[med-svn] [segemehl] 01/02: Imported Upstream version 0.2.0+dfsg
Andreas Tille
tille at debian.org
Mon Jun 20 11:36:13 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository segemehl.
commit ebec66244be32d971401a6932da69efe022afa0c
Author: Andreas Tille <tille at debian.org>
Date: Mon Jun 20 13:31:12 2016 +0200
Imported Upstream version 0.2.0+dfsg
---
segemehl/Makefile | 219 ++
segemehl/libs/708.c | 2833 +++++++++++++++++++++++++
segemehl/libs/708.h | 15 +
segemehl/libs/SAX.c | 443 ++++
segemehl/libs/SAX.h | 21 +
segemehl/libs/alignment.c | 1079 ++++++++++
segemehl/libs/alignment.h | 71 +
segemehl/libs/aluruSort.c | 1842 +++++++++++++++++
segemehl/libs/aluruSort.h | 84 +
segemehl/libs/basic-types.h | 75 +
segemehl/libs/biofiles.c | 3628 +++++++++++++++++++++++++++++++++
segemehl/libs/biofiles.h | 292 +++
segemehl/libs/bitArray.c | 89 +
segemehl/libs/bitArray.h | 69 +
segemehl/libs/bitVector.c | 192 ++
segemehl/libs/bitVector.h | 67 +
segemehl/libs/bitvectoralg.c | 632 ++++++
segemehl/libs/bitvectoralg.h | 75 +
segemehl/libs/browsematchfiles.c | 1482 ++++++++++++++
segemehl/libs/browsematchfiles.h | 94 +
segemehl/libs/citation.h | 50 +
segemehl/libs/container.c | 168 ++
segemehl/libs/container.h | 47 +
segemehl/libs/debug.c | 91 +
segemehl/libs/debug.h | 46 +
segemehl/libs/evalmatchfiles.c | 1976 ++++++++++++++++++
segemehl/libs/evalmatchfiles.h | 131 ++
segemehl/libs/evalmatchfileshelper.c | 1705 ++++++++++++++++
segemehl/libs/evalmethylmatchfiles.c | 732 +++++++
segemehl/libs/evalmethylmatchfiles.h | 39 +
segemehl/libs/fileBins.c | 956 +++++++++
segemehl/libs/fileBins.h | 179 ++
segemehl/libs/fileio.c | 580 ++++++
segemehl/libs/fileio.h | 47 +
segemehl/libs/fqueue.c | 183 ++
segemehl/libs/fqueue.h | 35 +
segemehl/libs/fstack.c | 92 +
segemehl/libs/fstack.h | 36 +
segemehl/libs/hash.c | 346 ++++
segemehl/libs/hash.h | 87 +
segemehl/libs/info.c | 128 ++
segemehl/libs/info.h | 44 +
segemehl/libs/iupac.c | 291 +++
segemehl/libs/iupac.h | 34 +
segemehl/libs/karlin.c | 271 +++
segemehl/libs/karlin.h | 36 +
segemehl/libs/kdchain.c | 1379 +++++++++++++
segemehl/libs/kdchain.h | 100 +
segemehl/libs/kdseed.c | 1079 ++++++++++
segemehl/libs/kdseed.h | 185 ++
segemehl/libs/list.c | 474 +++++
segemehl/libs/list.h | 73 +
segemehl/libs/manopt.c | 1125 ++++++++++
segemehl/libs/manopt.h | 170 ++
segemehl/libs/manout.c | 1866 +++++++++++++++++
segemehl/libs/manout.h | 215 ++
segemehl/libs/manoutformats.h | 153 ++
segemehl/libs/matchfiles.c | 2536 +++++++++++++++++++++++
segemehl/libs/matchfiles.h | 475 +++++
segemehl/libs/matchfilesfields.c | 867 ++++++++
segemehl/libs/matchfilesfields.h | 44 +
segemehl/libs/matepairs.c | 371 ++++
segemehl/libs/matepairs.h | 55 +
segemehl/libs/matfile.c | 925 +++++++++
segemehl/libs/matfile.h | 25 +
segemehl/libs/mathematics.c | 2488 ++++++++++++++++++++++
segemehl/libs/mathematics.h | 187 ++
segemehl/libs/md5.c | 407 ++++
segemehl/libs/md5.h | 96 +
segemehl/libs/memmac.h | 7 +
segemehl/libs/memman.c | 190 ++
segemehl/libs/memman.h | 67 +
segemehl/libs/memory.c | 27 +
segemehl/libs/memory.h | 19 +
segemehl/libs/merge.c | 626 ++++++
segemehl/libs/merge.h | 97 +
segemehl/libs/ncursesext.c | 78 +
segemehl/libs/ncursesext.h | 28 +
segemehl/libs/newton.c | 88 +
segemehl/libs/nw.c | 402 ++++
segemehl/libs/nw.h | 36 +
segemehl/libs/physmem.c | 332 +++
segemehl/libs/plotmatchfiles.c | 248 +++
segemehl/libs/plotmatchfiles.h | 21 +
segemehl/libs/queue.c | 211 ++
segemehl/libs/queue.h | 66 +
segemehl/libs/radixsort.c | 168 ++
segemehl/libs/radixsort.h | 31 +
segemehl/libs/randseqs.c | 1522 ++++++++++++++
segemehl/libs/randseqs.h | 117 ++
segemehl/libs/realign.c | 3448 +++++++++++++++++++++++++++++++
segemehl/libs/realign.h | 220 ++
segemehl/libs/remapping.c | 1603 +++++++++++++++
segemehl/libs/remapping.h | 146 ++
segemehl/libs/seqclip.c | 686 +++++++
segemehl/libs/seqclip.h | 40 +
segemehl/libs/snvsplines.c | 389 ++++
segemehl/libs/snvsplines.h | 51 +
segemehl/libs/sort.c | 547 +++++
segemehl/libs/sort.h | 50 +
segemehl/libs/splicesites.c | 1251 ++++++++++++
segemehl/libs/splicesites.h | 34 +
segemehl/libs/splines.c | 214 ++
segemehl/libs/splines.h | 51 +
segemehl/libs/stack.c | 164 ++
segemehl/libs/stack.h | 71 +
segemehl/libs/startsites.c | 72 +
segemehl/libs/startsites.h | 47 +
segemehl/libs/stringutils.c | 446 ++++
segemehl/libs/stringutils.h | 72 +
segemehl/libs/sufarray/charsequence.c | 459 +++++
segemehl/libs/sufarray/charsequence.h | 320 +++
segemehl/libs/sufarray/falphabet.c | 52 +
segemehl/libs/sufarray/falphabet.h | 40 +
segemehl/libs/sufarray/mmchar.c | 174 ++
segemehl/libs/sufarray/mmchar.h | 31 +
segemehl/libs/sufarray/multicharseq.c | 393 ++++
segemehl/libs/sufarray/multicharseq.h | 87 +
segemehl/libs/sufarray/sufarray.c | 1542 ++++++++++++++
segemehl/libs/sufarray/sufarray.h | 124 ++
segemehl/libs/sufarray/sufmatch.c | 599 ++++++
segemehl/libs/sufarray/sufmatch.h | 110 +
segemehl/libs/sw.c | 1562 ++++++++++++++
segemehl/libs/sw.h | 111 +
segemehl/libs/vqueue.c | 213 ++
segemehl/libs/vqueue.h | 45 +
segemehl/libs/vstack.c | 168 ++
segemehl/libs/vstack.h | 49 +
segemehl/libs/vtprogressbar.c | 84 +
segemehl/libs/vtprogressbar.h | 24 +
segemehl/libs/zran.c | 521 +++++
segemehl/libs/zran.h | 72 +
segemehl/src/filebintest.c | 135 ++
segemehl/src/filebintest.h | 21 +
segemehl/src/genfasta.c | 383 ++++
segemehl/src/kdmatch.c | 2094 +++++++++++++++++++
segemehl/src/kdmatch.h | 107 +
segemehl/src/rsorter.c | 89 +
segemehl/src/rsorter.h | 9 +
segemehl/src/segemehl.c | 1023 ++++++++++
segemehl/src/segemehl.h | 225 ++
segemehl/src/version.h | 395 ++++
142 files changed, 62171 insertions(+)
diff --git a/segemehl/Makefile b/segemehl/Makefile
new file mode 100644
index 0000000..db21182
--- /dev/null
+++ b/segemehl/Makefile
@@ -0,0 +1,219 @@
+ CC=gcc
+ LD=${CC}
+ CFLAGS= -Wall -pedantic -std=c99 -g -O3 -DFIXINSMALL -DFIXINBACKSPLICE -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DDBGNFO -DSHOWALIGN -DDBGLEVEL=0 -DPROGNFO -Isrc -Ilibs -Ilibs/sufarray -Lsrc
+ LDFLAGS= -lm -lpthread -lz -lncurses
+ CTAGS=ctags > tags
+ LIBS=-lob -lm -lpthread
+
+
+
+ GENFASTAOBJ = libs/vtprogressbar.o\
+ libs/debug.o\
+ libs/info.o\
+ libs/seqclip.o\
+ libs/stringutils.o\
+ libs/mathematics.o\
+ libs/fileio.o\
+ libs/zran.o\
+ libs/biofiles.o\
+ libs/randseqs.o\
+ libs/memory.o\
+ libs/alignment.o\
+ libs/sw.o\
+ libs/sufarray/charsequence.o\
+ libs/manopt.o\
+ libs/iupac.o\
+ libs/bitVector.o\
+ src/genfasta.o
+
+ SEGEMEHLOBJ = libs/debug.o\
+ libs/info.o\
+ libs/md5.o\
+ libs/stringutils.o\
+ libs/708.o\
+ libs/mathematics.o\
+ libs/memory.o\
+ libs/zran.o\
+ libs/fileio.o\
+ libs/biofiles.o\
+ libs/karlin.o\
+ libs/kdseed.o\
+ libs/queue.o\
+ libs/sufarray/sufarray.o\
+ libs/sufarray/charsequence.o\
+ libs/sufarray/multicharseq.o\
+ libs/radixsort.o\
+ libs/fileBins.o\
+ libs/stack.o\
+ libs/aluruSort.o\
+ libs/sort.o\
+ libs/vtprogressbar.o\
+ libs/sufarray/mmchar.o\
+ libs/bitArray.o\
+ libs/bitVector.o\
+ libs/bitvectoralg.o\
+ libs/manout.o\
+ libs/kdchain.o\
+ libs/manopt.o\
+ libs/container.o\
+ libs/vstack.o\
+ libs/vqueue.o\
+ libs/alignment.o\
+ libs/sw.o\
+ libs/seqclip.o\
+ libs/iupac.o\
+ libs/hash.o\
+ libs/matchfilesfields.o\
+ libs/merge.o\
+ src/kdmatch.o\
+ src/segemehl.o
+
+
+ TESTMATOBJ = libs/vtprogressbar.o\
+ libs/debug.o\
+ libs/info.o\
+ libs/bitVector.o\
+ libs/seqclip.o\
+ libs/stringutils.o\
+ libs/mathematics.o\
+ libs/splines.o\
+ libs/snvsplines.o\
+ libs/evalmatchfileshelper.o\
+ libs/708.o\
+ libs/fileio.o\
+ libs/zran.o\
+ libs/biofiles.o\
+ libs/randseqs.o\
+ libs/sufarray/charsequence.o\
+ libs/memory.o\
+ libs/iupac.o\
+ libs/alignment.o\
+ libs/sw.o\
+ libs/nw.o\
+ libs/manopt.o\
+ libs/container.o\
+ libs/vstack.o\
+ libs/vqueue.o\
+ libs/sort.o\
+ libs/list.o\
+ libs/matepairs.o\
+ libs/matchfilesfields.o\
+ libs/matchfiles.o\
+ libs/evalmatchfiles.o\
+ libs/evalmethylmatchfiles.o\
+ libs/browsematchfiles.o\
+ libs/plotmatchfiles.o\
+ libs/ncursesext.o\
+ libs/splicesites.o\
+ libs/startsites.o\
+ libs/matfile.o
+
+ TESTREALIGNOBJ = libs/fileBins.o\
+ libs/manout.o\
+ libs/evalmatchfileshelper.o\
+ libs/708.o\
+ libs/debug.o\
+ libs/info.o\
+ libs/bitVector.o\
+ libs/seqclip.o\
+ libs/stringutils.o\
+ libs/mathematics.o\
+ libs/fileio.o\
+ libs/zran.o\
+ libs/biofiles.o\
+ libs/randseqs.o\
+ libs/sufarray/charsequence.o\
+ libs/sufarray/multicharseq.o\
+ libs/memory.o\
+ libs/iupac.o\
+ libs/alignment.o\
+ libs/sw.o\
+ libs/nw.o\
+ libs/manopt.o\
+ libs/container.o\
+ libs/vstack.o\
+ libs/vqueue.o\
+ libs/radixsort.o\
+ libs/sort.o\
+ libs/list.o\
+ libs/fqueue.o\
+ libs/matepairs.o\
+ libs/matchfilesfields.o\
+ libs/matchfiles.o\
+ libs/vtprogressbar.o\
+ libs/evalmatchfiles.o\
+ libs/splicesites.o\
+ libs/startsites.o\
+ libs/realign.o
+
+ TESTREMAPPINGOBJ = libs/vtprogressbar.o\
+ libs/fileBins.o\
+ libs/manout.o\
+ libs/debug.o\
+ libs/info.o\
+ libs/bitVector.o\
+ libs/seqclip.o\
+ libs/stringutils.o\
+ libs/mathematics.o\
+ libs/evalmatchfileshelper.o\
+ libs/708.o\
+ libs/fileio.o\
+ libs/zran.o\
+ libs/biofiles.o\
+ libs/randseqs.o\
+ libs/sufarray/charsequence.o\
+ libs/sufarray/multicharseq.o\
+ libs/memory.o\
+ libs/iupac.o\
+ libs/alignment.o\
+ libs/sw.o\
+ libs/nw.o\
+ libs/manopt.o\
+ libs/container.o\
+ libs/vstack.o\
+ libs/vqueue.o\
+ libs/radixsort.o\
+ libs/sort.o\
+ libs/list.o\
+ libs/fqueue.o\
+ libs/matepairs.o\
+ libs/matchfilesfields.o\
+ libs/matchfiles.o\
+ libs/evalmatchfiles.o\
+ libs/ncursesext.o\
+ libs/splicesites.o\
+ libs/startsites.o\
+ libs/realign.o\
+ libs/remapping.o
+
+all: segemehl.x lack.x testrealign.x
+
+haarz.x: ${TESTMATOBJ}
+ gcc $(CFLAGS) -DGSLACTIVE -c -o libs/evalmatchfiles.o libs/evalmatchfiles.c
+ gcc $(CFLAGS) ${TESTMATOBJ} -o $@ $(LDFLAGS) -lform -lmenu -lgsl -lgslcblas
+
+
+segemehl.x: ${SEGEMEHLOBJ}
+ echo " " >> src/version.h
+ gcc $(CFLAGS) ${SEGEMEHLOBJ} -o $@ $(LDFLAGS)
+
+genfasta.x: ${GENFASTAOBJ}
+ gcc $(CFLAGS) ${GENFASTAOBJ} -o $@ $(LDFLAGS)
+
+
+testrealign.x: ${TESTREALIGNOBJ}
+ gcc $(CFLAGS) -DREALIGNTEST -c -o libs/realign.o libs/realign.c
+ gcc $(CFLAGS) ${TESTREALIGNOBJ} -o $@ $(LDFLAGS)
+
+lack.x: ${TESTREMAPPINGOBJ}
+ gcc $(CFLAGS) -c -o libs/realign.o libs/realign.c
+ gcc $(CFLAGS) -DREMAPPINGTEST -c -o libs/remapping.o libs/remapping.c
+ gcc $(CFLAGS) ${TESTREMAPPINGOBJ} -o $@ $(LDFLAGS)
+
+clean:
+ rm -f ${TESTMATOBJ}
+ rm -f ${SEGEMEHLOBJ} ${GENFASTAOBJ}
+ rm -f ${TESTREALIGNOBJ}
+ rm -f *~
+
+
diff --git a/segemehl/libs/708.c b/segemehl/libs/708.c
new file mode 100644
index 0000000..8d260af
--- /dev/null
+++ b/segemehl/libs/708.c
@@ -0,0 +1,2833 @@
+/* 708.f -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+//#include "f2c.h"
+#include "limits.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "float.h"
+/* Table of constant values */
+
+static long int c__1 = 1;
+static long int c__0 = 0;
+static double c_b188 = 1.f;
+
+
+
+double r_sign(double *a, double *b)
+{
+double x;
+x = (*a >= 0 ? *a : - *a);
+return( *b >= 0 ? x : -x);
+}
+
+
+/* ALGORITHM 708, COLLECTED ALGORITHMS FROM ACM. */
+/* THIS WORK PUBLISHED IN TRANSACTIONS ON MATHEMATICAL SOFTWARE, */
+/* VOL. 18, NO. 3, SEPTEMBER, 1992, PP. 360-373z. */
+/* PROGRAM BTST (OUTPUT, TAPE6=OUTPUT) */
+/* ----------------------------------------------------------------------- */
+
+/* SAMPLE PROGRAM USING BRATIO. GIVEN THE NONNEGATIVE VALUES */
+/* A, B, X, Y WHERE A AND B ARE NOT BOTH 0 AND X + Y = 1. THEN */
+
+/* CALL BRATIO (A, B, X, Y, W, W1, IERR) */
+
+/* COMPUTES THE VALUES */
+
+/* W = I (A,B) AND W1 = 1 - I (A,B). */
+/* X X */
+
+/* IERR IS A VARIABLE THAT REPORTS THE STATUS OF THE RESULTS. */
+/* IF NO INPUT ERRORS ARE DETECTED THEN IERR IS SET TO 0 AND */
+/* W AND W1 ARE COMPUTED. FOR MORE DETAILS SEE THE IN-LINE */
+/* DOCUMENTATION OF BRATIO. */
+
+/* THE LAST FUNCTION IN THIS PACKAGE, IPMPAR, MUST BE DEFINED */
+/* FOR THE PARTICULAR COMPUTER BEING USED. FOR DETAILS SEE THE */
+/* IN-LINE DOCUMENTATION OF IPMPAR. */
+
+/* NO DATA IS READ. THE OUTPUT FOR THE PROGRAM IS WRITTEN ON */
+/* UNIT 6. THE FIRST STATMENT OF THIS TEXT MAY BE USED TO */
+/* BEGIN THE PROGRAM FOR THE CDC 6000-7000 SERIES COMPUTERS. */
+/* ----------------------------------------------------------------------- */
+/* Main program */
+/*
+int main(void) {
+ int l, ierr;
+ double a = 5.3f;
+ double b = 10.1f;
+ double x = .01f, y, w, w1;
+
+ for (l = 1; l <= 50; ++l) {
+ y = .5f - x + .5f;
+ bratio_(&a, &b, &x, &y, &w, &w1, &ierr);
+ if (ierr != 0) {
+ fprintf(stderr, "stop: %d!\n", ierr);
+ exit(-1);
+ }
+
+ fprintf(stderr, "I(a,b,x,y) = I(%f,%f,%f,%f) = %.16f, %.16f\n", a, b, x, y, w, w1);
+ x += .01f;
+ }
+}
+*/
+//{
+// /* Format strings */
+// static char fmt_1[] = "(\0021 X Y\002,11x,\002W\002,14x,\002W1\002"
+// "/)";
+// static char fmt_2[] = "(2f6.2,2e16.6)";
+
+// /* Builtin functions */
+// long int s_wsfe(cilist *), e_wsfe(void);
+// /* Subroutine */ int s_stop(char *, ftnlen);
+// long int do_fio(long int *, char *, ftnlen);
+
+ /* Local variables */
+// static double a, b;
+// static long int l;
+// static double w, x, y, w1;
+// static long int ierr;
+// extern /* Subroutine */ int bratio_(double *, double *, double *, double *, double *
+// , double *, long int *);
+
+// /* Fortran I/O blocks */
+// static cilist io___1 = { 0, 6, 0, fmt_1, 0 };
+// static cilist io___10 = { 0, 6, 0, fmt_2, 0 };
+
+
+// s_wsfe(&io___1);
+// e_wsfe();
+/* L2: */
+
+// a = 5.3f;
+// b = 10.1f;
+// x = .01f;
+// for (l = 1; l <= 50; ++l) {
+// y = .5f - x + .5f;
+// bratio_(&a, &b, &x, &y, &w, &w1, &ierr);
+// if (ierr != 0) {
+// s_stop("", (ftnlen)0);
+// }
+// s_wsfe(&io___10);
+// do_fio(&c__1, (char *)&x, (ftnlen)sizeof(double));
+// do_fio(&c__1, (char *)&y, (ftnlen)sizeof(double));
+// do_fio(&c__1, (char *)&w, (ftnlen)sizeof(double));
+// do_fio(&c__1, (char *)&w1, (ftnlen)sizeof(double));
+// e_wsfe();
+// x += .01f;
+/* L10: */
+ // }
+// s_stop("", (ftnlen)0);
+// return 0;
+//} /* MAIN__ */
+
+/* Subroutine */ int bratio_(double *a, double *b, double *x, double *y, double *w,
+ double *w1, long int *ierr)
+{
+ /* System generated locals */
+ double r__1, r__2;
+ double d__1, d__2;
+
+ /* Builtin functions */
+ double pow(double, double);
+
+ /* Local variables */
+ static long int n;
+ static double t, z__, a0, b0, x0, y0;
+ static long int ind;
+ extern double bup_(double *, double *, double *, double *, long int *, double *);
+ static double eps;
+ static long int ierr1;
+ extern double bfrac_(double *, double *, double *, double *, double *, double *);
+ extern /* Subroutine */ int bgrat_(double *, double *, double *, double *, double *,
+ double *, long int *);
+ extern double apser_(double *, double *, double *, double *), bpser_(double *,
+ double *, double *, double *), basym_(double *, double *, double *, double *),
+ fpser_(double *, double *, double *, double *);
+ static double lambda;
+
+/* ----------------------------------------------------------------------- */
+
+/* EVALUATION OF THE INCOMPLETE BETA FUNCTION IX(A,B) */
+
+/* -------------------- */
+
+/* IT IS ASSUMED THAT A AND B ARE NONNEGATIVE, AND THAT X .LE. 1 */
+/* AND Y = 1 - X. BRATIO ASSIGNS W AND W1 THE VALUES */
+
+/* W = IX(A,B) */
+/* W1 = 1 - IX(A,B) */
+
+/* IERR IS A VARIABLE THAT REPORTS THE STATUS OF THE RESULTS. */
+/* IF NO INPUT ERRORS ARE DETECTED THEN IERR IS SET TO 0 AND */
+/* W AND W1 ARE COMPUTED. OTHERWISE, IF AN ERROR IS DETECTED, */
+/* THEN W AND W1 ARE ASSIGNED THE VALUE 0 AND IERR IS SET TO */
+/* ONE OF THE FOLLOWING VALUES ... */
+
+/* IERR = 1 IF A OR B IS NEGATIVE */
+/* IERR = 2 IF A = B = 0 */
+/* IERR = 3 IF X .LT. 0 OR X .GT. 1 */
+/* IERR = 4 IF Y .LT. 0 OR Y .GT. 1 */
+/* IERR = 5 IF X + Y .NE. 1 */
+/* IERR = 6 IF X = A = 0 */
+/* IERR = 7 IF Y = B = 0 */
+
+/* -------------------- */
+/* WRITTEN BY ALFRED H. MORRIS, JR. */
+/* NAVAL SURFACE WARFARE CENTER */
+/* DAHLGREN, VIRGINIA */
+/* REVISED ... NOV 1991 */
+/* ----------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------- */
+
+/* ****** EPS IS A MACHINE DEPENDENT CONSTANT. EPS IS THE SMALLEST */
+/* FLOATING POINT NUMBER FOR WHICH 1.0 + EPS .GT. 1.0 */
+
+ eps = DBL_EPSILON;
+
+/* ----------------------------------------------------------------------- */
+ *w = 0.f;
+ *w1 = 0.f;
+ if (*a < 0.f || *b < 0.f) {
+ goto L300;
+ }
+ if (*a == 0.f && *b == 0.f) {
+ goto L310;
+ }
+ if (*x < 0.f || *x > 1.f) {
+ goto L320;
+ }
+ if (*y < 0.f || *y > 1.f) {
+ goto L330;
+ }
+ z__ = *x + *y - .5f - .5f;
+ if (DABS(z__) > eps * 3.f) {
+ goto L340;
+ }
+
+ *ierr = 0;
+ if (*x == 0.f) {
+ goto L200;
+ }
+ if (*y == 0.f) {
+ goto L210;
+ }
+ if (*a == 0.f) {
+ goto L211;
+ }
+ if (*b == 0.f) {
+ goto L201;
+ }
+
+ eps = DMAX(eps,1e-15f);
+ if (DMAX(*a,*b) < eps * .001f) {
+ goto L230;
+ }
+
+ ind = 0;
+ a0 = *a;
+ b0 = *b;
+ x0 = *x;
+ y0 = *y;
+ if (DMIN(a0,b0) > 1.f) {
+ goto L30;
+ }
+
+/* PROCEDURE FOR A0 .LE. 1 OR B0 .LE. 1 */
+
+ if (*x <= .5f) {
+ goto L10;
+ }
+ ind = 1;
+ a0 = *b;
+ b0 = *a;
+ x0 = *y;
+ y0 = *x;
+
+L10:
+/* Computing MIN */
+ r__1 = eps, r__2 = eps * a0;
+ if (b0 < DMIN(r__1,r__2)) {
+ goto L80;
+ }
+/* Computing MIN */
+ r__1 = eps, r__2 = eps * b0;
+ if (a0 < DMIN(r__1,r__2) && b0 * x0 <= 1.f) {
+ goto L90;
+ }
+ if (DMAX(a0,b0) > 1.f) {
+ goto L20;
+ }
+ if (a0 >= DMIN(.2f,b0)) {
+ goto L100;
+ }
+ d__1 = (double) x0;
+ d__2 = (double) a0;
+ if (pow(d__1, d__2) <= .9f) {
+ goto L100;
+ }
+ if (x0 >= .3f) {
+ goto L110;
+ }
+ n = 20;
+ goto L130;
+
+L20:
+ if (b0 <= 1.f) {
+ goto L100;
+ }
+ if (x0 >= .3f) {
+ goto L110;
+ }
+ if (x0 >= .1f) {
+ goto L21;
+ }
+ d__1 = (double) (x0 * b0);
+ d__2 = (double) a0;
+ if (pow(d__1, d__2) <= .7f) {
+ goto L100;
+ }
+L21:
+ if (b0 > 15.f) {
+ goto L131;
+ }
+ n = 20;
+ goto L130;
+
+/* PROCEDURE FOR A0 .GT. 1 AND B0 .GT. 1 */
+
+L30:
+ if (*a > *b) {
+ goto L31;
+ }
+ lambda = *a - (*a + *b) * *x;
+ goto L32;
+L31:
+ lambda = (*a + *b) * *y - *b;
+L32:
+ if (lambda >= 0.f) {
+ goto L40;
+ }
+ ind = 1;
+ a0 = *b;
+ b0 = *a;
+ x0 = *y;
+ y0 = *x;
+ lambda = DABS(lambda);
+
+L40:
+ if (b0 < 40.f && b0 * x0 <= .7f) {
+ goto L100;
+ }
+ if (b0 < 40.f) {
+ goto L140;
+ }
+ if (a0 > b0) {
+ goto L50;
+ }
+ if (a0 <= 100.f) {
+ goto L120;
+ }
+ if (lambda > a0 * .03f) {
+ goto L120;
+ }
+ goto L180;
+L50:
+ if (b0 <= 100.f) {
+ goto L120;
+ }
+ if (lambda > b0 * .03f) {
+ goto L120;
+ }
+ goto L180;
+
+/* EVALUATION OF THE APPROPRIATE ALGORITHM */
+
+L80:
+ *w = fpser_(&a0, &b0, &x0, &eps);
+ *w1 = .5f - *w + .5f;
+ goto L220;
+
+L90:
+ *w1 = apser_(&a0, &b0, &x0, &eps);
+ *w = .5f - *w1 + .5f;
+ goto L220;
+
+L100:
+ *w = bpser_(&a0, &b0, &x0, &eps);
+ *w1 = .5f - *w + .5f;
+ goto L220;
+
+L110:
+ *w1 = bpser_(&b0, &a0, &y0, &eps);
+ *w = .5f - *w1 + .5f;
+ goto L220;
+
+L120:
+ r__1 = eps * 15.f;
+ *w = bfrac_(&a0, &b0, &x0, &y0, &lambda, &r__1);
+ *w1 = .5f - *w + .5f;
+ goto L220;
+
+L130:
+ *w1 = bup_(&b0, &a0, &y0, &x0, &n, &eps);
+ b0 += n;
+L131:
+ r__1 = eps * 15.f;
+ bgrat_(&b0, &a0, &y0, &x0, w1, &r__1, &ierr1);
+ *w = .5f - *w1 + .5f;
+ goto L220;
+
+L140:
+ n = b0;
+ b0 -= n;
+ if (b0 != 0.f) {
+ goto L141;
+ }
+ --n;
+ b0 = 1.f;
+L141:
+ *w = bup_(&b0, &a0, &y0, &x0, &n, &eps);
+ if (x0 > .7f) {
+ goto L150;
+ }
+ *w += bpser_(&a0, &b0, &x0, &eps);
+ *w1 = .5f - *w + .5f;
+ goto L220;
+
+L150:
+ if (a0 > 15.f) {
+ goto L151;
+ }
+ n = 20;
+ *w += bup_(&a0, &b0, &x0, &y0, &n, &eps);
+ a0 += n;
+L151:
+ r__1 = eps * 15.f;
+ bgrat_(&a0, &b0, &x0, &y0, w, &r__1, &ierr1);
+ *w1 = .5f - *w + .5f;
+ goto L220;
+
+L180:
+ r__1 = eps * 100.f;
+ *w = basym_(&a0, &b0, &lambda, &r__1);
+ *w1 = .5f - *w + .5f;
+ goto L220;
+
+/* TERMINATION OF THE PROCEDURE */
+
+L200:
+ if (*a == 0.f) {
+ goto L350;
+ }
+L201:
+ *w = 0.f;
+ *w1 = 1.f;
+ return 0;
+
+L210:
+ if (*b == 0.f) {
+ goto L360;
+ }
+L211:
+ *w = 1.f;
+ *w1 = 0.f;
+ return 0;
+
+L220:
+ if (ind == 0) {
+ return 0;
+ }
+ t = *w;
+ *w = *w1;
+ *w1 = t;
+ return 0;
+
+/* PROCEDURE FOR A AND B .LT. 1.E-3*EPS */
+
+L230:
+ *w = *b / (*a + *b);
+ *w1 = *a / (*a + *b);
+ return 0;
+
+/* ERROR RETURN */
+
+L300:
+ *ierr = 1;
+ return 0;
+L310:
+ *ierr = 2;
+ return 0;
+L320:
+ *ierr = 3;
+ return 0;
+L330:
+ *ierr = 4;
+ return 0;
+L340:
+ *ierr = 5;
+ return 0;
+L350:
+ *ierr = 6;
+ return 0;
+L360:
+ *ierr = 7;
+ return 0;
+} /* bratio_ */
+
+double fpser_(double *a, double *b, double *x, double *eps)
+{
+ /* System generated locals */
+ double ret_val;
+
+ /* Builtin functions */
+ double log(double), exp(double);
+
+ /* Local variables */
+ static double c__, s, t, an, tol;
+ extern double exparg_(long int *);
+
+/* ----------------------------------------------------------------------- */
+
+/* EVALUATION OF I (A,B) */
+/* X */
+
+/* FOR B .LT. MIN(EPS,EPS*A) AND X .LE. 0.5. */
+
+/* ----------------------------------------------------------------------- */
+
+/* SET FPSER = X**A */
+
+ ret_val = 1.f;
+ if (*a <= *eps * .001f) {
+ goto L10;
+ }
+ ret_val = 0.f;
+ t = *a * log(*x);
+ if (t < exparg_(&c__1)) {
+ return ret_val;
+ }
+ ret_val = exp(t);
+
+/* NOTE THAT 1/B(A,B) = B */
+
+L10:
+ ret_val = *b / *a * ret_val;
+ tol = *eps / *a;
+ an = *a + 1.f;
+ t = *x;
+ s = t / an;
+L20:
+ an += 1.f;
+ t = *x * t;
+ c__ = t / an;
+ s += c__;
+ if (DABS(c__) > tol) {
+ goto L20;
+ }
+
+ ret_val *= *a * s + 1.f;
+ return ret_val;
+} /* fpser_ */
+
+double apser_(double *a, double *b, double *x, double *eps)
+{
+ /* Initialized data */
+
+ static double g = .577215664901533f;
+
+ /* System generated locals */
+ double ret_val;
+
+ /* Builtin functions */
+ double log(double);
+
+ /* Local variables */
+ static double c__, j, s, t, aj, bx;
+ extern double psi_(double *);
+ static double tol;
+
+/* ----------------------------------------------------------------------- */
+/* APSER YIELDS THE INCOMPLETE BETA RATIO I(SUB(1-X))(B,A) FOR */
+/* A .LE. MIN(EPS,EPS*B), B*X .LE. 1, AND X .LE. 0.5. USED WHEN */
+/* A IS VERY SMALL. USE ONLY IF ABOVE INEQUALITIES ARE SATISFIED. */
+/* ----------------------------------------------------------------------- */
+/* -------------------- */
+/* -------------------- */
+ bx = *b * *x;
+ t = *x - bx;
+ if (*b * *eps > .02f) {
+ goto L10;
+ }
+ c__ = log(*x) + psi_(b) + g + t;
+ goto L20;
+L10:
+ c__ = log(bx) + g + t;
+
+L20:
+ tol = *eps * 5.f * DABS(c__);
+ j = 1.f;
+ s = 0.f;
+L30:
+ j += 1.f;
+ t *= *x - bx / j;
+ aj = t / j;
+ s += aj;
+ if (DABS(aj) > tol) {
+ goto L30;
+ }
+
+ ret_val = -(*a) * (c__ + s);
+ return ret_val;
+} /* apser_ */
+
+double bpser_(double *a, double *b, double *x, double *eps)
+{
+ /* System generated locals */
+ long int i__1;
+ double ret_val;
+ double d__1, d__2;
+
+ /* Builtin functions */
+ double log(double), exp(double), pow(double, double);
+
+ /* Local variables */
+ static double c__;
+ static long int i__, m;
+ static double n, t, u, w, z__, a0, b0, apb, tol, sum;
+ extern double gam1_(double *), gamln1_(double *), betaln_(double *, double *),
+ algdiv_(double *, double *);
+
+/* ----------------------------------------------------------------------- */
+/* POWER SERIES EXPANSION FOR EVALUATING IX(A,B) WHEN B .LE. 1 */
+/* OR B*X .LE. 0.7. EPS IS THE TOLERANCE USED. */
+/* ----------------------------------------------------------------------- */
+
+ ret_val = 0.f;
+ if (*x == 0.f) {
+ return ret_val;
+ }
+/* ----------------------------------------------------------------------- */
+/* COMPUTE THE FACTOR X**A/(A*BETA(A,B)) */
+/* ----------------------------------------------------------------------- */
+ a0 = DMIN(*a,*b);
+ if (a0 < 1.f) {
+ goto L10;
+ }
+ z__ = *a * log(*x) - betaln_(a, b);
+ ret_val = exp(z__) / *a;
+ goto L70;
+L10:
+ b0 = DMAX(*a,*b);
+ if (b0 >= 8.f) {
+ goto L60;
+ }
+ if (b0 > 1.f) {
+ goto L40;
+ }
+
+/* PROCEDURE FOR A0 .LT. 1 AND B0 .LE. 1 */
+
+ d__1 = (double) (*x);
+ d__2 = (double) (*a);
+ ret_val = pow(d__1, d__2);
+ if (ret_val == 0.f) {
+ return ret_val;
+ }
+
+ apb = *a + *b;
+ if (apb > 1.f) {
+ goto L20;
+ }
+ z__ = gam1_(&apb) + 1.f;
+ goto L30;
+L20:
+ u = (double) (*a) + (double) (*b) - 1.;
+ z__ = (gam1_(&u) + 1.f) / apb;
+
+L30:
+ c__ = (gam1_(a) + 1.f) * (gam1_(b) + 1.f) / z__;
+ ret_val = ret_val * c__ * (*b / apb);
+ goto L70;
+
+/* PROCEDURE FOR A0 .LT. 1 AND 1 .LT. B0 .LT. 8 */
+
+L40:
+ u = gamln1_(&a0);
+ m = b0 - 1.f;
+ if (m < 1) {
+ goto L50;
+ }
+ c__ = 1.f;
+ i__1 = m;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ b0 += -1.f;
+/* L41: */
+ c__ *= b0 / (a0 + b0);
+ }
+ u = log(c__) + u;
+
+L50:
+ z__ = *a * log(*x) - u;
+ b0 += -1.f;
+ apb = a0 + b0;
+ if (apb > 1.f) {
+ goto L51;
+ }
+ t = gam1_(&apb) + 1.f;
+ goto L52;
+L51:
+ u = (double) a0 + (double) b0 - 1.;
+ t = (gam1_(&u) + 1.f) / apb;
+L52:
+ ret_val = exp(z__) * (a0 / *a) * (gam1_(&b0) + 1.f) / t;
+ goto L70;
+
+/* PROCEDURE FOR A0 .LT. 1 AND B0 .GE. 8 */
+
+L60:
+ u = gamln1_(&a0) + algdiv_(&a0, &b0);
+ z__ = *a * log(*x) - u;
+ ret_val = a0 / *a * exp(z__);
+L70:
+ if (ret_val == 0.f || *a <= *eps * .1f) {
+ return ret_val;
+ }
+/* ----------------------------------------------------------------------- */
+/* COMPUTE THE SERIES */
+/* ----------------------------------------------------------------------- */
+ sum = 0.f;
+ n = 0.f;
+ c__ = 1.f;
+ tol = *eps / *a;
+L100:
+ n += 1.f;
+ c__ = c__ * (.5f - *b / n + .5f) * *x;
+ w = c__ / (*a + n);
+ sum += w;
+ if (DABS(w) > tol) {
+ goto L100;
+ }
+ ret_val *= *a * sum + 1.f;
+ return ret_val;
+} /* bpser_ */
+
+double bup_(double *a, double *b, double *x, double *y, long int *n, double *eps)
+{
+ /* System generated locals */
+ long int i__1;
+ double ret_val, r__1;
+
+ /* Builtin functions */
+ double exp(double);
+
+ /* Local variables */
+ static double d__;
+ static long int i__, k;
+ static double l, r__, t, w;
+ static long int mu;
+ static double ap1;
+ static long int nm1, kp1;
+ static double apb;
+ extern double brcmp1_(long int *, double *, double *, double *, double *),
+ exparg_(long int *);
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF IX(A,B) - IX(A+N,B) WHERE N IS A POSITIVE INTEGER. */
+/* EPS IS THE TOLERANCE USED. */
+/* ----------------------------------------------------------------------- */
+
+/* OBTAIN THE SCALING FACTOR EXP(-MU) AND */
+/* EXP(MU)*(X**A*Y**B/BETA(A,B))/A */
+
+ apb = *a + *b;
+ ap1 = *a + 1.f;
+ mu = 0;
+ d__ = 1.f;
+ if (*n == 1 || *a < 1.f) {
+ goto L10;
+ }
+ if (apb < ap1 * 1.1f) {
+ goto L10;
+ }
+ mu = (r__1 = exparg_(&c__1), DABS(r__1));
+ k = exparg_(&c__0);
+ if (k < mu) {
+ mu = k;
+ }
+ t = (double) mu;
+ d__ = exp(-t);
+
+L10:
+ ret_val = brcmp1_(&mu, a, b, x, y) / *a;
+ if (*n == 1 || ret_val == 0.f) {
+ return ret_val;
+ }
+ nm1 = *n - 1;
+ w = d__;
+
+/* LET K BE THE INDEX OF THE MAXIMUM TERM */
+
+ k = 0;
+ if (*b <= 1.f) {
+ goto L40;
+ }
+ if (*y > 1e-4f) {
+ goto L20;
+ }
+ k = nm1;
+ goto L30;
+L20:
+ r__ = (*b - 1.f) * *x / *y - *a;
+ if (r__ < 1.f) {
+ goto L40;
+ }
+ k = nm1;
+ t = (double) nm1;
+ if (r__ < t) {
+ k = r__;
+ }
+
+/* ADD THE INCREASING TERMS OF THE SERIES */
+
+L30:
+ i__1 = k;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ l = (double) (i__ - 1);
+ d__ = (apb + l) / (ap1 + l) * *x * d__;
+ w += d__;
+/* L31: */
+ }
+ if (k == nm1) {
+ goto L50;
+ }
+
+/* ADD THE REMAINING TERMS OF THE SERIES */
+
+L40:
+ kp1 = k + 1;
+ i__1 = nm1;
+ for (i__ = kp1; i__ <= i__1; ++i__) {
+ l = (double) (i__ - 1);
+ d__ = (apb + l) / (ap1 + l) * *x * d__;
+ w += d__;
+ if (d__ <= *eps * w) {
+ goto L50;
+ }
+/* L41: */
+ }
+
+/* TERMINATE THE PROCEDURE */
+
+L50:
+ ret_val *= w;
+ return ret_val;
+} /* bup_ */
+
+double bfrac_(double *a, double *b, double *x, double *y, double *lambda, double *eps)
+{
+ /* System generated locals */
+ double ret_val, r__1;
+
+ /* Local variables */
+ static double c__, e, n, p, r__, s, t, w, c0, c1, r0, an, bn, yp1, anp1,
+ bnp1, beta, alpha;
+ extern double brcomp_(double *, double *, double *, double *);
+
+/* ----------------------------------------------------------------------- */
+/* CONTINUED FRACTION EXPANSION FOR IX(A,B) WHEN A,B .GT. 1. */
+/* IT IS ASSUMED THAT LAMBDA = (A + B)*Y - B. */
+/* ----------------------------------------------------------------------- */
+/* -------------------- */
+ ret_val = brcomp_(a, b, x, y);
+ if (ret_val == 0.f) {
+ return ret_val;
+ }
+
+ c__ = *lambda + 1.f;
+ c0 = *b / *a;
+ c1 = 1.f / *a + 1.f;
+ yp1 = *y + 1.f;
+
+ n = 0.f;
+ p = 1.f;
+ s = *a + 1.f;
+ an = 0.f;
+ bn = 1.f;
+ anp1 = 1.f;
+ bnp1 = c__ / c1;
+ r__ = c1 / c__;
+
+/* CONTINUED FRACTION CALCULATION */
+
+L10:
+ n += 1.f;
+ t = n / *a;
+ w = n * (*b - n) * *x;
+ e = *a / s;
+ alpha = p * (p + c0) * e * e * (w * *x);
+ e = (t + 1.f) / (c1 + t + t);
+ beta = n + w / s + e * (c__ + n * yp1);
+ p = t + 1.f;
+ s += 2.f;
+
+/* UPDATE AN, BN, ANP1, AND BNP1 */
+
+ t = alpha * an + beta * anp1;
+ an = anp1;
+ anp1 = t;
+ t = alpha * bn + beta * bnp1;
+ bn = bnp1;
+ bnp1 = t;
+
+ r0 = r__;
+ r__ = anp1 / bnp1;
+ if ((r__1 = r__ - r0, DABS(r__1)) <= *eps * r__) {
+ goto L20;
+ }
+
+/* RESCALE AN, BN, ANP1, AND BNP1 */
+
+ an /= bnp1;
+ bn /= bnp1;
+ anp1 = r__;
+ bnp1 = 1.f;
+ goto L10;
+
+/* TERMINATION */
+
+L20:
+ ret_val *= r__;
+ return ret_val;
+} /* bfrac_ */
+
+double brcomp_(double *a, double *b, double *x, double *y)
+{
+ /* Initialized data */
+
+ static double const__ = .398942280401433f;
+
+ /* System generated locals */
+ long int i__1;
+ double ret_val, r__1;
+
+ /* Builtin functions */
+ double log(double), exp(double), sqrt(double);
+
+ /* Local variables */
+ static double c__, e, h__;
+ static long int i__, n;
+ static double t, u, v, z__, a0, b0, x0, y0, apb, lnx, lny;
+ extern double gam1_(double *), rlog1_(double *), bcorr_(double *, double *),
+ gamln1_(double *);
+ static double lambda;
+ extern double betaln_(double *, double *), algdiv_(double *, double *),
+ alnrel_(double *);
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF X**A*Y**B/BETA(A,B) */
+/* ----------------------------------------------------------------------- */
+/* ----------------- */
+/* CONST = 1/SQRT(2*PI) */
+/* ----------------- */
+
+ ret_val = 0.f;
+ if (*x == 0.f || *y == 0.f) {
+ return ret_val;
+ }
+ a0 = DMIN(*a,*b);
+ if (a0 >= 8.f) {
+ goto L100;
+ }
+
+ if (*x > .375f) {
+ goto L10;
+ }
+ lnx = log(*x);
+ r__1 = -(*x);
+ lny = alnrel_(&r__1);
+ goto L20;
+L10:
+ if (*y > .375f) {
+ goto L11;
+ }
+ r__1 = -(*y);
+ lnx = alnrel_(&r__1);
+ lny = log(*y);
+ goto L20;
+L11:
+ lnx = log(*x);
+ lny = log(*y);
+
+L20:
+ z__ = *a * lnx + *b * lny;
+ if (a0 < 1.f) {
+ goto L30;
+ }
+ z__ -= betaln_(a, b);
+ ret_val = exp(z__);
+ return ret_val;
+/* ----------------------------------------------------------------------- */
+/* PROCEDURE FOR A .LT. 1 OR B .LT. 1 */
+/* ----------------------------------------------------------------------- */
+L30:
+ b0 = DMAX(*a,*b);
+ if (b0 >= 8.f) {
+ goto L80;
+ }
+ if (b0 > 1.f) {
+ goto L60;
+ }
+
+/* ALGORITHM FOR B0 .LE. 1 */
+
+ ret_val = exp(z__);
+ if (ret_val == 0.f) {
+ return ret_val;
+ }
+
+ apb = *a + *b;
+ if (apb > 1.f) {
+ goto L40;
+ }
+ z__ = gam1_(&apb) + 1.f;
+ goto L50;
+L40:
+ u = (double) (*a) + (double) (*b) - 1.;
+ z__ = (gam1_(&u) + 1.f) / apb;
+
+L50:
+ c__ = (gam1_(a) + 1.f) * (gam1_(b) + 1.f) / z__;
+ ret_val = ret_val * (a0 * c__) / (a0 / b0 + 1.f);
+ return ret_val;
+
+/* ALGORITHM FOR 1 .LT. B0 .LT. 8 */
+
+L60:
+ u = gamln1_(&a0);
+ n = b0 - 1.f;
+ if (n < 1) {
+ goto L70;
+ }
+ c__ = 1.f;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ b0 += -1.f;
+ c__ *= b0 / (a0 + b0);
+/* L61: */
+ }
+ u = log(c__) + u;
+
+L70:
+ z__ -= u;
+ b0 += -1.f;
+ apb = a0 + b0;
+ if (apb > 1.f) {
+ goto L71;
+ }
+ t = gam1_(&apb) + 1.f;
+ goto L72;
+L71:
+ u = (double) a0 + (double) b0 - 1.;
+ t = (gam1_(&u) + 1.f) / apb;
+L72:
+ ret_val = a0 * exp(z__) * (gam1_(&b0) + 1.f) / t;
+ return ret_val;
+
+/* ALGORITHM FOR B0 .GE. 8 */
+
+L80:
+ u = gamln1_(&a0) + algdiv_(&a0, &b0);
+ ret_val = a0 * exp(z__ - u);
+ return ret_val;
+/* ----------------------------------------------------------------------- */
+/* PROCEDURE FOR A .GE. 8 AND B .GE. 8 */
+/* ----------------------------------------------------------------------- */
+L100:
+ if (*a > *b) {
+ goto L101;
+ }
+ h__ = *a / *b;
+ x0 = h__ / (h__ + 1.f);
+ y0 = 1.f / (h__ + 1.f);
+ lambda = *a - (*a + *b) * *x;
+ goto L110;
+L101:
+ h__ = *b / *a;
+ x0 = 1.f / (h__ + 1.f);
+ y0 = h__ / (h__ + 1.f);
+ lambda = (*a + *b) * *y - *b;
+
+L110:
+ e = -lambda / *a;
+ if (DABS(e) > .6f) {
+ goto L111;
+ }
+ u = rlog1_(&e);
+ goto L120;
+L111:
+ u = e - log(*x / x0);
+
+L120:
+ e = lambda / *b;
+ if (DABS(e) > .6f) {
+ goto L121;
+ }
+ v = rlog1_(&e);
+ goto L130;
+L121:
+ v = e - log(*y / y0);
+
+L130:
+ z__ = exp(-(*a * u + *b * v));
+ ret_val = const__ * sqrt(*b * x0) * z__ * exp(-bcorr_(a, b));
+ return ret_val;
+} /* brcomp_ */
+
+double brcmp1_(long int *mu, double *a, double *b, double *x, double *y)
+{
+ /* Initialized data */
+
+ static double const__ = .398942280401433f;
+
+ /* System generated locals */
+ long int i__1;
+ double ret_val, r__1;
+
+ /* Builtin functions */
+ double log(double), sqrt(double), exp(double);
+
+ /* Local variables */
+ static double c__, e, h__;
+ static long int i__, n;
+ static double t, u, v, z__, a0, b0, x0, y0, apb, lnx, lny;
+ extern double gam1_(double *), esum_(long int *, double *), rlog1_(double *),
+ bcorr_(double *, double *), gamln1_(double *);
+ static double lambda;
+ extern double betaln_(double *, double *), algdiv_(double *, double *),
+ alnrel_(double *);
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF EXP(MU) * (X**A*Y**B/BETA(A,B)) */
+/* ----------------------------------------------------------------------- */
+/* ----------------- */
+/* CONST = 1/SQRT(2*PI) */
+/* ----------------- */
+
+ a0 = DMIN(*a,*b);
+ if (a0 >= 8.f) {
+ goto L100;
+ }
+
+ if (*x > .375f) {
+ goto L10;
+ }
+ lnx = log(*x);
+ r__1 = -(*x);
+ lny = alnrel_(&r__1);
+ goto L20;
+L10:
+ if (*y > .375f) {
+ goto L11;
+ }
+ r__1 = -(*y);
+ lnx = alnrel_(&r__1);
+ lny = log(*y);
+ goto L20;
+L11:
+ lnx = log(*x);
+ lny = log(*y);
+
+L20:
+ z__ = *a * lnx + *b * lny;
+ if (a0 < 1.f) {
+ goto L30;
+ }
+ z__ -= betaln_(a, b);
+ ret_val = esum_(mu, &z__);
+ return ret_val;
+/* ----------------------------------------------------------------------- */
+/* PROCEDURE FOR A .LT. 1 OR B .LT. 1 */
+/* ----------------------------------------------------------------------- */
+L30:
+ b0 = DMAX(*a,*b);
+ if (b0 >= 8.f) {
+ goto L80;
+ }
+ if (b0 > 1.f) {
+ goto L60;
+ }
+
+/* ALGORITHM FOR B0 .LE. 1 */
+
+ ret_val = esum_(mu, &z__);
+ if (ret_val == 0.f) {
+ return ret_val;
+ }
+
+ apb = *a + *b;
+ if (apb > 1.f) {
+ goto L40;
+ }
+ z__ = gam1_(&apb) + 1.f;
+ goto L50;
+L40:
+ u = (double) (*a) + (double) (*b) - 1.;
+ z__ = (gam1_(&u) + 1.f) / apb;
+
+L50:
+ c__ = (gam1_(a) + 1.f) * (gam1_(b) + 1.f) / z__;
+ ret_val = ret_val * (a0 * c__) / (a0 / b0 + 1.f);
+ return ret_val;
+
+/* ALGORITHM FOR 1 .LT. B0 .LT. 8 */
+
+L60:
+ u = gamln1_(&a0);
+ n = b0 - 1.f;
+ if (n < 1) {
+ goto L70;
+ }
+ c__ = 1.f;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ b0 += -1.f;
+ c__ *= b0 / (a0 + b0);
+/* L61: */
+ }
+ u = log(c__) + u;
+
+L70:
+ z__ -= u;
+ b0 += -1.f;
+ apb = a0 + b0;
+ if (apb > 1.f) {
+ goto L71;
+ }
+ t = gam1_(&apb) + 1.f;
+ goto L72;
+L71:
+ u = (double) a0 + (double) b0 - 1.;
+ t = (gam1_(&u) + 1.f) / apb;
+L72:
+ ret_val = a0 * esum_(mu, &z__) * (gam1_(&b0) + 1.f) / t;
+ return ret_val;
+
+/* ALGORITHM FOR B0 .GE. 8 */
+
+L80:
+ u = gamln1_(&a0) + algdiv_(&a0, &b0);
+ r__1 = z__ - u;
+ ret_val = a0 * esum_(mu, &r__1);
+ return ret_val;
+/* ----------------------------------------------------------------------- */
+/* PROCEDURE FOR A .GE. 8 AND B .GE. 8 */
+/* ----------------------------------------------------------------------- */
+L100:
+ if (*a > *b) {
+ goto L101;
+ }
+ h__ = *a / *b;
+ x0 = h__ / (h__ + 1.f);
+ y0 = 1.f / (h__ + 1.f);
+ lambda = *a - (*a + *b) * *x;
+ goto L110;
+L101:
+ h__ = *b / *a;
+ x0 = 1.f / (h__ + 1.f);
+ y0 = h__ / (h__ + 1.f);
+ lambda = (*a + *b) * *y - *b;
+
+L110:
+ e = -lambda / *a;
+ if (DABS(e) > .6f) {
+ goto L111;
+ }
+ u = rlog1_(&e);
+ goto L120;
+L111:
+ u = e - log(*x / x0);
+
+L120:
+ e = lambda / *b;
+ if (DABS(e) > .6f) {
+ goto L121;
+ }
+ v = rlog1_(&e);
+ goto L130;
+L121:
+ v = e - log(*y / y0);
+
+L130:
+ r__1 = -(*a * u + *b * v);
+ z__ = esum_(mu, &r__1);
+ ret_val = const__ * sqrt(*b * x0) * z__ * exp(-bcorr_(a, b));
+ return ret_val;
+} /* brcmp1_ */
+
+/* Subroutine */ int bgrat_(double *a, double *b, double *x, double *y, double *w, double
+ *eps, long int *ierr)
+{
+ /* System generated locals */
+ long int i__1;
+ double r__1;
+
+ /* Builtin functions */
+ double log(double), exp(double);
+
+ /* Local variables */
+ static double c__[30], d__[30];
+ static long int i__;
+ static double j, l;
+ static long int n;
+ static double p, q, r__, s, t, u, v, z__, n2, t2, dj, cn, nu, bm1;
+ static long int nm1;
+ static double lnx, sum;
+ extern double gam1_(double *);
+ static double bp2n, coef;
+ extern /* Subroutine */ int grat1_(double *, double *, double *, double *, double *,
+ double *);
+ extern double algdiv_(double *, double *), alnrel_(double *);
+
+/* ----------------------------------------------------------------------- */
+/* ASYMPTOTIC EXPANSION FOR IX(A,B) WHEN A IS LARGER THAN B. */
+/* THE RESULT OF THE EXPANSION IS ADDED TO W. IT IS ASSUMED */
+/* THAT A .GE. 15 AND B .LE. 1. EPS IS THE TOLERANCE USED. */
+/* IERR IS A VARIABLE THAT REPORTS THE STATUS OF THE RESULTS. */
+/* ----------------------------------------------------------------------- */
+
+ bm1 = *b - .5f - .5f;
+ nu = *a + bm1 * .5f;
+ if (*y > .375f) {
+ goto L10;
+ }
+ r__1 = -(*y);
+ lnx = alnrel_(&r__1);
+ goto L11;
+L10:
+ lnx = log(*x);
+L11:
+ z__ = -nu * lnx;
+ if (*b * z__ == 0.f) {
+ goto L100;
+ }
+
+/* COMPUTATION OF THE EXPANSION */
+/* SET R = EXP(-Z)*Z**B/GAMMA(B) */
+
+ r__ = *b * (gam1_(b) + 1.f) * exp(*b * log(z__));
+ r__ = r__ * exp(*a * lnx) * exp(bm1 * .5f * lnx);
+ u = algdiv_(b, a) + *b * log(nu);
+ u = r__ * exp(-u);
+ if (u == 0.f) {
+ goto L100;
+ }
+ grat1_(b, &z__, &r__, &p, &q, eps);
+
+/* Computing 2nd power */
+ r__1 = 1.f / nu;
+ v = r__1 * r__1 * .25f;
+ t2 = lnx * .25f * lnx;
+ l = *w / u;
+ j = q / r__;
+ sum = j;
+ t = 1.f;
+ cn = 1.f;
+ n2 = 0.f;
+ for (n = 1; n <= 30; ++n) {
+ bp2n = *b + n2;
+ j = (bp2n * (bp2n + 1.f) * j + (z__ + bp2n + 1.f) * t) * v;
+ n2 += 2.f;
+ t *= t2;
+ cn /= n2 * (n2 + 1.f);
+ c__[n - 1] = cn;
+ s = 0.f;
+ if (n == 1) {
+ goto L21;
+ }
+ nm1 = n - 1;
+ coef = *b - n;
+ i__1 = nm1;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ s += coef * c__[i__ - 1] * d__[n - i__ - 1];
+/* L20: */
+ coef += *b;
+ }
+L21:
+ d__[n - 1] = bm1 * cn + s / n;
+ dj = d__[n - 1] * j;
+ sum += dj;
+ if (sum <= 0.f) {
+ goto L100;
+ }
+ if (DABS(dj) <= *eps * (sum + l)) {
+ goto L30;
+ }
+/* L22: */
+ }
+
+/* ADD THE RESULTS TO W */
+
+L30:
+ *ierr = 0;
+ *w += u * sum;
+ return 0;
+
+/* THE EXPANSION CANNOT BE COMPUTED */
+
+L100:
+ *ierr = 1;
+ return 0;
+} /* bgrat_ */
+
+/* Subroutine */ int grat1_(double *a, double *x, double *r__, double *p, double *q,
+ double *eps)
+{
+ /* System generated locals */
+ double r__1;
+
+ /* Builtin functions */
+ double log(double), exp(double), sqrt(double);
+
+ /* Local variables */
+ static double c__, g, h__, j, l, t, w, z__, an, am0, an0, a2n, b2n, cma;
+ extern double erf_(double *);
+ static double tol, sum;
+ extern double gam1_(double *);
+ static double a2nm1, b2nm1;
+ extern double rexp_(double *), erfc1_(long int *, double *);
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF THE INCOMPLETE GAMMA RATIO FUNCTIONS */
+/* P(A,X) AND Q(A,X) */
+
+/* IT IS ASSUMED THAT A .LE. 1. EPS IS THE TOLERANCE TO BE USED. */
+/* THE INPUT ARGUMENT R HAS THE VALUE E**(-X)*X**A/GAMMA(A). */
+/* ----------------------------------------------------------------------- */
+ if (*a * *x == 0.f) {
+ goto L130;
+ }
+ if (*a == .5f) {
+ goto L120;
+ }
+ if (*x < 1.1f) {
+ goto L10;
+ }
+ goto L50;
+
+/* TAYLOR SERIES FOR P(A,X)/X**A */
+
+L10:
+ an = 3.f;
+ c__ = *x;
+ sum = *x / (*a + 3.f);
+ tol = *eps * .1f / (*a + 1.f);
+L11:
+ an += 1.f;
+ c__ = -c__ * (*x / an);
+ t = c__ / (*a + an);
+ sum += t;
+ if (DABS(t) > tol) {
+ goto L11;
+ }
+ j = *a * *x * ((sum / 6.f - .5f / (*a + 2.f)) * *x + 1.f / (*a + 1.f));
+
+ z__ = *a * log(*x);
+ h__ = gam1_(a);
+ g = h__ + 1.f;
+ if (*x < .25f) {
+ goto L20;
+ }
+ if (*a < *x / 2.59f) {
+ goto L40;
+ }
+ goto L30;
+L20:
+ if (z__ > -.13394f) {
+ goto L40;
+ }
+
+L30:
+ w = exp(z__);
+ *p = w * g * (.5f - j + .5f);
+ *q = .5f - *p + .5f;
+ return 0;
+
+L40:
+ l = rexp_(&z__);
+ w = l + .5f + .5f;
+ *q = (w * j - l) * g - h__;
+ if (*q < 0.f) {
+ goto L110;
+ }
+ *p = .5f - *q + .5f;
+ return 0;
+
+/* CONTINUED FRACTION EXPANSION */
+
+L50:
+ a2nm1 = 1.f;
+ a2n = 1.f;
+ b2nm1 = *x;
+ b2n = *x + (1.f - *a);
+ c__ = 1.f;
+L51:
+ a2nm1 = *x * a2n + c__ * a2nm1;
+ b2nm1 = *x * b2n + c__ * b2nm1;
+ am0 = a2nm1 / b2nm1;
+ c__ += 1.f;
+ cma = c__ - *a;
+ a2n = a2nm1 + cma * a2n;
+ b2n = b2nm1 + cma * b2n;
+ an0 = a2n / b2n;
+ if ((r__1 = an0 - am0, DABS(r__1)) >= *eps * an0) {
+ goto L51;
+ }
+ *q = *r__ * an0;
+ *p = .5f - *q + .5f;
+ return 0;
+
+/* SPECIAL CASES */
+
+L100:
+ *p = 0.f;
+ *q = 1.f;
+ return 0;
+
+L110:
+ *p = 1.f;
+ *q = 0.f;
+ return 0;
+
+L120:
+ if (*x >= .25f) {
+ goto L121;
+ }
+ r__1 = sqrt(*x);
+ *p = erf_(&r__1);
+ *q = .5f - *p + .5f;
+ return 0;
+L121:
+ r__1 = sqrt(*x);
+ *q = erfc1_(&c__0, &r__1);
+ *p = .5f - *q + .5f;
+ return 0;
+
+L130:
+ if (*x <= *a) {
+ goto L100;
+ }
+ goto L110;
+} /* grat1_ */
+
+double basym_(double *a, double *b, double *lambda, double *eps)
+{
+ /* Initialized data */
+
+ static long int num = 20;
+ static double e0 = 1.12837916709551f;
+ static double e1 = .353553390593274f;
+
+ /* System generated locals */
+ long int i__1, i__2, i__3, i__4;
+ double ret_val, r__1, r__2;
+
+ /* Builtin functions */
+ double sqrt(double), exp(double);
+
+ /* Local variables */
+ static double c__[21], d__[21], f, h__;
+ static long int i__, j, m, n;
+ static double r__, s, t, u, w, z__, a0[21], b0[21], j0, j1, h2, r0, r1, t0,
+ t1, w0, z0, z2, hn, zn;
+ static long int im1, mm1, np1, imj, mmj;
+ static double sum, znm1, bsum, dsum;
+ extern double erfc1_(long int *, double *), rlog1_(double *), bcorr_(double *
+ , double *);
+
+/* ----------------------------------------------------------------------- */
+/* ASYMPTOTIC EXPANSION FOR IX(A,B) FOR LARGE A AND B. */
+/* LAMBDA = (A + B)*Y - B AND EPS IS THE TOLERANCE USED. */
+/* IT IS ASSUMED THAT LAMBDA IS NONNEGATIVE AND THAT */
+/* A AND B ARE GREATER THAN OR EQUAL TO 15. */
+/* ----------------------------------------------------------------------- */
+/* ------------------------ */
+/* ****** NUM IS THE MAXIMUM VALUE THAT N CAN TAKE IN THE DO LOOP */
+/* ENDING AT STATEMENT 50. IT IS REQUIRED THAT NUM BE EVEN. */
+/* THE ARRAYS A0, B0, C, D HAVE DIMENSION NUM + 1. */
+
+/* ------------------------ */
+/* E0 = 2/SQRT(PI) */
+/* E1 = 2**(-3/2) */
+/* ------------------------ */
+/* ------------------------ */
+ ret_val = 0.f;
+ if (*a >= *b) {
+ goto L10;
+ }
+ h__ = *a / *b;
+ r0 = 1.f / (h__ + 1.f);
+ r1 = (*b - *a) / *b;
+ w0 = 1.f / sqrt(*a * (h__ + 1.f));
+ goto L20;
+L10:
+ h__ = *b / *a;
+ r0 = 1.f / (h__ + 1.f);
+ r1 = (*b - *a) / *a;
+ w0 = 1.f / sqrt(*b * (h__ + 1.f));
+
+L20:
+ r__1 = -(*lambda) / *a;
+ r__2 = *lambda / *b;
+ f = *a * rlog1_(&r__1) + *b * rlog1_(&r__2);
+ t = exp(-f);
+ if (t == 0.f) {
+ return ret_val;
+ }
+ z0 = sqrt(f);
+ z__ = z0 / e1 * .5f;
+ z2 = f + f;
+
+ a0[0] = r1 * .66666666666666663f;
+ c__[0] = a0[0] * -.5f;
+ d__[0] = -c__[0];
+ j0 = .5f / e0 * erfc1_(&c__1, &z0);
+ j1 = e1;
+ sum = j0 + d__[0] * w0 * j1;
+
+ s = 1.f;
+ h2 = h__ * h__;
+ hn = 1.f;
+ w = w0;
+ znm1 = z__;
+ zn = z2;
+ i__1 = num;
+ for (n = 2; n <= i__1; n += 2) {
+ hn = h2 * hn;
+ a0[n - 1] = r0 * 2.f * (h__ * hn + 1.f) / (n + 2.f);
+ np1 = n + 1;
+ s += hn;
+ a0[np1 - 1] = r1 * 2.f * s / (n + 3.f);
+
+ i__2 = np1;
+ for (i__ = n; i__ <= i__2; ++i__) {
+ r__ = (i__ + 1.f) * -.5f;
+ b0[0] = r__ * a0[0];
+ i__3 = i__;
+ for (m = 2; m <= i__3; ++m) {
+ bsum = 0.f;
+ mm1 = m - 1;
+ i__4 = mm1;
+ for (j = 1; j <= i__4; ++j) {
+ mmj = m - j;
+/* L30: */
+ bsum += (j * r__ - mmj) * a0[j - 1] * b0[mmj - 1];
+ }
+/* L31: */
+ b0[m - 1] = r__ * a0[m - 1] + bsum / m;
+ }
+ c__[i__ - 1] = b0[i__ - 1] / (i__ + 1.f);
+
+ dsum = 0.f;
+ im1 = i__ - 1;
+ i__3 = im1;
+ for (j = 1; j <= i__3; ++j) {
+ imj = i__ - j;
+/* L40: */
+ dsum += d__[imj - 1] * c__[j - 1];
+ }
+/* L41: */
+ d__[i__ - 1] = -(dsum + c__[i__ - 1]);
+ }
+
+ j0 = e1 * znm1 + (n - 1.f) * j0;
+ j1 = e1 * zn + n * j1;
+ znm1 = z2 * znm1;
+ zn = z2 * zn;
+ w = w0 * w;
+ t0 = d__[n - 1] * w * j0;
+ w = w0 * w;
+ t1 = d__[np1 - 1] * w * j1;
+ sum += t0 + t1;
+ if (DABS(t0) + DABS(t1) <= *eps * sum) {
+ goto L60;
+ }
+/* L50: */
+ }
+
+L60:
+ u = exp(-bcorr_(a, b));
+ ret_val = e0 * t * u * sum;
+ return ret_val;
+} /* basym_ */
+
+
+double exparg_(long int *l)
+{
+ /* System generated locals */
+ double lnb = .69314718055995f;
+ int m;
+ if(*l==0) {
+ m = DBL_MAX_EXP;
+ return m * lnb * .99999;
+ }
+ m = DBL_MIN_EXP -1;
+ return m * lnb * .99999;
+} /* exparg_ */
+
+double esum_(long int *mu, double *x)
+{
+ /* System generated locals */
+ double ret_val;
+
+ /* Builtin functions */
+ double exp(double);
+
+ /* Local variables */
+ static double w;
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF EXP(MU + X) */
+/* ----------------------------------------------------------------------- */
+ if (*x > 0.f) {
+ goto L10;
+ }
+
+ if (*mu < 0) {
+ goto L20;
+ }
+ w = *mu + *x;
+ if (w > 0.f) {
+ goto L20;
+ }
+ ret_val = exp(w);
+ return ret_val;
+
+L10:
+ if (*mu > 0) {
+ goto L20;
+ }
+ w = *mu + *x;
+ if (w < 0.f) {
+ goto L20;
+ }
+ ret_val = exp(w);
+ return ret_val;
+
+L20:
+ w = (double) (*mu);
+ ret_val = exp(w) * exp(*x);
+ return ret_val;
+} /* esum_ */
+
+double rexp_(double *x)
+{
+ /* Initialized data */
+
+ static double p1 = 9.14041914819518e-10f;
+ static double p2 = .0238082361044469f;
+ static double q1 = -.499999999085958f;
+ static double q2 = .107141568980644f;
+ static double q3 = -.0119041179760821f;
+ static double q4 = 5.95130811860248e-4f;
+
+ /* System generated locals */
+ double ret_val;
+
+ /* Builtin functions */
+ double exp(double);
+
+ /* Local variables */
+ static double w;
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF THE FUNCTION EXP(X) - 1 */
+/* ----------------------------------------------------------------------- */
+/* ----------------------- */
+ if (DABS(*x) > .15f) {
+ goto L10;
+ }
+ ret_val = *x * (((p2 * *x + p1) * *x + 1.f) / ((((q4 * *x + q3) * *x + q2)
+ * *x + q1) * *x + 1.f));
+ return ret_val;
+
+L10:
+ w = exp(*x);
+ if (*x > 0.f) {
+ goto L20;
+ }
+ ret_val = w - .5f - .5f;
+ return ret_val;
+L20:
+ ret_val = w * (.5f - 1.f / w + .5f);
+ return ret_val;
+} /* rexp_ */
+
+double alnrel_(double *a)
+{
+ /* Initialized data */
+
+ static double p1 = -1.29418923021993f;
+ static double p2 = .405303492862024f;
+ static double p3 = -.0178874546012214f;
+ static double q1 = -1.62752256355323f;
+ static double q2 = .747811014037616f;
+ static double q3 = -.0845104217945565f;
+
+ /* System generated locals */
+ double ret_val;
+
+ /* Builtin functions */
+ double log(double);
+
+ /* Local variables */
+ static double t, w, x, t2;
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF THE FUNCTION LN(1 + A) */
+/* ----------------------------------------------------------------------- */
+/* -------------------------- */
+ if (DABS(*a) > .375f) {
+ goto L10;
+ }
+ t = *a / (*a + 2.f);
+ t2 = t * t;
+ w = (((p3 * t2 + p2) * t2 + p1) * t2 + 1.f) / (((q3 * t2 + q2) * t2 + q1)
+ * t2 + 1.f);
+ ret_val = t * 2.f * w;
+ return ret_val;
+
+L10:
+ x = (double) (*a) + 1.;
+ ret_val = log(x);
+ return ret_val;
+} /* alnrel_ */
+
+double rlog1_(double *x)
+{
+ /* Initialized data */
+
+ static double a = .0566749439387324f;
+ static double b = .0456512608815524f;
+ static double p0 = .333333333333333f;
+ static double p1 = -.224696413112536f;
+ static double p2 = .00620886815375787f;
+ static double q1 = -1.27408923933623f;
+ static double q2 = .354508718369557f;
+
+ /* System generated locals */
+ double ret_val;
+
+ /* Builtin functions */
+ double log(double);
+
+ /* Local variables */
+ static double h__, r__, t, w, w1;
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF THE FUNCTION X - LN(1 + X) */
+/* ----------------------------------------------------------------------- */
+/* ------------------------ */
+/* ------------------------ */
+ if (*x < -.39f || *x > .57f) {
+ goto L100;
+ }
+ if (*x < -.18f) {
+ goto L10;
+ }
+ if (*x > .18f) {
+ goto L20;
+ }
+
+/* ARGUMENT REDUCTION */
+
+ h__ = *x;
+ w1 = 0.f;
+ goto L30;
+
+L10:
+ h__ = (double) (*x) + .3;
+ h__ /= .7f;
+ w1 = a - h__ * .3f;
+ goto L30;
+
+L20:
+ h__ = (double) (*x) * .75 - .25;
+ w1 = b + h__ / 3.f;
+
+/* SERIES EXPANSION */
+
+L30:
+ r__ = h__ / (h__ + 2.f);
+ t = r__ * r__;
+ w = ((p2 * t + p1) * t + p0) / ((q2 * t + q1) * t + 1.f);
+ ret_val = t * 2.f * (1.f / (1.f - r__) - r__ * w) + w1;
+ return ret_val;
+
+
+L100:
+ w = *x + .5f + .5f;
+ ret_val = *x - log(w);
+ return ret_val;
+} /* rlog1_ */
+
+double erf_(double *x)
+{
+ /* Initialized data */
+
+ static double c__ = .564189583547756f;
+ static double a[5] = { 7.7105849500132e-5f,-.00133733772997339f,
+ .0323076579225834f,.0479137145607681f,.128379167095513f };
+ static double b[3] = { .00301048631703895f,.0538971687740286f,
+ .375795757275549f };
+ static double p[8] = { -1.36864857382717e-7f,.564195517478974f,
+ 7.21175825088309f,43.1622272220567f,152.98928504694f,
+ 339.320816734344f,451.918953711873f,300.459261020162f };
+ static double q[8] = { 1.f,12.7827273196294f,77.0001529352295f,
+ 277.585444743988f,638.980264465631f,931.35409485061f,
+ 790.950925327898f,300.459260956983f };
+ static double r__[5] = { 2.10144126479064f,26.2370141675169f,
+ 21.3688200555087f,4.6580782871847f,.282094791773523f };
+ static double s[4] = { 94.153775055546f,187.11481179959f,99.0191814623914f,
+ 18.0124575948747f };
+
+ /* System generated locals */
+ double ret_val;
+
+ /* Builtin functions */
+ double exp(double), r_sign(double *, double *);
+
+ /* Local variables */
+ static double t, x2, ax, bot, top;
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF THE REAL ERROR FUNCTION */
+/* ----------------------------------------------------------------------- */
+/* ------------------------- */
+/* ------------------------- */
+/* ------------------------- */
+/* ------------------------- */
+/* ------------------------- */
+ ax = DABS(*x);
+ if (ax > .5f) {
+ goto L10;
+ }
+ t = *x * *x;
+ top = (((a[0] * t + a[1]) * t + a[2]) * t + a[3]) * t + a[4] + 1.f;
+ bot = ((b[0] * t + b[1]) * t + b[2]) * t + 1.f;
+ ret_val = *x * (top / bot);
+ return ret_val;
+
+L10:
+ if (ax > 4.f) {
+ goto L20;
+ }
+ top = ((((((p[0] * ax + p[1]) * ax + p[2]) * ax + p[3]) * ax + p[4]) * ax
+ + p[5]) * ax + p[6]) * ax + p[7];
+ bot = ((((((q[0] * ax + q[1]) * ax + q[2]) * ax + q[3]) * ax + q[4]) * ax
+ + q[5]) * ax + q[6]) * ax + q[7];
+ ret_val = .5f - exp(-(*x) * *x) * top / bot + .5f;
+ if (*x < 0.f) {
+ ret_val = -ret_val;
+ }
+ return ret_val;
+
+L20:
+ if (ax >= 5.8f) {
+ goto L30;
+ }
+ x2 = *x * *x;
+ t = 1.f / x2;
+ top = (((r__[0] * t + r__[1]) * t + r__[2]) * t + r__[3]) * t + r__[4];
+ bot = (((s[0] * t + s[1]) * t + s[2]) * t + s[3]) * t + 1.f;
+ ret_val = (c__ - top / (x2 * bot)) / ax;
+ ret_val = .5f - exp(-x2) * ret_val + .5f;
+ if (*x < 0.f) {
+ ret_val = -ret_val;
+ }
+ return ret_val;
+
+L30:
+ ret_val = r_sign(&c_b188, x);
+ return ret_val;
+} /* erf_ */
+
+double erfc1_(long int *ind, double *x)
+{
+ /* Initialized data */
+
+ static double c__ = .564189583547756f;
+ static double a[5] = { 7.7105849500132e-5f,-.00133733772997339f,
+ .0323076579225834f,.0479137145607681f,.128379167095513f };
+ static double b[3] = { .00301048631703895f,.0538971687740286f,
+ .375795757275549f };
+ static double p[8] = { -1.36864857382717e-7f,.564195517478974f,
+ 7.21175825088309f,43.1622272220567f,152.98928504694f,
+ 339.320816734344f,451.918953711873f,300.459261020162f };
+ static double q[8] = { 1.f,12.7827273196294f,77.0001529352295f,
+ 277.585444743988f,638.980264465631f,931.35409485061f,
+ 790.950925327898f,300.459260956983f };
+ static double r__[5] = { 2.10144126479064f,26.2370141675169f,
+ 21.3688200555087f,4.6580782871847f,.282094791773523f };
+ static double s[4] = { 94.153775055546f,187.11481179959f,99.0191814623914f,
+ 18.0124575948747f };
+
+ /* System generated locals */
+ double ret_val, r__1;
+
+ /* Builtin functions */
+ double exp(double);
+
+ /* Local variables */
+ static double e, t;
+ static double w;
+ static double ax, bot, top;
+ extern double exparg_(long int *);
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF THE COMPLEMENTARY ERROR FUNCTION */
+
+/* ERFC1(IND,X) = ERFC(X) IF IND = 0 */
+/* ERFC1(IND,X) = EXP(X*X)*ERFC(X) OTHERWISE */
+/* ----------------------------------------------------------------------- */
+/* ------------------------- */
+/* ------------------------- */
+/* ------------------------- */
+/* ------------------------- */
+/* ------------------------- */
+
+/* ABS(X) .LE. 0.5 */
+
+ ax = DABS(*x);
+ if (ax > .5f) {
+ goto L10;
+ }
+ t = *x * *x;
+ top = (((a[0] * t + a[1]) * t + a[2]) * t + a[3]) * t + a[4] + 1.f;
+ bot = ((b[0] * t + b[1]) * t + b[2]) * t + 1.f;
+ ret_val = .5f - *x * (top / bot) + .5f;
+ if (*ind != 0) {
+ ret_val = exp(t) * ret_val;
+ }
+ return ret_val;
+
+/* 0.5 .LT. ABS(X) .LE. 4 */
+
+L10:
+ if (ax > 4.f) {
+ goto L20;
+ }
+ top = ((((((p[0] * ax + p[1]) * ax + p[2]) * ax + p[3]) * ax + p[4]) * ax
+ + p[5]) * ax + p[6]) * ax + p[7];
+ bot = ((((((q[0] * ax + q[1]) * ax + q[2]) * ax + q[3]) * ax + q[4]) * ax
+ + q[5]) * ax + q[6]) * ax + q[7];
+ ret_val = top / bot;
+ goto L40;
+
+/* ABS(X) .GT. 4 */
+
+L20:
+ if (*x <= -5.6f) {
+ goto L50;
+ }
+ if (*ind != 0) {
+ goto L30;
+ }
+ if (*x > 100.f) {
+ goto L60;
+ }
+ if (*x * *x > -exparg_(&c__1)) {
+ goto L60;
+ }
+
+L30:
+/* Computing 2nd power */
+ r__1 = 1.f / *x;
+ t = r__1 * r__1;
+ top = (((r__[0] * t + r__[1]) * t + r__[2]) * t + r__[3]) * t + r__[4];
+ bot = (((s[0] * t + s[1]) * t + s[2]) * t + s[3]) * t + 1.f;
+ ret_val = (c__ - t * top / bot) / ax;
+
+/* FINAL ASSEMBLY */
+
+L40:
+ if (*ind == 0) {
+ goto L41;
+ }
+ if (*x < 0.f) {
+ ret_val = exp(*x * *x) * 2.f - ret_val;
+ }
+ return ret_val;
+L41:
+ w = (double) (*x) * (double) (*x);
+ t = w;
+ e = w - (double) t;
+ ret_val = (.5f - e + .5f) * exp(-t) * ret_val;
+ if (*x < 0.f) {
+ ret_val = 2.f - ret_val;
+ }
+ return ret_val;
+
+/* LIMIT VALUE FOR LARGE NEGATIVE X */
+
+L50:
+ ret_val = 2.f;
+ if (*ind != 0) {
+ ret_val = exp(*x * *x) * 2.f;
+ }
+ return ret_val;
+
+/* LIMIT VALUE FOR LARGE POSITIVE X */
+/* WHEN IND = 0 */
+
+L60:
+ ret_val = 0.f;
+ return ret_val;
+} /* erfc1_ */
+
+double gam1_(double *a)
+{
+ /* Initialized data */
+
+ static double p[7] = { .577215664901533f,-.409078193005776f,
+ -.230975380857675f,.0597275330452234f,.0076696818164949f,
+ -.00514889771323592f,5.89597428611429e-4f };
+ static double q[5] = { 1.f,.427569613095214f,.158451672430138f,
+ .0261132021441447f,.00423244297896961f };
+ static double r__[9] = { -.422784335098468f,-.771330383816272f,
+ -.244757765222226f,.118378989872749f,9.30357293360349e-4f,
+ -.0118290993445146f,.00223047661158249f,2.66505979058923e-4f,
+ -1.32674909766242e-4f };
+ static double s1 = .273076135303957f;
+ static double s2 = .0559398236957378f;
+
+ /* System generated locals */
+ double ret_val;
+
+ /* Local variables */
+ static double d__, t, w, bot, top;
+
+/* ------------------------------------------------------------------ */
+/* COMPUTATION OF 1/GAMMA(A+1) - 1 FOR -0.5 .LE. A .LE. 1.5 */
+/* ------------------------------------------------------------------ */
+/* ------------------- */
+/* ------------------- */
+/* ------------------- */
+/* ------------------- */
+/* ------------------- */
+ t = *a;
+ d__ = *a - .5f;
+ if (d__ > 0.f) {
+ t = d__ - .5f;
+ }
+ if (t < 0.f) {
+ goto L30;
+ } else if (t == 0) {
+ goto L10;
+ } else {
+ goto L20;
+ }
+
+L10:
+ ret_val = 0.f;
+ return ret_val;
+
+L20:
+ top = (((((p[6] * t + p[5]) * t + p[4]) * t + p[3]) * t + p[2]) * t + p[1]
+ ) * t + p[0];
+ bot = (((q[4] * t + q[3]) * t + q[2]) * t + q[1]) * t + 1.f;
+ w = top / bot;
+ if (d__ > 0.f) {
+ goto L21;
+ }
+ ret_val = *a * w;
+ return ret_val;
+L21:
+ ret_val = t / *a * (w - .5f - .5f);
+ return ret_val;
+
+L30:
+ top = (((((((r__[8] * t + r__[7]) * t + r__[6]) * t + r__[5]) * t + r__[4]
+ ) * t + r__[3]) * t + r__[2]) * t + r__[1]) * t + r__[0];
+ bot = (s2 * t + s1) * t + 1.f;
+ w = top / bot;
+ if (d__ > 0.f) {
+ goto L31;
+ }
+ ret_val = *a * (w + .5f + .5f);
+ return ret_val;
+L31:
+ ret_val = t * w / *a;
+ return ret_val;
+} /* gam1_ */
+
+double gamln1_(double *a)
+{
+ /* Initialized data */
+
+ static double p0 = .577215664901533f;
+ static double p1 = .844203922187225f;
+ static double p2 = -.168860593646662f;
+ static double p3 = -.780427615533591f;
+ static double p4 = -.402055799310489f;
+ static double p5 = -.0673562214325671f;
+ static double p6 = -.00271935708322958f;
+ static double q1 = 2.88743195473681f;
+ static double q2 = 3.12755088914843f;
+ static double q3 = 1.56875193295039f;
+ static double q4 = .361951990101499f;
+ static double q5 = .0325038868253937f;
+ static double q6 = 6.67465618796164e-4f;
+ static double r0 = .422784335098467f;
+ static double r1 = .848044614534529f;
+ static double r2 = .565221050691933f;
+ static double r3 = .156513060486551f;
+ static double r4 = .017050248402265f;
+ static double r5 = 4.97958207639485e-4f;
+ static double s1 = 1.24313399877507f;
+ static double s2 = .548042109832463f;
+ static double s3 = .10155218743983f;
+ static double s4 = .00713309612391f;
+ static double s5 = 1.16165475989616e-4f;
+
+ /* System generated locals */
+ double ret_val;
+
+ /* Local variables */
+ static double w, x;
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF LN(GAMMA(1 + A)) FOR -0.2 .LE. A .LE. 1.25 */
+/* ----------------------------------------------------------------------- */
+/* ---------------------- */
+/* ---------------------- */
+ if (*a >= .6f) {
+ goto L10;
+ }
+ w = ((((((p6 * *a + p5) * *a + p4) * *a + p3) * *a + p2) * *a + p1) * *a
+ + p0) / ((((((q6 * *a + q5) * *a + q4) * *a + q3) * *a + q2) * *a
+ + q1) * *a + 1.f);
+ ret_val = -(*a) * w;
+ return ret_val;
+
+L10:
+ x = *a - .5f - .5f;
+ w = (((((r5 * x + r4) * x + r3) * x + r2) * x + r1) * x + r0) / (((((s5 *
+ x + s4) * x + s3) * x + s2) * x + s1) * x + 1.f);
+ ret_val = x * w;
+ return ret_val;
+} /* gamln1_ */
+
+double psi_(double *xx)
+{
+ /* Initialized data */
+
+ static double piov4 = .785398163397448f;
+ static double dx0 = 1.461632144968362341262659542325721325;
+ static double p1[7] = { .0089538502298197f,4.77762828042627f,
+ 142.441585084029f,1186.45200713425f,3633.51846806499f,
+ 4138.10161269013f,1305.60269827897f };
+ static double q1[6] = { 44.8452573429826f,520.752771467162f,
+ 2210.0079924783f,3641.27349079381f,1908.310765963f,
+ 6.91091682714533e-6f };
+ static double p2[4] = { -2.12940445131011f,-7.01677227766759f,
+ -4.48616543918019f,-.648157123766197f };
+ static double q2[4] = { 32.2703493791143f,89.2920700481861f,
+ 54.6117738103215f,7.77788548522962f };
+
+ /* System generated locals */
+ double ret_val, r__1, r__2;
+
+ /* Builtin functions */
+ double cos(double), sin(double), log(double);
+
+ /* Local variables */
+ static long int i__, m, n;
+ static double w, x, z__;
+ static long int nq;
+ static double den, aug, sgn, xmx0, xmax1, upper;
+ static double xsmall;
+
+/* --------------------------------------------------------------------- */
+
+/* EVALUATION OF THE DIGAMMA FUNCTION */
+
+/* ----------- */
+
+/* PSI(XX) IS ASSIGNED THE VALUE 0 WHEN THE DIGAMMA FUNCTION CANNOT */
+/* BE COMPUTED. */
+
+/* THE MAIN COMPUTATION INVOLVES EVALUATION OF RATIONAL CHEBYSHEV */
+/* APPROXIMATIONS PUBLISHED IN MATH. COMP. 27, 123-127(1973) BY */
+/* CODY, STRECOK AND THACHER. */
+
+/* --------------------------------------------------------------------- */
+/* PSI WAS WRITTEN AT ARGONNE NATIONAL LABORATORY FOR THE FUNPACK */
+/* PACKAGE OF SPECIAL FUNCTION SUBROUTINES. PSI WAS MODIFIED BY */
+/* A.H. MORRIS (NSWC). */
+/* --------------------------------------------------------------------- */
+/* --------------------------------------------------------------------- */
+
+/* PIOV4 = PI/4 */
+/* DX0 = ZERO OF PSI TO EXTENDED PRECISION */
+
+/* --------------------------------------------------------------------- */
+/* --------------------------------------------------------------------- */
+
+/* COEFFICIENTS FOR RATIONAL APPROXIMATION OF */
+/* PSI(X) / (X - X0), 0.5 .LE. X .LE. 3.0 */
+
+/* --------------------------------------------------------------------- */
+/* --------------------------------------------------------------------- */
+
+/* COEFFICIENTS FOR RATIONAL APPROXIMATION OF */
+/* PSI(X) - LN(X) + 1 / (2*X), X .GT. 3.0 */
+
+/* --------------------------------------------------------------------- */
+/* --------------------------------------------------------------------- */
+
+/* MACHINE DEPENDENT CONSTANTS ... */
+
+/* XMAX1 = THE SMALLEST POSITIVE FLOATING POINT CONSTANT */
+/* WITH ENTIRELY INTEGER REPRESENTATION. ALSO USED */
+/* AS NEGATIVE OF LOWER BOUND ON ACCEPTABLE NEGATIVE */
+/* ARGUMENTS AND AS THE POSITIVE ARGUMENT BEYOND WHICH */
+/* PSI MAY BE REPRESENTED AS ALOG(X). */
+
+/* XSMALL = ABSOLUTE ARGUMENT BELOW WHICH PI*COTAN(PI*X) */
+/* MAY BE REPRESENTED BY 1/X. */
+
+/* --------------------------------------------------------------------- */
+ xmax1 = (double) INT_MAX;
+/* Computing MIN */
+ r__1 = xmax1, r__2 = 0.5 / (0.5 * DBL_EPSILON);
+ xmax1 = DMIN(r__1,r__2);
+ xsmall = 1e-9f;
+/* --------------------------------------------------------------------- */
+ x = *xx;
+ aug = 0.f;
+ if (x >= .5f) {
+ goto L200;
+ }
+/* --------------------------------------------------------------------- */
+/* X .LT. 0.5, USE REFLECTION FORMULA */
+/* PSI(1-X) = PSI(X) + PI * COTAN(PI*X) */
+/* --------------------------------------------------------------------- */
+ if (DABS(x) > xsmall) {
+ goto L100;
+ }
+ if (x == 0.f) {
+ goto L400;
+ }
+/* --------------------------------------------------------------------- */
+/* 0 .LT. ABS(X) .LE. XSMALL. USE 1/X AS A SUBSTITUTE */
+/* FOR PI*COTAN(PI*X) */
+/* --------------------------------------------------------------------- */
+ aug = -1.f / x;
+ goto L150;
+/* --------------------------------------------------------------------- */
+/* REDUCTION OF ARGUMENT FOR COTAN */
+/* --------------------------------------------------------------------- */
+L100:
+ w = -x;
+ sgn = piov4;
+ if (w > 0.f) {
+ goto L120;
+ }
+ w = -w;
+ sgn = -sgn;
+/* --------------------------------------------------------------------- */
+/* MAKE AN ERROR EXIT IF X .LE. -XMAX1 */
+/* --------------------------------------------------------------------- */
+L120:
+ if (w >= xmax1) {
+ goto L400;
+ }
+ nq = (long int) w;
+ w -= (double) nq;
+ nq = (long int) (w * 4.f);
+ w = (w - (double) nq * .25f) * 4.f;
+/* --------------------------------------------------------------------- */
+/* W IS NOW RELATED TO THE FRACTIONAL PART OF 4.0 * X. */
+/* ADJUST ARGUMENT TO CORRESPOND TO VALUES IN FIRST */
+/* QUADRANT AND DETERMINE SIGN */
+/* --------------------------------------------------------------------- */
+ n = nq / 2;
+ if (n + n != nq) {
+ w = 1.f - w;
+ }
+ z__ = piov4 * w;
+ m = n / 2;
+ if (m + m != n) {
+ sgn = -sgn;
+ }
+/* --------------------------------------------------------------------- */
+/* DETERMINE FINAL VALUE FOR -PI*COTAN(PI*X) */
+/* --------------------------------------------------------------------- */
+ n = (nq + 1) / 2;
+ m = n / 2;
+ m += m;
+ if (m != n) {
+ goto L140;
+ }
+/* --------------------------------------------------------------------- */
+/* CHECK FOR SINGULARITY */
+/* --------------------------------------------------------------------- */
+ if (z__ == 0.f) {
+ goto L400;
+ }
+/* --------------------------------------------------------------------- */
+/* USE COS/SIN AS A SUBSTITUTE FOR COTAN, AND */
+/* SIN/COS AS A SUBSTITUTE FOR TAN */
+/* --------------------------------------------------------------------- */
+ aug = sgn * (cos(z__) / sin(z__) * 4.f);
+ goto L150;
+L140:
+ aug = sgn * (sin(z__) / cos(z__) * 4.f);
+L150:
+ x = 1.f - x;
+L200:
+ if (x > 3.f) {
+ goto L300;
+ }
+/* --------------------------------------------------------------------- */
+/* 0.5 .LE. X .LE. 3.0 */
+/* --------------------------------------------------------------------- */
+ den = x;
+ upper = p1[0] * x;
+
+ for (i__ = 1; i__ <= 5; ++i__) {
+ den = (den + q1[i__ - 1]) * x;
+ upper = (upper + p1[i__]) * x;
+/* L210: */
+ }
+
+ den = (upper + p1[6]) / (den + q1[5]);
+ xmx0 = (double) x - dx0;
+ ret_val = den * xmx0 + aug;
+ return ret_val;
+/* --------------------------------------------------------------------- */
+/* IF X .GE. XMAX1, PSI = LN(X) */
+/* --------------------------------------------------------------------- */
+L300:
+ if (x >= xmax1) {
+ goto L350;
+ }
+/* --------------------------------------------------------------------- */
+/* 3.0 .LT. X .LT. XMAX1 */
+/* --------------------------------------------------------------------- */
+ w = 1.f / (x * x);
+ den = w;
+ upper = p2[0] * w;
+
+ for (i__ = 1; i__ <= 3; ++i__) {
+ den = (den + q2[i__ - 1]) * w;
+ upper = (upper + p2[i__]) * w;
+/* L310: */
+ }
+
+ aug = upper / (den + q2[3]) - .5f / x + aug;
+L350:
+ ret_val = aug + log(x);
+ return ret_val;
+/* --------------------------------------------------------------------- */
+/* ERROR RETURN */
+/* --------------------------------------------------------------------- */
+L400:
+ ret_val = 0.f;
+ return ret_val;
+} /* psi_ */
+
+double betaln_(double *a0, double *b0)
+{
+ /* Initialized data */
+
+ static double e = .918938533204673f;
+
+ /* System generated locals */
+ long int i__1;
+ double ret_val, r__1;
+
+ /* Builtin functions */
+ double log(double);
+
+ /* Local variables */
+ static double a, b, c__, h__;
+ static long int i__, n;
+ static double u, v, w, z__;
+ extern double gamln_(double *), bcorr_(double *, double *), algdiv_(double *,
+ double *), alnrel_(double *), gsumln_(double *, double *);
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF THE LOGARITHM OF THE BETA FUNCTION */
+/* ----------------------------------------------------------------------- */
+/* E = 0.5*LN(2*PI) */
+/* -------------------------- */
+/* -------------------------- */
+ a = DMIN(*a0,*b0);
+ b = DMAX(*a0,*b0);
+ if (a >= 8.f) {
+ goto L60;
+ }
+ if (a >= 1.f) {
+ goto L20;
+ }
+/* ----------------------------------------------------------------------- */
+/* PROCEDURE WHEN A .LT. 1 */
+/* ----------------------------------------------------------------------- */
+ if (b >= 8.f) {
+ goto L10;
+ }
+ r__1 = a + b;
+ ret_val = gamln_(&a) + (gamln_(&b) - gamln_(&r__1));
+ return ret_val;
+L10:
+ ret_val = gamln_(&a) + algdiv_(&a, &b);
+ return ret_val;
+/* ----------------------------------------------------------------------- */
+/* PROCEDURE WHEN 1 .LE. A .LT. 8 */
+/* ----------------------------------------------------------------------- */
+L20:
+ if (a > 2.f) {
+ goto L30;
+ }
+ if (b > 2.f) {
+ goto L21;
+ }
+ ret_val = gamln_(&a) + gamln_(&b) - gsumln_(&a, &b);
+ return ret_val;
+L21:
+ w = 0.f;
+ if (b < 8.f) {
+ goto L40;
+ }
+ ret_val = gamln_(&a) + algdiv_(&a, &b);
+ return ret_val;
+
+/* REDUCTION OF A WHEN B .LE. 1000 */
+
+L30:
+ if (b > 1e3f) {
+ goto L50;
+ }
+ n = a - 1.f;
+ w = 1.f;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ a += -1.f;
+ h__ = a / b;
+ w *= h__ / (h__ + 1.f);
+/* L31: */
+ }
+ w = log(w);
+ if (b < 8.f) {
+ goto L40;
+ }
+ ret_val = w + gamln_(&a) + algdiv_(&a, &b);
+ return ret_val;
+
+/* REDUCTION OF B WHEN B .LT. 8 */
+
+L40:
+ n = b - 1.f;
+ z__ = 1.f;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ b += -1.f;
+ z__ *= b / (a + b);
+/* L41: */
+ }
+ ret_val = w + log(z__) + (gamln_(&a) + (gamln_(&b) - gsumln_(&a, &b)));
+ return ret_val;
+
+/* REDUCTION OF A WHEN B .GT. 1000 */
+
+L50:
+ n = a - 1.f;
+ w = 1.f;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ a += -1.f;
+ w *= a / (a / b + 1.f);
+/* L51: */
+ }
+ ret_val = log(w) - n * log(b) + (gamln_(&a) + algdiv_(&a, &b));
+ return ret_val;
+/* ----------------------------------------------------------------------- */
+/* PROCEDURE WHEN A .GE. 8 */
+/* ----------------------------------------------------------------------- */
+L60:
+ w = bcorr_(&a, &b);
+ h__ = a / b;
+ c__ = h__ / (h__ + 1.f);
+ u = -(a - .5f) * log(c__);
+ v = b * alnrel_(&h__);
+ if (u <= v) {
+ goto L61;
+ }
+ ret_val = log(b) * -.5f + e + w - v - u;
+ return ret_val;
+L61:
+ ret_val = log(b) * -.5f + e + w - u - v;
+ return ret_val;
+} /* betaln_ */
+
+double gsumln_(double *a, double *b)
+{
+ /* System generated locals */
+ double ret_val, r__1;
+
+ /* Builtin functions */
+ double log(double);
+
+ /* Local variables */
+ static double x;
+ extern double gamln1_(double *), alnrel_(double *);
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF THE FUNCTION LN(GAMMA(A + B)) */
+/* FOR 1 .LE. A .LE. 2 AND 1 .LE. B .LE. 2 */
+/* ----------------------------------------------------------------------- */
+ x = (double) (*a) + (double) (*b) - 2.;
+ if (x > .25f) {
+ goto L10;
+ }
+ r__1 = x + 1.f;
+ ret_val = gamln1_(&r__1);
+ return ret_val;
+L10:
+ if (x > 1.25f) {
+ goto L20;
+ }
+ ret_val = gamln1_(&x) + alnrel_(&x);
+ return ret_val;
+L20:
+ r__1 = x - 1.f;
+ ret_val = gamln1_(&r__1) + log(x * (x + 1.f));
+ return ret_val;
+} /* gsumln_ */
+
+double bcorr_(double *a0, double *b0)
+{
+ /* Initialized data */
+
+ static double c0 = .0833333333333333f;
+ static double c1 = -.00277777777760991f;
+ static double c2 = 7.9365066682539e-4f;
+ static double c3 = -5.9520293135187e-4f;
+ static double c4 = 8.37308034031215e-4f;
+ static double c5 = -.00165322962780713f;
+
+ /* System generated locals */
+ double ret_val, r__1;
+
+ /* Local variables */
+ static double a, b, c__, h__, t, w, x, s3, s5, x2, s7, s9, s11;
+
+/* ----------------------------------------------------------------------- */
+
+/* EVALUATION OF DEL(A0) + DEL(B0) - DEL(A0 + B0) WHERE */
+/* LN(GAMMA(A)) = (A - 0.5)*LN(A) - A + 0.5*LN(2*PI) + DEL(A). */
+/* IT IS ASSUMED THAT A0 .GE. 8 AND B0 .GE. 8. */
+
+/* ----------------------------------------------------------------------- */
+/* ------------------------ */
+ a = DMIN(*a0,*b0);
+ b = DMAX(*a0,*b0);
+
+ h__ = a / b;
+ c__ = h__ / (h__ + 1.f);
+ x = 1.f / (h__ + 1.f);
+ x2 = x * x;
+
+/* SET SN = (1 - X**N)/(1 - X) */
+
+ s3 = x + x2 + 1.f;
+ s5 = x + x2 * s3 + 1.f;
+ s7 = x + x2 * s5 + 1.f;
+ s9 = x + x2 * s7 + 1.f;
+ s11 = x + x2 * s9 + 1.f;
+
+/* SET W = DEL(B) - DEL(A + B) */
+
+/* Computing 2nd power */
+ r__1 = 1.f / b;
+ t = r__1 * r__1;
+ w = ((((c5 * s11 * t + c4 * s9) * t + c3 * s7) * t + c2 * s5) * t + c1 *
+ s3) * t + c0;
+ w *= c__ / b;
+
+/* COMPUTE DEL(A) + W */
+
+/* Computing 2nd power */
+ r__1 = 1.f / a;
+ t = r__1 * r__1;
+ ret_val = (((((c5 * t + c4) * t + c3) * t + c2) * t + c1) * t + c0) / a +
+ w;
+ return ret_val;
+} /* bcorr_ */
+
+double algdiv_(double *a, double *b)
+{
+ /* Initialized data */
+
+ static double c0 = .0833333333333333f;
+ static double c1 = -.00277777777760991f;
+ static double c2 = 7.9365066682539e-4f;
+ static double c3 = -5.9520293135187e-4f;
+ static double c4 = 8.37308034031215e-4f;
+ static double c5 = -.00165322962780713f;
+
+ /* System generated locals */
+ double ret_val, r__1;
+
+ /* Builtin functions */
+ double log(double);
+
+ /* Local variables */
+ static double c__, d__, h__, t, u, v, w, x, s3, s5, x2, s7, s9, s11;
+ extern double alnrel_(double *);
+
+/* ----------------------------------------------------------------------- */
+
+/* COMPUTATION OF LN(GAMMA(B)/GAMMA(A+B)) WHEN B .GE. 8 */
+
+/* -------- */
+
+/* IN THIS ALGORITHM, DEL(X) IS THE FUNCTION DEFINED BY */
+/* LN(GAMMA(X)) = (X - 0.5)*LN(X) - X + 0.5*LN(2*PI) + DEL(X). */
+
+/* ----------------------------------------------------------------------- */
+/* ------------------------ */
+ if (*a <= *b) {
+ goto L10;
+ }
+ h__ = *b / *a;
+ c__ = 1.f / (h__ + 1.f);
+ x = h__ / (h__ + 1.f);
+ d__ = *a + (*b - .5f);
+ goto L20;
+L10:
+ h__ = *a / *b;
+ c__ = h__ / (h__ + 1.f);
+ x = 1.f / (h__ + 1.f);
+ d__ = *b + (*a - .5f);
+
+/* SET SN = (1 - X**N)/(1 - X) */
+
+L20:
+ x2 = x * x;
+ s3 = x + x2 + 1.f;
+ s5 = x + x2 * s3 + 1.f;
+ s7 = x + x2 * s5 + 1.f;
+ s9 = x + x2 * s7 + 1.f;
+ s11 = x + x2 * s9 + 1.f;
+
+/* SET W = DEL(B) - DEL(A + B) */
+
+/* Computing 2nd power */
+ r__1 = 1.f / *b;
+ t = r__1 * r__1;
+ w = ((((c5 * s11 * t + c4 * s9) * t + c3 * s7) * t + c2 * s5) * t + c1 *
+ s3) * t + c0;
+ w *= c__ / *b;
+
+/* COMBINE THE RESULTS */
+
+ r__1 = *a / *b;
+ u = d__ * alnrel_(&r__1);
+ v = *a * (log(*b) - 1.f);
+ if (u <= v) {
+ goto L30;
+ }
+ ret_val = w - v - u;
+ return ret_val;
+L30:
+ ret_val = w - u - v;
+ return ret_val;
+} /* algdiv_ */
+
+double gamln_(double *a)
+{
+ /* Initialized data */
+
+ static double d__ = .418938533204673f;
+ static double c0 = .0833333333333333f;
+ static double c1 = -.00277777777760991f;
+ static double c2 = 7.9365066682539e-4f;
+ static double c3 = -5.9520293135187e-4f;
+ static double c4 = 8.37308034031215e-4f;
+ static double c5 = -.00165322962780713f;
+
+ /* System generated locals */
+ long int i__1;
+ double ret_val, r__1;
+
+ /* Builtin functions */
+ double log(double);
+
+ /* Local variables */
+ static long int i__, n;
+ static double t, w;
+ extern double gamln1_(double *);
+
+/* ----------------------------------------------------------------------- */
+/* EVALUATION OF LN(GAMMA(A)) FOR POSITIVE A */
+/* ----------------------------------------------------------------------- */
+/* WRITTEN BY ALFRED H. MORRIS */
+/* NAVAL SURFACE WARFARE CENTER */
+/* DAHLGREN, VIRGINIA */
+/* -------------------------- */
+/* D = 0.5*(LN(2*PI) - 1) */
+/* -------------------------- */
+/* -------------------------- */
+/* ----------------------------------------------------------------------- */
+ if (*a > .8f) {
+ goto L10;
+ }
+ ret_val = gamln1_(a) - log(*a);
+ return ret_val;
+L10:
+ if (*a > 2.25f) {
+ goto L20;
+ }
+ t = *a - .5f - .5f;
+ ret_val = gamln1_(&t);
+ return ret_val;
+
+L20:
+ if (*a >= 10.f) {
+ goto L30;
+ }
+ n = *a - 1.25f;
+ t = *a;
+ w = 1.f;
+ i__1 = n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ t += -1.f;
+/* L21: */
+ w = t * w;
+ }
+ r__1 = t - 1.f;
+ ret_val = gamln1_(&r__1) + log(w);
+ return ret_val;
+
+L30:
+/* Computing 2nd power */
+ r__1 = 1.f / *a;
+ t = r__1 * r__1;
+ w = (((((c5 * t + c4) * t + c3) * t + c2) * t + c1) * t + c0) / *a;
+ ret_val = d__ + w + (*a - .5f) * (log(*a) - 1.f);
+ return ret_val;
+} /* gamln_ */
+
diff --git a/segemehl/libs/708.h b/segemehl/libs/708.h
new file mode 100644
index 0000000..4eff7ac
--- /dev/null
+++ b/segemehl/libs/708.h
@@ -0,0 +1,15 @@
+
+/*
+ *
+ * 708.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 11/03/2013 09:01:21 AM EST
+ *
+ */
+
+int bratio_(double *a, double *b, double *x, double *y, double *w,
+ double *w1, long int *ierr);
+
diff --git a/segemehl/libs/SAX.c b/segemehl/libs/SAX.c
new file mode 100644
index 0000000..cdf514b
--- /dev/null
+++ b/segemehl/libs/SAX.c
@@ -0,0 +1,443 @@
+
+/*
+ * SAX.c
+ * Keoghs symbolic aggregate approximation
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 07/06/2011 09:19:20 PM CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <ctype.h>
+#include "basic-types.h"
+#include "info.h"
+#include "manopt.h"
+#include "stringutils.h"
+#include "mathematics.h"
+#include "fileio.h"
+#include "SAX.h"
+
+/*---------------------------------- bl_SAX ----------------------------------
+ *
+ * @brief piecewise aggregate approximation of a 2d timeseries of length n
+ * into w windows and creation of its symbolic representation.
+ * @author Steve Hoffmann
+ *
+ */
+
+char *
+bl_SAX (void *space, double *C, Uint n, Uint w, Uint b, Uint *P, Uint **SxP)
+{
+
+ Uint i, j;
+ double *Cb;
+ char *Sx;
+
+ assert(b > 0 && b < 5);
+
+ Cb = ALLOCMEMORY(space, NULL, double, w);
+ Sx = ALLOCMEMORY(space, NULL, char, w+1);
+
+ for(i=0; i < w; i++) {
+ Cb[i] = 0;
+ (*SxP)[i] = P[(n/w)*(i)+1];
+ for(j=(n/w)*(i)+1; j < (n/w)*(i+1); j++) {
+ Cb[i] += C[j];
+ }
+ // fprintf(stderr,"%d -> %f/%f=%f\n", j, Cb[i], (double)w/n, Cb[i]/((double)w/n));
+ Cb[i]/=(double)w/n;
+ }
+
+
+ for(i=0; i< w; i++) {
+ Sx[i] = equichar[b-1][0];
+ for(j=0; j < b; j++) {
+ if(Cb[i] > equiprob[b-1][j]) {
+ Sx[i] = equichar[b-1][j+1];
+ }
+ }
+ // fprintf(stderr,"Cb[%d]=%fi -> %c\n", i, Cb[i], Sx[i]);
+ }
+
+ FREEMEMORY(space, Cb);
+
+ Sx[w] = 0;
+ return Sx;
+}
+
+
+Uint
+descendRSS(void *space, double *Q, Uint u, Uint k, Uint v) {
+ Uint i, min_1= k, min_2 = k;
+ double mu_1 = .0, mu_2 = .0;
+
+ if(u >= k || k >= v) {
+ fprintf(stderr, "%d < %d < %d ?\n", u, k, v);
+ return k;
+ exit(-1);
+ }
+ assert( u < k && k < v );
+
+ for(i=u; i < k; i++) {
+ mu_1 += Q[i];
+ if(Q[i] < Q[min_1] || Q[min_1] < 5.0) {
+ min_1 = i;
+ }
+ }
+ mu_1 /= k-u+1;
+
+ for(i=k+1; i < v; i++) {
+ mu_2 += Q[i];
+ if(Q[i] < Q[min_2] && Q[min_2] > 5.0) {
+ min_2 = i;
+ }
+ }
+ mu_2 /= v-k+1;
+
+ if(mu_1 < mu_2) {
+ return min_1;
+ } else {
+ return min_2;
+ }
+
+ return k;
+}
+
+
+void
+slidingdiffRSS(void *space, double *Q, Uint n, Uint w, Uint off) {
+ double f = 0.12;
+ Uint i, j, u, minarg2, noofbreaks = (w/(f*w)), k=0,
+ next = 0, last = 0, pos = 0, *bl = NULL, *bl2=NULL, q=0;
+ double epsilon = 0.05, sum = 0, t = w*0.10, lsum, rsum, *D;
+ breakpoints_t *bp2;
+
+ fprintf(stderr, "looking for %d breakpoints\n", noofbreaks);
+ D = ALLOCMEMORY(space, NULL, double, w+1);
+
+ for(i=0; i < n-w; i+=off) {
+ for(j=0, sum=0; j< w; j++) {
+ sum += Q[i+j];
+ D[j] = Q[i+j+1] - Q[i+j];
+ }
+
+ if(sum > t) {
+
+ bp2 = bl_RSSmatrix(space, D, w, f*w, noofbreaks);
+ minarg2 = 0;
+ for(j=1; j < noofbreaks+1; j++) {
+ if(bp2[j].RSS > .00000000000000000001 &&
+ bp2[j].BIC < bp2[minarg2].BIC-(epsilon*bp2[minarg2].BIC)) {
+ minarg2 = j;
+ }
+ }
+
+
+ if(bp2[minarg2].RSS > 0.0) {
+
+
+ last = 0;
+
+ for(j=0; j < bp2[minarg2].noofbreaks; j++) {
+ if(j < bp2[minarg2].noofbreaks-1) {
+ next = bp2[minarg2].breaks[j+1];
+ } else {
+ next = w-1;
+ }
+
+ pos = bp2[minarg2].breaks[j];
+
+ for(u=last, lsum=0; u < pos; u++) {
+ //lsum += D[u];
+ lsum += Q[i+u];
+
+ }
+ for(u=pos, rsum=0; u < next; u++) {
+ //rsum += D[u];
+ rsum += Q[i+u];
+ }
+ fprintf(stderr, "entered minarg2 to register %d breaks (l:%f,r:%f)\n",
+ bp2[minarg2].noofbreaks, lsum, rsum);
+ if(lsum/(pos-last) > 5.0 || rsum/(next-pos) > 5.0 ) {
+ if(q==0 || bl2[q-1] < i+pos-20) {
+ bl2 = ALLOCMEMORY(space, bl2, Uint, q+1);
+ bl2[q] = i+pos;
+ q++;
+ }
+ }
+ last = bp2[minarg2].breaks[j];
+ }
+ }
+
+
+ for(j=1; j < noofbreaks+1; j++) {
+ if(bp2[j].noofbreaks)
+ FREEMEMORY(space, bp2[j].breaks);
+ }
+ FREEMEMORY(space, bp2);
+
+ }
+ if(k > 1000) break;
+ }
+
+ fprintf(stdout, "track name=events description=\"transcription events\" useScore=0\n");
+ for(i=0; i < k; i++) {
+ fprintf(stdout, "%s\t%d\t%d\tevent\n", "chr15", bl[i], bl[i]);
+ }
+
+
+ fprintf(stdout, "track name=changes description=\"transcription changes\" useScore=0\n");
+ for(i=0; i < q; i++) {
+ fprintf(stdout, "%s\t%d\t%d\tchange\n", "chr15", bl2[i], bl2[i]);
+ }
+}
+
+
+void
+slidingRSS(void *space, double *Q, Uint n, Uint w, Uint off) {
+ double f = 0.12;
+ Uint i, j, u, minarg, noofbreaks = (w/(f*w)), k=0,
+ next = 0, last = 0, pos = 0, *bl = NULL, *bl2=NULL, q=0;
+ double epsilon = 0.05, sum = 0, t = w*0.10, lsum, rsum;
+ Uint min_1, min_2, min;
+ double mu_1 = .0, mu_2 = .0;
+
+ breakpoints_t *bp;
+
+ fprintf(stderr, "looking for %d breakpoints\n", noofbreaks);
+
+ for(i=0; i < n-w; i+=off) {
+ for(j=0, sum=0; j< w; j++) {
+ sum += Q[i+j];
+ }
+
+ if(sum > t) {
+ bp = bl_RSSmatrix(space, &Q[i], w, f*w, noofbreaks);
+
+ minarg = 0;
+ for(j=1; j < noofbreaks+1; j++) {
+ if(bp[j].RSS > .00000000000000000001 &&
+ bp[j].BIC < bp[minarg].BIC-(epsilon*bp[minarg].BIC)) {
+ minarg = j;
+ }
+ }
+
+
+ if(bp[minarg].RSS > 0.0) {
+ for(j=0; j < bp[minarg].noofbreaks; j++) {
+ bp[minarg].breaks[j] += i;
+ }
+
+ /*
+ //correction
+
+ if(bp[minarg].noofbreaks) {
+ u = k;
+ while (u >= 1 && bp[minarg].breaks[0] <= bl[u-1]){
+ u--;
+ }
+
+ if(u > 0)
+ last = bl[u-1];
+ else
+ last = 0;
+ }
+
+ for(j=0; j < bp[minarg].noofbreaks; j++) {
+
+ if(j < bp[minarg].noofbreaks-1) {
+ next = bp[minarg].breaks[j+1];
+ } else {
+ next = i+w-1;
+ }
+
+ if(bp[minarg].breaks[j] > 50)
+ last = MAX(bp[minarg].breaks[j]-50, last);
+
+ //fprintf(stderr, "last:%d\n", last);
+ pos = descendRSS(space, Q, last, bp[minarg].breaks[j],
+ MIN(next, bp[minarg].breaks[j]+50));
+
+ bp[minarg].breaks[j] = pos;
+ last = pos;
+ } */
+
+ //evaluate and register
+ last = 0;
+ for(j=0; j < bp[minarg].noofbreaks; j++) {
+ if(j < bp[minarg].noofbreaks-1) {
+ next = bp[minarg].breaks[j+1];
+ } else {
+ next = i+w-1;
+ }
+
+ pos = bp[minarg].breaks[j];
+
+ if(pos > 100) last = MAX(last, pos - 100);
+ if(next > pos + 100) next = pos+100;
+
+ min_1 = pos;
+ for(u=last, lsum=0; u < pos; u++) {
+ lsum += Q[u];
+ if(Q[u] < Q[min_1] || Q[u] < 5.0) {
+ min_1 = u;
+ }
+ }
+
+ mu_1 = (double)lsum/(pos-last+1);
+
+ min_2 = pos;
+ for(u=pos, rsum=0; u < next; u++) {
+ rsum += Q[u];
+ if(Q[u] < Q[min_2] && Q[min_2] > 5.0) {
+ min_2 = u;
+ }
+ }
+
+ mu_2 = (double)rsum/(next-pos+1);
+
+ if(mu_1 < mu_2) {
+ min = min_1;
+ } else {
+ min = min_2;
+ }
+
+ if(lsum/(pos-last) > 5.0 || rsum/(next-pos) > 5.0 ) {
+ if(k==0 || bl[k-1] < min-20) {
+ bl = ALLOCMEMORY(space, bl, Uint, k+1);
+ bl[k] = min;
+ fprintf(stderr, "%d\t", min);
+ k++;
+ } else {
+ fprintf(stderr, "%d*\t", min);
+ }
+ }
+ last = bp[minarg].breaks[j];
+ }
+
+ fprintf(stderr, "\n");
+ }
+
+
+
+ for(j=1; j < noofbreaks+1; j++) {
+ if(bp[j].noofbreaks)
+ FREEMEMORY(space, bp[j].breaks);
+ }
+ FREEMEMORY(space, bp);
+
+ }
+ if(k > 1000) break;
+ }
+
+ fprintf(stdout, "track name=events description=\"transcription events\" useScore=0\n");
+ for(i=0; i < k; i++) {
+ fprintf(stdout, "%s\t%d\t%d\tevent\n", "chr15", bl[i], bl[i]);
+ }
+
+
+ fprintf(stdout, "track name=changes description=\"transcription changes\" useScore=0\n");
+ for(i=0; i < q; i++) {
+ fprintf(stdout, "%s\t%d\t%d\tchange\n", "chr15", bl2[i], bl2[i]);
+ }
+
+
+}
+
+
+int
+main(int argc, char **argv) {
+ stringset_t **csv;
+ double *C, sum=0, mu, *Q;
+ Uint *P, *SxP;
+ char *Sx;
+ Uint lines, i, k=0;
+ void *space = NULL;
+ //breakpoints_t *bp; Uint j; Uint d; Uint noofbreaks=5;
+
+ csv = readcsv(space, "startdist2.out", "\t", &lines);
+// C = ALLOCMEMORY(space, NULL, double, lines);
+// P = ALLOCMEMORY(space, NULL, Uint, lines);
+ Q = ALLOCMEMORY(space, NULL, double, lines);
+
+ /*
+ for(i=0; i < lines-1; i++) {
+ if((atoi(csv[i]->strings[0].str) > 5 || atoi(csv[i+1]->strings[0].str) > 5) &&
+ ((d=atof(csv[i]->strings[0].str) - atof(csv[i+1]->strings[0].str)) > 0 || d < 0)){
+ P[k] = i;
+ C[k] = (d > 0) ? d-1 : d+1;
+ sum += C[k];
+ k++;
+ }
+ }*/
+ fprintf(stderr, "read csv.\n");
+
+ for(i=0; i < lines; i++) {
+ Q[i] = atof(csv[i]->strings[0].str);
+ }
+
+ fprintf(stderr, "converted csv.\n");
+ fprintf(stderr, "starting segmentation.\n");
+
+ slidingRSS(space, Q, lines-1, 1000, 500);
+
+/*
+ bp = bl_RSSmatrix (space, &Q[0], 100, 15, noofbreaks);
+ for(i=0; i < noofbreaks+1; i++) {
+ fprintf(stderr, "rss=%f\tll=%f\tBIC=%f\t", bp[i].RSS, bp[i].LL, bp[i].BIC);
+ for(j=0; j < bp[i].noofbreaks; j++) {
+ fprintf(stderr, "%d\t", bp[i].breaks[j]);
+ }
+ fprintf(stderr, "\n");
+ }
+ */
+ exit(-1);
+
+
+ for(i=0; i < lines; i++) {
+ destructStringset(space, csv[i]);
+ }
+
+ FREEMEMORY(space, csv);
+
+ mu = sum/k;
+
+ //fprintf(stderr, "mean:%f\n", mu);
+ for(i=0; i < k; i++) {
+ /*center*/
+ C[i]-=mu;
+
+ // fprintf(stderr, "%f\n", C[i]);
+ /*normalize*/
+ C[i]/=sum;
+ }
+
+ // fprintf(stdout, "normalized:\n");
+ for(i=0; i < k; i++) {
+ fprintf(stdout, "%d\t%f\n", P[i], C[i]);
+ }
+
+ SxP = ALLOCMEMORY(space, NULL, Uint, (k/20)+1);
+ Sx = bl_SAX(space, C, k, k/20, 4, P, &SxP);
+
+ fprintf(stderr, "track\tname=SAX\tdescription=\"SAX of expression\"\n");
+
+ for(i=0; i < k/20; i++) {
+ fprintf(stderr, "chr15\t%d\t%d\t%c\n", SxP[i], SxP[i]+k/(k/20)-1, Sx[i]);
+ }
+
+// fprintf(stderr, "%s\n", Sx);
+
+ FREEMEMORY(space, Sx);
+ FREEMEMORY(space, SxP);
+ FREEMEMORY(space, C);
+ FREEMEMORY(space, P);
+ FREEMEMORY(space, Q);
+
+ return 0;
+}
diff --git a/segemehl/libs/SAX.h b/segemehl/libs/SAX.h
new file mode 100644
index 0000000..abbd169
--- /dev/null
+++ b/segemehl/libs/SAX.h
@@ -0,0 +1,21 @@
+#ifndef SAX_H
+#define SAX_H
+
+/*
+ *
+ * SAX.h
+ * Keoghs symbolic aggregate approximation
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 07/06/2011 09:38:28 PM CEST
+ *
+ */
+
+
+const double equiprob[4][4] = { {.0,.0,.0,.0}, {-.43,.43,.0,.0}, {-.67,.0,.67,.0}, {-.84,-.25,.25,.84}};
+const char equichar[4][5] = {{'a','b','-','-','-'}, {'a','b','c','-','-'}, {'a','b','c','d','-'}, {'a','b','c','d','e'}};
+
+char* bl_SAX (void *space, double *C, Uint n, Uint w, Uint b, Uint *P, Uint **SxP);
+
+#endif
diff --git a/segemehl/libs/alignment.c b/segemehl/libs/alignment.c
new file mode 100644
index 0000000..8ddbc44
--- /dev/null
+++ b/segemehl/libs/alignment.c
@@ -0,0 +1,1079 @@
+
+/*
+ * alignment.c
+ * implementation to handle alignments
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 02/03/2009 02:50:06 PM CET
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <ctype.h>
+#include <math.h>
+#include "mathematics.h"
+#include "basic-types.h"
+#include "alignment.h"
+#include "iupac.h"
+#include "debug.h"
+#include "charsequence.h"
+
+const char decodeEop[] = {'R','D','I'};
+const char ntdecode[] = {'A', 'C', 'G', 'T', '-', 'N' };
+
+char*
+getNTcodekey(void *space) {
+ char *code;
+ code = ALLOCMEMORY(space, NULL, char, 256);
+ memset(code, 5, sizeof(char)*256);
+
+ code['A'] = 0;
+ code['a'] = 0;
+ code['C'] = 1;
+ code['c'] = 1;
+ code['G'] = 2;
+ code['g'] = 2;
+ code['T'] = 3;
+ code['t'] = 3;
+ code['-'] = 4;
+
+ return code;
+}
+
+void
+initAlignment(Alignment *al,
+ char *u, Uint ulen, Uint uoff,
+ char *v, Uint vlen, Uint voff) {
+
+ assert(uoff < ulen && voff < vlen);
+
+ al->u = u;
+ al->v = v;
+ al->ulen = ulen;
+ al->vlen = vlen;
+ al->uoff = uoff;
+ al->voff = voff;
+ al->numofmeops = 0;
+ al->meops = malloc(sizeof(Multieop)*(ulen+vlen));
+ memset(al->meops, 0, sizeof(Multieop)*(ulen+vlen));
+}
+
+void
+wrapAlignment(Alignment *al) {
+ free(al->meops);
+ al->numofmeops = 0;
+ al->u = NULL;
+ al->v = NULL;
+ al->vlen = 0;
+ al->ulen = 0;
+ al->uoff = 0;
+ al->voff = 0;
+}
+
+void
+copyAlignment(Alignment *to, Alignment *from) {
+
+ to->meops = malloc(sizeof(Multieop)*(from->ulen+from->vlen));
+ memmove(to->meops, from->meops, sizeof(Multieop)*(from->ulen+from->vlen));
+ to->numofmeops = from->numofmeops;
+ to->u = from->u;
+ to->v = from->v;
+ to->vlen = from->vlen;
+ to->ulen = from->ulen;
+ to->uoff = from->uoff;
+ to->voff = from->voff;
+}
+
+void
+countEops(Alignment *al, Uint *mat, Uint *mis, Uint *ins, Uint *del) {
+ Uint i,j,k=0,l=0;
+
+ *mat = 0;
+ *mis = 0;
+ *ins = 0;
+ *del = 0;
+
+ for(i=0; i < al->numofmeops; i++) {
+ if(al->meops[i].eop != Replacement) {
+ if (al->meops[i].eop == Deletion) {
+ *del += al->meops[i].steps;
+ l += al->meops[i].steps;
+ } else {
+ *ins += al->meops[i].steps;
+ k += al->meops[i].steps;
+ }
+ } else {
+ for(j=0; j < al->meops[i].steps; j++) {
+ if(!matchIUPAC(al->u[k+al->uoff], al->v[l+al->voff]))
+ *mis += 1;
+ else
+ *mat += 1;
+
+ k++; l++;
+ }
+ }
+ }
+ return;
+}
+
+Uint
+getEdist(Alignment *al) {
+ Uint i,j,k=0,l=0;
+ Uint edist=0;
+
+ for(i=0; i < al->numofmeops; i++) {
+ if(al->meops[i].eop != Replacement) {
+ edist += al->meops[i].steps;
+ if (al->meops[i].eop == Deletion) {
+ l+= al->meops[i].steps;
+ } else {
+ k+= al->meops[i].steps;
+ }
+ } else {
+ for(j=0; j < al->meops[i].steps; j++) {
+ if(!matchIUPAC(al->u[k+al->uoff], al->v[l+al->voff])) edist++;
+ k++; l++;
+ }
+ }
+ }
+ return edist;
+}
+
+Uint
+getBisulfiteMismatches(Alignment *al, Uint bisulfite){
+ Uint i,j,k=0,l=0;
+ Uint mis=0;
+ char *seq;
+
+ seq = malloc(al->ulen);
+ memmove(seq, al->u, al->ulen);
+ bl_reconvertBisulfite(seq, al->ulen, bisulfite);
+
+ for(i=0; i < al->numofmeops; i++) {
+ if(al->meops[i].eop != Replacement) {
+ if (al->meops[i].eop == Deletion) {
+ l+= al->meops[i].steps;
+ } else {
+ k+= al->meops[i].steps;
+ }
+ } else {
+ for(j=0; j < al->meops[i].steps; j++) {
+ /* bisulfite mismatch: IUPAC match on converted query but mismatch on recoverted one */
+ if(matchIUPAC(al->u[k+al->uoff], al->v[l+al->voff]) &&
+ seq[k+al->uoff] != al->v[l+al->voff]) mis++;
+ k++; l++;
+ }
+ }
+ }
+ free(seq);
+ return mis;
+}
+
+Uint getWrongStrandBisulfiteMismatches(Alignment *al, Uint bisulfite){
+ Uint i,j,k=0,l=0;
+ Uint mis=0;
+ char *seq;
+
+ seq = malloc(al->ulen);
+ memmove(seq, al->u, al->ulen);
+ /* get other bisulfite run by altering bisulfite (e.g. 1=>2, 2=>1) */
+ bl_convertBisulfite(seq, al->ulen, (bisulfite % 2) ? bisulfite + 1 : bisulfite - 1, 0);
+
+ for(i=0; i < al->numofmeops; i++) {
+ if(al->meops[i].eop != Replacement) {
+ if (al->meops[i].eop == Deletion) {
+ l+= al->meops[i].steps;
+ } else {
+ k+= al->meops[i].steps;
+ }
+ } else {
+ for(j=0; j < al->meops[i].steps; j++) {
+ /*
+ * wrong strand bisulfite mismatch: IUPAC mismatch on converted query
+ * but match on coverted one in other bisulfite run
+ */
+ if(!matchIUPAC(al->u[k+al->uoff], al->v[l+al->voff]) &&
+ matchIUPAC(seq[k+al->uoff], al->v[l+al->voff])) mis++;
+ k++; l++;
+ }
+ }
+ }
+ free(seq);
+ return mis;
+}
+
+void
+getSoftClipScores(Alignment *al, int polyAlen, int *scores, int indel,
+ int *pAscr, int *adscr, int *adlen) {
+
+ Uint i,j,k=0,l=0;
+ int polyAscore=0, adapterScore=0, adapterLen=0;
+
+ for(i=0; i < al->numofmeops; i++) {
+
+ if(k+al->uoff < polyAlen) {
+ if(al->meops[i].eop != Replacement) {
+ polyAscore += indel * al->meops[i].steps;
+ if (al->meops[i].eop == Deletion) {
+ l+= al->meops[i].steps;
+ } else {
+ k+= al->meops[i].steps;
+ }
+ } else {
+ for(j=0; j < al->meops[i].steps; j++) {
+ if(al->u[k+al->uoff] != 'N' && al->v[k+al->voff] != 'N'
+ && !matchIUPAC(al->u[k+al->uoff], al->v[l+al->voff])) {
+ polyAscore += scores[1];
+ } else {
+ polyAscore += scores[0];
+ }
+
+ k++; l++;
+ }
+ }
+ } else {
+ adapterLen++;
+ if(al->meops[i].eop != Replacement) {
+ adapterScore += indel * al->meops[i].steps;
+ if (al->meops[i].eop == Deletion) {
+ l+= al->meops[i].steps;
+ } else {
+ k+= al->meops[i].steps;
+ }
+ } else {
+ for(j=0; j < al->meops[i].steps; j++) {
+ if(!matchIUPAC(al->u[k+al->uoff], al->v[l+al->voff])) {
+ adapterScore += scores[1];
+ } else {
+ adapterScore += scores[0];
+ }
+
+ k++; l++;
+ }
+ }
+ }
+
+ }
+
+ *pAscr = polyAscore;
+ *adscr = adapterScore;
+ *adlen = adapterLen;
+
+ return;
+}
+
+int
+getSWScore(Alignment *al, int *scores, int indel) {
+ Uint i,j,k=0,l=0;
+ int score=0;
+
+ for(i=0; i < al->numofmeops; i++) {
+
+ if(al->meops[i].eop != Replacement) {
+ score += indel * al->meops[i].steps;
+ if (al->meops[i].eop == Deletion) {
+ l+= al->meops[i].steps;
+ } else {
+ k+= al->meops[i].steps;
+ }
+ } else {
+ for(j=0; j < al->meops[i].steps; j++) {
+ if(!matchIUPAC(al->u[k+al->uoff], al->v[l+al->voff])) {
+ score += scores[1];
+ } else {
+ score += scores[0];
+ }
+
+ k++; l++;
+ }
+ }
+ }
+ return score;
+}
+
+int
+getAlignScore(Alignment *al, int *scores, int indel) {
+ Uint i,j,k=0,l=0;
+ int score=0;
+
+ for(i=0; i < al->numofmeops; i++) {
+
+ if(al->meops[i].eop != Replacement) {
+ score += indel * al->meops[i].steps;
+ if (al->meops[i].eop == Deletion) {
+ l+= al->meops[i].steps;
+ } else {
+ k+= al->meops[i].steps;
+ }
+ } else {
+ for(j=0; j < al->meops[i].steps; j++) {
+ if(!matchIUPAC(al->u[k+al->uoff], al->v[l+al->voff])) {
+ score += scores[1];
+ } else {
+ score += scores[0];
+ }
+
+ k++; l++;
+ }
+ }
+ }
+ return score;
+}
+
+
+
+int
+getSubstringEdist(Alignment *al, Uint u, Uint v) {
+
+ Uint i,j,k=0,l=0;
+ Uint edist=0, scr=0;
+ for(i=0; i < al->numofmeops; i++) {
+ if(al->meops[i].eop != Replacement) {
+ if(k >= u && k < v)
+ edist += al->meops[i].steps;
+ if (al->meops[i].eop == Deletion) {
+ l+= al->meops[i].steps;
+ } else {
+ k+= al->meops[i].steps;
+ }
+ } else {
+ for(j=0; j < al->meops[i].steps; j++) {
+ if(k >=u && k < v && !matchIUPAC(al->u[k+al->uoff], al->v[l+al->voff]))
+ edist++;
+ else if (k >=u && k<v) scr += 1;
+ k++; l++;
+ }
+ }
+ }
+ return scr;
+
+}
+
+//dumps visual representation of alignments and should be shorter!
+void
+showmultieoplist(FILE *dev, Alignment *al) {
+
+ Uint i=0;
+ fprintf(dev, "[");
+ if(al->numofmeops) {
+ for(i=0; i < al->numofmeops; i++) {
+ fprintf(dev, "%c %d, ", decodeEop[al->meops[i].eop], al->meops[i].steps);
+ }
+ fprintf(dev, "%c %d",decodeEop[al->meops[i].eop], al->meops[i].steps);
+ }
+ fprintf(dev, "]\n");
+}
+
+//dumps visual representation of alignments and should be shorter!
+char *
+multieopstring(Alignment *al, Uint leftclip, Uint rightclip, unsigned char rev) {
+ Uint i, j, k, q=0, p=0, cur=0, strsize, steps, msteps, ssteps;
+ char *meopstr;
+ char eopc=0;
+
+ meopstr = (char*) malloc(sizeof(char)*(3*(al->vlen+al->ulen+leftclip+rightclip)+1));
+
+ if(leftclip || (rightclip && rev)) {
+ steps = (rev) ? rightclip : leftclip;
+ //strsize = floor(log(steps)/log(10))+3;
+ strsize = snprintf(NULL, 0, "%d", steps)+2;
+ meopstr[cur] = 'C';
+ sprintf(&meopstr[cur+1], "%d;", steps);
+ cur+=strsize;
+ }
+
+ for(k=0; k < al->numofmeops; k++) {
+ i = (rev) ? al->numofmeops - k -1 : k;
+ //if Replacement occured
+ steps=0;
+ if (al->meops[i].eop == Replacement) {
+ //iter over all steps
+ msteps=0;
+ ssteps=0;
+ for (j=0; j < al->meops[i].steps; j++) {
+ if (!matchIUPAC(al->u[j+p+al->uoff], al->v[j+q+al->voff])) {
+ if (j==0 || eopc == 'S') {
+ ssteps++;
+ } else {
+ //strsize = floor(log(msteps)/log(10))+3;
+ strsize = snprintf(NULL, 0, "%d", msteps)+2;
+ meopstr[cur] = eopc;
+ sprintf(&meopstr[cur+1], "%d;", msteps);
+ cur+=strsize;
+ msteps=0;
+ ssteps=1;
+ }
+ eopc = 'S';
+ } else {
+ if (j==0 || eopc == 'M') {
+ msteps++;
+ } else {
+ //strsize = floor(log(ssteps)/log(10))+3;
+ strsize = snprintf(NULL, 0, "%d", ssteps)+2;
+ meopstr[cur] = eopc;
+ sprintf(&meopstr[cur+1], "%d;", ssteps);
+ cur+=strsize;
+ msteps=1;
+ ssteps=0;
+ }
+ eopc = 'M';
+ }
+ }
+ steps = msteps + ssteps;
+ assert(msteps == 0 || ssteps == 0);
+ //set string ptrs
+ p+=j;
+ q+=j;
+ }
+ //if deletion occured
+ if (al->meops[i].eop == Deletion) {
+ eopc = 'D';
+ //set ptrs
+ steps = al->meops[i].steps;
+ q+=steps;
+ }
+ //if insertions occured
+ if(al->meops[i].eop == Insertion) {
+ eopc = 'I';
+ steps = al->meops[i].steps;
+ p+=steps;
+ }
+
+ //strsize = floor(log(steps)/log(10))+3;
+ strsize = snprintf(NULL, 0, "%d", steps)+2;
+ meopstr[cur] = eopc;
+ sprintf(&meopstr[cur+1], "%d;", steps);
+ cur+=strsize;
+ }
+
+ if(rightclip || (leftclip && rev)) {
+ steps = (rev) ? leftclip : rightclip;
+ //strsize = floor(log(steps)/log(10))+3;
+ strsize = snprintf(NULL, 0, "%d", steps)+2;
+ meopstr[cur] = 'C';
+ sprintf(&meopstr[cur+1], "%d;", steps);
+ cur+=strsize;
+ }
+ return meopstr;
+}
+
+char*
+mdstring(Alignment *al, unsigned char rev) {
+ Uint i, j, k, q=0, p=0, cur=0, strsize, steps, msteps=0, ssteps=0;
+ char *mdstr;
+ char eopc=0;
+
+ mdstr = (char*) malloc(sizeof(char)*(3*(al->vlen+al->ulen)+1));
+ memset(mdstr, 0, sizeof(char)*(3*(al->vlen+al->ulen)+1));
+
+ msteps=0;
+ ssteps=0;
+ for(k=0; k < al->numofmeops; k++) {
+ i = (rev) ? al->numofmeops - k - 1 : k;
+ //if Replacement occured
+ steps=0;
+
+ if (al->meops[i].eop == Replacement) {
+ //iter over all steps
+
+ for (j=0; j < al->meops[i].steps; j++) {
+ if (!matchIUPAC(al->u[j+p+al->uoff], al->v[j+q+al->voff])) {
+ if(msteps) {
+ //strsize = floor(log(msteps)/log(10))+1;
+ strsize = snprintf(NULL, 0, "%d", msteps);
+ sprintf(&mdstr[cur], "%d", msteps);
+ cur += strsize;
+ msteps = 0;
+ }
+ if(eopc != 'M') {
+ sprintf(&mdstr[cur], "0");
+ cur += 1;
+ }
+ sprintf(&mdstr[cur], "%c", al->v[j+q+al->voff]);
+ cur += 1;
+ ssteps++;
+ eopc = 'S';
+ } else {
+ if (msteps) {
+ msteps++;
+ } else {
+ msteps=1;
+ ssteps=0;
+ }
+ eopc = 'M';
+ }
+ }
+ steps = msteps + ssteps;
+ assert(msteps == 0 || ssteps == 0);
+ //set string ptrs
+ p+=j;
+ q+=j;
+ }
+
+ //if deletion occured
+ if (al->meops[i].eop == Deletion) {
+ if (msteps) {
+ //strsize = floor(log(msteps)/log(10))+1;
+ strsize = snprintf(NULL, 0, "%d", msteps);
+ sprintf(&mdstr[cur], "%d", msteps);
+ cur += strsize;
+ msteps = 0;
+ } else {
+ sprintf(&mdstr[cur], "0");
+ cur += 1;
+ }
+
+ eopc = 'D';
+ //set ptrs
+ steps = al->meops[i].steps;
+ sprintf(&mdstr[cur], "^");
+ cur+=1;
+ for(j=0; j < steps; j++) {
+ sprintf(&mdstr[cur], "%c", al->v[j+q+al->voff]);
+ cur+=1;
+ }
+ q+=steps;
+ }
+
+ //if insertions occured
+ if(al->meops[i].eop == Insertion) {
+ //eopc = 'I';
+ steps = al->meops[i].steps;
+ p+=steps;
+ }
+ }
+
+ if(eopc != 'M') {
+ sprintf(&mdstr[cur], "0");
+ cur += 1;
+ }
+
+ if (msteps) {
+ //strsize = floor(log(msteps)/log(10))+1;
+ strsize = snprintf(NULL, 0, "%d", msteps);
+ sprintf(&mdstr[cur], "%d", msteps);
+ cur += strsize;
+ msteps = 0;
+ }
+
+ return mdstr;
+}
+
+
+/*-------------------------- bl_cigarGetAlignString --------------------------
+ *
+ * @brief decode a cigar string
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_cigarGetAlignString(char *cigar) {
+ Uint i, len, allen=0, cur=0;
+ char *buffer, *string = NULL;
+
+ len = strlen(cigar);
+ buffer = calloc(len, sizeof(char));
+
+ for(i=0; i < len; i++) {
+ switch (cigar[i]) {
+ case 'S':
+ string = realloc(string, allen+atoi(buffer)+1);
+ memset(&string[allen], 'S', atoi(buffer));
+ allen += atoi(buffer);
+ string[allen] = 0;
+ memset (buffer, 0, len);
+ cur = 0;
+ break;
+ case 'M':
+ case 'N':
+ string = realloc(string, allen+atoi(buffer)+1);
+ memset(&string[allen], 'M', atoi(buffer));
+ allen += atoi(buffer);
+ string[allen] = 0;
+ memset (buffer, 0, len);
+ cur = 0;
+ break;
+ case 'D':
+ string = realloc(string, allen+atoi(buffer)+1);
+ memset(&string[allen], 'D', atoi(buffer));
+ allen += atoi(buffer);
+ string[allen] = 0;
+ memset (buffer, 0, len);
+ cur = 0;
+ break;
+ case 'I':
+ string = realloc(string, allen+atoi(buffer)+1);
+ memset(&string[allen], 'I', atoi(buffer));
+ allen += atoi(buffer);
+ string[allen] = 0;
+ memset (buffer, 0, len);
+ cur =0;
+ break;
+ default :
+ buffer[cur++] = cigar[i];
+ }
+ }
+
+ free(buffer);
+ return string;
+}
+
+
+/*--------------------------- bl_cigarGetAlignLen ----------------------------
+ *
+ * @brief get alignment length from a cigar string
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_cigarGetAlignLen(char *cigar) {
+ Uint i, len, allen=0, cur=0;
+ char *buffer;
+
+ len = strlen(cigar);
+ buffer = calloc(len, sizeof(char));
+
+ for(i=0; i < len; i++) {
+ switch (cigar[i]) {
+ case 'M':
+ case 'X':
+ case '=':
+ allen += atoi(buffer);
+ memset (buffer, 0, len);
+ cur = 0;
+ break;
+ case 'I':
+ memset (buffer, 0, len);
+ cur = 0;
+ break;
+ case 'D':
+ allen += atoi(buffer);
+ memset (buffer, 0, len);
+ cur =0;
+ break;
+ case 'N':
+ memset (buffer, 0, len);
+ cur = 0;
+ break;
+ case 'S':
+ memset (buffer, 0, len);
+ cur = 0;
+ break;
+ case 'H':
+ memset (buffer, 0, len);
+ cur = 0;
+ break;
+ case 'P':
+ memset (buffer, 0, len);
+ cur = 0;
+ break;
+ default :
+ buffer[cur++] = cigar[i];
+ }
+ }
+
+ free(buffer);
+ return allen;
+}
+
+char*
+bl_mdGetDiffString(char *MD) {
+ Uint i, k=0, MDlen, allen=0, nops=0, buffersize=100;
+ char chr;
+ unsigned char del = 0;
+ char *buffer = NULL;
+ char *alignstr = NULL;
+
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ memset(buffer, 0, sizeof(char)*buffersize);
+ MDlen = strlen(MD);
+
+ for(i=0; i < MDlen; i++) {
+
+ if(!isalpha((int)MD[i]) && MD[i] != '^') {
+ if(k >= buffersize-2) {
+ buffersize += 100;
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ }
+ buffer[k] = MD[i];
+ buffer[k+1] = 0;
+ k++;
+ del = 0;
+ } else {
+
+ nops = atoi(buffer);
+ if(nops) {
+ alignstr = ALLOCMEMORY(space, alignstr, char, allen+nops+1);
+ memset(&alignstr[allen], 'M', nops);
+ alignstr[allen+nops] = 0;
+ allen+=nops;
+ }
+ memset(buffer, 0, sizeof(char)*buffersize);
+ k=0;
+
+ if(MD[i] == '^') {
+ del = 1;
+ } else {
+ chr = (del) ? 'D' : MD[i];
+ alignstr = ALLOCMEMORY(space, alignstr, char, allen+2);
+ alignstr[allen] = chr;
+ alignstr[allen+1] = 0;
+ allen += 1;
+ }
+ }
+ }
+
+ nops = atoi(buffer);
+ if(nops) {
+ alignstr = ALLOCMEMORY(space, alignstr, char, allen+nops+1);
+ memset(&alignstr[allen], 'M', nops);
+ alignstr[allen+nops] = 0;
+ allen+=nops;
+ }
+
+ FREEMEMORY(space, buffer);
+ return alignstr;
+}
+
+char*
+cigarstring(Alignment *al, Uint leftclip, Uint rightclip, char clipch, unsigned char rev) {
+ Uint i, j, k, q=0, p=0, cur=0, strsize, steps, msteps, ssteps;
+ char *meopstr;
+ char eopc=0;
+ meopstr = (char*) malloc(sizeof(char)*(3*(al->vlen+al->ulen+leftclip+rightclip)+1));
+
+ if(leftclip || (rightclip && rev)) {
+ steps = (rev) ? rightclip : leftclip;
+// strsize = floor(log(steps)/log(10))+2;
+ strsize = snprintf(NULL, 0, "%d", steps)+1;
+ sprintf(&meopstr[cur], "%d%c", steps, clipch);
+ cur+=strsize;
+ }
+
+ for(k=0; k < al->numofmeops; k++) {
+ i = (rev) ? al->numofmeops - k - 1 : k;
+ //if Replacement occured
+ steps=0;
+ if (al->meops[i].eop == Replacement) {
+ //iter over all steps
+ msteps=0;
+ ssteps=0;
+ for (j=0; j < al->meops[i].steps; j++) {
+ if (j==0 || eopc == 'M') {
+ ssteps++;
+ } else {
+ //strsize = floor(log(msteps)/log(10))+2;
+ strsize = snprintf(NULL, 0, "%d", msteps)+1;
+ sprintf(&meopstr[cur], "%d%c", msteps, eopc);
+ cur+=strsize;
+ msteps=0;
+ ssteps=1;
+ }
+ eopc = 'M';
+ }
+ steps = msteps + ssteps;
+ assert(msteps == 0 || ssteps == 0);
+ //set string ptrs
+ p+=j;
+ q+=j;
+ }
+ //if deletion occured
+ if (al->meops[i].eop == Deletion) {
+ eopc = 'D';
+ //set ptrs
+ steps = al->meops[i].steps;
+ q+=steps;
+ }
+ //if insertions occured
+ if(al->meops[i].eop == Insertion) {
+ eopc = 'I';
+ steps = al->meops[i].steps;
+ p+=steps;
+ }
+
+ //strsize = floor(log(steps)/log(10))+2;
+ strsize = snprintf(NULL, 0, "%d", steps)+1;
+ sprintf(&meopstr[cur], "%d%c", steps, eopc);
+ cur+=strsize;
+ }
+
+ if(rightclip || (leftclip && rev)) {
+ steps = (rev) ? leftclip : rightclip;
+ //strsize = floor(log(steps)/log(10))+2;
+ strsize = snprintf(NULL, 0, "%d", steps)+1;
+ sprintf(&meopstr[cur], "%d%c", steps, clipch);
+ cur+=strsize;
+ }
+ return meopstr;
+}
+
+//shows muliteoplist of all Alignments in *al
+void
+showDynmultieoplist(Alignment* al, int size) {
+
+ int i;
+ for (i=0; i < size; i++) {
+ showmultieoplist(stdout, &al[i]);
+ }
+}
+
+//dumps visual representation of alignments and should be shorter!
+void
+showAlign(Alignment* al, FILE *dev) {
+ int i, j , k, nlines, len;
+
+ Uint p = 0, q = 0, r = 0;
+ //output strings
+ char* utemp = (char*) malloc(sizeof(char)*(al->vlen+al->ulen));
+ char* vtemp = (char*) malloc(sizeof(char)*(al->vlen+al->ulen));
+ char* comp = (char*) malloc(sizeof(char)*(al->vlen+al->ulen));
+
+ //iter over all multieops
+ for (i=0; i < al->numofmeops; i++) {
+ //if Replacement occured
+ if (al->meops[i].eop == Replacement) {
+ //iter over all steps
+ for (j=0; j < al->meops[i].steps; j++) {
+ utemp[j+r] = al->u[j+p+al->uoff];
+ vtemp[j+r] = al->v[j+q+al->voff];
+ //real Replacement?
+ if (!matchIUPAC(utemp[j+r], vtemp[j+r]))
+ comp[j+r] = ' ';
+ else
+ comp[j+r] = '|';
+ }
+ //set string ptrs
+ p+=j;
+ q+=j;
+ r+=j;
+ }
+ //if deletion occured
+ if (al->meops[i].eop == Deletion) {
+ //iter over all steps
+ for (j=0; j < al->meops[i].steps; j++) {
+ utemp[j+r] = '-';
+ vtemp[j+r] = al->v[j+q+al->voff];
+ comp[j+r] = ' ';
+ }
+ //set ptrs
+ r+=j;
+ q+=j;
+ }
+ //if insertions occured
+ if (al->meops[i].eop == Insertion) {
+ for (j=0; j < al->meops[i].steps; j++) {
+ utemp[j+r] = al->u[j+p+al->uoff];
+ vtemp[j+r] = '-';
+ comp[j+r] = ' ';
+ }
+ r+=j;
+ p+=j;
+ }
+ if(i == al->numofmeops-1) {
+ //terminate strings
+ utemp[r]='\0';
+ vtemp[r]='\0';
+ comp[r] ='\0';
+
+ nlines = r/60;
+ nlines += (r % 60) ? 1 : 0;
+ //dump strings
+ for(k=0; k < nlines; k++) {
+ len = (k*60 > r) ? r % 60 : 60;
+ fprintf(dev, "%.*s\n", len, &utemp[k*60]);
+ fprintf(dev, "%.*s\n", len, &comp[k*60]);
+ fprintf(dev, "%.*s\n", len, &vtemp[k*60]);
+ }
+ fprintf(dev, "\n");
+ memset(utemp, 0, sizeof(char)*(al->ulen+al->vlen));
+ memset(comp, 0, sizeof(char)*(al->ulen+al->vlen));
+ memset(vtemp, 0, sizeof(char)*(al->ulen+al->vlen));
+ }
+ }
+
+ free(utemp);
+ free(comp);
+ free(vtemp);
+}
+
+//dumps visual representation of alignments and should be shorter!
+void
+showAlignLF(Alignment* al, FILE *dev, char lf) {
+ int i, j , k, nlines, len;
+
+ Uint p = 0, q = 0, r = 0;
+ //output strings
+ char* utemp = (char*) malloc(sizeof(char)*(al->vlen+al->ulen));
+ char* vtemp = (char*) malloc(sizeof(char)*(al->vlen+al->ulen));
+ char* comp = (char*) malloc(sizeof(char)*(al->vlen+al->ulen));
+
+ //iter over all multieops
+ for (i=0; i < al->numofmeops; i++) {
+ //if Replacement occured
+ if (al->meops[i].eop == Replacement) {
+ //iter over all steps
+ for (j=0; j < al->meops[i].steps; j++) {
+ utemp[j+r] = al->u[j+p+al->uoff];
+ vtemp[j+r] = al->v[j+q+al->voff];
+ //real Replacement?
+ if (!matchIUPAC(utemp[j+r], vtemp[j+r]))
+ comp[j+r] = ' ';
+ else
+ comp[j+r] = '|';
+ }
+ //set string ptrs
+ p+=j;
+ q+=j;
+ r+=j;
+ }
+ //if deletion occured
+ if (al->meops[i].eop == Deletion) {
+ //iter over all steps
+ for (j=0; j < al->meops[i].steps; j++) {
+ utemp[j+r] = '-';
+ vtemp[j+r] = al->v[j+q+al->voff];
+ comp[j+r] = ' ';
+ }
+ //set ptrs
+ r+=j;
+ q+=j;
+ }
+ //if insertions occured
+ if (al->meops[i].eop == Insertion) {
+ for (j=0; j < al->meops[i].steps; j++) {
+ utemp[j+r] = al->u[j+p+al->uoff];
+ vtemp[j+r] = '-';
+ comp[j+r] = ' ';
+ }
+ r+=j;
+ p+=j;
+ }
+ if(i == al->numofmeops-1) {
+ //terminate strings
+ utemp[r]='\0';
+ vtemp[r]='\0';
+ comp[r] ='\0';
+
+ nlines = r/60;
+ nlines += (r % 60) ? 1 : 0;
+ //dump strings
+ for(k=0; k < nlines; k++) {
+ len = (k*60 > r) ? r % 60 : 60;
+ fprintf(dev, "%.*s%c", len, &utemp[k*60], lf);
+ fprintf(dev, "%.*s%c", len, &comp[k*60], lf);
+ if(k < nlines-1)
+ fprintf(dev, "%.*s%c", len, &vtemp[k*60], lf);
+ else
+ fprintf(dev, "%.*s", len, &vtemp[k*60]);
+ }
+
+ memset(utemp, 0, sizeof(char)*(al->ulen+al->vlen));
+ memset(comp, 0, sizeof(char)*(al->ulen+al->vlen));
+ memset(vtemp, 0, sizeof(char)*(al->ulen+al->vlen));
+ }
+ }
+
+ free(utemp);
+ free(comp);
+ free(vtemp);
+}
+
+void
+insertEop(Alignment *al, Eoptype eop) {
+
+ //if previous multieops have been set up
+ if (al->numofmeops > 0) {
+ //inc steps if curr eop matches prev eops
+ if (al->meops[al->numofmeops-1].eop == eop) {
+ al->meops[al->numofmeops-1].steps++;
+ //set up new multieop otherwise
+ } else {
+ al->numofmeops++;
+ al->meops[al->numofmeops-1].eop = eop;
+ al->meops[al->numofmeops-1].steps = 1;
+ }
+ //set up first multieop
+ } else {
+ al->numofmeops = 1;
+ al->meops[0].eop = eop;
+ al->meops[0].steps = 1;
+ }
+}
+
+
+void
+revMeops(Alignment *al) {
+ Uint start = 0;
+ Uint end = al->numofmeops-1;
+ Multieop *meops = al->meops;
+
+ if (al->numofmeops == 0) return;
+
+ while (start<end) {
+ meops[start].eop ^= meops[end].eop;
+ meops[start].steps ^= meops[end].steps;
+ meops[end].eop ^= meops[start].eop;
+ meops[end].steps ^= meops[start].steps;
+ meops[start].eop ^= meops[end].eop;
+ meops[start].steps ^= meops[end].steps;
+
+ start++;
+ end--;
+ }
+}
+
+
+Uint
+getValignlen(Alignment *al) {
+ Uint i, vallen=0, steps;
+ Eoptype eop;
+
+ for(i=0; i < al->numofmeops; i++) {
+ eop = al->meops[i].eop;
+ steps = al->meops[i].steps;
+ switch(eop) {
+ case Replacement:
+ vallen += steps;
+ break;
+ case Deletion:
+ vallen += steps;
+ break;
+ case Insertion:
+ break;
+ }
+ }
+
+ return vallen;
+}
+
+Uint
+getUalignlen(Alignment *al) {
+ Uint i, uallen=0, steps;
+ Eoptype eop;
+
+ for(i=0; i < al->numofmeops; i++) {
+ eop = al->meops[i].eop;
+ steps = al->meops[i].steps;
+ switch(eop) {
+ case Replacement:
+ uallen += steps;
+ break;
+ case Deletion:
+ break;
+ case Insertion:
+ uallen += steps;
+ break;
+ }
+ }
+
+ return uallen;
+}
+
+
+
diff --git a/segemehl/libs/alignment.h b/segemehl/libs/alignment.h
new file mode 100644
index 0000000..31fb9da
--- /dev/null
+++ b/segemehl/libs/alignment.h
@@ -0,0 +1,71 @@
+#ifndef ALIGNMENT_H
+#define ALIGNMENT_H
+
+/*
+ *
+ * alignment.h
+ * alignment representation
+ *
+ * idea:
+ * Stephan Kurtz, Gordon Gremme. Foundations
+ * of sequence analysis, University Hamburg, 2005
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 02/03/2009 11:56:27 AM CET
+ *
+ */
+
+#include "basic-types.h"
+
+typedef enum
+{
+ Replacement, Deletion, Insertion
+} Eoptype;
+
+typedef struct
+{
+ Eoptype eop;
+ Uint steps;
+} Multieop;
+
+typedef struct {
+ char *u;
+ char *v;
+ Uint ulen;
+ Uint vlen;
+
+ /*start of aligment (use in approx string matching, local align)*/
+ Uint uoff;
+ Uint voff;
+ Multieop *meops;
+ Uint numofmeops;
+
+} Alignment;
+
+
+void copyAlignment(Alignment *to, Alignment *from);
+void showmultieoplist(FILE *dev, Alignment *al);
+void showDynmultieoplist(Alignment* al, int size);
+void showAlign(Alignment* al, FILE *dev);
+void showAlignLF(Alignment* al, FILE *dev, char);
+void initAlignment(Alignment *al, char *u, Uint ulen, Uint uoff, char *v, Uint vlen, Uint voff);
+void insertEop(Alignment *al, Eoptype eop);
+void revMeops(Alignment *al);
+void wrapAlignment(Alignment *al);
+Uint getEdist(Alignment *al);
+Uint getBisulfiteMismatches(Alignment *al, Uint bisulfite);
+Uint getWrongStrandBisulfiteMismatches(Alignment *al, Uint bisulfite);
+void countEops(Alignment *al, Uint *mat, Uint *mis, Uint *ins, Uint *del);
+char * multieopstring(Alignment *al, Uint leftpad, Uint rightpad, unsigned char rev);
+Uint getUalignlen(Alignment *al);
+Uint getValignlen(Alignment *al);
+int getSubstringEdist(Alignment *al, Uint u, Uint v);
+int getAlignScore(Alignment *al, int *scores, int indel);
+char* cigarstring(Alignment *al, Uint leftpad, Uint rightpad, char clipch, unsigned char rev);
+char* mdstring(Alignment *al, unsigned char rev);
+Uint bl_cigarGetAlignLen(char *cigar);
+char* bl_cigarGetAlignString(char *cigar);
+char* bl_mdGetDiffString(char *MD);
+char* getNTcodekey(void *space);
+void getSoftClipScores(Alignment *al, int polyAlen, int *scores, int indel, int *pAscr, int *adscr, int *adlen) ;
+#endif
diff --git a/segemehl/libs/aluruSort.c b/segemehl/libs/aluruSort.c
new file mode 100644
index 0000000..08010f5
--- /dev/null
+++ b/segemehl/libs/aluruSort.c
@@ -0,0 +1,1842 @@
+
+/*
+ * aluruSort.c
+ * implementation of
+ * space efficient linear time construction
+ * of suffix arrays
+ *
+ * Ko, Pang and Aluru, Srinivas
+ * Iowa State University
+ *
+ * kind support provided by RSR:
+ * Pravda et Tanger-Glasgow sur couleur3.ch
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 07/11/2007 05:15:28 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 55 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-09-11 13:39:04 +0200 (Thu, 11 Sep 2008) $
+ *
+ * Id: $Id: aluruSort.c 55 2008-09-11 11:39:04Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/trunk/libs/aluruSort.c $
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "basic-types.h"
+#include "memory.h"
+#include "mathematics.h"
+#include "aluruSort.h"
+#include "sort.h"
+#include "bitArray.h"
+#include "info.h"
+#include "debug.h"
+
+#define INTSIZE 64
+
+
+inline void getinterval(Uint *s, Uint len, Uint *min, Uint *max) {
+ Uint i;
+ Lint Max,
+ Min,
+ resc;
+
+ Max = (Lint)s[0];
+ Min = (Lint)s[0];
+
+
+ for(i=1; i < len; i++) {
+ resc = Max - (Lint)s[i];
+ resc = resc >> (INTSIZE-1);
+ Max = Max + (((Lint)s[i] -Max) & resc);
+
+ resc = s[i] - Min;
+ resc = resc >> (INTSIZE-1);
+ Min = Min - ((Min -(Lint)s[i]) & resc);
+ }
+
+
+ *max = Max;
+ *min = Min;
+
+}
+
+ Uint*
+getAluruArray(void *space, char *s, Uint len, char delim)
+{
+ Uint i,r,k=0;
+ Uint *ptr;
+
+ ptr = ALLOCMEMORY(space, NULL, Uint*, len);
+
+ for(i=0; i < len; i++) {
+ if((r=s[i]) != delim){
+ ptr[k]= i;
+ k++;
+ }
+ }
+ return ptr;
+}
+
+
+
+
+/*-------------------------------- distCount ---------------------------------
+ *
+ * @brief helper function to accumulate Qdistances
+ * @author Steve Hoffmann
+ *
+ */
+
+ Uint*
+distCount (void *space, Uint *Qdist, Uint len, Uint maxdist)
+{
+ Uint i,
+ j=0,
+ *distAcc,
+ temp,
+ prev;
+
+ distAcc = ALLOCMEMORY(space, NULL, Uint, maxdist+1);
+ memset(distAcc, 0, sizeof(Uint)*(maxdist+1));
+
+ while(Qdist[j] == 0) j++;
+
+ for(i=j; i < len; i++) {
+ distAcc[Qdist[i]-1]++;
+ }
+
+ prev = distAcc[0];
+ distAcc[0] = 0;
+
+ for(i=1; i <= maxdist; i++) {
+ temp = distAcc[i];
+ distAcc[i] = prev + distAcc[i-1];
+ prev = temp;
+ }
+
+ return distAcc;
+}
+
+
+/*---------------------------------- Qdist -----------------------------------
+ *
+ * @brief getting the Qdistance of each sequence member.
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint*
+Qdist(void *space, bitarray cl, Uint len, unsigned char Q) {
+ Uint *sdist,
+ i;
+ Lint l=-1;
+
+
+ sdist = ALLOCMEMORY(space, NULL, Uint, len);
+ for (i=0; i < len; i++) {
+ sdist[i] = (l < 0) ? 0 : i-l;
+ if (getbit(cl, (Uint) i) == Q)
+ l = i;
+ }
+
+ return sdist;
+}
+
+
+
+
+/*--------------------------------- Qmaxdist ---------------------------------
+ *
+ * @brief getting the max(d(s,l)) \forall s,l \in S
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+Qmaxdist(void *space, bitarray cl, Uint len, unsigned char Q) {
+ Lint i;
+ Lint tmp=0,
+ dist=0,
+ pre=1;
+
+ for (i=len-2; i >= 0; i--) {
+ tmp = dist - pre;
+ tmp = tmp >> (INTSIZE-1);
+ dist += ((pre-dist) & tmp);
+
+ tmp = (Q) ? (0 - (Lint)(!getbit(cl, (Uint) i))) : (((Lint)(!getbit(cl, (Uint) i))) - 1);
+ pre = (pre & tmp) + 1;
+ }
+
+ return (Uint)dist;
+}
+
+
+bitarray
+classifyint(void *space, Uint* s, Uint len, Uint *noL, Uint *noS) {
+ Uint i, j=0, k;
+ bitarray a;
+
+ a = initbitarray(space, len);
+ setbitarray(a, len, 0);
+
+ *noL = 0;
+ *noS = 0;
+
+ for (i=0; i < len-1; i++) {
+ if (s[i] > s[i+1]){
+ for(k=i-j; k <= i; k++) {
+ setbit(a,k,1);
+ *noL += 1;
+ }
+ j=0;
+ }
+ else if (s[i] < s[i+1]){
+
+ for(k=i-j; k <= i; k++) {
+ setbit(a,k,0);
+ *noS+=1;
+ }
+ j=0;
+ }
+ else {
+ j++;
+ }
+ }
+
+ if(*noS < *noL) {
+ setbit(a, len-1, 0);
+ *noS+=1;
+ }
+ else
+ {
+ setbit(a, len-1, 1);
+ *noL+=1;
+ }
+
+ return a;
+}
+
+
+
+bitarray
+classify(void *space, char* s, Uint len, Uint *noL, Uint *noS) {
+ Uint i, j=0, k;
+ bitarray a;
+
+ a = initbitarray(space, len);
+ setbitarray(a, len, 0);
+
+ *noL = 0;
+ *noS = 0;
+
+ for (i=0; i < len-1; i++) {
+ if (s[i] > s[i+1]){
+ for(k=i-j; k <= i; k++) {
+ setbit(a,k,1);
+ *noL += 1;
+ }
+ j=0;
+ }
+ else if (s[i] < s[i+1]){
+
+ for(k=i-j; k <= i; k++) {
+ setbit(a,k,0);
+ *noS+=1;
+ }
+ j=0;
+ }
+ else {
+ j++;
+ }
+ }
+
+ if(*noS < *noL) {
+ setbit(a, len-1, 0);
+ *noS+=1;
+ }
+ else
+ {
+ setbit(a, len-1, 1);
+ *noL+=1;
+ }
+
+ return a;
+}
+
+
+ Uint*
+countingsort(void *space,
+ char *s,
+ Uint len,
+ bitarray bckts,
+ Uint alphasize,
+ Uint alphaoffset)
+{
+ Uint i,
+ resc,
+ offset,
+ *buffer,
+ *A;
+
+ buffer = ALLOCMEMORY(space, NULL, Uint, len);
+ A = ALLOCMEMORY(space, NULL, Uint , len);
+
+ /*use buffer to count chars first*/
+ memset(buffer, 0, sizeof(Uint)*alphasize);
+ for(i=0; i < len; i++) {
+ resc = (Uint) s[i] - alphaoffset;
+ buffer[resc]++;
+ }
+
+ offset = buffer[0];
+ buffer[0]=0;
+
+ for(i=0; i < alphasize; i++) {
+ resc = buffer[i];
+ buffer[i] = offset + buffer[i-1];
+ offset = resc;
+ }
+
+ for(i=0; i < len; i++) {
+ resc = (Uint) s[i] - alphaoffset;
+ A[buffer[resc]] = i;
+ buffer[resc]++;
+ }
+ /*the bucket borders*/
+ setbitarray(bckts, len, 0);
+ for(i=0; i < alphasize; i++) {
+ setbit(bckts, buffer[i]-1, 1);
+ }
+
+ FREEMEMORY(space, buffer);
+ return A;
+}
+
+
+ Uint*
+getlistsL(void *space,
+ Uint *A,
+ Uint len,
+ Uint *dist,
+ Uint *accdist,
+ Uint maxdist,
+ bitarray bckts,
+ bitarray list,
+ Uint listlen)
+{
+ Lint i=len-1,
+ j,
+ pos,
+ tmp,
+ start,
+ end;
+ unsigned char firstelem;
+
+ NFO("getlistsL: memsetting list of %u elements\n", listlen);
+ setbitarray(list, listlen, 0);
+
+ NFO("getlistsL: iter from %lld down to 0\n", i);
+ while(i >= 0) {
+ end = i;
+ if(i > 0) {
+ if(!getbit(bckts, (Uint) i-1))
+ firstelem = 0;
+ else
+ firstelem = 1;
+ } else
+ firstelem =1;
+
+ while (!firstelem) {
+
+ tmp = dist[A[i]];
+/* printf("tmp %d\n", tmp);*/
+ if (tmp > 0) {
+ pos = accdist[tmp-1];
+ dist[A[i]]=pos;
+ setbit(list, pos, 1);
+ accdist[tmp-1] += 1;
+ } else {
+ dist[A[i]]= -1;
+ }
+
+ i--;
+
+ if(i > 0) {
+ if(!getbit(bckts, (Uint) i-1))
+ firstelem = 0;
+ else
+ firstelem = 1;
+ } else {
+ firstelem = 1;
+ }
+ }
+
+ tmp = dist[A[i]];
+ if(tmp != 0) {
+ pos = accdist[tmp-1];
+ dist[A[i]] = pos;
+ setbit(list, pos, 1);
+ accdist[tmp-1] += 1;
+
+ } else {
+ dist[A[i]] = (Uint)-1;
+ }
+
+ start=i;
+ for(j=end; j >= start; j--) {
+ pos = dist[A[j]];
+ if (pos != (Uint)-1 && pos != listlen -1) {
+ if(getbit(list, (Uint) pos+1))
+ setbit(list, (Uint) pos, 0);
+ }
+ }
+ i--;
+ }
+
+ NFO("scanning A (%u elems)\n", len);
+ for(i=0; i < len; i++) {
+ if(dist[i] != (Uint)-1) {
+ A[dist[i]] = i;
+ }
+ }
+
+ NFO("scanning accdist (%u elems) (1)\n", maxdist);
+ for(i=0; i < maxdist; i++) {
+ if(accdist[i] > 0) {
+ setbit(list, accdist[i]-1, 1);
+ } else {
+ setbit(list, accdist[i], 1);
+ }
+ }
+
+ NFO("scanning accdist (%u elems) (2)\n", maxdist);
+ for(i=0; i < maxdist; i++) {
+ if (i==0) {
+ j=0;
+ } else {
+ j = accdist[i-1];
+ }
+ while(j < accdist[i]) {
+ A[j] = A[j] -i -1;
+ j++;
+ }
+ }
+
+ MSG("getlistsL: exit\n");
+ return A;
+}
+
+
+ Uint*
+getlistsS(void *space,
+ Uint *A,
+ Uint len,
+ Uint *dist,
+ Uint *accdist,
+ Uint maxdist,
+ bitarray bckts,
+ bitarray list,
+ Uint listlen)
+{
+ Uint i=0,
+ j,
+ pos,
+ tmp,
+ start,
+ end;
+
+ NFO("getlistsS: memsetting list of %u elements\n", listlen);
+ setbitarray(list, listlen, 0);
+
+ NFO("getlistsS: iter up to %u\n", len);
+ while(i < len) {
+ start = i;
+
+ while(getbit(bckts, (Uint) i) != 1 && i < len) {
+ tmp = dist[A[i]];
+ if(tmp != (Uint)-1 && tmp > 0) {
+ pos = accdist[tmp-1];
+ dist[A[i]] = pos;
+ setbit(list, pos, 1);
+ accdist[tmp-1] += 1;
+ } else {
+ dist[A[i]] = (Uint)-1;
+ }
+ i++;
+ }
+
+ tmp = dist[A[i]];
+ if (tmp != 0) {
+ pos = accdist[tmp-1];
+ dist[A[i]] = pos;
+ setbit(list, pos, 1);
+ accdist[tmp-1] += 1;
+ } else {
+ dist[A[i]] = (Uint) -1;
+ }
+ end = i;
+
+ for(j= start; j < end; j++) {
+ pos = dist[A[j]];
+ if (pos != (Uint)-1 && pos != listlen -1) {
+ if (getbit(list, (Uint) pos+1)) {
+ setbit(list,pos, 0);
+ }
+ }
+ }
+ i++;
+ }
+
+ MSG("getlistsS: scan A\n");
+ for(i=0; i < len; i++) {
+ if(dist[i] != (Uint)-1)
+ A[dist[i]]=i;
+ }
+
+ MSG("getlistsS: set accidst\n");
+ for(i=0; i < maxdist; i++) {
+ if (accdist[i] == 0) NFO("getlistsS: i=%u accdist=0!!\n", i);
+ else
+ setbit(list, accdist[i]-1, 1);
+ }
+
+ for(i=0; i < maxdist; i++) {
+ j = (i==0) ? 0 : accdist[i-1];
+ while(j < accdist[i]) {
+ A[j] = A[j] - i -1;
+ j++;
+ }
+ }
+
+ MSG("getlistsS: exiting\n");
+ return A;
+}
+
+ void
+sortlistS(void *space,
+ Uint *B,
+ Uint lenB,
+ Uint len,
+ bitarray bckts,
+ Uint *list,
+ bitarray listb,
+ Uint listlen)
+{
+ Lint *rev,
+ *left,
+ bcktno,
+ i,
+ j,
+ new,
+ bcktright;
+
+ MSG("sortlistS: allocating stuff\n");
+ rev = ALLOCMEMORY(space, NULL, Lint, len);
+ left = ALLOCMEMORY(space, NULL, Lint, lenB);
+
+ memset(rev, -1, len*sizeof(Lint));
+ memset(left, -1, lenB*sizeof(Lint));
+
+ bcktright= lenB -1;
+
+ NFO("sortlistS: iterating %u elems", lenB);
+ for(i=lenB-1; i > 0; i--) {
+ rev[B[i]] = bcktright;
+ if(getbit(bckts, (Uint) i-1) == 1) {
+ left[bcktright] = i;
+ bcktright = i-1;
+ }
+ }
+
+ rev[B[0]] = bcktright;
+ left[bcktright] = 0;
+
+ NFO("sortlistS: looping %u elems", listlen);
+ i=0;
+ while (i < listlen) {
+ j=i;
+ while(!getbit(listb, (Uint)j)) {
+ left[rev[list[j]]] += 1;
+ j++;
+ }
+
+ left[rev[list[j]]] += 1;
+
+ j=i;
+ while(!getbit(listb, (Uint) j)) {
+ new = left[rev[list[j]]] - 1;
+ rev[list[j]] = new;
+ j++;
+ }
+
+ new = left[rev[list[j]]] - 1;
+ rev[list[j]] = new;
+
+ /*correct the values*/
+
+ j=i;
+ while (!getbit(listb, (Uint) j)) {
+ new = rev[list[j]];
+ if (left[new] == -1) {
+ left[new] = new;
+ } else {
+ left[new] -= 1;
+ }
+ setbit(bckts, new, 1);
+ j++;
+ }
+
+ /*last elem*/
+ new = rev[list[j]];
+ if(left[new] == -1) {
+ left[new] = new;
+ } else {
+ left[new] -=1;
+ }
+
+ setbit(bckts, new, 1);
+ i=j+1;
+ }
+
+
+ NFO("sortlistS: iterating %u elems", len);
+ for(i=0; i < len; i++) {
+ bcktno = rev[i];
+ if(bcktno > -1) {
+ B[left[bcktno]] = i;
+ left[bcktno] += 1;
+ }
+ }
+
+ MSG("sortlistsS: exiting happily!\n");
+ FREEMEMORY(space, rev);
+ FREEMEMORY(space, left);
+}
+
+ void
+sortlistL(void *space,
+ Uint *B,
+ Uint lenB,
+ Uint len,
+ bitarray bckts,
+ Uint *list,
+ bitarray listb,
+ Uint listlen)
+{
+ Lint *rev,
+ *right,
+ bcktno,
+ i,
+ j,
+ new,
+ bcktleft;
+
+ MSG("sortlistL: allocating stuff\n");
+ rev = ALLOCMEMORY(space, NULL, Lint, len);
+ right = ALLOCMEMORY(space, NULL, Lint, lenB);
+
+ memset(rev, -1, len*sizeof(Lint));
+ memset(right, -1, lenB*sizeof(Lint));
+
+ NFO("sortlistL: iterating %u elems", lenB);
+ bcktleft=0;
+ for(i=0; i < lenB; i++) {
+
+ rev[B[i]] = bcktleft;
+ if(getbit(bckts, (Uint) i) == 1) {
+ right[bcktleft] = i;
+ bcktleft = i+1;
+ }
+ }
+
+ NFO("sortlistL: looping %u elems", listlen);
+ i=0;
+ while (i < listlen) {
+
+ j=i;
+ while(getbit(listb, (Uint) j) == 0) {
+ right[rev[list[j]]] -= 1;
+ j++;
+ }
+
+ right[rev[list[j]]] -= 1;
+
+ j=i;
+ while(getbit(listb, (Uint) j) == 0) {
+ new = right[rev[list[j]]] + 1;
+ rev[list[j]] = new;
+ j++;
+ }
+
+ new = right[rev[list[j]]] + 1;
+ rev[list[j]] = new;
+
+ /*correct elems*/
+
+ j=i;
+ while (getbit(listb, (Uint) j) == 0) {
+ new = rev[list[j]];
+ if (right[new] == -1) {
+ right[new] = new;
+ } else {
+ right[new] += 1;
+ }
+ if(new > 0) {
+ setbit(bckts, new-1, 1);
+ }
+ j++;
+ }
+
+ /*correct last elem*/
+
+ new = rev[list[j]];
+ if(right[new] == -1) {
+ right[new] = new;
+ } else {
+ right[new] +=1;
+ }
+
+ if(new > 0) {
+ setbit(bckts, new-1, 1);
+ }
+
+ i=j+1;
+ }
+
+ NFO("sortlistL: iterating %u elems", len);
+ for(i=0; i < len; i++) {
+ bcktno = rev[i];
+ if(bcktno > -1) {
+ B[right[bcktno]] = i;
+ right[bcktno] -= 1;
+ }
+ }
+
+ MSG("sortlistsL: exiting happily!\n");
+ FREEMEMORY(space, rev);
+ FREEMEMORY(space, right);
+}
+
+
+ Uint*
+countingsortint(void *space,
+ Uint *s,
+ Uint len,
+ bitarray bckts)
+{
+ Uint i,
+ resc,
+ offset,
+ *buffer,
+ *A,
+ min,
+ max,
+ sigma;
+
+ getinterval(s, len, &min, &max);
+ sigma = max - min +1;
+
+ MSG("countingsortint: init buffers and A\n");
+ buffer = ALLOCMEMORY(space, NULL, Uint, len);
+ A = ALLOCMEMORY(space, NULL, Uint , len);
+
+ MSG( "setting buffer to zero\n");
+ /*use buffer to count chars first*/
+ memset(buffer, 0, sizeof(Uint)*sigma);
+
+ MSG("countsortint: scanning buffer (1 of 3)\n");
+ for(i=0; i < len; i++) {
+ resc = (Uint) (s[i] - min);
+ buffer[resc]++;
+ }
+
+ offset = buffer[0];
+ buffer[0]=0;
+
+ MSG("countsortint: scanning buffer (2 of 3)\n");
+ for(i=1; i < sigma; i++) {
+ resc = buffer[i];
+ buffer[i] = offset + buffer[i-1];
+ offset = resc;
+ }
+
+ MSG("countsortint: scanning buffer (3 of 3)\n");
+ for(i=0; i < len; i++) {
+ resc = (Uint) (s[i] - min);
+ A[buffer[resc]] = i;
+ buffer[resc]++;
+ }
+
+
+
+ MSG("countsortint: scanning buffer (to set borders)\n");
+ /*the bucket borders*/
+ setbitarray(bckts, len, 0);
+ for(i=0; i < sigma; i++) {
+ setbit(bckts, buffer[i]-1, 1);
+ }
+
+
+ MSG("countsortint: exiting\n");
+ FREEMEMORY(space, buffer);
+ return A;
+}
+
+
+ Uint*
+substringsort(void *space,
+ char *s,
+ Uint *A,
+ bitarray cl,
+ Uint len,
+ bitarray bckts,
+ Uint bucketno,
+ Uint dist,
+ Uint Q)
+{
+ Uint bufferlen = 255 * 2,
+ i,
+ j=0,
+ idx,
+ offset;
+ Lint start,
+ end,
+ resc,
+ prevCount,
+ tempBucketTest=0,
+ *tmp,
+ *skip,
+ *buffer;
+ Lint type = (Q) ? 0 : 1;
+
+ MSG("setting bit array to zero\n");
+ setbitarray(bckts, bucketno, 0);
+
+ MSG("allocating space for buckets and buffers\n");
+ buffer = ALLOCMEMORY(space, NULL, Lint, bufferlen+1);
+ skip = ALLOCMEMORY(space, NULL, Lint, bucketno+1);
+ tmp = ALLOCMEMORY(space, NULL, Lint, bucketno+1);
+
+ MSG("memsetting\n");
+
+ memset(skip, 0, sizeof(Lint)*(bucketno+1));
+ skip[0] = bucketno;
+
+ for(i=0; i <= dist; i++) { /*offset*/
+ start =0;
+ offset =0; /*prevPos*/
+ while(start < bucketno) {
+ offset = start;
+ while (skip[start] < 0 && start < bucketno) {
+ start = (Lint) -skip[start];
+ }
+ end = skip[start] - 1;
+ skip[offset] = -start;
+
+ memset(buffer, 0, sizeof(Lint)*bufferlen);
+
+ if(start < bucketno) {
+
+ for(j=start; j <=end; j++) { /*i*/
+ tempBucketTest++;
+ tmp[j] = A[j];
+
+ idx = A[j] + i;
+ resc = ((Lint) s[idx]) << 1;
+ resc += (!getbit(cl, (Uint) idx)) ? 1 : 0;
+ buffer[resc] += 1;
+ }
+
+ prevCount = buffer[0];
+ buffer[0] = start;
+
+ for(j=1; j < bufferlen; j++) {
+ resc = buffer[j];
+ buffer[j] = buffer[j-1]+prevCount;
+ prevCount = resc;
+ }
+
+ for(j=start; j <= end; j++) {
+ tempBucketTest++;
+ idx = tmp[j] + i;
+ resc = ((Lint) s[idx]) << 1;
+ resc += (!getbit(cl, (Uint)idx)) ? 1 : 0;
+ A[buffer[resc]] = tmp[j];
+ buffer[resc]++;
+ }
+
+ /*bucket boundaries*/
+
+ j=1;
+ if(i > 0) {
+
+ if(buffer[type] > start) {
+ setbit(bckts, buffer[type]-1, 1);
+ skip[start] = -buffer[0];
+ }
+
+ for(j=1; j < bufferlen; j++) {
+
+ if(buffer[j] == buffer[j-1]+1) {
+ setbit(bckts, buffer[j]-1, 1);
+ skip[buffer[j-1]] = -buffer[j];
+
+ } else
+ if (buffer[j] > buffer[j-1] + 1) {
+ setbit(bckts, buffer[j]-1, 1);
+ resc = (Q) ? -((j & 1)^1): -(j & 1);
+ resc = (buffer[j] ^ resc) - resc;
+ skip[buffer[j-1]] = resc;
+ }
+ }
+ }
+ /*if first bucket greater start not empty*/
+ else {
+ if(buffer[type] > start) {
+ setbit(bckts, buffer[type]-1, 1);
+ skip[start] = -buffer[0];
+ }
+
+ for(j=1; j < bufferlen; j++) {
+ if(buffer[j] == buffer[j-1] +1) {
+ setbit(bckts, buffer[j]-1, 1);
+ skip[buffer[j-1]] = -buffer[j];
+ } else if(buffer[j] > buffer[j-1] +1) {
+ setbit(bckts, buffer[j]-1, 1);
+ skip[buffer[j-1]] = buffer[j];
+ }
+ }
+ }
+ if(type && start == end) {
+ skip[start] = -(end - 1);
+ setbit(bckts, start, 1);
+ }
+
+ start = end + 1;
+ }
+ }
+ }
+
+ MSG("substring sort ... ok\n");
+ FREEMEMORY(space, skip);
+ FREEMEMORY(space, tmp);
+ FREEMEMORY(space, buffer);
+
+ return A;
+}
+
+
+ Uint*
+arrayB (void *space,
+ Uint *A,
+ Uint lenA,
+ Uint lenB,
+ bitarray bcktsA,
+ bitarray bcktsB,
+ bitarray cl,
+ unsigned char Q)
+{
+ Uint i,
+ *B;
+ Lint j=0;
+ unsigned char type = Q ? 1 : 0;
+
+ NFO("arrayB: allocating B with %u elements\n", lenB);
+ B = ALLOCMEMORY(space, NULL, Uint, lenB);
+ memset(B, 0, sizeof(Uint)*lenB);
+ setbitarray(bcktsB, lenB, 0);
+
+ NFO("arrayB: iterating to lenA=%u\n", lenA);
+ for(i=0; i < lenA; i++) {
+ if (getbit(cl, (Uint) A[i]) == type) {
+ B[j] = A[i];
+ j++;
+ }
+ if (j > lenB) DBG("arrayB: j=%lld in B out of bounds!\n", j);
+ /*copy bckt-borders*/
+ if(getbit(bcktsA, (Uint)i) ==1 && j-1 >= 0)
+ setbit(bcktsB, j-1, 1);
+ }
+
+ MSG("arrayB: exiting\n");
+ return B;
+}
+
+
+ Uint*
+Tprime(void *space,
+ Uint len,
+ Uint *B,
+ Uint lenB,
+ bitarray bcktsB,
+ bitarray cl,
+ unsigned char Q)
+{
+ Uint i,
+ j=0,
+ *tprime;
+ Lint *buffer;
+ Lint cur=0,
+ inv;
+
+ MSG("tprime: init arrays\n");
+ buffer = ALLOCMEMORY(space, NULL, Lint, len);
+ tprime = ALLOCMEMORY(space, NULL, Uint, lenB);
+
+ memset(buffer, 0, sizeof(Lint)*len);
+
+ MSG( "tprime: scan B\n");
+ for(i=0; i < lenB; i++) {
+ buffer[B[i]] = cur;
+ cur += getbit(bcktsB,(Uint) i) ? 1 : 0;
+ }
+
+ NFO("tprime: iterating i=%u elements with lenB=%u\n", len, lenB);
+ for(i=0; i < len; i++) {
+ cur = (Q) ? (((Lint)!getbit(cl,(Uint)i)) - 1) : -((Lint)!getbit(cl, (Uint)i));
+ inv = ~cur;
+ tprime[j] = (Uint) ((((Lint)tprime[j]) & inv) | (buffer[i] & cur));
+ if (j >= lenB) DBG( "j=%u out of bounds\n", j);
+ j += (1 & cur) ? 1 : 0;
+ }
+
+ MSG( "tprime: exit\n");
+ FREEMEMORY(space, buffer);
+ return tprime;
+}
+
+
+ void
+reconstruct(void *space,
+ Uint len,
+ Uint *B,
+ Uint lenB,
+ bitarray cl,
+ unsigned int Q)
+{
+ Lint *conv;
+ Lint i,
+ j=0,
+ cur,
+ inv;
+
+ MSG("reconstruct: init\n");
+ conv = ALLOCMEMORY(space, NULL, Lint, lenB);
+
+ NFO("reconstruct: iteration over %u elems\n", len);
+ for(i=0; i < len; i++) {
+ cur = (getbit(cl, (Uint)i)) ? 0 : 1;
+ cur = cur << (INTSIZE-1);
+ cur = cur >> (INTSIZE-1);
+ inv = ~cur;
+ if(Q) {
+ conv[j] = ((i & inv) | (((Lint)conv[j]) & cur));
+ j = j + (1 & inv);
+ } else {
+ conv[j] = ((i & cur) | (((Lint)conv[j]) & inv));
+ j = j + (1 & cur);
+ }
+ }
+
+ NFO("reconstruct: scan B (size: %u)\n", lenB);
+ for(i=0; i < lenB; i++) {
+ cur = B[i];
+ B[i] = (Uint) conv[cur];
+ }
+
+ MSG("reconstruct: exit");
+ FREEMEMORY(space, conv);
+ return;
+}
+
+
+ Uint*
+aluruSuffixArrayS(void *space,
+ char* T,
+ Uint len,
+ Uint *B,
+ Uint lenB,
+ bitarray cl)
+{
+ Lint *count;
+ Uint *sarray;
+ Lint i,
+ j;
+ Lint tmp;
+ Lint offset;
+ bitarray b;
+
+ count = ALLOCMEMORY(space, NULL, Lint, 256);
+ sarray = ALLOCMEMORY(space, NULL, Uint, len);
+ b = initbitarray(space, len);
+
+ memset(count, 0, 256*sizeof(Lint));
+ setbitarray(b, len, 0);
+
+ for(i=0; i < len; i++) {
+ tmp = (Lint) T[i];
+ count[tmp]++;
+ }
+
+ offset = count[0];
+ count[0] = 0;
+
+ for(i=1; i < 255; i++) {
+ tmp = count[i];
+ count[i] = count[i-1] + offset;
+ offset = tmp;
+ }
+
+ j=0;
+
+ for(i=0; i < len; i++) {
+ if (!getbit(b, (Uint) i)) {
+ sarray[i] = B[j];
+ setbit(b, i, 1);
+ j++;
+ offset = (Lint) sarray[i] -1;
+ if (offset >= 0) {
+ if(getbit(cl, (Uint) offset)) {
+ tmp = (Lint) T[offset];
+ if (count[tmp] > i) {
+ sarray[count[tmp]] = offset;
+ setbit(b, count[tmp], 1);
+ count[tmp] += 1;
+ }
+ }
+ }
+ } else {
+ offset = (Lint) sarray[i] -1;
+ if(offset >= 0) {
+ if(getbit(cl, (Uint) offset)) {
+ tmp = (Lint) T[offset];
+ if (count[tmp] > i) {
+ sarray[count[tmp]] = offset;
+ setbit(b, count[tmp], 1);
+ count[tmp] += 1;
+ }
+ }
+ }
+ }
+ }
+ FREEMEMORY(space, count);
+ FREEMEMORY(space, b);
+ return sarray;
+}
+
+ Uint*
+aluruSuffixArrayL(void *space,
+ char* T,
+ Uint len,
+ Uint *B,
+ Uint lenB,
+ bitarray cl)
+{
+ Lint *count;
+ Uint *sarray;
+ Lint i,
+ j;
+ Lint tmp;
+ Lint offset;
+ bitarray b;
+
+ MSG( "aluruSuffixArrayL: initalizning arrays\n");
+ count = ALLOCMEMORY(space, NULL, Lint, 255);
+ sarray = ALLOCMEMORY(space, NULL, Uint, len);
+ b = initbitarray(space, len);
+
+ MSG("aluruSuffixArrayL: memsetting count\n");
+ memset(count, 0, 255 *sizeof(Lint));
+
+ MSG("aluruSuffixArrayL: setting b\n");
+ setbitarray(b, len, 0);
+
+ for(i=0; i < len; i++) {
+ tmp = (Lint) T[i];
+ count[tmp]++;
+ }
+
+ count[0] = count[0] -1;
+
+ for(i=1; i < 255; i++) {
+ count[i] = count[i-1] + count[i];
+ }
+
+ j=lenB -1;
+
+ MSG( "aluruSuffixArrayL: iteration\n");
+ for(i=len-1; i >= 0; i--) {
+ if (!getbit(b, (Uint) i)) {
+ sarray[i] = B[j];
+ setbit(b, (Uint) i, 1);
+ j--;
+ offset = (Lint) sarray[i] -1;
+ if (offset >= 0) {
+ if(!getbit(cl, (Uint) offset)) {
+ tmp = (Lint) T[offset];
+ if (count[tmp] < i) {
+ sarray[count[tmp]] = offset;
+ setbit(b, (Uint) count[tmp], 1);
+ count[tmp] -= 1;
+ }
+ }
+ }
+ } else {
+ offset = (Lint) sarray[i] -1;
+ if(offset >= 0) {
+ if(!getbit(cl, (Uint) offset)) {
+ tmp = (Lint) T[offset];
+ if (count[tmp] < i) {
+ sarray[count[tmp]] = offset;
+ setbit(b, (Uint) count[tmp], 1);
+ count[tmp] -= 1;
+ }
+ }
+ }
+ }
+ }
+
+ FREEMEMORY(space, count);
+ FREEMEMORY(space, b);
+ MSG("aluruSuffixArrayL: exit ok\n");
+ return sarray;
+}
+
+
+ Uint*
+aluruSuffixArraySint(void *space,
+ Uint* T,
+ Uint len,
+ Uint *B,
+ Uint lenB,
+ bitarray cl)
+{
+ Uint *count;
+ Uint *sarray;
+ Uint i,
+ j,
+ min,
+ max,
+ sigma;
+ Uint tmp;
+ Lint offset;
+ bitarray b;
+
+ getinterval(T, len, &min, &max);
+ sigma = max - min +1;
+
+ count = ALLOCMEMORY(space, NULL, Uint, sigma);
+ sarray = ALLOCMEMORY(space, NULL, Uint, len);
+ memset(count, 0, sigma*sizeof(Uint));
+
+ b = initbitarray(space, len);
+ setbitarray(b, len, 0);
+
+ for(i=0; i < len; i++) {
+ tmp = (Uint) T[i] - min;
+ count[tmp]++;
+ }
+
+ offset = count[0];
+ count[0] = 0;
+
+ for(i=1; i < sigma; i++) {
+ tmp = count[i];
+ count[i] = count[i-1] + offset;
+ offset = tmp;
+ }
+
+ j=0;
+ for(i=0; i < len; i++) {
+ if (!getbit(b, (Uint) i)) {
+ sarray[i] = B[j];
+ setbit(b, i, 1);
+ j++;
+ offset = (Lint) sarray[i] -1;
+ if (offset >= 0) {
+ if(getbit(cl, (Uint) offset)) {
+ tmp = (Uint) T[offset];
+ if (count[tmp] > i) {
+ sarray[count[tmp]] = offset;
+ setbit(b, count[tmp], 1);
+ count[tmp] += 1;
+ }
+ }
+ }
+ } else {
+ offset = (Lint) sarray[i] -1;
+ if(offset >= 0) {
+ if(getbit(cl, (Uint) offset)) {
+ tmp = (Lint) T[offset];
+ if (count[tmp] > i) {
+ sarray[count[tmp]] = offset;
+ setbit(b, count[tmp], 1);
+ count[tmp] += 1;
+ }
+ }
+ }
+ }
+ }
+ FREEMEMORY(space, count);
+ FREEMEMORY(space, b);
+
+ return sarray;
+}
+
+
+ Uint*
+aluruSuffixArrayLint(void *space,
+ Uint* T,
+ Uint len,
+ Uint *B,
+ Uint lenB,
+ bitarray cl)
+{
+ Uint *count;
+ Uint *sarray;
+ Uint min,
+ max,
+ sigma;
+ Uint tmp;
+ Lint offset;
+ Lint i;
+ Lint j;
+ bitarray b;
+
+ getinterval(T, len, &min, &max);
+ sigma = max - min +1;
+
+ count = ALLOCMEMORY(space, NULL, Uint, sigma);
+ sarray = ALLOCMEMORY(space, NULL, Uint, len+1);
+ memset(sarray, 0, (len+1)*sizeof(Uint));
+ memset(count, 0, sigma*sizeof(Uint));
+ b = initbitarray(space, len);
+
+ memset(count, 0, sigma*sizeof(Uint));
+ setbitarray(b, len, 0);
+
+ for(i=0; i < len; i++) {
+ tmp = (Lint) T[i] - min;
+ count[tmp]++;
+ }
+
+ count[0] = count[0] -1;
+
+ for(i=1; i < sigma; i++) {
+ count[i] = count[i-1] + count[i];
+ }
+
+ j=lenB -1;
+
+ for(i=len-1; i >= 0; i--) {
+ if (!getbit(b, (Uint) i)) {
+ sarray[i] = B[j];
+ setbit(b, i, 1);
+ j--;
+ offset = (Lint) sarray[i] -1;
+ if (offset >= 0) {
+ if(!getbit(cl, (Uint) offset)) {
+ tmp = (Lint) T[offset];
+ if (count[tmp] < (i)) {
+ sarray[count[tmp]] = offset;
+ setbit(b, count[tmp], 1);
+ count[tmp] -= 1;
+ }
+ }
+ }
+ } else {
+ offset = (Lint) sarray[i] -1;
+ if(offset >= 0) {
+ if(!getbit(cl, (Uint) offset)) {
+ tmp = (Lint) T[offset];
+ if (count[tmp] < i) {
+ sarray[count[tmp]] = offset;
+ setbit(b, count[tmp], 1);
+ count[tmp] -= 1;
+ }
+ }
+ }
+ }
+ }
+ FREEMEMORY(space, count);
+ FREEMEMORY(space, b);
+ return sarray;
+}
+
+
+Uint*
+alurusortint(void *space, Uint *s, Uint *l) {
+ bitarray cl=NULL,
+ bcktsA=NULL,
+ bcktsB=NULL,
+ bcktslist=NULL;
+ Uint *B,
+ *A,
+ *tprime;
+ Uint noL;
+ Uint noS;
+ Uint maxdist,
+ *dist,
+ *accDist,
+ *list,
+ listlen;
+ Uint len = *l;
+
+ MSG("alurusortint: classify int\n");
+ cl = classifyint(space, s, len, &noL, &noS);
+
+ MSG("alurusortint: getting bit\n");
+ if(!getbit(cl, (Uint) len-1) && noS ==1) {
+
+ MSG("alurusortint: aluruSuffixArraySint\n");
+ /*printf("fewintS\n");*/
+ B = ALLOCMEMORY(space, NULL, Uint, 1);
+ B[0] = len-1;
+ A = aluruSuffixArraySint(space, s, len, B, noS, cl);
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, cl);
+
+ *l = len;
+ return A;
+ }
+
+ MSG("alurusortint: init bcktsA\n");
+ bcktsA = initbitarray(space, len);
+ MSG("alurusortint: countingsort\n");
+ A=countingsortint(space, s, len, bcktsA);
+
+ /*sort type S suffixes*/
+ if(!getbit(cl, (Uint) len-1)) {
+
+ MSG("alurusortint: Sorting type S suffixes. Init bcktsB.\n");
+ NFO("%d\t%d\t%d\n\n", noS, noL, len);
+
+ bcktsB = initbitarray(bcktsB, noS);
+ B = arrayB(space, A, len, noS, bcktsA, bcktsB, cl, 0);
+ MSG("alurusortint: enter Qmaxdist\n");
+ maxdist = Qmaxdist(space, cl, len, 0);
+
+ MSG("alurusortint: enter Qdist\n");
+ dist = Qdist(space, cl, len, 0);
+
+
+ MSG("alurusortint: enter distCount\n");
+ accDist = distCount(space, dist, len, maxdist);
+
+ listlen = accDist[maxdist];
+ bcktslist = initbitarray(space, listlen);
+
+
+ MSG("alurusortint: enter get listsS\n");
+ /*list points to modified A*/
+ list = getlistsS(space, A, len, dist, accDist, maxdist, bcktsA,
+ bcktslist, listlen);
+
+ MSG("alurusortint: freeing stuff\n");
+ FREEMEMORY(space, bcktsA);
+ FREEMEMORY(space, accDist);
+ FREEMEMORY(space, dist);
+
+ MSG("alurusortint: enter sortlistsS\n");
+ sortlistS(space, B, noS, len, bcktsB, list, bcktslist, listlen);
+ FREEMEMORY(space, list);
+ FREEMEMORY(space, bcktslist);
+
+ if(valbitarray(bcktsB, noS, 1)) {
+
+ FREEMEMORY(space, bcktsB);
+ /* printf("valbitarraysortedS.\n");*/
+
+ MSG( "alurusortint: valbitarraysortedS.\n");
+ A = aluruSuffixArraySint(space, s, len, B, noS, cl);
+
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, cl);
+
+ *l = len;
+ return A;
+ }
+
+ MSG("alurusortint: enter tprime\n");
+ tprime = Tprime(space, len, B, noS, bcktsB, cl, 0);
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, bcktsB);
+
+ MSG("alurusortint: enter alurusortint\n");
+ B = alurusortint(space, tprime, &noS);
+ FREEMEMORY(space, tprime);
+
+ MSG("reconstructintS\n");
+ reconstruct(space, len, B, noS, cl, 0);
+
+ A = aluruSuffixArraySint(space, s, len, B, noS, cl);
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, cl);
+
+ *l = len;
+ return A;
+
+ } else {
+
+ /*type L suffixes*/
+
+ /*printf("Sorting type L suffixes\n");
+ printf("%d\t%d\t%d\n\n", noS, noL, len);*/
+
+ bcktsB = initbitarray(bcktsB, noL);
+ B = arrayB(space, A, len, noL, bcktsA, bcktsB, cl, 1);
+
+ maxdist = Qmaxdist(space, cl, len, 1);
+ dist = Qdist(space, cl, len, 1);
+ accDist = distCount(space, dist, len, maxdist);
+
+ listlen = accDist[maxdist];
+ bcktslist = initbitarray(bcktslist, listlen);
+
+ /*list points to modified A*/
+
+ MSG("alurusortint: enter get listsL\n");
+ list = getlistsL(space, A, len, dist, accDist, maxdist, bcktsA,
+ bcktslist, listlen);
+
+ FREEMEMORY(space, bcktsA);
+ FREEMEMORY(space, accDist);
+ FREEMEMORY(space, dist);
+
+
+ MSG("alurusortint: sort listsL\n");
+ sortlistL(space, B, noL, len, bcktsB, list, bcktslist, listlen);
+
+ FREEMEMORY(space, bcktslist);
+ FREEMEMORY(space, list);
+
+ if(valbitarray(bcktsB, noL, 1)) {
+
+ FREEMEMORY(space, bcktsB);
+ MSG("alurusortint: valbitarraysortedL.\n");
+ A = aluruSuffixArrayLint(space, s, len, B, noL, cl);
+
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, cl);
+ *l = len;
+ return A;
+ }
+
+ MSG("alurusortint: enter tprime\n");
+ tprime = Tprime(space, len, B, noL, bcktsB, cl, 1);
+ FREEMEMORY(space, bcktsB);
+ FREEMEMORY(space, B);
+
+
+ MSG("alurusortint: enter alurusortint\n");
+ B = alurusortint(space, tprime, &noL);
+ FREEMEMORY(space, tprime);
+
+ MSG("reconstructintL\n");
+ reconstruct(space, len, B, noL, cl, 1);
+
+ A = aluruSuffixArrayLint(space, s, len, B, noL, cl);
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, cl);
+
+ *l = len;
+ return A;
+ }
+}
+
+
+Uint*
+alurusort(void *space, char *s, Uint *l) {
+ bitarray cl=NULL,
+ bckts=NULL;
+ Uint *B,
+ *A,
+ *tprime,
+ i,
+ j;
+ Uint noL;
+ Uint noS;
+ Uint dist;
+ Uint len = *l;
+
+ space=NULL;
+
+ MSG("alurusort: classify\n");
+ cl = classify(space, s, len, &noL, &noS);
+
+ MSG( "alurusort: getting bit\n");
+ if(!getbit(cl, (Uint) len-1) && noS ==1) {
+ MSG("alurusort: fewcharS\n");
+ B = ALLOCMEMORY(space, NULL, Uint, 1);
+ B[0] = len-1;
+ A = aluruSuffixArrayS(space, s, len, B, noS, cl);
+
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, cl);
+ *l = len;
+ return A;
+ }
+
+ if(!getbit(cl, (Uint) len-1)) {
+
+ NFO("not bit alurusort: alloc B of size %u\n", noS);
+ B = ALLOCMEMORY(space, NULL, Uint, noS);
+
+ NFO("alurusort: initbitarray of size %u\n", noS);
+ bckts = initbitarray(bckts, noS);
+
+ NFO("alurusort: Qmaxdist in cl of size %u\n", len);
+ dist = Qmaxdist(space, cl, len, 0);
+
+ MSG("alurusort: scan B\n");
+
+ j=0;
+ for(i=0; i < len; i++) {
+ B[j] = i;
+ if (j > len) DBG("%u > %u\n", j, len);
+ j += (!getbit(cl, (Uint) i)) ? 1 : 0;
+ }
+
+ MSG("alurusort: substringsort\n");
+ B = substringsort(space, s, B, cl, len, bckts, noS, dist, 0);
+
+ MSG("checking valbitarray\n");
+ if(valbitarray(bckts, noS, 1)) {
+
+ FREEMEMORY(space, bckts);
+ /*printf("valbitarraysortedcharS.\n");*/
+
+ MSG("aluruSuffixArrayS start (if cond 1)\n");
+ A = aluruSuffixArrayS(space, s, len, B, noS, cl);
+
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, cl);
+
+ *l = len;
+ return A;
+ }
+
+ MSG("enter Tprime calculation");
+ tprime = Tprime(space, len, B, noS, bckts, cl, 0);
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, bckts);
+
+ MSG("enter alursortint \n");
+ B = alurusortint(space, tprime, &noS);
+ FREEMEMORY(space, tprime);
+
+ MSG("reconstructcharS\n");
+ reconstruct(space, len, B, noS, cl, 0);
+
+ MSG( "enter aluruSuffixArrayS start\n");
+ A = aluruSuffixArrayS(space, s, len, B, noS, cl);
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, cl);
+
+ *l = len;
+ return A;
+
+ } else {
+
+ NFO("bit alurusort: alloc B of size %u\n", noL);
+ B = ALLOCMEMORY(space, NULL, Uint, noL);
+
+ NFO( "alurusort: initbitarray of size %u\n", noL);
+ bckts = initbitarray(bckts, noL);
+
+ NFO( "alurusort: Qmaxdist in cl of size %u\n", len);
+ dist = Qmaxdist(space, cl, len, 1);
+
+ j=0;
+ for(i=0; i < len; i++) {
+ B[j] = i;
+ j -= ((Lint) (!getbit(cl, (Uint) i)) -1);
+ }
+
+ MSG("enter alurusort: substringsort\n");
+ B = substringsort(space, s, B, cl, len, bckts, noL, dist, 1);
+ MSG( "checking valbitarray\n");
+ if(valbitarray(bckts, noL, 1)) {
+ FREEMEMORY(space, bckts);
+ /*printf("valbitarraysortedcharL.\n");*/
+ MSG( "aluruSuffixArrayL start (if cond 1)\n");
+ A = aluruSuffixArrayL(space, s, len, B, noL, cl);
+
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, cl);
+
+ *l = len;
+ return A;
+ }
+
+ MSG("enter Tprime calculation");
+ tprime = Tprime(space, len, B, noL, bckts, cl, 1);
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, bckts);
+
+ MSG("enter alursortint \n");
+ B = alurusortint(space, tprime, &noL);
+ FREEMEMORY(space, tprime);
+
+ /*printf("reconstructcharL\n");*/
+
+ MSG("enter reconstruction\n");
+ reconstruct(space, len, B, noL, cl, 1);
+ MSG("enter aluruSuffixArrayL start\n");
+ A = aluruSuffixArrayL(space, s, len, B, noL, cl);
+
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, cl);
+ *l = len;
+
+ return A;
+ }
+}
+
+void
+showQDlist(vector_t **qdlist, Uint n) {
+ Uint i;
+
+ for(i=0; i < n; i++) {
+ printf("list %d\n", i);
+ dumpVector(qdlist[i]);
+ printf("\n");
+ }
+}
+
+
+void
+showAluruBuckets(Alurubucket *bckts, Uint *R, Uint n) {
+ Uint i,j,k=0;
+
+ for(i=0; i < n; i++) {
+ printf("bucket %d\n", i);
+ for(j=0; j < bckts[i].noofelems; j++) {
+ printf("A[%d]=%d, R[%d]=%d", k++, bckts[i].elems[j], bckts[i].elems[j], R[bckts[i].elems[j]]);
+ }
+ printf("\n");
+ }
+
+}
+
+int
+bcktcmpANSI(const void *a, const void *b) {
+ Alurubucket *first = (Alurubucket *)a,
+ *second = (Alurubucket *)b;
+
+ if (first->id < second->id) return -1;
+ if (first->id > second->id) return 1;
+
+ return 0;
+}
+
+
+Uint
+bcktcmp(Uint a, Uint b, void *arr, void *info) {
+ Alurubucket *bckts;
+
+ bckts = (Alurubucket*) arr;
+
+ if (bckts[a].id < bckts[b].id) return 2;
+ if (bckts[a].id > bckts[b].id) return 1;
+
+ return 0;
+}
+
+
+ void
+sortAluruSubstrings (void *space,
+ vector_t** qdlist,
+ Uint nooflists,
+ Alurubucket *bckts,
+ Uint noofbckts,
+ Uint *R,
+ char *cl,
+ Uint len,
+ char Q)
+{
+ Uint i,j;
+
+ for(i=0; i< nooflists; i++) {
+ for(j=0; j < LENGTHVEC(qdlist[i]); j++) {
+ if(cl[VECTOR(qdlist[i],j)] == Q) {
+ printf("sorting suffix %d at pos %d\n", VECTOR(qdlist[i],j),
+ R[VECTOR(qdlist[i],j)]);
+ }
+ }
+ }
+
+}
+
+Alurubucket*
+getAluruBuckets(void *space,
+ char *s,
+ Uint len,
+ Uint *noofbuckets,
+ Uint **inv) {
+ Uint i, j, no=0, b, k=0;
+ Uint *srtidx;
+
+ Alurubucket *bckts = NULL;
+
+ for(i=0; i< len; i++) {
+ BUCKETRET(bckts, no, (Uint) s[i], b);
+ if (b == no) {
+ bckts = ALLOCMEMORY(space, bckts, Alurubucket, ++no);
+ BUCKETINIT(bckts[b], (Uint) s[i]);
+ }
+ BUCKETADD(space, bckts[b], i);
+ }
+
+ srtidx = quickSort(space, bckts, no, bcktcmp, NULL);
+ (*inv) = ALLOCMEMORY(space, NULL, Uint, len);
+
+ for(i=0; i< no; i++) {
+ for(j=0; j< bckts[srtidx[i]].noofelems; j++) {
+ printf("R[%d]=%d\n", bckts[srtidx[i]].elems[j], k );
+ (*inv)[bckts[srtidx[i]].elems[j]] = k++;
+ }
+ }
+
+ qsort(bckts, no, sizeof(Alurubucket), bcktcmpANSI);
+ *noofbuckets = no;
+ return bckts;
+}
+
+vector_t**
+getQdistList(void *space,
+ Alurubucket* bckts,
+ Uint bcktno,
+ Uint *d,
+ Uint len) {
+ Uint max,
+ i,
+ j,
+ l;
+ vector_t **list;
+
+ max = uarraymax (d, len);
+ list = ALLOCMEMORY(space, NULL, vector_t*, d[max]+1);
+
+ for (i=0; i < d[max]+1; i++) {
+ list[i] = ALLOCMEMORY(space, NULL, vector_t, 1);
+ INITVECTOR(list[i]);
+ }
+
+ for(i=0; i < bcktno; i++) {
+ for (j=0; j < bckts[i].elems[j]; j++) {
+ l = d[bckts[i].elems[j]];
+ printf("A[%d]=%d l=%d\n",i, bckts[i].elems[j], l);
+ appendvector(space, list[l], bckts[i].elems[j]);
+ }
+ }
+ return list;
+}
+
diff --git a/segemehl/libs/aluruSort.h b/segemehl/libs/aluruSort.h
new file mode 100644
index 0000000..c3567a9
--- /dev/null
+++ b/segemehl/libs/aluruSort.h
@@ -0,0 +1,84 @@
+#ifndef ALURUSORT_H
+#define ALURUSORT_H
+
+/*
+ *
+ * aluruSort.h
+ * declarations for aluruSort.c
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 07/11/2007 07:06:04 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 31 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-16 14:11:09 +0200 (Fri, 16 May 2008) $
+ *
+ * Id: $Id: aluruSort.h 31 2008-05-16 12:11:09Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/aluruSort.h $
+ */
+
+#include "mathematics.h"
+#include "bitArray.h"
+
+#define BUCKETRET(X, N, I, B) for(B=0; B < (N); B++) { \
+ if(X[B].id == (I)) break; \
+ }
+
+#define BUCKETINIT(X,I) (X).id = (I);\
+ (X).elems=NULL; \
+ (X).noofelems=0;\
+ (X).allocsize=0
+#define BUCKETSWAP(X,A,B) {\
+ Uint resc = (X)->elems[(A)]; \
+ (X)->elems[(A)] = (X)->elems[(B)];\
+ (X)->elems[(B)] = resc;\
+ }
+#define BUCKETINC 1000
+#define BUCKETADD(S,X,E) {\
+ if((X).allocsize <= (X).noofelems) {\
+ (X).elems = ALLOCMEMORY((S), (X).elems, Uint, ((X).allocsize+=BUCKETINC)); \
+ }\
+ (X).elems[(X).noofelems++] = (E);\
+ }
+#define BUCKETFRONT(X) (X)->front++
+#define SUFA(X, N, I) {\
+ Uint ITER;\
+ Uint CUMSUM=0;\
+ for(ITER=0; ITER < N; ITER++) {\
+ if(CUMSUM+X[ITER].noofelems > I) {\
+ break;\
+ }\
+ CUMSUM+=X[ITER].noofelems;\
+ }\
+ } X[ITER].elems[I-CUMSUM]
+typedef struct {
+ Uint id;
+ Uint *elems;
+ Uint noofelems;
+ Uint front;
+ Uint allocsize;
+
+} Alurubucket;
+
+Alurubucket* getAluruBuckets(void *, char *, Uint, Uint *, Uint **);
+Uint* Qdist(void *, bitarray, Uint, unsigned char);
+bitarray classify(void *, char* s, Uint, Uint*, Uint*);
+Uint* getAluruArray(void *space, char *s, Uint len, char delim);
+vector_t** getQdistList(void *space, Alurubucket*, Uint, Uint *, Uint);
+void showAluruBuckets(Alurubucket *bckts, Uint*, Uint n);
+void sortAluruSubstrings (void *space,
+ vector_t** qdlist,
+ Uint nooflists,
+ Alurubucket *bckts,
+ Uint noofbckts,
+ Uint *R,
+ char *cl,
+ Uint len,
+ char Q);
+
+Uint* alurusortint(void *space, Uint *s, Uint *l);
+Uint* alurusort(void *space, char *s, Uint *l);
+extern void getinterval(Uint *s, Uint len, Uint *min, Uint *max);
+#endif
diff --git a/segemehl/libs/basic-types.h b/segemehl/libs/basic-types.h
new file mode 100644
index 0000000..4ea0c13
--- /dev/null
+++ b/segemehl/libs/basic-types.h
@@ -0,0 +1,75 @@
+#ifndef BASIC_TYPES_H
+#define BASIC_TYPES_H
+
+#include <stdint.h>
+
+#ifdef __CYGWIN__
+
+#define CRLF '\r'
+
+#else
+#define CRLF ' '
+#endif
+
+#define MAXBUFFERSIZE 10000
+#define BASEINC 10000
+#define MAX_INT_LENGTH 50
+typedef unsigned char Uchar;
+typedef unsigned int Uint;
+typedef signed long long Lint;
+typedef signed long long int LLint;
+typedef signed int Sint;
+typedef unsigned char BOOL;
+
+#define True 1
+#define False 0
+
+#ifndef TRUE
+#define TRUE True
+#endif
+
+#ifndef FALSE
+#define FALSE False
+#endif
+
+typedef struct {
+ int a,
+ b;
+} PairSint;
+
+typedef struct{
+ Lint a,
+ b;
+} PairLSint;
+
+typedef struct {
+ Uint a,
+ b;
+} PairUint;
+
+
+typedef struct {
+ Uint a,
+ b,
+ c;
+} TripleUint;
+
+
+typedef struct {
+ int a,
+ b,
+ c;
+} TripleSint;
+
+typedef struct {
+ int a,
+ b,
+ c,
+ d;
+} QuadSint;
+
+
+
+
+#endif
+
diff --git a/segemehl/libs/biofiles.c b/segemehl/libs/biofiles.c
new file mode 100644
index 0000000..a2abd2c
--- /dev/null
+++ b/segemehl/libs/biofiles.c
@@ -0,0 +1,3628 @@
+
+/*
+ * biofiles.c
+ * helper functions to handle file types
+ * used in bioinformatics
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 07/10/2007 01:56:15 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 76 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-11-11 16:34:21 +0100 (Tue, 11 Nov 2008) $
+ *
+ * Id: $Id: biofiles.c 76 2008-11-11 15:34:21Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/biofiles.c $
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <time.h>
+#include <errno.h>
+#include "debug.h"
+#include "zlib.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "biofiles.h"
+#include "fileio.h"
+#include "seqclip.h"
+#include "charsequence.h"
+#include "assert.h"
+#include "zran.h"
+#include "info.h"
+#include "bitVector.h"
+
+
+
+/*------------------------------- bl_fastaInit -------------------------------
+ *
+ * @brief initialize the fasta struct
+ * @author Steve Hoffmann
+ *
+ */
+
+fasta_t*
+bl_fastaInit(void *space) {
+ fasta_t *f;
+
+ f = ALLOCMEMORY(space, NULL, fasta_t, 1);
+ f->seqs = NULL;
+ f->quals = NULL;
+ f->active_noofseqs = 0;
+ f->active_noofmates = 0;
+ f->noofseqs = 0;
+ f->minlen = 0;
+ f->maxlen = 0;
+ f->matestart = NULL;
+ f->hasIndex = 0;
+ f->gzip = 0;
+ f->nooffiles = 0;
+ f->filenames=NULL;
+ f->matefilenames = NULL;
+ f->filetotal = 0;
+ f->findex = NULL;
+ f->gzindex = NULL;
+ f->chunkindex = NULL;
+ f->matechunkindex = NULL;
+ f->curchunk = 0;
+ f->chunkIsActive = 0;
+ f->hasMates = 0;
+
+ return f;
+}
+
+
+/*---------------------------- bl_fastaHasIndex -----------------------------
+ *
+ * @brief returns 1 if fasta is divided into chunks, 0 otherwise
+ * @author Steve Hoffmann
+ *
+ */
+
+unsigned char
+bl_fastaHasIndex (fasta_t *f)
+{
+ return (f->hasIndex) ;
+}
+
+/*--------------------------- bl_fastxInitSeqIndex ---------------------------
+ *
+ * @brief initialize a sequence index
+ * @author Steve Hoffmann
+ *
+ */
+
+fastxfileindex_t*
+bl_fastxInitFileIndex (void *space, Uint noofaccesspoints)
+{
+ fastxfileindex_t *idx;
+ idx = ALLOCMEMORY(space, NULL, fastxfileindex_t, 1);
+ idx->ap = ALLOCMEMORY(space, NULL, seqaccesspoint_t, noofaccesspoints);
+ memset(idx->ap, 0, sizeof(seqaccesspoint_t)*noofaccesspoints);
+ idx->size = 0;
+ idx->allocated = noofaccesspoints;
+
+ return idx;
+}
+
+
+/*--------------------------- bl_fastxInitSeqIndex ---------------------------
+ *
+ * @brief initialize a sequence index
+ * @author Steve Hoffmann
+ *
+ */
+
+fastxseqindex_t*
+bl_fastxInitSeqIndex (void *space, Uint noofaccesspoints)
+{
+ fastxseqindex_t *idx;
+ idx = ALLOCMEMORY(space, NULL, fastxseqindex_t, 1);
+ idx->ap = ALLOCMEMORY(space, NULL, seqaccesspoint_t, noofaccesspoints);
+ memset(idx->ap, 0, sizeof(seqaccesspoint_t)*noofaccesspoints);
+ idx->size = 0;
+ idx->allocated = noofaccesspoints;
+
+ return idx;
+}
+
+
+
+/*----------------------- bl_fastaGetDescriptionLength -----------------------
+ *
+ * @brief get the length of the fasta description
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_fastaGetDescriptionLength(fasta_t *f, Uint elem) {
+ Uint k;
+ if(!bl_fastaHasIndex(f))
+ return f->seqs[elem]->descrlen;
+
+ k = bl_fastxGetChunkElem (NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+ return f->seqs[k]->descrlen;
+}
+
+
+/*-------------------------- bl_fastaGetDescription --------------------------
+ *
+ * @brief returns the descriptions of a fasta sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+
+char*
+bl_fastaGetDescription(fasta_t *f, Uint elem) {
+ Uint k;
+ if(!bl_fastaHasIndex(f))
+ return f->seqs[elem]->description;
+
+ k = bl_fastxGetChunkElem (NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+
+ return f->seqs[k]->description;
+}
+
+
+
+
+/*--------------------- bl_fastaGetMateDescriptionLength ---------------------
+ *
+ * @brief get the length of the fasta mate description
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_fastaGetMateDescriptionLength(fasta_t *f, Uint elem) {
+ Uint k;
+ if(!bl_fastaHasIndex(f))
+ return f->seqs[elem]->noofinfo;
+
+ k = bl_fastxGetChunkElem (NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+ return f->seqs[k]->noofinfo;
+}
+
+
+/*-------------------------- bl_fastaGetDescription --------------------------
+ *
+ * @brief returns the descriptions of a fasta sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+
+char*
+bl_fastaGetMateDescription(fasta_t *f, Uint elem) {
+ Uint k;
+ if(!bl_fastaHasIndex(f))
+ return (char*) f->seqs[elem]->info;
+
+ k = bl_fastxGetChunkElem (NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+
+ return (char*) f->seqs[k]->info;
+}
+
+/*------------------------ bl_fastaGetSequenceLength -------------------------
+ *
+ * @brief returns the length of a fasta sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_fastaGetSequenceLength(fasta_t *f, Uint elem) {
+ Uint k, clip3=0, clip5=0;
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem (NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ }
+ } else {
+ k = elem;
+ }
+
+ clip3 = f->seqs[k]->clip3[0];
+ clip5 = f->seqs[k]->clip5[0];
+
+ if (bl_fastaHasMate(f)) {
+ return f->matestart[k]-1-clip3-clip5;
+ }
+
+ return f->seqs[k]->length-clip3-clip5;
+}
+
+
+/*--------------------------- bl_fastaGetSequence ----------------------------
+ *
+ * @brief return the fasta sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_fastaGetSequence(fasta_t *f, Uint elem) {
+ Uint k, clip5=0;
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem (NULL, f, elem);
+ //fprintf(stdout, "found chunk elem %d\n", k);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+ } else {
+ k = elem;
+ }
+
+ clip5 = f->seqs[k]->clip5[0];
+ return &f->seqs[k]->sequence[clip5];
+}
+
+
+/*----------------------------- bl_fastaHasMate ------------------------------
+ *
+ * @brief returns 1 if fasta has a mate pair, 0 otherwise
+ * @author Steve Hoffmann
+ *
+ */
+
+
+unsigned char
+bl_fastaHasMate(fasta_t *f) {
+ return (unsigned char)(f->matestart != NULL || (f->hasMates && f->hasIndex)) ;
+}
+
+/*--------------------------- bl_fastaGetMateStart ---------------------------
+ *
+ * @brief get the start pos of mate sequence in string
+ * @author Steve Hoffmann
+ *
+ */
+
+
+Uint
+bl_fastaGetMateStart(fasta_t *f, Uint elem) {
+ Uint k = 0;
+
+ if (!bl_fastaHasMate(f)) {
+ return 0;
+ }
+
+ if(!bl_fastaHasIndex(f))
+ return f->matestart[elem];
+
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+
+ return f->matestart[k];
+
+}
+
+
+
+/*-------------------------- bl_fastaGetMateLength ---------------------------
+ *
+ * @brief return length of mate sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+
+Uint
+bl_fastaGetMateLength(fasta_t *f, Uint elem) {
+ Uint k, clip3=0, clip5=0;
+
+ if (!bl_fastaHasMate(f)) {
+ return 0;
+ }
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ }
+ } else {
+ k = elem;
+ }
+
+ clip3 = f->seqs[k]->clip3[1];
+ clip5 = f->seqs[k]->clip5[1];
+
+ return (f->seqs[k]->length - f->matestart[k]) - clip3 - clip5;
+}
+
+
+/*----------------------------- bl_fastaGetMate ------------------------------
+ *
+ * @brief returns the mate sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+
+char*
+bl_fastaGetMate(fasta_t *f, Uint elem) {
+ char *res, clip5 = 0;
+ Uint k;
+
+ if (!bl_fastaHasMate(f)) {
+ return NULL;
+ }
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ }
+ } else {
+ k = elem;
+ }
+
+ clip5 = f->seqs[k]->clip5[1];
+
+ res = f->seqs[k]->sequence;
+ return &res[f->matestart[k]+clip5];
+
+}
+
+/*---------------------------- bl_fastaGetClipPos ----------------------------
+ *
+ * @brief return the clipping positions
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_fastaGetClipPos (fasta_t *f, Uint elem, Uint *p5, Uint *p3)
+{
+ Uint k;
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ }
+ } else {
+ k = elem;
+ }
+
+ *p5 = f->seqs[k]->clip5[0];
+ *p3 = f->seqs[k]->clip3[0];
+
+ return ;
+}
+
+/*-------------------------- bl_fastaGetMateClipPos --------------------------
+ *
+ * @brief return the clipping positions
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_fastaGetMateClipPos (fasta_t *f, Uint elem, Uint *p5, Uint *p3)
+{
+
+ Uint k;
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ }
+ } else {
+ k = elem;
+ }
+
+ *p5 = f->seqs[k]->clip5[1];
+ *p3 = f->seqs[k]->clip3[1];
+
+ return ;
+}
+
+/*--------------------------- bl_fastaDestructMate ---------------------------
+ *
+ * @brief free mate information
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastaDestructMate (void *space, fasta_t *f)
+{
+ assert(f->matestart);
+ FREEMEMORY(space,f->matestart);
+
+ return ;
+}
+
+/*---------------------------- bl_fastaHasQuality ----------------------------
+ *
+ * @brief returns 1 if fasta has quality information, 0 otherwise
+ * @author Steve Hoffmann
+ *
+ */
+
+unsigned char
+bl_fastaHasQuality(fasta_t *f) {
+ return (unsigned char)(f->quals != NULL);
+}
+
+
+/*---------------------------- bl_fastaGetQuality ----------------------------
+ *
+ * @brief returns the quality information of the sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+
+char*
+bl_fastaGetQuality(fasta_t* f, Uint elem) {
+ Uint k, clip5=0;
+
+ if (!bl_fastaHasQuality(f)) {
+ return NULL;
+ }
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+
+ if(k == -1) {
+ DBG("retrieval of quality %d failed. Exit forced.\n", elem);
+ }
+ } else {
+ k = elem;
+ }
+
+ clip5 = f->seqs[k]->clip5[0];
+
+ return &f->quals[k]->sequence[clip5];
+}
+
+/*-------------------------- bl_fastaGetMateQuality --------------------------
+ *
+ * @brief returns the quality string of the mate sequence elem
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_fastaGetMateQuality(fasta_t *f, Uint elem) {
+ char *res, clip5=0;
+ Uint k;
+
+ if (!bl_fastaHasMate(f) || !bl_fastaHasQuality(f)) {
+ return NULL;
+ }
+
+ if (bl_fastaHasIndex(f)){
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+
+ if(k == -1) {
+ DBG("retrieval of quality %d failed. Exit forced.\n", elem);
+ }
+ } else {
+ k = elem;
+ }
+
+ clip5 = f->seqs[k]->clip5[1];
+
+ res = f->quals[k]->sequence;
+ return &res[f->matestart[k]+clip5];
+}
+
+/*------------------------- bl_fastaDestructQuality --------------------------
+ *
+ * @brief free quality information
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastaDestructQuality(void *space, fasta_t *f)
+{
+ Uint i;
+ assert(f->quals);
+
+ for(i=0; i < f->active_noofseqs; i++) {
+ destructSequence(space, f->quals[i]);
+ }
+
+ return ;
+}
+
+#ifdef HASHING
+/*---------------------------- bl_fastaGetQuantity -----------------------------
+ *
+ * @brief get quantity of fasta sequence in terms of tag count
+ * (number of input sequences with equal nucleotides)
+ * @author Christian Otto
+ *
+ */
+Uint
+bl_fastaGetQuantity(fasta_t *f, Uint elem){
+ Uint k;
+
+ if(!bl_fastaHasIndex(f))
+ return f->seqs[elem]->quantity;
+
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ }
+
+ return f->seqs[k]->quantity;
+}
+
+/*------------------------ bl_fastaSetQuantity ------------------------
+ *
+ *
+ * @brief set quantity of fasta sequence in terms of tag count
+ * (number of input sequences with equal nucleotides)
+ * @author Christian Otto
+ *
+ */
+void
+bl_fastaSetQuantity (fasta_t *f, Uint elem, Uint quantity) {
+ Uint k;
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+ } else {
+ k = elem;
+ }
+
+ f->seqs[k]->quantity = quantity;
+}
+
+#endif
+
+/*------------------------ bl_fastaSetMateDescription ------------------------
+ *
+ * @brief set the description for a fasta sequence elem
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_fastaSetMateDescription(fasta_t *f, Uint elem, char *descr, Uint len) {
+ assert(descr[0] == '@' || descr[0] == '>');
+ memmove(descr, &descr[1], len-1);
+ descr[len-1] = 0;
+ f->seqs[elem]->info = descr;
+ f->seqs[elem]->noofinfo = len-1;
+ return;
+}
+
+
+/*-------------------------- bl_fastaSetDescription --------------------------
+ *
+ * @brief set the description for a fasta sequence elem
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_fastaSetDescription(fasta_t *f, Uint elem, char *descr, Uint len) {
+ assert(descr[0] == '@' || descr[0] == '>');
+ memmove(descr, &descr[1], len-1);
+ descr[len-1] = 0;
+ f->seqs[elem]->description = descr;
+ f->seqs[elem]->descrlen = len-1;
+
+ return;
+}
+
+
+/*--------------------------- bl_fastaSetSequence ----------------------------
+ *
+ * @brief set the fasta sequence elem
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_fastaSetSequence(void *space, fasta_t *f, Uint elem, char *seq, Uint len) {
+
+ Uint oldlen,
+ newlen;
+ char *oldseq;
+
+ oldlen = f->seqs[elem]->length;
+ oldseq = f->seqs[elem]->sequence;
+ newlen = oldlen + len;
+
+ if (oldlen) {
+ newlen = len + oldlen + 1;
+ seq = ALLOCMEMORY(space, seq, char, newlen+1);
+ f->matestart = ALLOCMEMORY(space, f->matestart, Uint, elem+1);
+ f->matestart[elem] = oldlen+1;
+ memmove(&seq[oldlen+1], seq, len);
+ memmove(seq, oldseq, oldlen);
+ seq[oldlen] = '\0';
+ seq[newlen] = '\0';
+ FREEMEMORY(space, oldseq);
+ }
+
+ f->seqs[elem]->length = newlen;
+ f->seqs[elem]->sequence = seq;
+
+ return;
+}
+
+
+
+/*----------------------------- bl_fastaSoftClip -----------------------------
+ *
+ * @brief clip sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_fastaSoftClip (void *space, fasta_t *f, Uint elem,
+ char *p5, Uint p5len, Uint p5scr, char *p3, Uint p3len, Uint p3acc, Uint pAlen)
+{
+ Uint len, l=0, r=0, k;
+ char *seq;
+
+ seq = bl_fastaGetSequence(f, elem);
+ len = bl_fastaGetSequenceLength(f, elem);
+
+
+ if (p3 && p3len)
+ r = bl_seqclipSoft3Prime(space, seq, len, p3, p3len, p3acc, pAlen);
+ if (p5 && p5len)
+ l = bl_seqclipSoft5Prime(space, seq, len, p5, p5len, p5scr);
+
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+ } else {
+ k = elem;
+ }
+
+
+ if(f->seqs[k]->clip5[0] + f->seqs[k]->clip3[0]+ l +r >= len) {
+
+ return 0;
+ }
+
+
+ f->seqs[k]->clip5[0] += l;
+ f->seqs[k]->clip3[0] += r;
+
+
+ return l+r;
+}
+
+/*--------------------------- bl_fastaMateSoftClip ---------------------------
+ *
+ * @brief clip sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_fastaMateSoftClip (void *space, fasta_t *f, Uint elem,
+ char *p5, Uint p5len, Uint p5scr, char *p3, Uint p3len, Uint p3acc, Uint pAlen)
+{
+ Uint len, l=0, r=0, k;
+ char *seq;
+
+ if (!bl_fastaHasMate(f)) {
+ return 0;
+ }
+
+ seq = bl_fastaGetMate(f, elem);
+ len = bl_fastaGetMateLength(f, elem);
+
+ if (p3 && p3len)
+ r = bl_seqclipSoft3Prime(space, seq, len, p3, p3len, p3acc, pAlen);
+ if (p5 && p5len)
+ l = bl_seqclipSoft5Prime(space, seq, len, p5, p5len, p5scr);
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+ } else {
+ k = elem;
+ }
+
+
+ if(f->seqs[k]->clip5[1] + f->seqs[k]->clip3[1] + l + r >= len) {
+ return 0;
+ }
+
+ f->seqs[k]->clip5[1] += l;
+ f->seqs[k]->clip3[1] += r;
+
+ return l+r;
+}
+
+
+/*----------------------------- bl_fastaHardClip -----------------------------
+ *
+ * @brief hard clip sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_fastaHardClip (void *space, fasta_t *f, Uint elem, Uint p5, Uint p3)
+{
+ Uint k, len;
+
+ len = bl_fastaGetSequenceLength(f, elem);
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+ } else {
+ k = elem;
+ }
+
+ if(f->seqs[k]->clip5[0] + f->seqs[k]->clip3[0] + p3 + p5 >= len) {
+ return 0;
+ }
+
+ f->seqs[k]->clip5[0] += p5;
+ f->seqs[k]->clip3[0] += p3;
+
+ return p5+p3;
+}
+
+/*--------------------------- bl_fastaMateHardClip ---------------------------
+ *
+ * @brief hard clip sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_fastaMateHardClip (void *space, fasta_t *f, Uint elem, Uint p5, Uint p3)
+{
+
+ Uint k, len;
+
+ if(!bl_fastaHasMate(f))
+ return 0;
+
+ len = bl_fastaGetMateLength(f, elem);
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+ } else {
+ k = elem;
+ }
+
+ if(f->seqs[k]->clip5[1]+f->seqs[k]->clip3[1]+p3+p5 >= len) {
+ return 0;
+ }
+
+ f->seqs[k]->clip5[1] += p5;
+ f->seqs[k]->clip3[1] += p3;
+
+ return p5+p3;
+}
+
+void
+bl_fastaSetClip (fasta_t *f, Uint elem, Uint p5, Uint p3) {
+ Uint k;
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+ } else {
+ k = elem;
+ }
+
+ f->seqs[k]->clip5[0] = p5;
+ f->seqs[k]->clip3[0] = p3;
+}
+
+void
+bl_fastaSetMateClip (fasta_t *f, Uint elem, Uint p5, Uint p3) {
+ Uint k;
+
+ if(bl_fastaHasIndex(f)) {
+ k = bl_fastxGetChunkElem(NULL, f, elem);
+ if(k == -1) {
+ DBG("retrieval of sequence %d failed. Exit forced.\n", elem);
+ exit(-1);
+ }
+ } else {
+ k = elem;
+ }
+
+ f->seqs[k]->clip5[1] = p5;
+ f->seqs[k]->clip3[1] = p3;
+}
+
+
+
+
+
+/*---------------------------- bl_fastaSeqQuality ----------------------------
+ *
+ * @brief set the quality information for sequence eleme
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastaSetQuality(void *space, fasta_t* f, Uint elem, char *qlty, Uint len) {
+
+ Uint oldlen,
+ newlen;
+ char *oldqlty;
+
+ assert(f->quals && f->quals[elem]);
+
+ oldlen = f->quals[elem]->length;
+ oldqlty = f->quals[elem]->sequence;
+ newlen = len + oldlen;
+
+ if (oldlen) {
+ newlen = len + oldlen + 1;
+ qlty = ALLOCMEMORY(space, qlty, char, newlen+1);
+ memmove(&qlty[oldlen+1], qlty, len);
+ memmove(qlty, oldqlty, oldlen);
+ qlty[oldlen] = '\0';
+ qlty[newlen] = '\0';
+ FREEMEMORY(space, oldqlty);
+ }
+
+ f->quals[elem]->length = newlen;
+ f->quals[elem]->sequence = qlty;
+
+ return;
+}
+
+
+/*---------------------------- bl_fastaAddQuality ----------------------------
+ *
+ * @brief allocate memory for a new quality information
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_fastaAddQuality(void *space, fasta_t *f) {
+ Uint n = f->active_noofseqs;
+
+ f->quals = ALLOCMEMORY(space, (f->quals), CharSequence*, n+1);
+ assert(f->quals != NULL);
+ f->quals[n] = initSequence(space);
+
+ return;
+}
+
+
+/*--------------------------- bl_fastaAddSequence ----------------------------
+ *
+ * @brief alloc memory for a new fasta sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastaAddSequence(void *space, fasta_t *f) {
+ Uint n = f->active_noofseqs;
+
+ f->seqs = ALLOCMEMORY(space, (f->seqs), CharSequence*, n+1);
+ assert(f->seqs != NULL);
+ f->seqs[n] = initSequence(space);
+ f->seqs[n]->clip3[0] = 0;
+ f->seqs[n]->clip3[1] = 0;
+ f->seqs[n]->clip5[0] = 0;
+ f->seqs[n]->clip5[1] = 0;
+ return;
+}
+
+
+/*------------------------------- bl_fastaAdd --------------------------------
+ *
+ * @brief add a new fasta structure
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_fastaAdd(void *space,
+ fasta_t *f,
+ char *descr,
+ Uint descrlen,
+ char *sequence,
+ Uint seqlen,
+ Uint n) {
+
+ assert(n == f->active_noofseqs);
+
+ bl_fastaAddSequence(space, f);
+ bl_fastaSetDescription(f, n, descr, descrlen);
+ bl_fastaSetSequence(space, f, n, sequence, seqlen);
+
+ f->minlen = (seqlen < f->minlen) ? seqlen : f->minlen;
+ f->maxlen = (seqlen > f->maxlen) ? seqlen : f->maxlen;
+ f->active_noofseqs++;
+
+ if(!f->hasIndex) {
+ f->noofseqs++;
+ }
+
+ return;
+}
+
+
+/*------------------------------- bl_fastaxAdd -------------------------------
+ *
+ * @brief add a new fasta structure for fastq or fasta information
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_fastxAdd(void *space,
+ fasta_t *f,
+ char *descr,
+ Uint descrlen,
+ char *sequence,
+ char *quality,
+ Uint seqlen,
+ Uint n) {
+
+
+ assert(n == f->active_noofseqs);
+
+ bl_fastaAddSequence(space, f);
+ bl_fastaSetDescription(f, f->active_noofseqs, descr, descrlen);
+ bl_fastaSetSequence(space, f, f->active_noofseqs, sequence, seqlen);
+
+ f->minlen = (seqlen < f->minlen) ? seqlen : f->minlen;
+ f->maxlen = (seqlen > f->maxlen) ? seqlen : f->maxlen;
+
+ if (quality) {
+ assert(n==0 || bl_fastaHasQuality(f));
+ bl_fastaAddQuality(space, f);
+ bl_fastaSetQuality(space, f, f->active_noofseqs, quality, seqlen);
+ } else {
+ assert(!bl_fastaHasQuality(f));
+ }
+
+ f->active_noofseqs++;
+
+ if (!f->hasIndex) {
+ f->noofseqs++;
+ }
+
+ return;
+}
+
+
+/*--------------------------- bl_fastaCheckMateID ----------------------------
+ *
+ * @brief check if fasta description matches with mate pair description
+ * @author Steve Hoffmann
+ *
+ */
+
+unsigned char
+bl_fastaCheckMateID(fasta_t* f, Uint elem, char *mateid, Uint matelen) {
+
+ char *id, *id2, *tok1, *tok2, *desc;
+ Uint descrlen;
+ unsigned char res;
+
+ descrlen = f->seqs[elem]->descrlen;
+ desc = f->seqs[elem]->description;
+
+ id = ALLOCMEMORY(space, NULL, char, descrlen+2);
+ id2 = ALLOCMEMORY(space, NULL, char, matelen+2);
+
+ strcpy(id, desc);
+ strcpy(id2, mateid);
+
+ tok1 = strtok(id, "/");
+ tok2 = strtok(id2, "/");
+ res = (strcmp(tok1, tok2)==0);
+
+ if(!res) {
+ FREEMEMORY(space, id);
+ FREEMEMORY(space, id2);
+
+ id = ALLOCMEMORY(space, NULL, char, descrlen+2);
+ id2 = ALLOCMEMORY(space, NULL, char, matelen+2);
+
+ strcpy(id, desc);
+ strcpy(id2, mateid);
+
+ tok1 = strtok(id, " ");
+ tok2 = strtok(id2, " ");
+ res = (strcmp(tok1, tok2)==0);
+ }
+
+ FREEMEMORY(space, id);
+ FREEMEMORY(space, id2);
+ return res;
+}
+
+
+/*----------------------------- bl_fastaAddMate ------------------------------
+ *
+ * @brief add a new mate pair
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastaAddMate(void *space,
+ fasta_t *f,
+ char *descr,
+ Uint descrlen,
+ char *sequence,
+ Uint seqlen,
+ Uint n) {
+
+ bl_fastaSetMateDescription(f, n, descr, descrlen);
+ assert(bl_fastaCheckMateID(f, n, descr, descrlen));
+ bl_fastaSetSequence(space, f, n, sequence, seqlen);
+
+
+ f->minlen = (seqlen < f->minlen) ? seqlen : f->minlen;
+ f->maxlen = (seqlen > f->maxlen) ? seqlen : f->maxlen;
+
+ f->active_noofmates++;
+ return;
+}
+
+
+/*----------------------------- bl_fastxAddMate ------------------------------
+ *
+ * @brief add a new mate pair with or without quality information
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastxAddMate(void *space,
+ fasta_t *f,
+ char *descr,
+ Uint descrlen,
+ char *sequence,
+ char *quality,
+ Uint seqlen,
+ Uint n) {
+
+ bl_fastaSetMateDescription(f, n, descr, descrlen);
+ assert(bl_fastaCheckMateID(f, n, descr, descrlen));
+
+ bl_fastaSetSequence(space, f, n, sequence, seqlen);
+
+
+ if (quality) {
+ assert(bl_fastaHasQuality(f));
+ bl_fastaSetQuality(space, f, n, quality, seqlen);
+ } else {
+ assert(!bl_fastaHasQuality(f));
+ }
+
+ f->minlen = (seqlen < f->minlen) ? seqlen : f->minlen;
+ f->maxlen = (seqlen > f->maxlen) ? seqlen : f->maxlen;
+
+ f->active_noofmates++;
+ return;
+}
+
+
+/*------------------------------- bl_fastaChop -------------------------------
+ *
+ * @brief chop the fasta structure into pieces
+ * @author Steve Hoffmann
+ *
+ */
+
+fasta_t**
+bl_fastaChop(void *space, fasta_t* f, Uint pieces) {
+ Uint size, r, i, j, offset=0;
+ fasta_t **chops;
+
+
+ size = f->active_noofseqs/pieces;
+ r = f->active_noofseqs-(size*pieces);
+ assert((pieces*size)+r == f->active_noofseqs);
+
+ chops = ALLOCMEMORY(space, NULL, fasta_t*, pieces);
+ for(i=0; i < pieces; i++) {
+
+ chops[i] = bl_fastaInit(NULL);
+ if (i < pieces-1) {
+ chops[i]->active_noofseqs=size;
+ chops[i]->noofseqs = size;
+ } else {
+ chops[i]->active_noofseqs = size+r;
+ chops[i]->noofseqs = size+r;
+ }
+
+ chops[i]->seqs =
+ ALLOCMEMORY(space, NULL, CharSequence*, chops[i]->active_noofseqs);
+ if (bl_fastaHasQuality(f)) {
+ chops[i]->quals =
+ ALLOCMEMORY(space, NULL, CharSequence*, chops[i]->active_noofseqs);
+ }
+
+ if (bl_fastaHasMate(f)) {
+ chops[i]->matestart = &f->matestart[offset];
+ }
+
+ for(j=0; j < chops[i]->active_noofseqs; j++) {
+ chops[i]->seqs[j] = f->seqs[j+offset];
+
+
+ if (bl_fastaHasQuality(f)) {
+ chops[i]->quals[j] = f->quals[j+offset];
+ }
+
+ chops[i]->minlen = (f->seqs[j+offset]->length < chops[i]->minlen) ?
+ f->seqs[j+offset]->length : chops[i]->minlen;
+ chops[i]->maxlen = (f->seqs[j+offset]->length > chops[i]->maxlen) ?
+ f->seqs[j+offset]->length : chops[i]->maxlen;
+ }
+
+ offset += chops[i]->active_noofseqs;
+ }
+ return chops;
+}
+
+
+
+/*---------------------------- bl_fastxChopIndex -----------------------------
+ *
+ * @brief chop the index to pieces
+ * @author Steve Hoffmann
+ *
+ */
+
+fasta_t**
+bl_fastxChopIndex(void *space, fasta_t *f, Uint pieces)
+{
+ Uint i, j, offset, noofchunks, chunksperpiece,
+ rest=0, chunks, chunkoff=0;
+
+ fasta_t **chops;
+ fastxseqindex_t *chunkindex = NULL;
+ fastxseqindex_t *matechunkindex = NULL;
+
+ assert(f->hasIndex);
+ assert(pieces <= f->chunkindex->size);
+
+ chops = ALLOCMEMORY(space, NULL, fasta_t*, pieces);
+ noofchunks = f->chunkindex->size;
+ chunksperpiece = noofchunks/pieces;
+ rest = noofchunks - (pieces*chunksperpiece);
+
+ for(i=0; i < pieces; i++) {
+
+ chops[i] = bl_fastaInit(space);
+ memmove(chops[i], f, sizeof(fasta_t));
+ chops[i]->chunkIsActive = 0;
+ chops[i]->active_noofseqs = 0;
+ chops[i]->active_noofmates = 0;
+ chops[i]->seqs = NULL;
+ chops[i]->quals = NULL;
+ chops[i]->matestart = NULL;
+ chops[i]->chunkIsActive = 0;
+ chunks = chunksperpiece;
+
+ if(rest > 0) {
+ rest--;
+ chunks++;
+ }
+
+ chunkindex = ALLOCMEMORY(space, NULL, fastxseqindex_t, 1);
+ chunkindex->ap = ALLOCMEMORY(space, NULL, seqaccesspoint_t, chunks);
+ memmove(chunkindex->ap, &f->chunkindex->ap[chunkoff],
+ chunks * sizeof(seqaccesspoint_t));
+ chunkindex->size = chunks;
+ chunkindex->allocated = chunks;
+
+ if (chunkoff) {
+ offset = f->chunkindex->ap[chunkoff-1].cumnoofseqs;
+ } else {
+ offset = 0;
+ }
+
+ for(j=0; j < chunks; j++) {
+ chunkindex->ap[j].cumnoofseqs -= offset;
+ }
+
+ if(f->matechunkindex) {
+
+ matechunkindex = ALLOCMEMORY(space, NULL, fastxseqindex_t, 1);
+ matechunkindex->ap = ALLOCMEMORY(space, NULL, seqaccesspoint_t, chunks);
+ memmove(matechunkindex->ap, &f->matechunkindex->ap[chunkoff],
+ chunks * sizeof(seqaccesspoint_t));
+ matechunkindex->size = chunks;
+ matechunkindex->allocated = chunks;
+
+ if (chunkoff) {
+ offset = f->matechunkindex->ap[chunkoff-1].cumnoofseqs;
+ } else {
+ offset = 0;
+ }
+
+ for(j=0; j < chunks; j++) {
+ matechunkindex->ap[j].cumnoofseqs -= offset;
+ }
+ }
+
+ chops[i]->chunkindex = chunkindex;
+ chops[i]->matechunkindex = matechunkindex;
+ if (matechunkindex)
+ assert (chunkindex->ap[chunks-1].cumnoofseqs == matechunkindex->ap[chunks-1].cumnoofseqs);
+ chops[i]->noofseqs = chunkindex->ap[chunks-1].cumnoofseqs;
+ chunkoff += chunks;
+ }
+
+ return chops;
+}
+
+
+
+
+/*------------------------------- bl_fastxScan -------------------------------
+ *
+ * @brief scan fasta or fastq format
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_fastxScan(
+ void *space,
+ char *filename, struct access *index,
+ off_t offset, fastxfileindex_t* findex,
+ Uint max, Uint *minlen, Uint *maxlen, unsigned char *minq, unsigned char *maxq)
+{
+
+ char ch;
+ char idchar=0;
+ int ret=0;
+ off_t curseqoffset, lastindexoffset=0;
+ unsigned char desc = 0;
+ unsigned char fastq = 0;
+ unsigned char qualdesc = 0;
+ unsigned char qual = 0;
+ unsigned char seq = 0;
+ unsigned char minqual = 255, maxqual = 0;
+ FILE *fp;
+ Uint seqlen = 0;
+ Uint n = 0;
+ Uint len = 0;
+
+ struct gzidxfile *gzf = NULL;
+
+ if (index) {
+ fp = fopen(filename, "rb");
+ gzf = bl_initgzidxfile(fp, index, offset, MEDIUMCHUNK);
+
+ } else {
+
+ fp = fopen(filename, "r");
+ if (fp == NULL) {
+ fprintf(stderr, "Couldnt open %s for reading. Exit forced.\n", filename);
+ exit(-1);
+ }
+
+ ret = fseeko(fp, offset, SEEK_SET);
+
+ if (ret == -1) {
+ MSG("fseeko failed. Exit forced.\n");
+ exit(-1);
+ }
+ }
+
+ if(findex->size+2 >= findex->allocated) {
+ findex->ap = ALLOCMEMORY(space, findex->ap,
+ seqaccesspoint_t, findex->allocated+11);
+ findex->allocated += 11;
+ }
+
+ findex->ap[findex->size].offset = offset;
+ findex->ap[findex->size].noofseqs = 0;
+ findex->size++;
+
+ lastindexoffset = offset;
+
+ while((ch = (index) ? bl_getgzidxc(gzf) : getc(fp)) != EOF) {
+
+ if((ch == '@' || ch == '>') && !idchar) {
+ desc = 1;
+ fastq = (ch == '@');
+ idchar = ch;
+ }
+
+ if(ch==idchar
+ && ((!fastq && len > 0) || (qual && len > 0 && len == seqlen))
+ )
+ {
+ seq = 0;
+ qual = 0;
+ desc = 1;
+
+ if (n == 1 || *minlen > len) {
+ *minlen = len;
+ }
+
+ if(n == 1 || *maxlen < len) {
+ *maxlen = len;
+ }
+
+ n++;
+ seqlen = 0;
+ len = 0;
+
+ if(index) {
+ curseqoffset = bl_ftellgzidx(gzf);
+ } else {
+ curseqoffset = ftello(fp);
+ if (curseqoffset == -1) {
+ MSG("ftello failed. Exit forced.\n");
+ exit(-1);
+ }
+ }
+
+ if(curseqoffset > lastindexoffset + SPAN || (max && n == max)) {
+
+ if(findex->size+2 >= findex->allocated) {
+ findex->ap = ALLOCMEMORY(space, findex->ap,
+ seqaccesspoint_t, findex->allocated+11);
+ findex->allocated += 11;
+ }
+
+ findex->ap[findex->size].offset = curseqoffset-1;
+ findex->ap[findex->size].noofseqs = n-1;
+ findex->size++;
+
+ lastindexoffset = curseqoffset;
+ }
+
+ if(max && n == max) {
+ break;
+ }
+ }
+
+
+ if(qual && len > seqlen) {
+ NFO("fastq error: qual string > nt string: %d\n", n);
+ exit(-1);
+ }
+
+ if(qual) {
+ if(ch < minqual) minqual = ch;
+ if(ch > maxqual) maxqual = ch;
+ }
+
+ if(fastq && ch=='+' && seq && len > 0) {
+ seq = 0;
+ qualdesc = 1;
+ seqlen = len;
+ len = 0;
+ }
+
+ if(!desc && !qualdesc && ch =='\n') {
+ /*do nothing.*/
+ } else {
+ if(desc && ch == '\n') {
+ len = 0;
+ desc = 0;
+ seq = 1;
+ } else if (fastq && qualdesc && ch == '\n') {
+ len = 0;
+ qualdesc = 0;
+ qual = 1;
+ } else {
+ if (ch == '\r') continue;
+ len++;
+ }
+ }
+ }
+
+ if ((!fastq && seq && len > 0)||(fastq && qual && len > 0 && len == seqlen)){
+ if (n == 0 || *minlen > len) {
+ *minlen = len;
+ }
+ if(n == 0 || *maxlen < len) {
+ *maxlen = len;
+ }
+ n++;
+ }
+
+
+ *minq = minqual;
+ *maxq = maxqual;
+
+ fclose(fp);
+ if(index) bl_destructgzidxfile(gzf);
+ FREEMEMORY(space, gzf);
+
+ return n;
+}
+
+
+/*--------------------------- bl_fastxChunkIndex ----------------------------
+ *
+ * @brief adjust offset-index for n seqs to chunks with approx k seqs
+ * @author Steve Hoffmann
+ *
+ */
+
+fastxseqindex_t*
+bl_fastxChunkIndex (void *space, char **filenames, struct access **gzindex,
+ fastxfileindex_t **findex, Uint *n, Uint nooffiles, Uint total, Uint k)
+{
+ Uint i=0, j, chunks, cur=0, skip, chunksize, curchunksize=0, rest=0,
+ nseqs=0, nseqsfile=0, minlen=0, maxlen=0, off=0;
+ unsigned char minqual = 255, maxqual = 0;
+ //Uint l;
+ //char ch;
+ // FILE *fp=NULL;
+
+ fastxseqindex_t *idx;
+ fastxfileindex_t *tmp;
+
+ chunks = (k>total) ? 1 : total/k;
+ idx = bl_fastxInitSeqIndex(space, chunks+1000);
+
+
+ //fprintf(stderr, "start fastxchunkindex\n");
+
+ for(j=0; j < nooffiles; j++) {
+
+ chunks = (k>n[j]) ? 1 : n[j]/k;
+ chunksize = (k > n[j]) ? n[j] : k+((n[j]%k)/chunks);
+ rest = (n[j]%k) % chunks;
+
+ curchunksize = chunksize;
+
+ if(rest > 0) {
+ rest--;
+ curchunksize++;
+ }
+
+ nseqs += curchunksize;
+ idx->ap[idx->size].noofseqs = curchunksize;
+ idx->ap[idx->size].cumnoofseqs = nseqs;
+ idx->ap[idx->size].offset = 0;
+ idx->ap[idx->size].fileid = j;
+ idx->size++;
+
+
+ tmp = bl_fastxInitFileIndex(space, chunks);
+ nseqsfile = curchunksize;
+
+ for(i=1; i < chunks; i++) {
+
+ cur = 0;
+ while (cur+1 < findex[j]->size &&
+ findex[j]->ap[cur+1].noofseqs < nseqsfile)
+ cur++;
+
+ if(findex[j]->ap[cur].noofseqs) {
+ off = findex[j]->ap[cur].noofseqs+1;
+ } else {
+ off = 0;
+ }
+
+ skip = nseqsfile - off;
+
+
+ // fprintf(stderr, "chunk %d in file %d (curchunksize %d): skiping %d (= nseqsfile:%d - findex.noofseqs:%d)\n",
+ // i, j, curchunksize, skip, nseqsfile, findex[j]->ap[cur].noofseqs);
+ // fprintf(stderr, "at offset: %lu\n", findex[j]->ap[cur].offset);
+ // fprintf(stderr, "@%d\n", nseqsfile);
+
+
+ if(skip) {
+ if(gzindex) {
+ bl_fastxScan(space, filenames[j], gzindex[j],
+ findex[j]->ap[cur].offset, tmp, skip, &minlen, &maxlen, &maxqual, &minqual);
+ } else {
+ bl_fastxScan(space, filenames[j], NULL,
+ findex[j]->ap[cur].offset, tmp, skip, &minlen, &maxlen, &maxqual, &minqual);
+ }
+ } else {
+ if(tmp->size+2 >= tmp->allocated) {
+ tmp->ap = ALLOCMEMORY(space, tmp->ap,
+ seqaccesspoint_t, tmp->allocated+11);
+ tmp->allocated += 11;
+ }
+
+ tmp->ap[tmp->size].offset = findex[j]->ap[cur].offset;
+ tmp->ap[tmp->size].noofseqs = curchunksize;
+ tmp->size++;
+ }
+
+ /*===================
+
+ fp = fopen(filenames[j],"r");
+ if (fp == NULL) {
+ fprintf(stderr, "Couldnt open %s for reading. Exit forced.\n", filenames[j]);
+ exit(-1);
+ }
+
+ fseeko(fp, tmp->ap[tmp->size-1].offset, SEEK_SET);
+
+ for(l=0; l < 50; l++) {
+ ch = getc(fp);
+ fprintf(stderr, "%c", ch);
+ }
+
+ fprintf(stderr, "\n");
+ fclose(fp);
+ ===================== */
+
+ curchunksize = chunksize;
+
+ if(rest > 0) {
+ rest--;
+ curchunksize++;
+ }
+
+ nseqs += curchunksize;
+ nseqsfile += curchunksize;
+ idx->ap[idx->size].noofseqs = curchunksize;
+ idx->ap[idx->size].cumnoofseqs = nseqs;
+ idx->ap[idx->size].offset = tmp->ap[tmp->size-1].offset;
+ idx->ap[idx->size].fileid = j;
+ idx->size++;
+ }
+
+
+ FREEMEMORY(space, tmp->ap);
+ FREEMEMORY(space, tmp);
+ }
+
+ return idx;
+}
+
+
+
+
+/*----------------------------- bl_fastxGetChunk -----------------------------
+ *
+ * @brief return chunknumber for sequence k
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+bl_fastxGetChunk (fasta_t *fasta, Uint k)
+{
+ Uint chunksize, i, chunks;
+
+ if (k >= fasta->noofseqs || !fasta->chunkindex)
+ return -1;
+
+ chunks = fasta->chunkindex->size;
+ chunksize = fasta->chunkindex->ap[0].noofseqs;
+ i = k/chunksize;
+ if (i >= chunks) i=0;
+
+
+
+ while (i < chunks && fasta->chunkindex->ap[i].cumnoofseqs <= k) i++;
+ while (i>0 && fasta->chunkindex->ap[i-1].cumnoofseqs > k) i--;
+
+
+ if (fasta->chunkindex->ap[i].cumnoofseqs <= k
+ || ( i>0 && fasta->chunkindex->ap[i-1].cumnoofseqs > k)) {
+
+ DBG("chunk not found: chunks:%d, i:%d, idx[i]:%d, idx[i-1]:%d, k:%d\n",
+ chunks, i, fasta->chunkindex->ap[i].cumnoofseqs,
+ fasta->chunkindex->ap[i-1].cumnoofseqs, k);
+
+
+ i=0;
+ DBG("list: chunks:%d, i:%d, idx[i]:%d, idx[i-1]:%d, k:%d, fid:%d\n",
+ chunks, i, fasta->chunkindex->ap[i].cumnoofseqs, 0, k,
+ fasta->chunkindex->ap[i].fileid);
+
+ for(i=1; i < chunks; i++) {
+
+ DBG("list: chunks:%d, i:%d, idx[i]:%d, idx[i-1]:%d, k:%d, fid:%d\n",
+ chunks, i, fasta->chunkindex->ap[i].cumnoofseqs,
+ fasta->chunkindex->ap[i-1].cumnoofseqs, k,
+ fasta->chunkindex->ap[i].fileid);
+ }
+
+ exit(-1);
+ return -1;
+ }
+
+ return i;
+}
+
+
+
+/*------------------------- bl_fastxGetChunkElem -------------------------
+ *
+ * @brief load chunk if necessary and return position of sequence k
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_fastxGetChunkElem (void *space, fasta_t *f, Uint k)
+{
+ Uint off=0, fid;
+ int cur;
+
+ cur = f->curchunk;
+
+ if (f->chunkIsActive) {
+ if(f->chunkindex->ap[cur].cumnoofseqs > k &&
+ (cur==0 || f->chunkindex->ap[cur-1].cumnoofseqs <= k))
+ {
+ if (cur>0) off = f->chunkindex->ap[cur-1].cumnoofseqs;
+ assert(k>=off);
+ return k-off;
+ }
+ bl_fastxDestructSequence(space, f);
+ f->active_noofseqs = 0;
+ }
+
+ cur = bl_fastxGetChunk (f, k);
+
+ assert(cur > -1);
+ fid = f->chunkindex->ap[cur].fileid;
+
+ if (f->gzip) {
+ f = bl_fastxgzRead(space, f, f->filenames[fid], f->gzindex[fid],
+ f->upper, f->lower, f->chunkindex->ap[cur].offset, 0,
+ f->chunkindex->ap[cur].noofseqs, bl_fastxAdd);
+
+ if (f->hasMates) {
+ f = bl_fastxgzRead(space, f, f->matefilenames[fid], f->mategzindex[fid],
+ f->upper, f->lower, f->matechunkindex->ap[cur].offset, 0,
+ f->matechunkindex->ap[cur].noofseqs, bl_fastxAddMate);
+ }
+ } else {
+ f = bl_fastxRead(space, f, f->filenames[fid],
+ f->upper, f->lower, f->chunkindex->ap[cur].offset,
+ 0, f->chunkindex->ap[cur].noofseqs,
+ //f->gzip, f->index[fid],
+ bl_fastxAdd);
+ if (f->hasMates) {
+ f = bl_fastxRead(space, f, f->matefilenames[fid],
+ f->upper, f->lower, f->matechunkindex->ap[cur].offset,
+ 0, f->matechunkindex->ap[cur].noofseqs,
+ //f->gzip, f->index[fid],
+ bl_fastxAddMate);
+ }
+ }
+
+ if (cur>0) off = f->chunkindex->ap[cur-1].cumnoofseqs;
+
+ f->chunkIsActive = 1;
+ f->curchunk = cur;
+
+ assert(k>=off);
+ return k-off;
+}
+
+/*------------------------------ bl_fastxIndex ------------------------------
+ *
+ * @brief build index of fasta or fastq file
+ * @author Steve Hoffmann
+ *
+ */
+
+fasta_t*
+bl_fastxIndex(void *space, fasta_t* set, char **filenames,
+ Uint nooffiles, unsigned char isMate, unsigned char gzip, Uint pieces) {
+
+ int i, len=0;
+ Uint total=0, *ftotal=NULL;
+ Uint chunksize = 10000, minlen=0, maxlen=0, seqsperpiece, noofchunks, rest;
+ unsigned char minqual =0, maxqual = 0;
+
+ struct access **gzindex = NULL;
+ fastxfileindex_t **findex = NULL;
+ fastxseqindex_t *chunkindex;
+
+
+ if(gzip) {
+ gzindex = ALLOCMEMORY(space, NULL, struct access*, nooffiles);
+ for(i=0; i < nooffiles; i++) {
+ gzindex[i] = bl_zranGetIndex(filenames[i], &len);
+ }
+ }
+
+ findex = ALLOCMEMORY(space, NULL, fastxfileindex_t*, nooffiles);
+ ftotal = ALLOCMEMORY(space, NULL, Uint, nooffiles);
+
+ for(i=0; i < nooffiles; i++) {
+ findex[i] = bl_fastxInitFileIndex(space, len);
+
+ if(gzip)
+ ftotal[i] = bl_fastxScan(space, filenames[i], gzindex[i],
+ 0, findex[i], 0, &minlen, &maxlen, &minqual, &maxqual);
+ else
+ ftotal[i] = bl_fastxScan(space, filenames[i], NULL,
+ 0, findex[i], 0, &minlen, &maxlen, &minqual, &maxqual);
+
+ total += ftotal[i];
+ }
+
+ seqsperpiece = MAX(1, total / pieces);
+
+ if (seqsperpiece < chunksize) {
+ chunksize = seqsperpiece;
+ } else {
+ noofchunks = seqsperpiece / chunksize;
+ rest = seqsperpiece - (noofchunks * chunksize);
+ chunksize += rest/noofchunks;
+ }
+ chunkindex = bl_fastxChunkIndex(space, filenames, gzindex, findex,
+ ftotal, nooffiles, total, chunksize);
+
+ if(!isMate) {
+
+ set = bl_fastaInit(space);
+ set->hasIndex = 1;
+ set->active_noofseqs=0;
+
+
+ set->chunkIsActive = 0;
+ set->chunkindex = chunkindex;
+ set->findex = findex;
+ set->gzindex = gzindex;
+ set->filenames = filenames;
+ set->gzip = gzip;
+ set->noofseqs = total;
+ set->upper = 1;
+ set->lower = 0;
+ set->nooffiles = nooffiles;
+ set->filetotal = ftotal;
+ set->minlen = minlen;
+ set->maxlen = maxlen;
+ set->minqual = minqual;
+ set->maxqual = maxqual;
+
+ } else {
+
+ if (set == NULL || set->nooffiles != nooffiles ||
+ set->noofseqs != total || set->chunkindex == NULL ||
+ set->chunkindex->size != chunkindex->size) {
+ MSG("1: Reading mates failed: mate and query files differ in size!\n");
+ NFO("set->nooffiles %d = %d nooffiles\n", set->nooffiles, nooffiles);
+ NFO("set->noofseqs %d = %d noofseqs\n", set->noofseqs, total);
+ NFO("set->chunkindex->size %d = %d chunkindex->size",
+ set->chunkindex->size, chunkindex->size);
+ exit(-1);
+ }
+
+ for(i=0; i < nooffiles; i++) {
+ if (ftotal[i] != set->filetotal[i]) {
+ MSG("2: Reading mates failed: mate and query files differ in size!\n");
+ exit(-1);
+ }
+ }
+
+ for(i=0; i < chunkindex->size; i++) {
+ if(chunkindex->ap[i].noofseqs != set->chunkindex->ap[i].noofseqs) {
+ MSG("3: Reading mates failed: mate and query files differ in size!\n");
+ exit(-1);
+ }
+ }
+
+ FREEMEMORY(space, ftotal);
+
+ set->matefilenames = filenames;
+ set->matechunkindex = chunkindex;
+ set->matefindex = findex;
+ set->mategzindex = gzindex;
+ set->hasMates = 1;
+
+ if(set->maxlen < maxlen) {
+ set->maxlen = maxlen;
+ }
+
+ if(set->minlen > minlen) {
+ set->minlen = minlen;
+ }
+
+ if(set->minqual > minqual) {
+ set->minqual = minqual;
+ }
+
+ if(set->maxqual < maxqual) {
+ set->maxqual = maxqual;
+ }
+
+ }
+
+ set->active_noofseqs = 0;
+ set->chunkIsActive = 0;
+
+ return set;
+}
+
+/*------------------------------- bl_fastxRead -------------------------------
+ *
+ * @brief read fasta or fastq format
+ * @author Steve Hoffmann
+ *
+ */
+
+fasta_t*
+bl_fastxRead(
+ void *space,
+ fasta_t* fasta,
+ char* filename,
+ unsigned char upper,
+ unsigned char lower, off_t offset, Uint startseq, Uint lastseq,
+ //unsigned char gzip, struct access *index,
+ void (*handler)
+ (void *, fasta_t*, char *, Uint, char *, char *, Uint, Uint))
+{
+
+ FILE *fp;
+ char ch;
+ char *buffer;
+ char *descrbuffer = NULL;
+ char *seqbuffer = NULL;
+ char *qualbuffer = NULL;
+ char idchar=0;
+ int ret=0;
+ unsigned char desc = 0;
+ unsigned char fastq = 0;
+ unsigned char qualdesc = 0;
+ unsigned char qual = 0;
+ unsigned char seq = 0;
+ unsigned char gzip = 0;
+ struct gzidxfile *gzf = NULL;
+ struct access * index = NULL;
+
+ Uint descrlength = 0;
+ Uint seqlen = 0;
+ Uint buffersize = MAXBUFFERSIZE;
+ Uint n = startseq;
+ Uint len = 0;
+
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ if (fasta == NULL) fasta = bl_fastaInit(space);
+
+ if(gzip) {
+ fp = fopen(filename, "rb");
+ gzf = bl_initgzidxfile(fp, index, offset, MEDIUMCHUNK);
+ } else {
+ fp = fopen(filename, "r");
+ }
+
+ if (fp == NULL) {
+ NFO("fastxRead: Couldn't open file '%s': %d. Exit forced.\n", filename, errno);
+ exit(-1);
+ }
+
+ if(offset > 0) {
+ ret = fseeko(fp, offset, SEEK_SET);
+ if (ret == -1) {
+ NFO("fastxRead: fseeko failed for file %s. Exit forced.\n", filename);
+ exit(-1);
+ }
+ }
+
+
+ while((ch= (gzip) ? bl_getgzidxc(gzf) : getc(fp)) != EOF) {
+
+ if(len == buffersize-1) {
+ buffersize = 2*buffersize+1;
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ }
+
+ if((ch == '@' || ch == '>') && !idchar) {
+ desc = 1;
+ fastq = (ch == '@');
+ idchar = ch;
+ }
+
+ if(fastq && ch=='+' && seq && len > 0) {
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+ seq = 0;
+ qualdesc = 1;
+
+ seqbuffer = buffer;
+ seqlen = len;
+ len = 0;
+
+ buffersize = MAXBUFFERSIZE;
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ }
+
+ if(qual && len > seqlen) {
+ NFO("fastq format error: quality string longer than nt string: %s\n", descrbuffer);
+ exit(-1);
+ }
+
+ assert(!qual || len <= seqlen); // v-- && seqlen > 0 produces segfault!!
+ if(ch==idchar && ((!fastq && len > 0 ) || (qual && len > 0 && len == seqlen))) {
+
+ if(lastseq && n >= lastseq-1) {
+ break;
+ }
+
+ seq = 0;
+ qual = 0;
+ desc = 1;
+
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+
+ assert (!fastq || seqbuffer);
+
+ if (!seqbuffer) {
+ seqbuffer = buffer;
+ } else {
+ qualbuffer = buffer;
+ }
+
+ handler(space, fasta, descrbuffer, descrlength,
+ seqbuffer, qualbuffer, len, n);
+ n++;
+
+ descrlength = 0;
+ descrbuffer = NULL;
+ seqlen = 0;
+ seqbuffer = NULL;
+ qualbuffer = NULL;
+
+ len = 0;
+ buffersize = MAXBUFFERSIZE;
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ }
+
+ if(!desc && !qualdesc && ch =='\n') {
+ /*do nothing.*/
+ } else {
+ if(desc && ch == '\n') {
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+
+ descrbuffer = buffer;
+ descrlength = len;
+
+ len = 0;
+ buffersize = MAXBUFFERSIZE;
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ desc = 0;
+ seq = 1;
+ } else if (fastq && qualdesc && ch == '\n') {
+ FREEMEMORY(space, buffer);
+ len = 0;
+ buffersize = MAXBUFFERSIZE;
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ qualdesc = 0;
+ qual = 1;
+ } else {
+ if (ch == '\r') continue;
+ len++;
+ if (upper && !desc && !qualdesc && !qual) {
+ buffer[len-1]=(char)toupper((int)ch);
+ } else if (lower && !desc && !qualdesc && !qual) {
+ buffer[len-1]=(char)tolower((int)ch);
+ } else {
+ buffer[len-1]=(char) ch;
+ }
+ }
+ }
+ }
+
+
+ if((!fastq && len > 0) || (qual && len > 0 && len == seqlen)) {
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+
+ assert (!fastq || seqbuffer);
+
+ if (!seqbuffer) {
+ seqbuffer = buffer;
+ } else {
+ qualbuffer = buffer;
+ }
+
+ if (descrbuffer == NULL)
+ DBG("empty descr buffer after loop n=%d\n", n);
+
+ handler(space, fasta, descrbuffer, descrlength,
+ seqbuffer, qualbuffer, len, n);
+ }
+
+ if(gzip) {
+ bl_destructgzidxfile(gzf);
+ FREEMEMORY(space, gzf);
+ }
+
+ fclose(fp);
+ return fasta;
+}
+
+
+/*------------------------------ bl_fastgzRead -------------------------------
+ *
+ * @brief randomly access zipped fasta gz using the index
+ * to populate fasta struct
+ * @author Steve Hoffmann
+ *
+ */
+
+
+fasta_t*
+bl_fastxgzRead (void *space, fasta_t *fasta, char *filename, struct access *idx,
+ unsigned char upper, unsigned char lower, off_t offset, Uint startseq, Uint lastseq,
+ void (*handler)(void *, fasta_t*, char *, Uint, char *, char *, Uint, Uint))
+{
+
+ FILE *fp = NULL;
+ char ch = 0;
+ char *buffer;
+ //char *check;
+ unsigned char *gzbuffer;
+ char *descrbuffer = NULL;
+ char *seqbuffer = NULL;
+ char *qualbuffer = NULL;
+ char idchar=0;
+ int nb;
+ int gzbufpos=0;
+ unsigned char desc = 0;
+ unsigned char fastq = 0;
+ unsigned char qualdesc = 0;
+ unsigned char qual = 0;
+ unsigned char seq = 0;
+ unsigned char finalize = 0;
+ int ret;
+ Uint descrlength = 0;
+ Uint seqlen = 0;
+ Uint buffersize = MAXBUFFERSIZE;
+ Uint n = startseq;
+ Uint len = 0;
+
+ //fprintf(stderr, "reading %d sequences with offset %d\n", nseqs, offset);
+
+ if (fasta == NULL) fasta = bl_fastaInit(space);
+
+ fp = fopen(filename, "rb");
+
+ if (fp == NULL) {
+ DBG("fastxgzRead: Couldn't open file '%s': %s. Exit forced.\n", filename, strerror(errno));
+ exit(-1);
+ }
+
+ /* access block and read it entirely */
+
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ gzbuffer = ALLOCMEMORY(space, NULL, char, SPAN);
+// check = ALLOCMEMORY(space, NULL, char, 1001);
+
+ nb = extract(fp, idx, offset, gzbuffer, SPAN);
+
+ /*
+ memset(check, 0, 1001);
+ memmove(check, gzbuffer, MIN(nb,1000));
+ fprintf(stderr, "%s\n-------\n", check);
+ */
+
+ if(nb < 0) {
+ DBG("extraction failed (%s)\n",
+ nb == Z_MEM_ERROR ? "out of memory" : "input corrupted");
+ fclose(fp);
+ exit(-1);
+ }
+
+ do {
+
+ finalize = 0;
+ while(gzbufpos < nb) {
+ ch = gzbuffer[gzbufpos++];
+
+ if(len == buffersize-1) {
+ buffersize = 2*buffersize+1;
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ }
+
+ if((ch == '@' || ch == '>') && !idchar) {
+ desc = 1;
+ fastq = (ch == '@');
+ idchar = ch;
+ len = 0;
+ //fprintf(stderr, "found idchar: %c\n", idchar);
+ }
+
+ /*fastq only: store the sequence and read the quality string*/
+ if(fastq && ch=='+' && seq && len > 0) {
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+ seq = 0;
+ qualdesc = 1;
+
+ seqbuffer = buffer;
+ seqlen = len;
+ len = 0;
+
+ buffersize = MAXBUFFERSIZE;
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ }
+
+ if(qual && len > seqlen) {
+ DBG("%s: qual longer than nt string (n=%d). Exit.\n", descrbuffer, n);
+ exit(-1);
+ }
+
+ /*reading a new id*/
+ if(ch==idchar && ((!fastq && len > 0) || (qual && len > 0 && len == seqlen))) {
+
+ seq = 0;
+ qual = 0;
+ desc = 1;
+
+ if(lastseq && n >= lastseq-1) {
+ finalize = 1;
+ break;
+ }
+
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+ assert (!fastq || seqbuffer);
+
+ /*switch fasta:fastq*/
+ if (!seqbuffer) {
+ seqbuffer = buffer;
+ } else {
+ qualbuffer = buffer;
+ }
+
+ if (descrbuffer == NULL)
+ DBG("empty descr buffer in loop n=%d\n", n);
+
+ handler(space, fasta, descrbuffer, descrlength,
+ seqbuffer, qualbuffer, len, n);
+
+ n++;
+
+ descrlength = 0;
+ descrbuffer = NULL;
+ seqlen = 0;
+ seqbuffer = NULL;
+ qualbuffer = NULL;
+
+ len = 0;
+ buffersize = MAXBUFFERSIZE;
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ }
+
+ if(!desc && !qualdesc && ch =='\n') {
+ /*do nothing.*/
+ } else {
+ if(desc && ch == '\n') {
+ /*store description*/
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+
+ descrbuffer = buffer;
+ descrlength = len;
+
+ len = 0;
+ buffersize = MAXBUFFERSIZE;
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ desc = 0;
+ seq = 1;
+ } else if (fastq && qualdesc && ch == '\n') {
+ /*toss quality string description*/
+ FREEMEMORY(space, buffer);
+ len = 0;
+ buffersize = MAXBUFFERSIZE;
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ qualdesc = 0;
+ qual = 1;
+ } else {
+ if (ch == '\r') continue;
+ len++;
+ if (upper && !desc && !qualdesc && !qual) {
+ buffer[len-1]=(char)toupper((int)ch);
+ } else if (lower && !desc && !qualdesc && !qual) {
+ buffer[len-1]=(char)tolower((int)ch);
+ } else {
+ buffer[len-1]=(char) ch;
+ }
+ }
+ }
+ }
+
+ FREEMEMORY(space, gzbuffer);
+
+ if (!finalize) {
+ /*read next chunk*/
+ offset += nb;
+ gzbuffer = ALLOCMEMORY(space, NULL, char, SPAN);
+ nb = extract(fp, idx, offset, gzbuffer, SPAN);
+
+ /*
+ memmove(check, gzbuffer, MIN(nb,1000));
+ fprintf(stderr, "%s\n-------\n", check);
+ */
+
+ if(nb < 0) {
+ DBG("extraction failed (%s)\n",
+ nb == Z_MEM_ERROR ? "out of memory" : "input corrupted");
+ exit(-1);
+ } else if (nb == 0) {
+ FREEMEMORY(space, gzbuffer);
+ break;
+ }
+
+ gzbufpos = 0;
+ } else {
+ break;
+ }
+
+ } while (1);
+
+
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+
+ assert (!fastq || seqbuffer);
+
+ if (!seqbuffer) {
+ seqbuffer = buffer;
+ } else {
+ qualbuffer = buffer;
+ }
+
+ if (descrbuffer == NULL)
+ DBG("empty descr buffer after loop n=%d\n", n);
+
+ handler(space, fasta, descrbuffer, descrlength,
+ seqbuffer, qualbuffer, len, n);
+
+ ret = fclose(fp);
+ if(ret == EOF) {
+ fprintf(stderr, "Couldnt close file!\n");
+ exit(-1);
+ }
+ return fasta;
+}
+
+
+/*------------------------------ bl_fastxGetSet ------------------------------
+ *
+ * @brief read a set of fasta or fastq files
+ * @author Steve Hoffmann
+ *
+ */
+
+fasta_t*
+bl_fastxGetSet(void *space, char **filenames, unsigned int nooffiles,
+ unsigned char upper, unsigned char lower, unsigned char index, Uint pieces) {
+
+ Uint i, prefixlen, n =0;
+ int len=0;
+ unsigned char gzip=2;
+ struct access *gzindex;
+ fasta_t *set;
+
+ for(i=0; i < nooffiles; i++) {
+ prefixlen = bl_fileprefixlen(filenames[i]);
+
+ if(strncmp(&filenames[i][prefixlen], ".gz", 3) == 0 ||
+ strncmp(&filenames[i][prefixlen], ".gzip", 3) == 0) {
+
+ if(gzip == 2 || gzip == 1) {
+ gzip = 1;
+ } else {
+ MSG("Provide fastx files either gzipped xor plain. Exit forced.\n");
+ exit(-1);
+ }
+ } else {
+ if(gzip == 2 || gzip == 0) {
+ gzip = 0;
+ } else {
+ MSG("Provide fastx files either gzip'd xor plain. Exit forced.\n");
+ exit(-1);
+ }
+ }
+ }
+
+ if(!index) {
+ set = bl_fastaInit(space);
+ set->filenames = filenames;
+ set->nooffiles = nooffiles;
+ set->gzip = gzip;
+ set->upper = upper;
+ set->lower = lower;
+
+ for(i=0; i < nooffiles; i++) {
+ if(!gzip) {
+ set = bl_fastxRead(space, set, filenames[i], upper, lower,
+ 0, n, 0, //gzip, gzindex,
+ bl_fastxAdd);
+
+ n = set->active_noofseqs;
+
+ } else {
+
+ gzindex = bl_zranGetIndex(filenames[i], &len);
+ set = bl_fastxgzRead(space, set, filenames[i], gzindex,
+ upper, lower, 0, n, 0, bl_fastxAdd);
+
+ n = set->active_noofseqs;
+
+ FREEMEMORY(space, gzindex->list);
+ FREEMEMORY(space, gzindex);
+ }
+ }
+ } else {
+ set = bl_fastxIndex(space, NULL, filenames, nooffiles, 0, gzip, pieces);
+ }
+
+ return set;
+}
+
+
+/*---------------------------- bl_fastxGetMateSet ----------------------------
+ *
+ * @brief get a set of fastq or fasta mate sequence files
+ * @author Steve Hoffmann
+ *
+ */
+
+fasta_t*
+bl_fastxGetMateSet(void *space, fasta_t* set, char** filenames,
+ unsigned int nooffiles, unsigned char upper, unsigned lower,
+ unsigned char index, Uint pieces) {
+
+ Uint i, prefixlen, n=0;
+ int len=0;
+ unsigned char gzip=2;
+ struct access *gzindex;
+
+ assert (set != NULL);
+
+ for(i=0; i < nooffiles; i++) {
+ prefixlen = bl_fileprefixlen(filenames[i]);
+
+ if(strncmp(&filenames[i][prefixlen], ".gz", 3) == 0 ||
+ strncmp(&filenames[i][prefixlen], ".gzip", 4) == 0) {
+
+ if(gzip == 2 || gzip == 1) {
+ gzip = 1;
+ } else {
+ MSG("Provide fastx files either gzipped xor txt. Exit forced.\n");
+ exit(-1);
+ }
+ } else {
+ if(gzip == 2 || gzip == 0) {
+ gzip = 0;
+ } else {
+ MSG("Provide fastx files either gzip'd xor txt. Exit forced.\n");
+ exit(-1);
+ }
+ }
+ }
+
+ if(!index) {
+ set = bl_fastaInit(space);
+ for(i=0; i < nooffiles; i++) {
+ if(!gzip) {
+
+ set = bl_fastxRead(space, set, filenames[i], upper, lower,
+ 0, n, 0, bl_fastxAddMate);
+
+ n = set->active_noofmates;
+
+ } else {
+
+ gzindex = bl_zranGetIndex(filenames[i], &len);
+ set = bl_fastxgzRead(space, set, filenames[i], gzindex,
+ upper, lower, 0, n, 0, bl_fastxAddMate);
+
+ n = set->active_noofmates;
+
+ FREEMEMORY(space, gzindex->list);
+ FREEMEMORY(space, gzindex);
+ }
+ }
+ } else {
+ set = bl_fastxIndex(space, set, filenames, nooffiles, 1, gzip, pieces);
+ }
+
+
+ return set;
+}
+
+
+/*------------------------------ bl_fastxIDcmp -------------------------------
+ *
+ * @brief compare fasta ids
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+bl_fastxIDcmp (char *a, char *b)
+{
+
+ char *desc;
+ char *temp = "chr";
+ int i, j, alen, blen, tlen;
+ alen = strlen(a);
+ blen = strlen(b);
+ tlen = strlen(temp);
+
+ if(strcmp(a,b) == 0) return 0;
+
+ for(i=0; i < alen; i++) {
+ if (isspace((int)a[i])) break;
+ }
+
+ if(blen <= i && strncmp(a, b, i) == 0) {
+ return 0;
+ }
+
+ for(j=0; j < blen; j++) {
+ if (isspace((int)b[j])) break;
+ }
+
+
+ desc = calloc(i+tlen+1,sizeof(char));
+ memmove(desc, temp, tlen);
+ memmove(&desc[tlen], a, i);
+ desc[tlen+i] = 0;
+
+ if(blen >= tlen+i && strncmp(desc, b, tlen+i) == 0) {
+ free(desc);
+ return 0;
+ }
+
+ free(desc);
+
+ desc = calloc(j+tlen+1,sizeof(char));
+ memmove(desc, temp, tlen);
+ memmove(&desc[tlen], b, j);
+ desc[tlen+j] = 0;
+
+ if(alen >= tlen+j && strncmp(a, desc, tlen+j) == 0) {
+ free(desc);
+ return 0;
+ }
+
+ free(desc);
+
+ return -1;
+}
+
+/*----------------------------- bl_fastxFindIdx ------------------------------
+ *
+ * @brief return the index for a fasta with name id or description
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_fastxFindIDIdx (char *id, fasta_t *set)
+{
+ Uint i, j;
+ char *desc;
+
+ for(i=0; i < set->noofseqs; i++) {
+
+ desc = bl_fastaGetDescription(set,i);
+ // fprintf(stderr, "desc:%s, id:%s\n", desc, id);
+
+ if(strcmp(id, desc) == 0) {
+ break;
+ }
+ // fprintf(stderr, "not break\n");
+
+ for(j=0; j <bl_fastaGetDescriptionLength(set,i); j++) {
+ if (isspace((int)desc[j])) break;
+ }
+
+ if(strlen(id) <= j && strncmp(id, desc, j) == 0) {
+ break;
+ }
+
+ //fprintf(stderr, "not break 2\n");
+ }
+
+ return (i < set->noofseqs) ? i : -1;
+}
+
+/*------------------------ bl_fastxDestructChunkIndex ------------------------
+ *
+ * @brief destruct the chunk index
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastxDestructChunkIndex (void *space, fasta_t *f)
+{
+
+ if(f->chunkindex) {
+
+ FREEMEMORY(space, f->chunkindex->ap);
+ FREEMEMORY(space, f->chunkindex);
+
+ if(f->hasMates) {
+ FREEMEMORY(space, f->matechunkindex->ap);
+ FREEMEMORY(space, f->matechunkindex);
+ }
+
+ f->matechunkindex = NULL;
+ f->chunkindex = NULL;
+ }
+
+ return ;
+}
+
+/*------------------------- bl_fastaDestructIndex -------------------------
+ *
+ * @brief free sequence index
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_fastxDestructIndex (void *space, fasta_t *f)
+{
+
+ Uint i;
+
+ bl_fastxDestructChunkIndex(space, f);
+
+ if(f->filetotal) {
+ FREEMEMORY(space, f->filetotal);
+ }
+
+ if(f->findex) {
+ for(i=0; i < f->nooffiles; i++) {
+ FREEMEMORY(space, f->findex[i]->ap);
+ FREEMEMORY(space, f->findex[i]);
+ if(f->hasMates) {
+
+ FREEMEMORY(space, f->matefindex[i]->ap);
+ FREEMEMORY(space, f->matefindex[i]);
+ }
+ }
+ FREEMEMORY(space, f->findex);
+ if(f->hasMates) {
+
+ FREEMEMORY(space, f->matefindex);
+ }
+ f->matefindex = NULL;
+ f->findex = NULL;
+ }
+
+ f->curchunk = 0;
+ f->noofseqs = 0;
+ f->chunkIsActive = 0;
+
+ if(f->gzip) {
+ for(i=0; i < f->nooffiles; i++) {
+ if (f->gzindex){
+ FREEMEMORY(space, f->gzindex[i]->list);
+ FREEMEMORY(space, f->gzindex[i]);
+ }
+ if(f->hasMates && f->mategzindex) {
+ FREEMEMORY(space, f->mategzindex[i]->list);
+ FREEMEMORY(space, f->mategzindex[i]);
+ }
+ }
+ if (f->gzindex){
+ FREEMEMORY(space, f->gzindex);
+ }
+
+ if(f->hasMates && f->mategzindex) {
+ FREEMEMORY(space, f->mategzindex);
+ }
+ f->gzip = 0;
+ }
+
+ return;
+}
+
+/*------------------------- bl_fastaDestructSequence -------------------------
+ *
+ * @brief descruct the fasta struct
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastxDestructSequence(void *space, fasta_t* f) {
+ Uint i;
+
+ for(i=0; i < f->active_noofseqs; i++) {
+ destructSequence(space, f->seqs[i]);
+ }
+ FREEMEMORY(space, f->seqs);
+
+ if(bl_fastaHasMate(f)) {
+ FREEMEMORY(space, f->matestart);
+ }
+
+ if(bl_fastaHasQuality(f)) {
+ bl_fastaDestructQuality(space, f);
+ FREEMEMORY(space, f->quals);
+ }
+
+ f->seqs = NULL;
+ f->matestart = NULL;
+ f->quals = NULL;
+ f->active_noofseqs = 0;
+ f->minlen = 0;
+ f->maxlen= 0;
+ f->chunkIsActive =0;
+}
+
+
+/*----------------------------- bl_fastaDestruct -----------------------------
+ *
+ * @brief destruct fasta
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastaDestruct (void *space, fasta_t *f)
+{
+ bl_fastxDestructSequence(space, f);
+ bl_fastxDestructIndex(space, f);
+ return ;
+}
+
+
+/*------------------------------- bl_fastxDump -------------------------------
+ *
+ * @brief dump the fastx
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastxDump( void *space,
+ fasta_t *fasta,
+ char *desc,
+ Uint desclen,
+ char *sequence,
+ char *quality,
+ Uint quallen,
+ Uint n)
+{
+ fprintf(stderr, "%s\n", desc);
+ fprintf(stderr, "%s\n", sequence);
+ fprintf(stderr, "+%s\n", &desc[1]);
+ fprintf(stderr, "%s\n", quality);
+}
+
+/*-------------------------- bl_annotationtrackInit --------------------------
+ *
+ * @brief init
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_annotationtrackInit (annotationtrack_t *track)
+{
+
+ track->trackname = NULL;
+ track->tracknamelen = 0;
+ track->description = NULL;
+ track->descriptionlen = 0;
+ track->noofitems = 0;
+ track->items = NULL;
+
+ return ;
+}
+
+/*------------------------ bl_annotationtrackitemInit ------------------------
+ *
+ * @brief init item
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_annotationitemInit (annotationitem_t *item, unsigned char type)
+{
+
+ item->type = type;
+ item->chromname = NULL;
+ item->chromnamelen = 0;
+ item->source=NULL;
+ item->sourcelen=0;
+ item->start = 0;
+ item->end = 0;
+ item->name = NULL;
+ item->namelen = 0;
+ item->score = .0;
+ item->strand = '0';
+ item->thickStart = 0;
+ item->thickEnd = 0;
+ item->itemRgb = NULL;
+ item->blockCount = 0;
+ item->blockSizes = NULL;
+ item->blockStarts = NULL;
+ item->blockStrands = NULL;
+ item->blockRefseqs = NULL;
+ item->noofovl = 0;
+ item->firstovl = -1;
+ item->level = 0;
+ item->source = NULL;
+ item->sourcelen = 0;
+ item->noofattributes=0;
+ item->attributes = NULL;
+ item->attributelen = NULL;
+
+ return ;
+}
+
+/*------------------------- bl_annotationitemDestruct -----------------------
+ *
+ * @brief destruct annotation item
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_annotationitemDestruct (void *space, annotationitem_t *item)
+{
+
+ Uint i;
+
+ if(item->chromname) FREEMEMORY(space, item->chromname);
+ if(item->name) FREEMEMORY(space, item->name);
+ if(item->itemRgb) FREEMEMORY(space, item->itemRgb);
+ if(item->blockSizes) FREEMEMORY(space, item->blockSizes);
+ if(item->blockStarts) FREEMEMORY(space, item->blockStarts);
+ if(item->blockRefseqs) FREEMEMORY(space, item->blockRefseqs);
+ if(item->blockStrands) FREEMEMORY(space, item->blockStrands);
+ if(item->source) FREEMEMORY(space, item->source);
+
+ if(item->noofattributes) {
+ for(i=0; i < item->noofattributes; i++) {
+ FREEMEMORY(space, item->attributes[i]);
+ }
+ FREEMEMORY(space, item->attributes);
+ FREEMEMORY(space, item->attributelen);
+ }
+
+ return ;
+}
+/*--------------------------- bl_annotationitem_cmp --------------------------
+ *
+ * @brief find annotation item in track
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_annotationitem_cmp_track (Uint item, void *track, void *elem, void *nfo)
+{
+ annotationitem_t *l, *r;
+ annotationtrack_t *t;
+ int chr;
+
+ t = (annotationtrack_t*) track;
+
+ l = (annotationitem_t*) &t->items[item];
+ r = (annotationitem_t*) elem;
+
+
+ if ((chr = strcmp(l->chromname, r->chromname))) {
+ if(chr < 0) return 2;
+ if(chr > 0) return 1;
+ }
+
+ if(l->end < r->start) {
+ return 2;
+ }
+
+ if(l->end > r->start) {
+ return 1;
+ }
+
+ return 0;
+}
+
+
+/*--------------------------- bl_annotationitem_cmp --------------------------
+ *
+ * @brief compare annotation lines
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+bl_annotationitem_cmp (void const *a, void const *b)
+{
+ annotationitem_t *l, *r;
+ int chr;
+
+ l = (annotationitem_t*) a;
+ r = (annotationitem_t*) b;
+
+ if ((chr = strcmp(l->chromname, r->chromname))) {
+ return chr;
+ }
+
+ if(l->start < r->start) {
+ return -1;
+ }
+
+ if(l->start > r->start) {
+ return 1;
+ }
+
+ if(l->end < r->end) {
+ return -1;
+ }
+
+ if(l->end > r->end) {
+ return 1;
+ }
+
+ if(l->strand < r->strand) {
+ return -1;
+ }
+
+ if(l->strand > r->strand) {
+ return 1;
+ }
+
+ return 0;
+}
+
+/*------------------------ bl_annotationtrackDestruct -------------------------
+ *
+ * @brief init
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_annotationtrackDestruct (void *space, annotationtrack_t *track)
+{
+
+ Uint i;
+
+ if(track->trackname) FREEMEMORY(space, track->trackname);
+ if(track->description) FREEMEMORY(space, track->description);
+
+ for(i=0; i < track->noofitems; i++) {
+ bl_annotationitemDestruct(space, &track->items[i]);
+ }
+
+ track->noofitems = 0;
+ if(track->items) FREEMEMORY(space, track->items) ;
+
+ return ;
+}
+
+
+
+/*------------------------------- bl_GFFwrite --------------------------------
+ *
+ * @brief write GFF to file filename
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_GFFwrite(char *filename, annotationtrack_t *set) {
+ Uint i,j;
+ FILE *fp;
+ annotationitem_t *g;
+
+
+ fp = fopen(filename, "w");
+ if (fp == NULL) {
+ fprintf(stderr, "couldn't open %s - exit forced", filename);
+ exit(-1);
+ }
+
+ for(i=0; i < set->noofitems; i++) {
+ g = &set->items[i];
+ fprintf(fp,"%s\t%s\t%s\t", g->chromname, g->source, g->name);
+ fprintf(fp,"%d\t%d\t%c\t", g->start, g->end, g->strand);
+ fprintf(fp,"%d", g->frame);
+
+ if(g->noofattributes) {
+ fprintf(fp,"\t");
+ }
+
+ for(j=0; j < g->noofattributes; j++) {
+ fprintf(fp,"%s", g->attributes[j]);
+ if(j < g->noofattributes-1) {
+ fprintf(fp,";");
+ }
+ }
+
+ fprintf(fp,"\n");
+ }
+
+ fclose(fp);
+ return;
+}
+
+/*------------------- bl_annotationtrackAssignTrackLevel -------------------
+ *
+ * @brief assign the track numbers to annota
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_annotationtrackAssignTrackLevel(annotationtrack_t *track)
+{
+
+ Uint i, k, p;
+ bitvector a;
+
+ for(i=0; i< track->noofitems; i++ ) {
+ for(k=i+1; k < track->noofitems; k++) {
+ if(track->items[i].end+1 <= track->items[k].start ||
+ strcmp(track->items[i].chromname, track->items[k].chromname))
+ break;
+ if(track->items[k].firstovl == -1)
+ track->items[k].firstovl = i;
+ track->items[k].noofovl++;
+ }
+ }
+
+ for(i=0; i < track->noofitems; i++) {
+ if(track->items[i].noofovl < 2) {
+ track->items[i].level = track->items[i].noofovl;
+ } else {
+ a=initbitvector(NULL, 255);
+ for(k=track->items[i].firstovl; k < i; k++) {
+ if(track->items[k].end+1 >= track->items[i].start &&
+ !strcmp(track->items[i].chromname, track->items[k].chromname)) {
+ bitvector_setbit(a, track->items[k].level, 1);
+ }
+ }
+ for(p=0; p < 255; p++) {
+ if (bitvector_getbit(a,p) == 0) break;
+ }
+ track->items[i].level = p;
+ FREEMEMORY(space, a);
+ }
+ }
+
+ return ;
+}
+
+
+/*-------------------------------- bl_BEDread --------------------------------
+ *
+ * @brief read a bed file
+ * @author Steve Hoffmann
+ *
+ */
+
+annotationtrack_t*
+bl_BEDread (void *space, char *filename)
+{
+ stringset_t **set;
+ annotationtrack_t *track;
+ annotationitem_t *item;
+ char *str, *pch, *tmp;
+ Uint linecount, i, j, k, u, v, noofstr, len, ulen;
+
+ track = ALLOCMEMORY(space, NULL, annotationtrack_t, 1);
+ bl_annotationtrackInit(track);
+ set = readcsv(space, filename, "\t", &linecount);
+ track->items = ALLOCMEMORY(space, NULL, annotationitem_t, linecount);
+
+ for(i=0; i < linecount; i++) {
+ noofstr = set[i]->noofstrings;
+
+ if(noofstr) {
+ str = set[i]->strings[0].str;
+ len = strlen(str);
+
+ //comment line
+ if(strncmp(str, "#", 1) == 0) {
+ continue;
+ }
+
+ //track description
+ if(len >= 5 && !strncmp(str, "track", 5)) {
+ for(j=1; j < noofstr; j++) {
+ str = set[i]->strings[j].str;
+ len = strlen(str);
+
+ if(len > 5 && !strncmp(str, "name=", 5)) {
+ track->tracknamelen = len-5;
+ track->trackname = ALLOCMEMORY(space, NULL, char, len-4);
+ memmove(track->trackname, &str[5], len-5);
+ track->trackname[len-5] = '\0';
+ }
+
+ if(len > 12 && !strncmp(str, "description=", 12)) {
+ track->descriptionlen = len-12;
+ track->description = ALLOCMEMORY(space, NULL, char, len-11);
+ memmove(track->description, &str[5], len-12);
+ track->description[len-12] = '\0';
+ }
+ }
+ continue;
+ }
+
+ //real data
+ if(noofstr >= 3) {
+ item = &track->items[track->noofitems];
+ bl_annotationitemInit(item, BEDITEM);
+
+ for(j=0; j < noofstr; j++) {
+ str = set[i]->strings[j].str;
+ len = strlen(str);
+
+ switch(j) {
+ case 0:
+ item->chromnamelen = len;
+ item->chromname = ALLOCMEMORY(space, NULL, char, len+1);
+ memmove(item->chromname, str, len);
+ item->chromname[len] = '\0';
+ break;
+ case 1:
+ item->start = atoi(str);
+ if(!item->start && str[0] != '0') {
+ DBG("BED '%s' %d:%d: atoi failed", filename, i, j);
+ exit(-1);
+ }
+ break;
+ case 2:
+ item->end = atoi(str);
+ if(!item->end && str[0] != '0') {
+ DBG("BED '%s' %d:%d: atoi failed", filename, i, j);
+ exit(-1);
+ }
+ break;
+ case 3:
+ item->namelen = len;
+ item->name = ALLOCMEMORY(space, NULL, char, len+1);
+ memmove(item->name, str, len);
+ item->name[len] = '\0';
+ break;
+ case 4:
+ item->score = atof(str);
+ if(item->score == 0.0 && str[0] != '0' && str[0] != '.') {
+ DBG("BED '%s' %d:%d: %f(%s) :atof failed", filename, i, j, item->score, str);
+ exit(-1);
+ }
+ break;
+ case 5:
+ if(str[0] != '-' && str[0] != '+' && str[0] != '.') {
+ DBG("BED '%s' %d:%d: atof failed", filename, i, j);
+ exit(-1);
+ }
+ item->strand = str[0];
+ break;
+ case 6:
+ item->thickStart = atoi(str);
+ if(!item->thickStart && str[0] != '0') {
+ DBG("BED '%s' %d:%d: %s:atoi failed", filename, i, j, str);
+ exit(-1);
+ }
+ break;
+ case 7:
+ item->thickEnd = atoi(str);
+ if(!item->thickEnd && str[0] != '0') {
+ DBG("BED '%s' %d:%d: atoi failed", filename, i, j);
+ exit(-1);
+ }
+ break;
+ case 8:
+ k = 0;
+ pch = strtok(str, ",");
+ while(pch != NULL) {
+ item->itemRgb = ALLOCMEMORY(space, NULL, Uint, k+1);
+ item->itemRgb[k] = atoi(pch);
+ if(!item->itemRgb[k] && pch[0] != '0' && k > 2) {
+ DBG("BED '%s' %d:%d: atoi failed", filename, i, j);
+ exit(-1);
+ }
+ k++;
+ pch = strtok(NULL, ",");
+ }
+ if(k == 1) {
+ FREEMEMORY(space, item->itemRgb);
+ item->itemRgb = NULL;
+ }
+ if(k != 1 && k != 3) {
+ DBG("BED '%s' %d:%d: wrong igb code", filename, i, j);
+ exit(-1);
+ }
+ break;
+ case 9:
+ item->blockCount = atoi(str);
+ if(!item->blockCount && str[0] != '0') {
+ DBG("BED '%s' %d:%d: %s: atoi failed", filename, i, j, str);
+ exit(-1);
+ }
+ break;
+ case 10:
+ k = 0;
+ pch = strtok(str, ",");
+ while(pch != NULL) {
+ item->blockSizes = ALLOCMEMORY(space, item->blockSizes, Uint, k+1);
+ item->blockSizes[k] = atoi(pch);
+ if(!item->blockSizes[k] && pch[0] != '0') {
+ DBG("BED '%s' %d:%d: %s: atoi failed", filename, i, j, pch);
+ exit(-1);
+ }
+ k++;
+ pch = strtok(NULL, ",");
+ }
+ if(k != item->blockCount) {
+ DBG("BED '%s' %d:%d: %d!=%d: wrong block count", filename, i, j, k, item->blockCount);
+ exit(-1);
+ }
+ break;
+ case 11:
+ k = 0;
+ pch = strtok(str, ",");
+ while(pch != NULL) {
+ item->blockStarts = ALLOCMEMORY(space, item->blockStarts, Uint, k+1);
+ item->blockRefseqs = ALLOCMEMORY(space, item->blockRefseqs, char*, k+1);
+ item->blockStrands = ALLOCMEMORY(space, item->blockStrands, char, k+1);
+ ulen = strlen(pch);
+ for(u=0; u < ulen; u++) {
+ if(pch[u] == ':') break;
+ }
+ if(u < ulen) {
+ assert(u>0);
+
+ item->blockRefseqs[k] = ALLOCMEMORY(space, NULL, char, u+1);
+ memmove(item->blockRefseqs[k], pch, u);
+ item->blockRefseqs[k][u] = 0;
+ v = u+1;
+
+ for(u=v; u < ulen; u++) {
+ if(pch[u]==':') break;
+ }
+ assert(u>v);
+
+ tmp = ALLOCMEMORY(space, NULL, char, u-v+1);
+ memmove(tmp, &pch[v], u-v);
+ tmp[u-v] = 0;
+ item->blockStarts[k] = atoi(tmp);
+
+ if(!item->blockStarts[k] && tmp[0] != '0') {
+ DBG("BED '%s' %d:%d: atoi failed while reading extension", filename, i, j);
+ exit(-1);
+ }
+ assert(pch[u+1]=='-' || pch[u+1] == '+');
+ item->blockStrands[k] = pch[u+1];
+
+ } else {
+ item->blockStarts[k] = atoi(pch);
+ item->blockRefseqs[k] = NULL;
+ item->blockStrands[k] = item->strand;
+ if(!item->blockStarts[k] && pch[0] != '0') {
+ DBG("BED '%s' %d:%d: atoi failed", filename, i, j);
+ exit(-1);
+ }
+ }
+ k++;
+ pch = strtok(NULL, ",");
+ }
+ if(k != item->blockCount) {
+ DBG("BED '%s' %d:%d: wrong block count", filename, i, j);
+ exit(-1);
+ }
+ break;
+ default:
+ DBG("'%s' not in BED format\n", filename);
+ exit(-1);
+ break;
+ }
+ }
+
+ track->noofitems++;
+ }
+
+ }
+ destructStringset(space, set[i]);
+ }
+
+ qsort(track->items, track->noofitems, sizeof(annotationitem_t),
+ bl_annotationitem_cmp);
+
+ bl_annotationtrackAssignTrackLevel(track);
+
+ FREEMEMORY(space,set);
+ return track;
+}
+
+
+/*--------------------------- bl_blGFFAddAttribute ---------------------------
+ *
+ * @brief add an attribute to annotation item
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_GFFAddAttribute (void *space, annotationitem_t *item, char *attr, Uint len)
+{
+
+ item->attributes = ALLOCMEMORY(space, item->attributes,
+ char *, item->noofattributes+1);
+
+ item->attributelen = ALLOCMEMORY(space, item->attributelen,
+ Uint, item->noofattributes+1);
+
+ item->attributes[item->noofattributes] =
+ ALLOCMEMORY(space, NULL, char, len+1);
+
+ item->attributelen[item->noofattributes] = len;
+ memmove(item->attributes[item->noofattributes], attr, len);
+
+ item->attributes[item->noofattributes]
+ [item->attributelen[item->noofattributes]] = 0;
+
+ item->noofattributes++;
+
+ return ;
+}
+
+/*-------------------------------- bl_GFFread --------------------------------
+ *
+ * @brief read a bed file
+ * @author Steve Hoffmann
+ *
+ */
+
+annotationtrack_t*
+bl_GFFread (void *space, char *filename)
+{
+ stringset_t **set;
+ annotationtrack_t *track;
+ annotationitem_t *item;
+ char *str, *pch;
+ Uint linecount, i, j, p, noofstr, len, pchlen;
+
+ track = ALLOCMEMORY(space, NULL, annotationtrack_t, 1);
+ bl_annotationtrackInit(track);
+ set = readcsv(space, filename, "\t", &linecount);
+ track->items = ALLOCMEMORY(space, NULL, annotationitem_t, linecount);
+
+ for(i=0; i < linecount; i++) {
+ noofstr = set[i]->noofstrings;
+
+ if(noofstr) {
+ str = set[i]->strings[0].str;
+ len = strlen(str);
+
+ //comment line
+ if(strncmp(str, "#", 1) == 0) {
+ continue;
+ }
+
+ //track description
+ if(len >= 5 && !strncmp(str, "track", 5)) {
+ for(j=1; j < noofstr; j++) {
+ str = set[i]->strings[j].str;
+ len = strlen(str);
+
+ if(len > 5 && !strncmp(str, "name=", 5)) {
+ track->tracknamelen = len-5;
+ track->trackname = ALLOCMEMORY(space, NULL, char, len-4);
+ memmove(track->trackname, &str[5], len-5);
+ track->trackname[len-5] = '\0';
+ }
+
+ if(len > 12 && !strncmp(str, "description=", 12)) {
+ track->descriptionlen = len-12;
+ track->description = ALLOCMEMORY(space, NULL, char, len-11);
+ memmove(track->description, &str[5], len-12);
+ track->description[len-12] = '\0';
+ }
+ }
+ continue;
+ }
+
+ //real data
+ if(noofstr >= 3) {
+ item = &track->items[track->noofitems];
+ bl_annotationitemInit(item, GFFITEM);
+
+ for(j=0; j < noofstr; j++) {
+ str = set[i]->strings[j].str;
+ len = strlen(str);
+
+ switch(j) {
+ case 0:
+ item->chromnamelen = len;
+ item->chromname = ALLOCMEMORY(space, NULL, char, len+1);
+ memmove(item->chromname, str, len);
+ item->chromname[len] = '\0';
+ break;
+ case 1:
+ item->sourcelen = len;
+ item->source = ALLOCMEMORY(space, NULL, char, len+1);
+ memmove(item->source, str, len);
+ item->source[len] = '\0';
+ break;
+ case 2:
+ item->namelen = len;
+ item->name = ALLOCMEMORY(space, NULL, char, len+1);
+ memmove(item->name, str, len);
+ item->name[len] = '\0';
+ break;
+ case 3:
+ item->start = atoi(str);
+ if(!item->start && str[0] != '0') {
+ DBG("GFF '%s' %d:%d: atoi failed", filename, i, j);
+ exit(-1);
+ }
+ break;
+ case 4:
+ item->end = atoi(str);
+ if(!item->end && str[0] != '0') {
+ DBG("GFF '%s' %d:%d: atoi failed", filename, i, j);
+ exit(-1);
+ }
+ break;
+ case 5:
+ item->score = atof(str);
+ if(item->score == 0.0 && str[0] != '0' && str[0] != '.') {
+ DBG("GFF '%s' %d:%d: %f(%s) :atof failed", filename, i, j, item->score, str);
+ exit(-1);
+ }
+ break;
+ case 6:
+ if(str[0] != '-' && str[0] != '+' && str[0] != '.') {
+ DBG("GFF '%s' %d:%d: strand failed", filename, i, j);
+ exit(-1);
+ }
+ item->strand = str[0];
+ break;
+ case 7:
+ item->frame = atoi(str);
+ if((!item->frame && str[0] != '.') || item->frame > 2) {
+ DBG("GFF '%s' %d:%d: %s:atoi frame failed", filename, i, j, str);
+ exit(-1);
+ }
+ break;
+ case 8:
+ pch = strtok(str, ";");
+
+ while(pch != NULL) {
+ pchlen = strlen(pch);
+
+ for(p=0; isspace((int)pch[p]) && p < pchlen; p++);
+ if(p < pchlen) {
+ bl_GFFAddAttribute(space, item, &pch[p], strlen(&pch[p]));
+ }
+ pch = strtok(NULL, ";");
+ }
+
+ break;
+ default:
+ DBG("'%s' not in GFF format\n", filename);
+ exit(-1);
+ break;
+ }
+ }
+
+ track->noofitems++;
+ }
+
+ }
+ destructStringset(space, set[i]);
+ }
+
+ qsort(track->items, track->noofitems, sizeof(annotationitem_t),
+ bl_annotationitem_cmp);
+
+ bl_annotationtrackAssignTrackLevel(track);
+
+ FREEMEMORY(space,set);
+ return track;
+}
+
+
+/*------------------------ bl_annotationtrackGetStats ------------------------
+ *
+ * @brief get number of different loci in annotation track
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_annotationtrackGetStats (void *space, annotationtrack_t *track)
+{
+
+ Uint i=0,j,noofdups, len;
+ char *attr;
+ annotationitem_t *a, *b;
+
+ while(i < track->noofitems) {
+ a = &track->items[i];
+ for(j=i+1; j < track->noofitems; j++) {
+ b = &track->items[j];
+ if(a->start != b->start || a->end != b->end || a->strand != b->strand) {
+ break;
+ }
+ }
+
+ noofdups = j-i;
+
+ for(j=i; j < i+noofdups; j++) {
+ b = &track->items[j];
+
+ len = snprintf(NULL, 0, "loci_cnt %d %d", j-i+1, noofdups);
+ attr = ALLOCMEMORY(space, NULL, char, len+1);
+ snprintf(attr, len+1,"loci_cnt %d %d", j-i+1, noofdups);
+ bl_GFFAddAttribute(space, b, attr, len);
+ }
+
+ i+=noofdups;
+
+ }
+
+ return 0;
+}
+
+
+
+/*------------------------------- bl_BEDwrite --------------------------------
+ *
+ * @brief write a bed to a file
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_BEDwrite (annotationtrack_t *track, FILE *fp)
+{
+ Uint i,j;
+ annotationitem_t *b;
+
+
+ for(i=0; i < track->noofitems; i++) {
+ b = &track->items[i];
+ fprintf(fp,"%s\t%d\t%d\t", b->chromname, b->start, b->end);
+ if(b->name) {
+ fprintf(fp,"%s\t", b->name);
+ if(b->score >=0) {
+ fprintf(fp, "%f\t", b->score);
+ if(b->strand) {
+ fprintf(fp, "%c\t", b->strand);
+ if(b->thickStart) {
+ fprintf(fp, "%d\t", b->thickStart);
+ if(b->thickEnd) {
+ fprintf(fp, "%d\t", b->thickEnd);
+ if(b->itemRgb) {
+ fprintf(fp, "%d,%d,%d\t", b->itemRgb[0], b->itemRgb[1], b->itemRgb[2]);
+ } else {
+ fprintf(fp, "0\t");
+ }
+ if(b->blockCount) {
+ fprintf(fp, "%d\t", b->blockCount);
+ if(b->blockSizes) {
+ for(j=0; j < b->blockCount; j++) {
+ fprintf(fp, "%d", b->blockSizes[j]);
+ if (j < b->blockCount-1) fprintf(fp, ",");
+ else fprintf(fp, "\t");
+ }
+ if(b->blockStarts) {
+ for(j=0; j < b->blockCount; j++) {
+ if(b->blockRefseqs && b->blockRefseqs[j]) {
+ fprintf(fp, "%s:%d:%c", b->blockRefseqs[j],
+ b->blockStarts[j], b->blockStrands[j]);
+ } else {
+ fprintf(fp, "%d", b->blockStarts[j]);
+ }
+
+ if (j < b->blockCount-1) fprintf(fp, ",");
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ fprintf(fp, "\n");
+ }
+
+ fclose(fp);
+ return ;
+}
+
+
+/*-------------------------------- bl_GTFread --------------------------------
+ *
+ * @brief read a GTF file an construct a list of gene models
+ * @author Steve Hoffmann
+ *
+ */
+
+geneset_t*
+bl_GTFread (char *filename)
+{
+ return NULL;
+}
+
+
diff --git a/segemehl/libs/biofiles.h b/segemehl/libs/biofiles.h
new file mode 100644
index 0000000..f827116
--- /dev/null
+++ b/segemehl/libs/biofiles.h
@@ -0,0 +1,292 @@
+#ifndef _BIOFILES_
+#define _BIOFILES_
+
+/*
+ *
+ * biofiles.h
+ * declarations
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 07/10/2007 02:32:29 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: biofiles.h 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/biofiles.h $
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringutils.h"
+#include "basic-types.h"
+#include "charsequence.h"
+#include "zran.h"
+
+#define ID 1
+#define IDEND 2
+#define SEQEND 3
+#define QUALIDEND 4
+#define END 5
+
+#define GFFITEM 0
+#define BEDITEM 1
+#define SNPITEM 2
+
+typedef struct seqaccesspoint_s {
+ off_t offset;
+ Uint fileid;
+ Uint noofseqs;
+ Uint cumnoofseqs;
+} seqaccesspoint_t;
+
+
+typedef struct fastxseqindex_s {
+ seqaccesspoint_t *ap;
+ Uint size;
+ Uint allocated;
+} fastxseqindex_t;
+
+
+typedef struct fastxfileindex_s {
+ seqaccesspoint_t *ap;
+ Uint size;
+ Uint allocated;
+} fastxfileindex_t;
+
+
+typedef struct fasta_s {
+
+ CharSequence** seqs;
+ CharSequence** quals;
+
+ Uint *matestart;
+ Uint noofseqs;
+ Uint active_noofseqs;
+ Uint active_noofmates;
+ Uint minlen;
+ Uint maxlen;
+ char minqual;
+ char maxqual;
+ Uint curchunk;
+ Uint offset;
+
+ unsigned char lower;
+ unsigned char upper;
+ unsigned char gzip;
+ unsigned char hasMates;
+ unsigned char hasIndex;
+ unsigned char chunkIsActive;
+
+ Uint nooffiles;
+ Uint *filetotal;
+ char **filenames;
+ char **matefilenames;
+
+ fastxseqindex_t *chunkindex;
+ fastxseqindex_t *matechunkindex;
+
+ fastxfileindex_t **findex;
+ fastxfileindex_t **matefindex;
+
+ struct access **gzindex;
+ struct access **mategzindex;
+
+} fasta_t;
+
+
+
+
+typedef struct {
+
+ unsigned char type;
+ /*
+ *
+ * chromname is gff seqname
+ *
+ */
+ char *chromname;
+ Uint chromnamelen;
+
+ /*
+ * BED und GFF: 1-offset
+ * for personalSNP: start with 0-offset
+ * end base is not part of the feature ie. if
+ * end = 100 last feature base is 99. see below.
+ *
+ */
+
+ Uint start;
+ Uint end;
+
+ /*
+ * GFF: name is the feature key
+ * BED: name is the name of the acutal feature
+ * for personalSNP track name is alleles A,C,T,G separated by '/'.
+ * Leading '-' is indel: insertion if start-end=0
+ *
+ * */
+
+ char *name;
+ Uint namelen;
+ double score;
+ unsigned char strand;
+
+ /*GFF fields*/
+ unsigned char frame;
+ char *source;
+ Uint sourcelen;
+ Uint noofattributes;
+ char **attributes;
+ Uint *attributelen;
+
+ /*BED fields*/
+ Uint thickStart;
+ Uint thickEnd;
+ Uint *itemRgb;
+ Uint blockCount;
+ Uint* blockSizes;
+ Uint* blockStarts;
+ Uint noofovl;
+ Uint firstovl;
+ Uint level;
+ /*extension*/
+ char **blockRefseqs;
+ char *blockStrands;
+
+ /*SNPitem*/
+ Uint alleleCount;//number of alleles in name
+ Uint *alleleFreq;//from comma separated list of number of observed alleles - if unkowns 0
+ Uint *alleleScores;//from a comma separated list - if unkown 0
+
+} annotationitem_t;
+
+
+
+typedef struct {
+
+ char *trackname;
+ Uint tracknamelen;
+ char *description;
+ Uint descriptionlen;
+ Uint noofitems;
+ annotationitem_t *items;
+
+} annotationtrack_t;
+
+
+typedef struct {
+ Uint start;
+ Uint end;
+} cds_t;
+
+typedef struct {
+ Uint start;
+ Uint end;
+ char *refchr;
+ char strand;
+ Uint noofcds;
+ cds_t *cds;
+} exon_t;
+
+typedef struct {
+ char *id;
+ char direction;
+ Uint noofexons;
+ exon_t *exons;
+ Uint startcodon;
+ Uint stopcodon;
+} gene_t;
+
+
+typedef struct {
+ Uint noofgenes;
+ gene_t *genes;
+} geneset_t;
+
+
+fasta_t**
+bl_fastxChopIndex(void *space, fasta_t *f, Uint pieces);
+void bl_fastaGetClipPos (fasta_t *f, Uint elem, Uint *p5, Uint *p3);
+void bl_fastaGetMateClipPos (fasta_t *f, Uint elem, Uint *p5, Uint *p3);
+Uint bl_fastaSoftClip (void *space, fasta_t *f, Uint elem,
+ char *p5, Uint p5len, Uint p5scr, char *p3, Uint p3len, Uint p3scr, Uint pAlen);
+Uint bl_fastaMateSoftClip (void *space, fasta_t *f, Uint elem,
+ char *p5, Uint p5len, Uint p5scr, char *p3, Uint p3len, Uint p3scr, Uint pAlen);
+Uint bl_fastaHardClip (void *space, fasta_t *f, Uint elem,
+ Uint p5, Uint p3);
+Uint bl_fastaMateHardClip (void *space, fasta_t *f, Uint elem,
+ Uint p5, Uint p3);
+void bl_fastaDestruct(void *space, fasta_t* f);
+void bl_fastxDestructSequence(void *space, fasta_t* f);
+int bl_fastxGetChunk (fasta_t *fasta, Uint k);
+fasta_t** bl_fastaChop(void *space, fasta_t* f, Uint pieces);
+Uint bl_fastaGetMateDescriptionLength(fasta_t *f, Uint elem);
+char* bl_fastaGetMateDescription(fasta_t *f, Uint elem);
+unsigned char bl_fastaHasQuality(fasta_t *f);
+fasta_t* bl_fastaInit(void *);
+void bl_fastaAdd(void *space, fasta_t*, char *desc, Uint,
+ char* sequence, Uint, Uint);
+fasta_t* bl_fastaRead(void *space, fasta_t*, char* filename,
+ unsigned char upper, unsigned char lower, unsigned int n,
+ void (*handler) (void *, fasta_t*, char*, Uint, char*, Uint, Uint));
+fasta_t* bl_fastxGetSet(void *space, char **filenames, unsigned int nooffiles,
+ unsigned char upper, unsigned char lower, unsigned char index, Uint pieces);
+fasta_t* bl_fastaGetSet(void *space, char **filenames, unsigned int nooffiles,
+ unsigned char upper, unsigned char lower);
+Uint bl_fastaGetMateStart(fasta_t *f, Uint elem);
+void bl_fastxDestructChunkIndex (void *space, fasta_t *);
+fasta_t* bl_fastxGetMateSet(void *space, fasta_t* set, char** filenames,
+ unsigned int nooffiles, unsigned char upper, unsigned lower,
+ unsigned char index, Uint pieces);
+void bl_fastxDestructSequence(void *space, fasta_t* f);
+fasta_t* bl_fastxgzRead (void *space, fasta_t *fasta, char *filename, struct access *idx,
+ unsigned char upper, unsigned char lower, off_t offset, Uint startseq, Uint lastseq,
+ void (*handler)(void *, fasta_t*, char *, Uint, char *, char *, Uint, Uint));
+fasta_t* bl_fastxRead(void *space, fasta_t* fasta, char* filename,
+ unsigned char upper, unsigned char lower, off_t offset, Uint startseq, Uint lastseq,
+ void (*handler) (void *, fasta_t*, char *, Uint, char *, char *, Uint, Uint));
+void bl_fastxAdd(void *space, fasta_t *f, char *desc, Uint descrlen,
+ char *sequence, char *quality, Uint seqlen, Uint sNo);
+void bl_fastxAddMate(void *space, fasta_t *f, char *desc, Uint desclen,
+ char *sequence, char *quality, Uint seqlen, Uint sNo);
+void bl_fastxDump( void *space, fasta_t *fasta, char *desc, Uint desclen,
+ char *sequence, char *quality, Uint quallen, Uint sNo);
+Uint bl_fastxGetChunkElem (void *space, fasta_t *f, Uint k);
+fasta_t* bl_fastxIndex(void *space, fasta_t *f, char **filenames, Uint nooffiles,
+ unsigned char isMate, unsigned char gzip, Uint pieces);
+Uint bl_fastaGetDescriptionLength(fasta_t *f, Uint elem);
+char* bl_fastaGetDescription(fasta_t *f, Uint elem);
+Uint bl_fastaGetSequenceLength(fasta_t *f, Uint elem);
+char* bl_fastaGetSequence(fasta_t *f, Uint elem);
+char* bl_fastaGetQuality(fasta_t* f, Uint elem);
+#ifdef HASHING
+Uint bl_fastaGetQuantity(fasta_t* f, Uint elem);
+void bl_fastaSetQuantity(fasta_t* f, Uint elem, Uint quantity);
+#endif
+unsigned char bl_fastaHasMate(fasta_t *f);
+Uint bl_fastaGetMateLength(fasta_t *f, Uint elem);
+char* bl_fastaGetMate(fasta_t *f, Uint elem);
+char* bl_fastaGetMateQuality(fasta_t *f, Uint elem);
+fasta_t* bl_fastxgzIndex(void *space, char *gzfilename);
+fastxseqindex_t* bl_fastxChunkIndex (void *space, char **filenames, struct access **gzindex,
+ fastxfileindex_t **findex, Uint *n, Uint nooffiles, Uint total, Uint k);
+void bl_fastaSetMateClip (fasta_t *f, Uint elem, Uint p5, Uint p3);
+void bl_fastaSetClip (fasta_t *f, Uint elem, Uint p5, Uint p3);
+int bl_rm(void *space, char *filename);
+Uint bl_fastxFindIDIdx (char *id, fasta_t *set);
+annotationtrack_t* bl_BEDread (void *space, char *filename);
+void bl_BEDwrite (annotationtrack_t *track, FILE *dev);
+void bl_annotationtrackDestruct (void *space, annotationtrack_t *track);
+annotationtrack_t* bl_GFFread (void *space, char *filename);
+Uint bl_annotationitem_cmp_track (Uint item, void *track, void *elem, void *nfo);
+int bl_fastxIDcmp (char *a, char *b);
+void bl_GFFAddAttribute (void *space, annotationitem_t *item, char *attr, Uint len);
+void bl_GFFwrite(char *filename, annotationtrack_t *set);
+Uint bl_annotationtrackGetStats (void *space, annotationtrack_t *track);
+
+#endif
diff --git a/segemehl/libs/bitArray.c b/segemehl/libs/bitArray.c
new file mode 100644
index 0000000..9b12f0d
--- /dev/null
+++ b/segemehl/libs/bitArray.c
@@ -0,0 +1,89 @@
+
+/*
+ * bitArray.c
+ * implementations
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 07/14/2007 04:15:14 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 33 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-27 10:00:08 +0200 (Tue, 27 May 2008) $
+ *
+ * Id: $Id: bitArray.c 33 2008-05-27 08:00:08Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/bitArray.c $
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "basic-types.h"
+#include "memory.h"
+#include "bitArray.h"
+
+bitarray
+initbitarray(void *space, Uint len) {
+ Uint n;
+
+ bitarray a;
+
+ n = len/8;
+ n += (len % 8 > 0) ? 1 : 0;
+ fprintf(stderr, "init bit array of %u\n", n);
+ a = ALLOCMEMORY(space, NULL, unsigned char, n);
+
+ return a;
+}
+
+bitarray
+resizebitarray(void *space, bitarray a, Uint len) {
+ Uint n;
+
+ n = len/8;
+ n += (len % 8 > 0) ? 1 : 0;
+ a = ALLOCMEMORY(space, a, unsigned char, n);
+
+ return a;
+}
+
+void
+setbitarray(bitarray a, Uint len, unsigned char val) {
+ Uint n;
+
+ n = len/8;
+ n += (len % 8 > 0) ? 1 : 0;
+ memset(a,((val) ? 255 :0), n);
+}
+
+unsigned char
+valbitarray(bitarray a, Uint len, unsigned char val) {
+ Uint i;
+ char *array;
+
+ array = (char*) a;
+
+ for(i=0; i < (len/8); i++) {
+ if (array[i] != (char) 255)
+ return 0;
+ }
+ for(i=0; i < (len%8); i++){
+ if (getbit(a, len-i-1)!= val)
+ return 0;
+ }
+
+ return 1;
+}
+
+void
+dumpbitarray(bitarray a, Uint len) {
+ Uint i;
+
+ for(i=0; i < len; i++) {
+ printf("%d ", getbit(a, i));
+ }
+ printf("\n");
+}
+
diff --git a/segemehl/libs/bitArray.h b/segemehl/libs/bitArray.h
new file mode 100644
index 0000000..a689f8a
--- /dev/null
+++ b/segemehl/libs/bitArray.h
@@ -0,0 +1,69 @@
+#ifndef BITARRAY_H
+#define BITARRAY_H
+
+/*
+ *
+ * bitArray.h
+ * declarations for bit arrays
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 07/14/2007 04:15:27 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: bitArray.h 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/bitArray.h $
+ */
+
+typedef unsigned char* bitarray;
+
+bitarray initbitarray(void *, Uint length);
+void dumpbitarray(bitarray a, Uint len);
+unsigned char valbitarray(bitarray a, Uint len, unsigned char val);
+void setbitarray(bitarray a, Uint len, unsigned char val);
+bitarray resizebitarray(void *space, bitarray, Uint len);
+static inline void
+setbit(bitarray a, Uint pos, unsigned char val) {
+ int bytes,
+ bits,
+ shift;
+ unsigned char byte, resc;
+
+ bytes = pos >> 3;
+ bits = pos & 7;
+ resc = a[bytes];
+
+ byte = (unsigned char) val;
+ shift = 7 ^ bits;
+
+ byte = byte << shift;
+ byte = byte ^ resc;
+ byte = byte & (1 << shift);
+
+ a[bytes] = byte ^ resc;
+}
+
+static inline unsigned char
+getbit(bitarray a, Uint pos) {
+ int bytes,
+ bits;
+ unsigned char byte;
+
+ bytes = pos >> 3;
+ bits = pos & 7;
+
+ byte = a[bytes];
+ byte = byte >> (7 ^ bits);
+ byte = byte << 7;
+ byte = byte >> 7;
+ byte = byte & 1;
+
+ return byte;
+}
+
+
+#endif
diff --git a/segemehl/libs/bitVector.c b/segemehl/libs/bitVector.c
new file mode 100644
index 0000000..35837b1
--- /dev/null
+++ b/segemehl/libs/bitVector.c
@@ -0,0 +1,192 @@
+
+/*
+ * bitVector.c
+ * implementations
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 07/14/2007 04:15:14 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 93 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-12-07 16:58:47 +0100 (Sun, 07 Dec 2008) $
+ *
+ * Id: $Id: bitArray.c 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/bitArray.c $
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "basic-types.h"
+#include "memory.h"
+#include "bitVector.h"
+
+inline bitvector
+initbitvector(void *space, Uint len) {
+ Uint n;
+
+ bitvector a;
+
+ n = len/BITVECTOR_WORDSIZE;
+ n += (len % (BITVECTOR_WORDSIZE) > 0) ? 1 : 0;
+ a = calloc(n, sizeof(bitvector_t));
+
+ return a;
+}
+
+bitvector
+resizebitvector(void *space, bitvector a, Uint len) {
+ Uint n;
+
+ n = len/BITVECTOR_WORDSIZE;
+ n += (len % BITVECTOR_WORDSIZE > 0) ? 1 : 0;
+ a = ALLOCMEMORY(space, a, bitvector_t, n);
+
+ return a;
+}
+
+inline void
+setbitvector(bitvector a, Uint len, unsigned char val) {
+ Uint n;
+
+ n = len/BITVECTOR_WORDSIZE;
+ n += (len % BITVECTOR_WORDSIZE > 0) ? 1 : 0;
+ memset(a,((val) ? 255 :0), n*(sizeof(bitvector_t)));
+}
+
+unsigned char
+valbitvector(bitvector a, Uint len, unsigned char val) {
+ Uint i;
+ bitvector array;
+
+ array = a;
+
+ for(i=0; i < (len/BITVECTOR_WORDSIZE); i++) {
+ if (array[i] != (int) 255)
+ return 0;
+ }
+ for(i=0; i < (len%BITVECTOR_WORDSIZE); i++){
+ if (bitvector_getbit(a, len-i-1)!= val)
+ return 0;
+ }
+
+ return 1;
+}
+
+void
+dumpbitvector(bitvector a, Uint len) {
+ Uint i;
+
+ for(i=0; i < len; i++) {
+ printf("%d ", bitvector_getbit(a, i));
+ }
+ printf("\n");
+}
+
+void
+bitvectorAND(bitvector dest, bitvector a, bitvector b, Uint len) {
+ int i;
+ int n = len/BITVECTOR_WORDSIZE;
+
+ for(i=0; i <n ; i++) {
+ dest[i] = a[i] & b[i];
+ }
+}
+
+void
+bitvectorOR(bitvector dest, bitvector a, bitvector b, Uint len) {
+ int i;
+ int n;
+
+ n = len/BITVECTOR_WORDSIZE;
+
+ for(i=0; i < n; i++) {
+ dest[i] = a[i] | b[i];
+ }
+}
+
+void
+bitvectorNOT(bitvector dest, bitvector a, Uint len) {
+ int i;
+ int n = len/BITVECTOR_WORDSIZE;
+
+ for(i=0; i < n; i++) {
+ dest[i] = ~a[i];
+ }
+}
+
+void
+bitvectorXOR(bitvector dest, bitvector a, bitvector b, Uint len) {
+ int i;
+ int n = len/BITVECTOR_WORDSIZE;
+
+ for(i=0; i < n; i++) {
+ dest[i] = a[i] ^ b[i];
+ }
+}
+
+unsigned char
+bitvectorADD(bitvector dest, bitvector a, bitvector b, Uint len){
+ int i,n = len/BITVECTOR_WORDSIZE;
+ unsigned char carry = 0;
+
+ for(i=0; i < n; i++) {
+ dest[i] = a[i] + b[i] + carry;
+ if(carry) {
+ carry = ((dest[i] <= a[i]) || (dest[i] <= b[i]));
+ } else {
+ carry = ((dest[i] < a[i]) || (dest[i] < b[i]));
+ }
+ }
+ return carry;
+}
+
+
+void
+bitvectorLSHIFT(bitvector dest, bitvector a, Uint len, Uint shift) {
+ int i;
+
+ int wordshift = shift/BITVECTOR_WORDSIZE;
+ int offset = shift % BITVECTOR_WORDSIZE;
+ int n = len/BITVECTOR_WORDSIZE;
+ int suboffset = BITVECTOR_WORDSIZE - offset;
+
+ if (offset == 0) {
+ for(i=n-1; i >= wordshift; --i) {
+ dest[i] = a[i-wordshift];
+ }
+ } else {
+ for(i=n-1; i > wordshift; --i) {
+ dest[i] = (a[i-wordshift] << offset) | (a[i-wordshift-1] >> suboffset);
+ }
+ dest[wordshift] = a[0] << offset;
+ }
+}
+
+void
+bitvectorRSHIFT(bitvector dest, bitvector a, Uint len, Uint shift) {
+ int i;
+
+ int wordshift = shift/BITVECTOR_WORDSIZE;
+ int offset = shift % BITVECTOR_WORDSIZE;
+ int n = len/BITVECTOR_WORDSIZE;
+ int limit = n - wordshift -1;
+ int suboffset = BITVECTOR_WORDSIZE - offset;
+
+ if (offset == 0) {
+ for(i=0; i <= limit; ++i) {
+ dest[i] = a[i+wordshift];
+ }
+ } else {
+ for(i=0; i < limit; ++i) {
+ dest[i] = (a[i+wordshift] >> offset) | (a[i+wordshift+1] << suboffset);
+ }
+ dest[limit] = a[n-1] >> offset;
+ }
+}
+
+
+
diff --git a/segemehl/libs/bitVector.h b/segemehl/libs/bitVector.h
new file mode 100644
index 0000000..0575e61
--- /dev/null
+++ b/segemehl/libs/bitVector.h
@@ -0,0 +1,67 @@
+#ifndef BITVECTOR_H
+#define BITVECTOR_H
+
+/*
+ *
+ * bitVector.h
+ * declarations for bit arrays
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 07/14/2007 04:15:27 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 93 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-12-07 16:58:47 +0100 (Sun, 07 Dec 2008) $
+ *
+ * Id: $Id: bitArray.h 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/bitArray.h $
+ */
+
+#include "basic-types.h"
+
+#define BITVECTOR_WORDSIZE (sizeof(unsigned long long int)*8)
+
+
+typedef unsigned long long int bitvector_t;
+typedef bitvector_t* bitvector;
+
+extern bitvector initbitvector(void *, Uint length);
+void dumpbitvector(bitvector a, Uint len);
+unsigned char valbitvector(bitvector a, Uint len, unsigned char val);
+extern void setbitvector(bitvector a, Uint len, unsigned char val);
+bitvector resizebitvector(void *space, bitvector, Uint len);
+void wrapBitmatrix(void *space, bitvector *, Uint m);
+
+static inline void
+bitvector_setbit(bitvector a, Uint pos, unsigned char val) {
+ int byte,
+ bits;
+ bitvector_t mask=0;
+
+ byte = pos/BITVECTOR_WORDSIZE;
+ bits = pos & (BITVECTOR_WORDSIZE - 1);
+ mask = 1;
+ mask <<= bits;
+
+ a[byte] ^= ((bitvector_t)-val ^ a[byte]) & mask;
+}
+
+static inline unsigned char
+bitvector_getbit(bitvector a, Uint pos) {
+ int byte;
+ int bits;
+ bitvector_t mask=0;
+
+
+ byte = pos/BITVECTOR_WORDSIZE;
+ bits = pos & (BITVECTOR_WORDSIZE - 1);
+ mask = 1;
+ mask <<= bits;
+
+ return ((bitvector_t)a[byte] & mask)? 1 : 0;
+}
+
+
+#endif
diff --git a/segemehl/libs/bitvectoralg.c b/segemehl/libs/bitvectoralg.c
new file mode 100644
index 0000000..54e0566
--- /dev/null
+++ b/segemehl/libs/bitvectoralg.c
@@ -0,0 +1,632 @@
+
+/*
+ * bitvectoralg.c
+ * implementation of Gene Myers
+ * bitvector algorithm
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 05/23/2008 06:12:54 PM CEST
+ *
+ */
+
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "basic-types.h"
+#include "mathematics.h"
+#include "bitVector.h"
+#include "memory.h"
+#include "alignment.h"
+#include "iupac.h"
+
+
+/*-------------------------------- alphacheck ---------------------------------
+ *
+ * @brief alphabet check
+ * @author Steve Hoffmann
+ *
+ */
+
+unsigned char
+alphacheck (char c) {
+ if ( c == 'A' || c == 'a' || c == 'C' || c == 'c'
+ || c == 'T' || c == 't' || c == 'G' || c == 'g') {
+ return 1;
+ }
+ return 0;
+}
+
+
+/*---------------------------- getstringalphabet -----------------------------
+ *
+ * @brief return the alphabet of a string
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+getstringalphabet (void *space, char *string, Uint len, Uint *asize)
+{
+ char *alphabet=NULL;
+ unsigned char *found;
+ Uint i, l = 0;
+
+ found = ALLOCMEMORY(space, NULL, char, 256);
+ memset(found, 0, 256);
+
+ for(i=0; i < len; i++) {
+ if(!found[(Uint)string[i]]) {
+ alphabet = ALLOCMEMORY(space, alphabet, char, l+2);
+ alphabet[l] = string[i];
+ alphabet[l+1] = 0;
+ found[(Uint)string[i]] = 1;
+ l++;
+ }
+ }
+
+ FREEMEMORY(space, found);
+ *asize = l;
+ return alphabet;
+}
+
+
+
+/*-------------------------------- encodetab ---------------------------------
+ *
+ * @brief brutal ascii encoding
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint*
+encodetab(char *alphabet, Uint asize) {
+ Uint i;
+ Uint *tab;
+
+ tab = ALLOCMEMORY(space, NULL, Uint, 255);
+ memset(tab, asize, sizeof(Uint)*255);
+ for(i=0; i < asize; i++) {
+ tab[(Uint)alphabet[i]] = i;
+ }
+ return tab;
+}
+
+
+/*---------------------------------- getpeq ----------------------------------
+ *
+ * @brief returns pattern mask for each char in alphabet
+ * @author Steve Hoffmann
+ *
+ */
+
+bitvector*
+getpeq(void *space,
+ char *query,
+ Uint qlen,
+ char *alphabet,
+ Uint asize,
+ Uint *enctab) {
+
+ bitvector* peq;
+ Uint i,j, wordno;
+
+ wordno = qlen/BITVECTOR_WORDSIZE;
+ wordno++;
+ peq = ALLOCMEMORY(space, NULL, bitvector*, asize);
+
+// printf("query: %s\n", query);
+ for(i=0; i < asize; i++) {
+ peq[i] = initbitvector(space, BITVECTOR_WORDSIZE*wordno);
+ setbitvector(peq[i], BITVECTOR_WORDSIZE*wordno, 0);
+ //peq[i] = 0;
+ for(j=0; j < qlen; j++) {
+ if (matchIUPAC(query[j], alphabet[i])){
+ bitvector_setbit(peq[i], j, 1);
+ }
+ }
+ // printf("char:%c\n", alphabet[i]);
+ // dumpbitvector(peq[i], BITVECTOR_WORDSIZE*wordno);
+ }
+
+ return peq;
+}
+
+
+/*------------------------------ myersbitvector ------------------------------
+ *
+ * @brief approx. string matching: calculate min edist of query and subject
+ * @author Steve Hoffmann
+ *
+ */
+
+PairSint
+myersbitvector(
+ void *space,
+ char *query,
+ Uint qlen,
+ char *subject,
+ Uint slen,
+ char *alphabet,
+ Uint asize,
+ Uint *enctab,
+ Uint k,
+ bitvector *peq) {
+
+ bitvector
+ Pv,
+ Mv,
+ Eq;
+ PairSint res;
+
+ Uint score=qlen,
+ i,
+ j,
+ wordno,
+ bits;
+ bitvector_t check,
+ temp,
+ carryxh,
+ carryph,
+ carrymh,
+ Ph=0,
+ Mh=0,
+ Xv=0,
+ Xh=0;
+
+ res.a = -1;
+ res.b = qlen;
+ wordno = qlen/BITVECTOR_WORDSIZE;
+ bits = qlen & (BITVECTOR_WORDSIZE-1);
+ wordno++;
+
+ Pv = initbitvector(space, wordno*BITVECTOR_WORDSIZE);
+ Mv = initbitvector(space, wordno*BITVECTOR_WORDSIZE);
+
+ check = 0;
+ bitvector_setbit(&check, bits, 1);
+
+ setbitvector(Pv, wordno*BITVECTOR_WORDSIZE, 1);
+ setbitvector(Mv, wordno*BITVECTOR_WORDSIZE, 0);
+
+
+ for(i=0; i < slen; i++) {
+
+ Eq = peq[enctab[(Uint)subject[i]]];
+ carryxh = carryph = carrymh = 0;
+
+ for(j=0; j < wordno; j++) {
+
+ Xv = Eq[j] | Mv[j];
+ temp = ((Eq[j] & Pv[j]) + Pv[j] + carryxh);
+ Xh = (temp ^ Pv[j]) | Eq[j];
+
+ if (carryxh)
+ carryxh = (temp <= (Eq[j] & Pv[j]) || temp <= Pv[j]);
+ else
+ carryxh = (temp < (Eq[j] & Pv[j]) || temp < Pv[j]);
+
+ Ph = Mv[j] | ~(Xh | Pv[j]);
+ Mh = Pv[j] & Xh;
+
+ //check if last word
+ if (j == wordno-1) {
+ if (Ph & check)
+ score+=1;
+ else if(Mh & check)
+ score-=1;
+ }
+
+ /*Ph = Ph << 1; with carry*/
+ temp = (Ph << 1) | carryph;
+ carryph = Ph >> (BITVECTOR_WORDSIZE-1);
+ Ph = temp;
+
+ temp = (Mh << 1) | carrymh;
+ carrymh = Mh >> (BITVECTOR_WORDSIZE-1);
+ Mh = temp;
+
+ Pv[j] = Mh | ~(Xv | Ph);
+ Mv[j] = Ph & Xv;
+
+ }
+
+ if (score <= k && score <= res.b) {
+ res.a = i;
+ res.b = score;
+ }
+ }
+
+ FREEMEMORY(space, Pv);
+ FREEMEMORY(space, Mv);
+
+ return res;
+}
+
+
+/*------------------------------ myersbitmatrix ------------------------------
+ *
+ * @brief modified bitvector algorithm to return bitmatrix for backtracking
+ * @author Steve Hoffmann
+ *
+ */
+
+bitvector*
+myersbitmatrix(
+ void *space,
+ char *query,
+ Uint qlen,
+ char *subject,
+ Uint slen,
+ char *alphabet,
+ Uint asize,
+ Uint *enctab,
+ Uint k,
+ bitvector *peq,
+ PairSint *res,
+ bitvector *D,
+ Uint dim) {
+
+ bitvector
+ *Pv,
+ *Mv,
+ MvP,
+ PvP,
+ Eq;
+
+ Uint score=qlen,
+ i,
+ j,
+ wordno,
+ bits;
+ bitvector_t check,
+ temp,
+ carryxh,
+ carryph,
+ carrymh,
+ Ph=0,
+ Mh=0,
+ Xv=0,
+ Xh=0;
+
+ res->a = -1;
+ res->b = qlen;
+ wordno = qlen/BITVECTOR_WORDSIZE;
+ bits = qlen & (BITVECTOR_WORDSIZE-1);
+ wordno++;
+
+ Pv = D;
+ Mv = &Pv[dim+1];
+
+ memset(Pv[0], 255, wordno*(sizeof(bitvector_t)));
+ memset(Mv[0], 0, wordno*(sizeof(bitvector_t)));
+
+ check = 0;
+ bitvector_setbit(&check, bits, 1);
+
+ for(i=0; i < slen; i++) {
+
+ Eq = peq[enctab[(Uint)subject[i]]];
+ carryxh = carryph = carrymh = 0;
+
+ MvP = Mv[i];
+ PvP = Pv[i];
+
+ for(j=0; j < wordno; j++) {
+
+ Xv = Eq[j] | MvP[j];
+ temp = ((Eq[j] & PvP[j]) + PvP[j] + carryxh);
+ Xh = (temp ^ PvP[j]) | Eq[j];
+
+ if (carryxh)
+ carryxh = (temp <= (Eq[j] & PvP[j]) || temp <= PvP[j]);
+ else
+ carryxh = (temp < (Eq[j] & PvP[j]) || temp < PvP[j]);
+
+ Ph = MvP[j] | ~(Xh | PvP[j]);
+ Mh = PvP[j] & Xh;
+
+ //check if last word
+ if (j == wordno-1) {
+ if (Ph & check) {
+ score+=1;
+ // printf("%d,%d:%d hout: %d\n", i, j, score, 1);
+ }
+ else if(Mh & check) {
+ score-=1;
+ // printf("%d,%d:%d hout: %d\n", i, j, score, -1);
+ }
+ }
+
+ /*Ph = Ph << 1; with carry*/
+ temp = (Ph << 1) | carryph;
+ carryph = Ph >> (BITVECTOR_WORDSIZE-1);
+ Ph = temp;
+
+ temp = (Mh << 1) | carrymh;
+ carrymh = Mh >> (BITVECTOR_WORDSIZE-1);
+ Mh = temp;
+
+ Pv[i+1][j] = Mh | ~(Xv | Ph);
+ Mv[i+1][j] = Ph & Xv;
+
+ }
+
+ if (score <= k && score <= res->b) { // && i < slen - 1) {
+ res->a = i;
+ res->b = score;
+ // fprintf(stderr, "%d: %d wordno:%d\n", i, score, wordno);
+ }
+ }
+
+ return Pv;
+}
+
+
+
+
+/*---------------------------- myersbitblockmatrix ----------------------------
+ *
+ * @brief modified bitvector algorithm to return bitmatrix for backtracking
+ * @author Steve Hoffmann
+ *
+ */
+
+bitvector*
+myersblockbitmatrix(
+ void *space,
+ char *query,
+ Uint qlen,
+ char *subject,
+ Uint slen,
+ char *alphabet,
+ Uint asize,
+ Uint *enctab,
+ Uint k,
+ bitvector *peq,
+ PairSint *res,
+ bitvector *D,
+ Uint dim) {
+
+ bitvector
+ *Pv,
+ *Mv,
+ MvP,
+ PvP,
+ Eq,
+ W;
+
+ Uint *score,
+ i,
+ j,
+ y,
+ wordno,
+ bits;
+ int hout=0;
+ bitvector_t check,
+ last,
+ first,
+ temp,
+ carryxh,
+ carryph,
+ carrymh,
+ Ph=0,
+ Mh=0,
+ Xv=0,
+ Xh=0,
+ w;
+
+ res->a = -1;
+ res->b = qlen;
+ wordno = qlen/BITVECTOR_WORDSIZE;
+ bits = qlen & (BITVECTOR_WORDSIZE-1);
+ wordno++;
+
+ Pv = D;
+ Mv = &Pv[dim+1];
+
+ memset(Pv[0], 255, wordno*(sizeof(bitvector_t)));
+ memset(Mv[0], 0, wordno*(sizeof(bitvector_t)));
+
+ check = 0;
+ bitvector_setbit(&check, bits, 1);
+ last = 0;
+ bitvector_setbit(&last, BITVECTOR_WORDSIZE-1, 1);
+ first = 0;
+ bitvector_setbit(&first, 0, 1);
+
+ score = calloc(wordno+1, sizeof(Uint));
+ W = ALLOCMEMORY(space, NULL, bitvector_t, wordno);
+ y = MIN((Uint) ceil((double)k/(double)BITVECTOR_WORDSIZE)-1, (wordno-1));
+
+ for(i=0; i <= y ; i++) {
+ score[i] = (i+1)*BITVECTOR_WORDSIZE;
+ W[i] = last;
+ }
+ score[wordno-1] = qlen;
+ W[wordno-1] = check;
+
+ for(i=0; i < slen; i++) {
+
+ Eq = peq[enctab[(Uint)subject[i]]];
+ carryxh = carryph = carrymh = 0;
+
+ MvP = Mv[i];
+ PvP = Pv[i];
+
+ for(j=0; j <= y; j++) {
+
+ Xv = Eq[j] | MvP[j];
+ temp = ((Eq[j] & PvP[j]) + PvP[j] + carryxh);
+ Xh = (temp ^ PvP[j]) | Eq[j];
+
+ if (carryxh)
+ carryxh = (temp <= (Eq[j] & PvP[j]) || temp <= PvP[j]);
+ else
+ carryxh = (temp < (Eq[j] & PvP[j]) || temp < PvP[j]);
+
+ Ph = MvP[j] | ~(Xh | PvP[j]);
+ Mh = PvP[j] & Xh;
+
+ w = W[j];
+
+ hout = 0;
+ if (Ph & w) {
+ score[j] += 1;
+ hout = 1;
+ } else if (Mh & w) {
+ score[j] -= 1;
+ hout = -1;
+ }
+
+ /*Ph = Ph << 1; with carry*/
+ temp = (Ph << 1) | carryph;
+ carryph = Ph >> (BITVECTOR_WORDSIZE-1);
+ Ph = temp;
+
+ temp = (Mh << 1) | carrymh;
+ carrymh = Mh >> (BITVECTOR_WORDSIZE-1);
+ Mh = temp;
+
+ Pv[i+1][j] = Mh | ~(Xv | Ph);
+ Mv[i+1][j] = Ph & Xv;
+ }
+
+
+ if (y < wordno-1 && score[y]-hout <= k
+ && ((Eq[y+1] & first) || hout < 0)) {
+
+ y += 1;
+
+ memset(&Pv[i][j], 255,(sizeof(bitvector_t)));
+ memset(&Mv[i][j], 0, (sizeof(bitvector_t)));
+
+ MvP = Mv[i];
+ PvP = Pv[i];
+
+ //since we open a new zone here we have to update the score
+ Xv = Eq[j] | MvP[j];
+ temp = ((Eq[j] & PvP[j]) + PvP[j] + carryxh);
+ Xh = (temp ^ PvP[j]) | Eq[j];
+
+ Ph = MvP[j] | ~(Xh | PvP[j]);
+ Mh = PvP[j] & Xh;
+
+ //check if last word
+ if (j < wordno-1) {
+ score[j] = score[j-1] + BITVECTOR_WORDSIZE - hout;
+ W[j] = last;
+ w = last;
+ } else {
+ score[j] = score[j-1] + bits - hout;
+ w = check;
+ }
+
+ if (Ph & w) {
+ score[j] += 1;
+ } else if (Mh & w) {
+ score[j] -= 1;
+ }
+
+ temp = (Ph << 1) | carryph;
+ carryph = Ph >> (BITVECTOR_WORDSIZE-1);
+ Ph = temp;
+
+ temp = (Mh << 1) | carrymh;
+ carrymh = Mh >> (BITVECTOR_WORDSIZE-1);
+ Mh = temp;
+
+ Pv[i+1][j] = Mh | ~(Xv | Ph);
+ Mv[i+1][j] = Ph & Xv;
+
+ } else {
+
+ while(y > 0 && score[y] >= k + BITVECTOR_WORDSIZE) {
+ y -= 1;
+ }
+ }
+
+ if (score[wordno-1] <= k && score[wordno-1] <= res->b && i < slen - 1) {
+ res->a = i;
+ res->b = score[wordno-1];
+ // fprintf(stderr, "block score: %d:%d (%d); wordno:%d\n", i, score[wordno-1], score[0], wordno-1);
+ }
+ }
+
+ FREEMEMORY(space, W);
+ FREEMEMORY(space, score);
+ return Pv;
+}
+
+
+/*---------------------------- bitvectorbacktrack ----------------------------
+ *
+ * @brief backtracking in bitmatrix
+ * @author Steve Hoffmann
+ *
+ */
+
+Alignment*
+bitvectorbacktrack(Alignment *al, bitvector *D, Uint dim, Uint k, Uint l) {
+ int i=k-1,
+ j=l;
+ bitvector *Pv = D;
+ bitvector *Mv = &D[dim+1];
+
+ // fprintf(stderr, "dim:%d\n", dim);
+
+ while (i > 0 && j > 0) {
+ if (bitvector_getbit(Pv[j], i)) {
+ insertEop(al, Insertion);
+ i--;
+ } else {
+ if (bitvector_getbit(Mv[j-1], i)) {
+ insertEop(al, Deletion);
+ } else {
+ insertEop(al, Replacement);
+ i--;
+ }
+ j--;
+ }
+ }
+
+ if (i==0 && j > 0) {
+ insertEop(al, Replacement);
+ } else {
+ while(i>=0) {
+ insertEop(al, Insertion);
+ i--;
+ }
+ /*no insertions in app. string matching at the end*/
+ }
+
+ /*adjust subject boundaries*/
+ if(j>0) {
+ al->voff = j-1;
+ al->vlen -= (al->vlen > j) ? j : al->vlen;
+ }
+
+ revMeops(al);
+ return al;
+}
+
+/*------------------------------ wrapBitmatrix -------------------------------
+ *
+ * @brief destruct bit matrix
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+wrapBitmatrix(void *space, bitvector *D, Uint m) {
+ Uint i;
+ for(i=0; i < m; i++) {
+ FREEMEMORY(space, D[i]);
+ }
+
+ return;
+}
+
diff --git a/segemehl/libs/bitvectoralg.h b/segemehl/libs/bitvectoralg.h
new file mode 100644
index 0000000..cecc5da
--- /dev/null
+++ b/segemehl/libs/bitvectoralg.h
@@ -0,0 +1,75 @@
+
+/*
+ *
+ * bitvectoralg.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 05/26/2008 12:26:03 PM CEST
+ *
+ */
+#include "basic-types.h"
+#include "alignment.h"
+#include "bitVector.h"
+
+PairSint myersbitvector( void *space,
+ char *query,
+ Uint qlen,
+ char *subject,
+ Uint slen,
+ char *alphabet,
+ Uint asize,
+ Uint *enctab,
+ Uint k,
+ bitvector *peq);
+
+bitvector*
+myersbitmatrix( void *space,
+ char *query,
+ Uint qlen,
+ char *subject,
+ Uint slen,
+ char *alphabet,
+ Uint asize,
+ Uint *enctab,
+ Uint k,
+ bitvector *peq,
+ PairSint *res,
+ bitvector *D,
+ Uint dim);
+
+bitvector*
+myersblockbitmatrix(
+ void *space,
+ char *query,
+ Uint qlen,
+ char *subject,
+ Uint slen,
+ char *alphabet,
+ Uint asize,
+ Uint *enctab,
+ Uint k,
+ bitvector *peq,
+ PairSint *res,
+ bitvector *D,
+ Uint dim);
+
+bitvector*
+getpeq(void *space,
+ char *query,
+ Uint qlen,
+ char *alphabet,
+ Uint asize,
+ Uint *enctab);
+
+Uint*
+encodetab(char *alphabet, Uint asize) ;
+
+
+char*
+getstringalphabet (void *space, char *string, Uint len, Uint *asize);
+
+Alignment*
+bitvectorbacktrack(Alignment *al, bitvector *D, Uint dim, Uint k, Uint l);
+
diff --git a/segemehl/libs/browsematchfiles.c b/segemehl/libs/browsematchfiles.c
new file mode 100644
index 0000000..fcbcec6
--- /dev/null
+++ b/segemehl/libs/browsematchfiles.c
@@ -0,0 +1,1482 @@
+
+/*
+ * browsematchfiles.c
+ * a small browser
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 06.10.2010 01:39:45 CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include "alignment.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "matchfiles.h"
+#include "browsematchfiles.h"
+#include "plotmatchfiles.h"
+#include "info.h"
+#include <form.h>
+#include <menu.h>
+#include <curses.h>
+#include "ncursesext.h"
+
+#define MAXCHROMSIZE 1500000000
+
+
+/*--------------------------- bl_matchfileGetInfo ----------------------------
+ *
+ * @brief display window with stats and cs info
+ * @author Steve Hoffmann
+ *
+ */
+
+
+SHADOWEDWINDOW*
+bl_matchfileGetInfo(matchfileView_t *view) {
+ int j,k;
+ Uint *ntcounts, curptr=0 ;
+ double *ntprobs, *ntvarpos, *ntreadpos, *ntedist, *ntredund;
+ char NT[]={'A', 'C', 'G', 'T', '-'};
+ WINDOW *win;
+ SHADOWEDWINDOW *shadowed;
+ matchfileCross_t *cs;
+
+ shadowed = newshadowedwin(36, 70, 9, 12);
+ win = dershadowedwin(shadowed, 34, 67, 1, 1);
+ shadowedwbkgd(shadowed, COLOR_PAIR(2), COLOR_PAIR(8));
+ box(win,0,0);
+ wbkgd(win, COLOR_PAIR(2));
+ mvwprintw(win, 22, 2, "(Press 'q' to exit)");
+
+ if(view->curframe && view->curframe->cs) {
+ cs = view->curframe->cs;
+
+ if(view->imap[view->offset-1]) {
+ curptr = view->imap[view->offset-1];
+ } else {
+ k = view->offset-1;
+ while(view->imap[k] == 0 && k > 0) k--;
+ curptr = view->imap[k];
+ }
+
+
+ bl_matchfileTest(NULL, 0, 0, view->curframe->start+curptr,
+ &view->curframe->cs[curptr], view->curframe->ref[curptr], view->idx, 1, NULL);
+
+ wattron(win, A_BOLD);
+ mvwprintw(win, 1, 2, "cross section %d:",
+ view->curframe->start+curptr);
+ mvwprintw(win, 1, 40, "r: %c <> %c :c",
+ cs[curptr].ref, cs[curptr].cons);
+ wattroff(win, A_BOLD);
+
+ ntredund = bl_matchfileGetNTRedundancy(view->curframe, curptr);
+ ntcounts = bl_matchfileGetNTCounts(&view->curframe->cs[curptr]);
+ ntprobs = bl_matchfileGetNTError(view->curframe, curptr);
+ ntedist = bl_matchfileGetNTEdist(view->curframe, curptr);
+ ntreadpos = bl_matchfileGetNTReadPos(view->curframe, curptr);
+ ntvarpos = bl_matchfileGetNTReadPosVar(&view->curframe->cs[curptr]);
+
+ mvwprintw(win, 3, 2, "coverage: %d ",
+ cs[curptr].len);
+ mvwprintw(win, 3, 21, "5'-ends: %d",
+ cs[curptr].starts);
+ mvwprintw(win, 3, 38, "3'-ends: %d",
+ cs[curptr].ends);
+
+ wattron(win, A_UNDERLINE);
+ mvwprintw(win, 4, 20, " ");
+ mvwprintw(win, 4, 26, " ");
+ mvwprintw(win, 4, 35, " ");
+ mvwprintw(win, 4, 44, " ");
+ wattroff(win,A_UNDERLINE);
+ mvwprintw(win, 5, 2,
+ " # log(E) edst multi rpos s(rpos) ");
+ mvwprintw(win, 6, 2,
+ " ---------------------------------------------------- ");
+
+ for(j=0,k=0; j < 5; j++) {
+
+ if(ntcounts[(int)NT[j]]) {
+ if(NT[j] == cs[curptr].ref ||
+ NT[j] == cs[curptr].cons)
+ wattron(win, A_BOLD);
+ mvwprintw(win, 7+k, 2, "%c:", NT[j]);
+ if(NT[j] == cs[curptr].ref ||
+ NT[j] == cs[curptr].ref)
+ wattroff(win, A_BOLD);
+
+ mvwprintw(win, 7+k, 8, "%d", ntcounts[(int)NT[j]]);
+ mvwprintw(win, 7+k, 16, "%.3f", ntprobs[(int)NT[j]]);
+ mvwprintw(win, 7+k, 26, "%.3f", ntedist[(int)NT[j]]);
+ mvwprintw(win, 7+k, 35, "%.1f", ntredund[(int)NT[j]]);
+ mvwprintw(win, 7+k, 44, "%.1f", ntreadpos[(int)NT[j]]);
+ mvwprintw(win, 7+k, 52, "%.3f", ntvarpos[(int)NT[j]]);
+
+ mvwprintw(win, 7+k+1, 26, "%.1f",
+ log(((double)view->stats->RR[(int)MIN(ntedist[(int)NT[j]],6)] + 1.0)/
+ ((double)view->stats->RR_N + 1.0)));
+
+ mvwprintw(win, 7+k+1, 35, "%.1f",
+ log(((double)view->stats->MM[(int)MIN(ntredund[(int)NT[j]],50)] + 1.0)/
+ ((double)view->stats->MM_N + 1.0)));
+
+ mvwprintw(win, 7+k+1, 52, "%.1f",
+ log(univarnormcdf(ntvarpos[(int)NT[j]], view->stats->V_mu, view->stats->V_sd)));
+
+ k+=2;
+ }
+ }
+
+ wattron(win, A_BOLD);
+ mvwprintw(win, 7+k+1, 2, "frame statistics");
+ wattroff(win, A_BOLD);
+
+ wattron(win, A_UNDERLINE);
+ mvwprintw(win, 7+k+2, 2, " ");
+ wattroff(win, A_UNDERLINE);
+ mvwprintw(win, 7+k+3, 2, "coverage: %.2f",
+ view->curframestats->mean_cov);
+
+ for(j=0; j < 5; j++) {
+
+ mvwprintw(win, 7+k+5, 2, "%c:", NT[j]);
+ mvwprintw(win, 7+k+5, 8, "%d",
+ view->curframestats->ntcnt[(int)NT[j]]);
+ mvwprintw(win, 7+k+5, 16, "%.3f",
+ view->curframestats->mean_err[(int)NT[j]]);
+ mvwprintw(win, 7+k+5, 26, "%.3f",
+ view->curframestats->mean_dis[(int)NT[j]]);
+ mvwprintw(win, 7+k+5, 35, "%.1f",
+ view->curframestats->mean_mul[(int)NT[j]]);
+ mvwprintw(win, 7+k+5, 44, "%.1f",
+ view->curframestats->mean_pos[(int)NT[j]]);
+ mvwprintw(win, 7+k+5, 52, "%.1f",
+ view->curframestats->mean_sde[(int)NT[j]]);
+ k++;
+ }
+
+
+ mvwprintw(win, 7+k+6, 2, "P(V)=%.2f, P(NV)=%.2f, P(Entropy=%.3f)=%.2f",
+ log((double)view->stats->X/view->stats->N),
+ log((double)view->stats->P/view->stats->N),
+ cs[curptr].entropy, cs[curptr].pentropy);
+
+ mvwprintw(win, 7+k+7, 2, "P(V)*P(D|V)=%.2f,%.2f P(NV)*P(D|NV)=%.2f,%.2f S=%.2f",
+ cs[curptr].p_refx, cs[curptr].s_refx, cs[curptr].p_ref, cs[curptr].s_ref, logadd(cs[curptr].p_ref,
+ cs[curptr].p_refx));
+
+ mvwprintw(win, 7+k+8, 2, "P(V)*P(D|V)=%.2f,%.2f P(NV)*P(D|NV)=%.2f,%.2f S=%.2f",
+ cs[curptr].p_consx, cs[curptr].s_consx, cs[curptr].p_cons, cs[curptr].s_cons, logadd(cs[curptr].p_cons,
+ cs[curptr].p_consx));
+
+// mvwprintw(win, 7+k+9, 2, "Phom=%.2f, ee_cons=%.2f, ee_consx=%.2f, ee_ref=%.2f, ee_refx=%.2f", cs[curptr].p_hom,
+// cs[curptr].ee_cons, cs[curptr].ee_consx, cs[curptr].ee_ref, cs[curptr].ee_refx);
+
+
+
+ FREEMEMORY(space, ntcounts);
+ FREEMEMORY(space, ntprobs);
+ FREEMEMORY(space, ntedist);
+ FREEMEMORY(space, ntredund);
+ FREEMEMORY(space, ntreadpos);
+ FREEMEMORY(space, ntvarpos);
+
+ } else {
+
+ mvwprintw(win, 1, 2, "no cross section at position %d available",
+ view->curframe->start+curptr);
+ }
+
+ return shadowed;
+}
+
+
+/*------------------------ bl_matchfileSelectChrMenu -------------------------
+ *
+ * @brief display menu for chromosome selection
+ * @author Steve Hoffmann
+ *
+ */
+
+
+char*
+bl_matchfileSelectChrMenu(fasta_t *set) {
+ ITEM **it;
+ MENU *me;
+ WINDOW *shadoww,*mainwin, *win;
+ char **mi, *newchrname = NULL;
+ int ch, i, sel, nchr;
+
+ nchr = set->noofseqs;
+ sel= nchr+1;
+
+ it = (ITEM **)calloc(nchr+2, sizeof(ITEM *));
+ mi = ALLOCMEMORY(space, NULL, char*, nchr);
+
+ for(i=0;i < nchr; i++) {
+ mi[i] = ALLOCMEMORY(space, NULL, char, log10(nchr)+3);
+ snprintf(mi[i], log10(nchr)+2, "%d", i);
+ it[i] = new_item(mi[i], set->seqs[i]->description);
+ }
+
+ it[i] = new_item("E","Exit");
+ it[i+1] = NULL;
+ me = new_menu(it);
+
+ shadoww = newwin(13, 63, 11, 12);
+ mainwin = newwin(13, 63, 10, 10);
+ win = derwin(mainwin, 11, 60, 1, 1);
+ set_menu_win (me, win);
+ set_menu_sub (me, derwin(win, 6, 38, 3, 1));
+ set_menu_format(me, 5, 1);
+ set_menu_mark(me, " * ");
+
+ box(win, 0, 0);
+ mvwaddstr(win, 1, 2, "Select Reference");
+ set_menu_fore(me, COLOR_PAIR(5)|A_REVERSE);
+ set_menu_back(me, COLOR_PAIR(6));
+ set_menu_grey(me, COLOR_PAIR(7));
+ wbkgd(shadoww, COLOR_PAIR(8));
+ wbkgd(mainwin, COLOR_PAIR(2));
+ wbkgd(win, COLOR_PAIR(2));
+ post_menu(me);
+
+ attron(COLOR_PAIR(1));
+ mvwprintw(win, 9, 2, "Use PageUp, PageDown and Arrows to scroll (F1 exits).");
+ attroff(COLOR_PAIR(1));
+
+ refresh();
+ wrefresh(shadoww);
+ wrefresh(mainwin);
+
+ while(sel > nchr && (ch=getch()) != KEY_F(1))
+ {
+ switch(ch)
+ {
+ case KEY_DOWN:
+ menu_driver(me, REQ_DOWN_ITEM);
+ break;
+ case KEY_UP:
+ menu_driver(me, REQ_UP_ITEM);
+ break;
+ case KEY_NPAGE:
+ menu_driver(me, REQ_SCR_DPAGE);
+ break;
+ case KEY_PPAGE:
+ menu_driver(me, REQ_SCR_UPAGE);
+ break;
+ case 0xA: /* Return- bzw. Enter-Taste -> ASCII-Code */
+ sel = item_index(current_item(me));
+ }
+
+ wrefresh(shadoww);
+ wrefresh(mainwin);
+ }
+
+ unpost_menu(me);
+ free_menu(me);
+
+ for(i = 0; i < nchr; ++i) {
+ free(mi[i]);
+ free_item(it[i]);
+ }
+
+ free(it);
+ free(mi);
+
+ delwin(win);
+ delwin(shadoww);
+ delwin(mainwin);
+
+ if(sel < nchr) {
+ newchrname = ALLOCMEMORY(space, NULL, char,
+ strlen(set->seqs[sel]->description)+1);
+ memset(newchrname, 0,
+ strlen(set->seqs[sel]->description)+1);
+ memmove(newchrname, set->seqs[sel]->description,
+ strlen(set->seqs[sel]->description));
+ }
+
+ return newchrname;
+}
+
+/*------------------------ bl_matchfileJumpToInitForm ------------------------
+ *
+ * @brief initialize the form to jump to position
+ * @author Steve Hoffmann
+ *
+ */
+
+FORM*
+bl_matchfileJumpToInitForm(FIELD ***flds) {
+ FIELD **fi;
+ FORM *fo;
+ int i;
+
+ fi = (FIELD **)calloc(3, sizeof(FIELD *));
+ fi[0] = new_field(1, 15, 0, COLS-20, 0, 0);
+ fi[1] = new_field(1, 15, 0, COLS-45, 0, 0);
+ fi[2] = 0;
+
+ set_field_type(fi[0], TYPE_INTEGER, 0, 1, 999999999);
+ set_field_type(fi[1], TYPE_REGEXP, "^.*$");
+
+ for(i=0; i < 2; i++) {
+ set_field_fore(fi[i], COLOR_PAIR(2));
+ set_field_back(fi[i], COLOR_PAIR(2));
+ field_opts_on(fi[i], O_EDIT);
+ }
+ field_opts_off(fi[1], O_AUTOSKIP);
+ field_opts_on(fi[1], O_STATIC);
+
+
+ fo = new_form(fi);
+ post_form(fo);
+
+ mvaddstr(0, COLS-25, "pos: ");
+ mvaddstr(0, COLS-50, "chr: ");
+
+ *flds = fi;
+ return fo;
+}
+
+
+/*----------------------- bl_matchfileJumpToWrapField ------------------------
+ *
+ * @brief destruct the form to jump
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileJumpToWrapField(FIELD **fi) {
+ free_field(fi[0]);
+ free_field(fi[1]);
+ free(fi);
+ curs_set(0);
+}
+
+
+/*--------------------------- bl_machfileViewQuit ----------------------------
+ *
+ * @brief Adios!
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileViewQuit() {
+ endwin();
+}
+
+/*-------------------------- bl_matchfileGetTrackName ---------------------------
+ *
+ * @brief get the track names
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_matchfileItemDisplayName (annotationtrack_t *track, Uint k)
+{
+
+ Uint i, len, nlen, dlen;
+ char *name=NULL, GFFname = 0;
+ annotationitem_t *items;
+
+
+ items = track->items;
+ nlen = strlen(items[k].name);
+
+ if(items[k].type == GFFITEM) {
+ for(i=0; i < items[k].noofattributes; i++) {
+
+ if(items[k].attributelen[i] > 5 &&
+ (strncmp(items[k].attributes[i], "Name=", 5) == 0 ||
+ strncmp(items[k].attributes[i], "name=", 5) == 0)) {
+
+ if(strlen(items[k].name)) {
+ dlen = strlen(&items[k].attributes[i][5]);
+ len = nlen + dlen + 1;
+ name = ALLOCMEMORY(space, NULL, char, len+2);
+ memset(name, ' ', len);
+ memmove(name, items[k].name, nlen);
+ memmove(&name[nlen+1], &items[k].attributes[i][5], dlen);
+ name[len] = 0;
+ }
+
+ GFFname = 1;
+ break;
+ }
+ }
+ }
+
+ if(!GFFname) {
+ name = ALLOCMEMORY(space, NULL, char, nlen+1);
+ memmove(name, items[k].name, nlen);
+ name[nlen] = 0;
+ }
+
+ return name;
+}
+
+
+/*--------------------- bl_matchfileDrawAnnotationTrack ----------------------
+ *
+ * @brief draw the annotation track
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchfileDrawAnnotationTrack (matchfileView_t *view)
+{
+ Uint i=0, k, pos=0, noofitems = 0,
+ off; //, start;
+ int dpos=0, dend=0, dstart=0;
+ Uint *imap, curptr, endpos, itemnamelen, p, startitem=0;
+ annotationitem_t *items = NULL;
+ char *itemname, trackchar;
+ int attr, istart, iend, fstart, fend;
+ Uint xoff = 3, yoff=0;
+ WINDOW *pad;
+
+ attr = A_BOLD | COLOR_PAIR(4) ;
+
+ if(!view->annotation) return;
+
+ if(view->annotationpad) {
+ delwin(view->annotationpad);
+ }
+
+ pad = newpad(MAXPADLINES, COLS);
+ off = view->offset;
+ // not used: start = view->curframe->start;
+ imap = view->imap;
+
+ if(imap[view->offset-1]) {
+ curptr = view->curframe->start + imap[off-1];
+ } else {
+ k = view->offset-1;
+ while(imap[k] == 0 && k > 0) k--;
+ curptr = view->curframe->start + imap[k];
+ }
+
+ view->annotationoffset = 0;
+ noofitems = view->annotation->noofitems;
+ items = view->annotation->items;
+
+ for(k=0; k < noofitems; k++) {
+ if(!strcmp(items[k].chromname, view->curframe->chrname) &&
+ items[k].end >= curptr) break;
+ }
+
+ startitem = k;
+
+ for(i=0; i < COLS-xoff; i++) {
+ if(i+off-1 == 0 || imap[i+off-1] > 0) {
+ pos = view->curframe->start+imap[i+off-1];
+
+ for(k=startitem; k < noofitems; k++) {
+ if(!strcmp(items[k].chromname, view->curframe->chrname)) {
+ if(pos >= items[k].start && pos <= items[k].end) {
+
+ if(items[k].strand == '+') {
+ trackchar = '>';
+ } else {
+ trackchar = '<';
+ }
+
+ if(items[k].start == pos) {
+ wattrset(pad, COLOR_PAIR(BLUEONYELLOW));
+ mvwprintw(pad, yoff+items[k].level, i, "|");
+ } else if (items[k].end == pos) {
+ wattrset(pad, A_BOLD | COLOR_PAIR(REDONYELLOW));
+ mvwprintw(pad, yoff+items[k].level, i, "|");
+ } else {
+ wattrset(pad, A_BOLD | COLOR_PAIR(BLUEONYELLOW));
+ if(i%2) mvwprintw(pad, yoff+items[k].level, i, "%c", trackchar);
+ else mvwprintw(pad, yoff+items[k].level, i, " ");
+ }
+ }
+ }
+ if(pos+COLS+10 < items[k].start) {
+ break;
+ }
+ }
+ }
+ }
+
+ wattrset(pad, COLOR_PAIR(BLUEONYELLOW));
+ endpos = view->curframe->start+view->map[off-1+COLS-1-(2*xoff)];
+
+ for(k=startitem; k < noofitems; k++) {
+
+ if(!strcmp(items[k].chromname, view->curframe->chrname)) {
+ if(OVERLAP(items[k].start, items[k].end, curptr, endpos)) {
+
+ itemname = bl_matchfileItemDisplayName(view->annotation, k);
+ itemnamelen = strlen(itemname);
+
+ fstart = MAX(0, (int)items[k].start-(int)view->curframe->start);
+ fend = items[k].end - view->curframe->start;
+
+ istart = view->map[fstart];
+ iend = view->map[fend];
+
+ dstart = MAX((int)istart-(int)off+2, 0); //+ xoff;
+ dend = MIN(COLS-1, iend-(off-1)); //- (2*xoff);
+
+ if(dend - dstart > itemnamelen+2) {
+ dpos = dstart+(((dend-dstart)-itemnamelen)/2);
+ } else {
+ dpos = dstart;
+ }
+
+ if(itemnamelen+2 > (dend - dpos)) {
+ for(p=dpos; p < dend && p-dstart < itemnamelen ; p++) {
+ mvwaddch(pad, yoff+items[k].level, p, itemname[p-dstart]);
+ }
+ } else {
+ mvwprintw(pad, yoff+items[k].level, dpos, " %s ", itemname, dstart, dend);
+ }
+
+ FREEMEMORY(space, itemname);
+ }
+ }
+ }
+
+ view->annotationpad = pad;
+ attrset(A_NORMAL | attr);
+
+ return ;
+}
+
+/*-------------------------- bl_matchfileDrawRuler ---------------------------
+ *
+ * @brief draw the ruler
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_matchfileDrawRuler (matchfileView_t *view) {
+ Uint i=0, k, pos=0,
+ e=0, z=0, h=0, dz=100, dh=1000, off; //, start;
+ int attr=0;
+ Uint *imap, curptr;
+ Uint xoff = 3;
+
+ off = view->offset;
+ // not used: start = view->curframe->start;
+ imap = view->imap;
+
+ if(imap[view->offset-1]) {
+ curptr = view->curframe->start + imap[off-1];
+ } else {
+ k = view->offset-1;
+ while(imap[k] == 0 && k > 0) k--;
+ curptr = view->curframe->start + imap[k];
+ }
+
+ attr = A_BOLD | COLOR_PAIR(4) ;
+ attrset(attr);
+
+ mvprintw(0 ,3+xoff,"%d (chr: '%s')",
+ curptr, view->curframe->chrname);
+
+ mvaddch (0, 0+xoff, ACS_ULCORNER);
+
+ for(i=0; i < COLS-(2*xoff); i++) {
+ if(i+off-1 == 0 || imap[i+off-1] > 0) {
+ pos = view->curframe->start+imap[i+off-1];
+ /*ruler*/
+ e = (pos)%10;
+ attrset(A_UNDERLINE | attr);
+ mvprintw(3, i+xoff, "%d", e);
+ attrset(A_NORMAL | attr);
+
+ z = (pos+1-e)%100;
+ if(dz != z || !i)
+ mvprintw(2, i+xoff, "%d", z/10);
+ else
+ mvprintw(2, i+xoff, " ");
+ dz = z;
+
+ h = (pos+1-e-z)%1000;
+ if(dh != h || !i)
+ mvprintw(1, i+xoff, "%d", h/100);
+ else
+ mvprintw(1, i+xoff, " ");
+ dh = h;
+ }
+ }
+
+ attrset(A_NORMAL | attr);
+ return;
+}
+
+
+
+/*------------------------ bl_matchfileViewUpdateFrame -------------------------
+ *
+ * @brief load a frame
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileViewUpdateFrame(void *space, matchfileView_t *vw, char *chrom,
+ Uint start, Uint width)
+{
+ char *chrname;
+ Uint fs=1, fw = 2*width; //, k, i;
+
+ if(vw->curframe && vw->curframe->cs) {
+ bl_matchfileDestructView(space, vw);
+ }
+
+ if(!chrom) {
+ chrname = vw->file->index->chromnames[0];
+ } else {
+ chrname = chrom;
+ }
+
+ if(start <= width) {
+ vw->offset = start;
+ } else {
+ fs = start-width;
+ vw->offset = width;
+ }
+ /* not used
+ k = bl_matchfileGetChromIndexNumber(vw->file->index, chrname);
+ i = bl_fastxFindIDIdx(chrname, vw->set);
+ */
+ vw->curframe = bl_matchfileGetFrame(space, vw->file, chrname, fs,
+ fw, vw->set, 20000, NULL);
+
+ bl_matchfileGetConsensus(vw->curframe);
+ vw->curframestats = bl_matchfileFrameStats(space, vw->curframe);
+
+ return ;
+}
+
+/*----------------------- bl_matchfileViewUpdateFrame ------------------------
+ *
+ * @brief draw the frame to the view
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileViewDrawFrame(void *space, matchfileView_t *view,
+ matchfilePanel_t *panel, Uint frameno) {
+
+ Uint i=0, v, j, k, l, m, starts, cover;
+ Uint *imap, *map, maxdel=0;
+ char *dels;
+ matchfileCross_t *cs, *xs;
+ int col, curcol=0, errcol=0;
+
+ delwin(view->pad);
+ view->pad = newpad(MAXPADLINES, view->curframe->width+2);
+ wclear(view->pad);
+ assert(view->pad);
+
+
+ cs = view->curframe->cs;
+
+ map = calloc(view->curframe->width*10, sizeof(Uint));
+ imap = calloc(view->curframe->width*10, sizeof(Uint));
+
+ for(k=0, i=0; i < view->curframe->width; i++) {
+ map[i] = k-maxdel;
+ imap[k] = i;
+
+ if(view->curframe->ref) {
+
+ bl_matchfileTest(space, 0, 0, view->curframe->start+i, &view->curframe->cs[i],
+ view->curframe->ref[i], view->idx, 0, NULL);
+ /*
+ if(!isinf(cs[i].p_hom)) {
+ errcol = COLOR_PAIR(REDONWHITE);
+ curcol = COLOR_PAIR(BLUEONWHITE);
+ } else {
+ if (cs[i].s_cons >= cs[i].s_consx && cs[i].s_ref >= cs[i].s_refx) {
+ curcol = COLOR_PAIR(WHITEONBLACK);
+ errcol = COLOR_PAIR(REDONBLACK);
+ } else {
+ if(cs[i].cons == view->curframe->ref[i] ||
+ (cs[i].s_ref < cs[i].s_refx && cs[i].s_cons < cs[i].s_consx)) {
+ errcol = COLOR_PAIR(REDONYELLOW);
+ curcol = COLOR_PAIR(BLUEONYELLOW);
+ } else {
+ errcol = COLOR_PAIR(REDONBLUE);
+ curcol = COLOR_PAIR(WHITEONBLUE);
+
+ }
+ }
+ }
+ */
+ if( (cs[i].p_consx != log(0) &&
+ cs[i].p_consx > cs[i].p_cons) ||
+ (cs[i].p_refx != log(0) &&
+ cs[i].p_refx > cs[i].p_ref) ||
+ !isinf(cs[i].p_hom)) {
+
+ if(!isinf(cs[i].p_hom)) {
+ errcol = COLOR_PAIR(REDONWHITE);
+ curcol = COLOR_PAIR(BLUEONWHITE);
+ } else if(cs[i].p_consx > cs[i].p_cons &&
+ cs[i].p_refx > cs[i].p_ref) {
+ errcol = COLOR_PAIR(REDONYELLOW);
+ curcol = COLOR_PAIR(BLUEONYELLOW);
+ } else {
+ errcol = COLOR_PAIR(REDONBLUE);
+ curcol = COLOR_PAIR(WHITEONBLUE);
+ }
+
+ } else {
+ curcol = COLOR_PAIR(WHITEONBLACK);
+ errcol = COLOR_PAIR(REDONBLACK);
+ }
+ }
+
+ if(view->curframe->ref){
+ mvwaddchattr(view->pad, 0, k, A_DIM | curcol,
+ view->curframe->ref[i]);
+ }
+
+ col = (view->curframe->ref && cs[i].cons != view->curframe->ref[i])
+ ? errcol : curcol;
+ mvwaddchattr(view->pad, 1, k, A_UNDERLINE | col, cs[i].cons);
+
+ starts = (cs[i].starts/10 > 9) ? 9 : cs[i].starts/10;
+ cover = (cs[i].len/10 > 9) ? 9 : cs[i].len/10;
+
+ wattrset(view->pad, curcol);
+ mvwprintw(view->pad, 2, k, "%d", cover);
+ wattrset(view->pad, curcol | A_UNDERLINE);
+ mvwprintw(view->pad, 3, k, "%d", starts);
+ dels = calloc((cs[i].maxrow*2)+1, sizeof(char));
+
+ /*determine max deletion accros all views*/
+ for(v=0, maxdel=0; v < panel->noofviews; v++) {
+ xs = panel->views[v]->curframe->cs;
+ for(l=0; l < xs[i].noofdels; l++) {
+ maxdel = MAX(maxdel, xs[i].dels[l].len);
+ }
+ }
+
+ for(l=0; l < cs[i].noofdels; l++) {
+ dels[cs[i].dels[l].row] = 1;
+ }
+
+
+ for(j=0; j < cs[i].len; j++) {
+
+ if(cs[i].row[j] < MAXPADLINES){
+ col = (view->curframe->ref &&
+ cs[i].chars[j] != view->curframe->ref[i]) ? errcol : curcol;
+ mvwaddchattr(view->pad, cs[i].row[j]+4, k, col, cs[i].chars[j]);
+ }
+
+ assert(cs[i].row[j] <= cs[i].maxrow);
+ wattrset(view->pad, A_BOLD | COLOR_PAIR(4));
+
+ if(dels && dels[cs[i].row[j]]) {
+ for(l=0; l < cs[i].noofdels; l++) {
+ if(cs[i].dels[l].row == cs[i].row[j]) {
+ break;
+ }
+ }
+
+ for(m=0; m < cs[i].dels[l].len; m++) {
+ mvwaddch(view->pad, cs[i].row[j]+4, k+m+1,
+ cs[i].dels[l].string[m]);
+ }
+
+ for(;m < maxdel;m++) {
+ mvwaddch(view->pad, cs[i].row[j]+4, k+m+1, '^');
+ }
+
+ } else {
+
+ for(m=0; m < maxdel; m++) {
+ if(cs[i].feat[j] != '$'){
+ mvwaddch(view->pad, cs[i].row[j]+4, k+m+1, '^');
+ }
+ }
+ }
+ }
+
+
+ FREEMEMORY(space, dels);
+ k+=maxdel+1;
+ }
+
+ view->offset = map[view->offset];
+ view->map = map;
+ view->imap = imap;
+}
+
+/*----------------------- bl_matchfileViewDrawPanel ------------------------
+ *
+ * @brief update the panel
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileViewDrawPanel (void *space, matchfilePanel_t *panel)
+{
+
+ Uint i;
+
+ for(i=0; i < panel->noofviews; i++) {
+ bl_matchfileViewDrawFrame(space, panel->views[i], panel, i);
+ }
+
+ return ;
+}
+
+/*----------------------- bl_matchfileViewUpdatePanel ------------------------
+ *
+ * @brief update the panel
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileViewUpdatePanel (void *space, matchfilePanel_t *panel,
+ char *chrom, Uint start, Uint width)
+{
+
+ Uint i;
+
+ for(i=0; i < panel->noofviews; i++) {
+ bl_matchfileViewUpdateFrame(space, panel->views[i], chrom, start, width);
+ }
+
+
+
+ return ;
+}
+
+/*----------------------- bl_matchfileViewRefreshPanel -----------------------
+ *
+ * @brief panel refresh
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileRefreshViewPanel (void *space, matchfilePanel_t *panel)
+{
+ Uint i, lpp = MAX(10, ((LINES-1-8)/(panel->noofviews)));
+ Uint al = 0;
+ Uint xoff = 3;
+
+
+ if(panel->views[0]->annotation) {
+ al = 5;
+ }
+
+ refresh();
+
+
+ for(i=0; i < panel->noofviews; i++) {
+
+ if(i+1 == panel->activeview){
+ delwin(panel->activebox);
+
+ panel->activebox = newwin(lpp, COLS-2,
+ 4+al+(i*lpp)+1, panel->smincol[i]+1);
+
+ wbkgd(panel->activebox, COLOR_PAIR(1));
+ box(panel->activebox, ACS_VLINE, ACS_HLINE);
+ }
+
+ wrefresh(panel->activebox);
+
+ /*
+ prefresh(panel->views[i]-header, 5, panel->views[i]->offset-1,
+ 4+al+(i*lpp)+3, panel->smincol[i]+3,
+ 4+al+((i+1)*lpp)-3, COLS-1-3);
+
+ */
+
+ prefresh(panel->views[i]->pad, panel->views[i]->scroll,
+ panel->views[i]->offset-1,
+ 4+al+(i*lpp)+2, panel->smincol[i]+3,
+ 4+al+((i+1)*lpp)-3, COLS-1-3);
+
+ refresh();
+ }
+
+ if(panel->views[0]->annotation) {
+ if(panel->activeview == 0){
+ delwin(panel->activebox);
+ panel->activebox = newwin(7, COLS-2, 4, 1);
+
+ wbkgd(panel->activebox, COLOR_PAIR(1));
+ box(panel->activebox, ACS_VLINE, ACS_HLINE);
+ }
+
+ wrefresh(panel->activebox);
+ prefresh(panel->views[0]->annotationpad,
+ panel->views[0]->annotationscroll, 0, 5, 3, 9, COLS-1-xoff);
+ refresh();
+ }
+
+ return ;
+}
+
+/*---------------------- bl_matchfileControlScreenSize -----------------------
+ *
+ * @brief control the size of the screen
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+bl_matchfileControlScreenSize ( )
+{
+
+ if (COLS-1 < 10) {
+ mvprintw(0,0,"screensize!");
+ return 0;
+ }
+
+ return 1;
+}
+
+
+/*-------------------------- bl_matchfileViewUpdate --------------------------
+ *
+ * @brief user interface
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileViewUpdateScreen(void *space, matchfilePanel_t *panel,
+ Uint width, fasta_t *set) {
+ int ch,
+ nch;
+ int curexit = 'q', redraw;
+ Uint chrnamelen, pos, k, i1, margin, refend, vwidth, vstart, j, chrlen;
+ FIELD **fi;
+ FORM *fo;
+ SHADOWEDWINDOW *infowin = NULL;
+ unsigned char showcsinfo=0;
+ char *chrname = NULL,
+ *newchrname;
+ matchfileView_t **views;
+
+
+ bl_matchfileDrawRuler(panel->views[0]);
+ bl_matchfileDrawAnnotationTrack (panel->views[0]);
+
+ bl_matchfileRefreshViewPanel (space, panel);
+ views = panel->views;
+
+ while((ch=getch()) != curexit) {
+
+ if(!bl_matchfileControlScreenSize()) continue;
+
+ /*
+ * key up jump +100 nucleotides
+ *
+ * */
+
+ if(ch == 'a' || ch == KEY_LEFT || ch == KEY_UP) {
+ clear();
+
+ redraw = 0;
+ for(j=0; j < panel->noofviews; j++) {
+ if(ch != KEY_UP) {
+ views[j]->offset++;
+ } else {
+ views[j]->offset += 100;
+ }
+
+ vstart = views[j]->curframe->start;
+ vwidth = views[j]->curframe->width;
+ margin = views[j]->offset + COLS + 100;
+ refend = vstart + margin;
+ chrlen = views[j]->curframe->chrlen;
+
+ if(margin >= vwidth && refend < MIN(chrlen, MAXCHROMSIZE)) {
+
+ k = views[j]->offset;
+
+ if(views[j]->imap[k]) {
+ pos = vstart + views[j]->imap[k];
+ } else {
+ while(views[j]->imap[k] == 0 && k > 0) k--;
+ pos = vstart + views[j]->imap[k];
+ }
+
+ bl_matchfileViewUpdateFrame(space, views[j], chrname, pos, width);
+ redraw = 1;
+ clear();
+ }
+ }
+
+ if(redraw) bl_matchfileViewDrawPanel (space, panel);
+ }
+
+ /*
+ * key down jump -100 nucleotides
+ *
+ * */
+
+ if(ch == 'd' || ch == KEY_RIGHT || ch == KEY_DOWN) {
+ clear();
+
+ redraw = 0;
+ for(j=0; j < panel->noofviews; j++) {
+
+ if(ch != KEY_DOWN) {
+ views[j]->offset--;
+ } else {
+ if (views[j]->offset < 100) {
+ views[j]->offset = 1;
+ } else {
+ views[j]->offset -= 100;
+ }
+ }
+
+ vstart = views[j]->curframe->start;
+
+ if(views[j]->offset <= 1 && vstart > 1
+ && views[j]->offset + vstart < MAXCHROMSIZE) {
+
+ bl_matchfileViewUpdateFrame(space, views[j], chrname, vstart, width);
+ redraw = 1;
+ clear();
+ }
+ if (views[j]->offset < 1) views[j]->offset = 1;
+ }
+
+ if(redraw) bl_matchfileViewDrawPanel (space, panel);
+ bl_matchfileDrawRuler(views[0]);
+ bl_matchfileDrawAnnotationTrack (views[0]);
+ }
+
+ /*
+ * menu to change chromosomes
+ *
+ **/
+
+ if(ch == 'm') {
+ newchrname = bl_matchfileSelectChrMenu(set);
+ if(newchrname) {
+ if(chrname) {
+ FREEMEMORY(space, chrname);
+ }
+ chrname = newchrname;
+
+ for(j=0; j < panel->noofviews; j++) {
+ bl_matchfileViewUpdateFrame(space, views[j], chrname, 100, width);
+ }
+ clear();
+ bl_matchfileViewDrawPanel (space, panel);
+ }
+ }
+
+ if(ch == KEY_NPAGE) {
+ if(panel->activeview > 0 && panel->views[panel->activeview-1]->scroll+1 < MAXPADLINES-5)
+ panel->views[panel->activeview-1]->scroll++;
+
+ if(!panel->activeview && panel->views[0]->annotationscroll+1 < MAXPADLINES-5) {
+ panel->views[0]->annotationscroll++;
+ }
+ }
+
+ if(ch == KEY_PPAGE) {
+ if(panel->activeview > 0 && panel->views[panel->activeview-1]->scroll > 0)
+ panel->views[panel->activeview-1]->scroll--;
+
+ if(!panel->activeview && panel->views[0]->annotationscroll > 0) {
+ panel->views[0]->annotationscroll--;
+ }
+ }
+
+ /*
+ * change active panel
+ *
+ **/
+
+ if (ch == 'n') {
+ if(panel->activeview+1 <= panel->noofviews)
+ panel->activeview++;
+ clear();
+ }
+
+ if (ch == 'p') {
+ if(panel->activeview >= 1)
+ panel->activeview--;
+ clear();
+ }
+
+
+ if(ch == 'P') {
+ bl_matchfilePERRGNUPLOT(space, views[0]->file->index);
+ }
+
+ if(ch == 'E') {
+ bl_matchfileQERRGNUPLOT(space, views[0]->file->index);
+ }
+
+ if(ch == 'C') {
+ bl_matchfileCOVGNUPLOT(space, views[0]->curframe);
+ }
+
+ if(ch == 'W'){
+ bl_matchfileSUBGNUPLOT(space, views[0]->file->index);
+ }
+
+ if(ch == 'S') {
+ bl_matchfileRSSGNUPLOT(space, views[0]->curframe,
+ views[0]->curframestats);
+ }
+
+ if(ch == 'i' || (showcsinfo && ch == 'q')) {
+ if(showcsinfo) {
+ delshadowedwin(infowin);
+ infowin = NULL;
+ showcsinfo = 0;
+ curexit = 'q';
+ } else {
+ showcsinfo = 1;
+ curexit = 0;
+ }
+ }
+
+ if(ch == ':') {
+
+ clear();
+ fo = bl_matchfileJumpToInitForm(&fi);
+ bl_matchfileDrawRuler(views[0]);
+ bl_matchfileDrawAnnotationTrack (views[0]);
+ bl_matchfileRefreshViewPanel (space, panel);
+
+ curs_set(1);
+ move(0,COLS-20);
+
+ while((nch=wgetch(stdscr)) != KEY_F(1)) {
+ switch(nch) {
+ case KEY_BTAB:
+ form_driver(fo, REQ_END_LINE);
+ form_driver(fo, REQ_PREV_FIELD);
+ form_driver(fo, REQ_END_LINE);
+ break;
+ case 9:
+ form_driver(fo, REQ_END_LINE);
+ form_driver(fo, REQ_NEXT_FIELD);
+ form_driver(fo, REQ_END_LINE);
+ break;
+ case KEY_LEFT:
+ case '\b':
+ case KEY_BACKSPACE:
+ form_driver(fo, REQ_DEL_PREV);
+ break;
+ case '\n':
+ form_driver(fo, REQ_END_LINE);
+ form_driver(fo, REQ_CLR_FIELD);
+ break;
+ default:
+ form_driver(fo, nch);
+ }
+ bl_matchfileRefreshViewPanel (space, panel);
+ if(nch == '\n') break;
+ }
+
+ curs_set(0);
+ i1 = atoi(field_buffer(fi[0], 0));
+ chrnamelen = strlen(field_buffer(fi[1],0));
+ newchrname = strtrim(NULL, field_buffer(fi[1],0), &chrnamelen);
+
+ if(newchrname) {
+ if(chrname) {
+ FREEMEMORY(space, chrname);
+ }
+ chrname = newchrname;
+
+ for(j=0; j < panel->noofviews; j++) {
+
+ bl_matchfileViewUpdateFrame(space, views[j], chrname, 100, width);
+ }
+ bl_matchfileViewDrawPanel (space, panel);
+ }
+
+ unpost_form(fo);
+ free_form(fo);
+ bl_matchfileJumpToWrapField(fi);
+
+ if(i1 && i1 >= 0 && i1 < MAXCHROMSIZE) {
+ for(j=0; j < panel->noofviews; j++) {
+ bl_matchfileViewUpdateFrame(space, views[j], chrname, i1, width);
+ }
+ bl_matchfileViewDrawPanel (space, panel);
+ }
+
+ for(j=0; j < panel->noofviews; j++) {
+ if(views[j]->offset < 1) views[j]->offset = 1;
+ }
+ }
+
+ bl_matchfileDrawRuler(views[0]);
+ bl_matchfileDrawAnnotationTrack (views[0]);
+ bl_matchfileRefreshViewPanel (space, panel);
+
+ if(showcsinfo) {
+ delshadowedwin(infowin);
+ infowin = bl_matchfileGetInfo(views[0]);
+ refresh();
+ shadowedwrefresh(infowin);
+ }
+ }
+
+ if(chrname) {
+ FREEMEMORY(space, chrname);
+ }
+}
+
+
+/*--------------------------- bl_matchfileViewInit ---------------------------
+ *
+ * @brief initialize the match file viewer
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileViewInit() {
+ initscr();
+ atexit(bl_matchfileViewQuit);
+ clear();
+ noecho();
+ curs_set(0);
+ cbreak();
+ keypad(stdscr, TRUE);
+ start_color();
+
+ init_pair(1, COLOR_WHITE, COLOR_BLACK);
+ init_pair(2, COLOR_BLACK, COLOR_WHITE);
+ init_pair(3, COLOR_RED, COLOR_BLACK);
+ init_pair(4, COLOR_BLUE, COLOR_BLACK);
+ init_pair(5, COLOR_WHITE, COLOR_BLUE);
+
+ init_pair(6, COLOR_BLUE, COLOR_GREEN);
+
+ init_pair(7, COLOR_BLACK, COLOR_BLUE);
+ init_pair(8, COLOR_BLACK, COLOR_BLACK);
+ init_pair(9, COLOR_BLUE, COLOR_YELLOW);
+ init_pair(10, COLOR_RED, COLOR_YELLOW);
+ init_pair(11, COLOR_BLUE, COLOR_MAGENTA);
+
+ init_pair(12, COLOR_WHITE, COLOR_GREEN);
+ init_pair(13, COLOR_RED, COLOR_GREEN);
+ init_pair(14, COLOR_WHITE, COLOR_YELLOW);
+
+ init_pair(15, COLOR_RED, COLOR_WHITE);
+ init_pair(16, COLOR_BLUE, COLOR_WHITE);
+ init_pair(17, COLOR_GREEN, COLOR_WHITE);
+ init_pair(18, COLOR_MAGENTA, COLOR_WHITE);
+ init_pair(19, COLOR_CYAN, COLOR_WHITE);
+
+ init_pair(20, COLOR_GREEN, COLOR_YELLOW);
+ init_pair(21, COLOR_RED, COLOR_BLUE);
+
+}
+
+
+
+/*------------------------- bl_matchfileDestructView -------------------------
+ *
+ * @brief remove the view from the heap
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_matchfileDestructView(void *space, matchfileView_t *view) {
+
+ bl_matchfileDestructCross(space, view->curframe->cs, view->curframe->width);
+ bl_matchfileDestructFrameStats(space, view->curframestats);
+
+ FREEMEMORY(space, view->curframe->cs);
+ FREEMEMORY(space, view->curframestats);
+ FREEMEMORY(space, view->curframe);
+ FREEMEMORY(space, view->imap);
+ FREEMEMORY(space, view->map);
+
+
+}
+
+/*-------------------------- bl_matchfileInitView ----------------------------
+ *
+ * @brief init the views
+ * @author Steve Hoffmann
+ *
+ */
+
+matchfileView_t*
+bl_matchfileInitView(void *space, matchfile_t *file,
+ matchfileindex_t *idx, fasta_t *set, Uint fw)
+{
+ matchfileView_t *view;
+ matchfileSampleStats_t *stats = idx->stats;
+
+ assert(file->index);
+ assert(file->index->chromnames);
+ assert(file->index->chromnames[0]);
+
+ view = ALLOCMEMORY(space, NULL, matchfileView_t, 1);
+ view->file = file;
+ view->stats = stats;
+ view->offset = 0;
+ view->set = set;
+ view->map = NULL;
+ view->imap = NULL;
+ view->curframe = NULL;
+ view->curframestats = NULL;
+ view->scroll = 0;
+ view->pad = NULL;
+ view->annotation = NULL;
+ view->annotationoffset = 0;
+ view->scroll = 0;
+ view->annotationscroll = 0;
+ view->pad = NULL;
+ view->annotationpad = NULL;
+ return view;
+}
+
+/*------------------------ bl_matchfileInitViewPanel -------------------------
+ *
+ * @brief init the view panel
+ * @author Steve Hoffmann
+ *
+ */
+
+matchfilePanel_t*
+bl_matchfileInitViewPanel(void *space, matchfile_t **files,
+ Uint nooffiles, fasta_t *set, annotationtrack_t *track, Uint fw)
+{
+
+ Uint i, nrow = 0, rowsum = 0;;
+ matchfilePanel_t *panel;
+ matchfileView_t **views;
+
+
+ panel = ALLOCMEMORY(space, NULL, matchfilePanel_t, 1);
+ views = ALLOCMEMORY(space, NULL, matchfileView_t*, nooffiles);
+
+ panel->activeview = 0;
+ panel->activebox = NULL;
+ panel->pminrow = ALLOCMEMORY(space, NULL, int, nooffiles);
+ panel->pmincol = ALLOCMEMORY(space, NULL, int, nooffiles);
+ panel->sminrow = ALLOCMEMORY(space, NULL, int, nooffiles);
+ panel->smincol = ALLOCMEMORY(space, NULL, int, nooffiles);
+ panel->smaxrow = ALLOCMEMORY(space, NULL, int, nooffiles);
+ panel->smaxcol = ALLOCMEMORY(space, NULL, int, nooffiles);
+
+ nrow = MIN(20, (LINES-1-4/nooffiles));
+
+ for(i=0; i < nooffiles; i++) {
+ views[i] = bl_matchfileInitView(space, files[i],
+ files[i]->index, set, fw);
+
+ views[i]->annotation = track;
+
+ views[i]->pad = newpad(MAXPADLINES, fw);
+ assert(views[i]->pad);
+
+ panel->pminrow[i] = 5;
+ panel->pmincol[i] = 0;
+ panel->sminrow[i] = rowsum+4;
+ panel->smincol[i] = 0;
+ panel->smaxrow[i] = rowsum+4+nrow;
+ panel->smaxcol[i] = COLS-1;
+ rowsum += nrow;
+ }
+
+ panel->noofviews = nooffiles;
+ panel->views = views;
+
+ return panel;
+}
+
+
+/*---------------------- bl_matchfileDestructViewPanel -----------------------
+ *
+ * @brief destruct the view panel
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDestructViewPanel(void *space, matchfilePanel_t *panel)
+{
+ Uint i;
+
+ for(i=0; i < panel->noofviews; i++) {
+ bl_matchfileDestructView(space, panel->views[i]);
+ FREEMEMORY(space, panel->views[i]);
+ }
+
+ FREEMEMORY(space, panel->pminrow);
+ FREEMEMORY(space, panel->pmincol);
+ FREEMEMORY(space, panel->sminrow);
+ FREEMEMORY(space, panel->smincol);
+ FREEMEMORY(space, panel->smaxrow);
+ FREEMEMORY(space, panel->smaxcol);
+ FREEMEMORY(space, panel->views);
+ return ;
+}
+
+/*----------------------------- matchfileViewer ------------------------------
+ *
+ * @brief start the browser
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileViewer(void *space, matchfile_t **files, Uint nooffiles,
+ fasta_t *set, annotationtrack_t *track, Uint start, Uint width) {
+
+ matchfilePanel_t *panel;
+ Uint fw;
+
+
+ bl_matchfileViewInit();
+ while(!bl_matchfileControlScreenSize());
+
+ fw = (start<= width) ? width : 2*width;
+ panel = bl_matchfileInitViewPanel(space, files, nooffiles, set, track, fw);
+
+ bl_matchfileViewUpdatePanel(space, panel, NULL, start, width);
+ bl_matchfileViewDrawPanel (space, panel);
+ // bl_matchfileDumpSampleStats (panel->views[0]->stats);
+ bl_matchfileViewUpdateScreen(space, panel, width, set);
+
+ bl_matchfileDestructViewPanel(space, panel);
+ FREEMEMORY(space, panel);
+}
+
diff --git a/segemehl/libs/browsematchfiles.h b/segemehl/libs/browsematchfiles.h
new file mode 100644
index 0000000..9126702
--- /dev/null
+++ b/segemehl/libs/browsematchfiles.h
@@ -0,0 +1,94 @@
+#ifndef BROWSE_MAT_FILE_H
+#define BROWSE_MAT_FILE_H
+
+/*
+ *
+ * browsematchfiles.h
+ * a small browser
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 06.10.2010 01:48:30 CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "manout.h"
+#include "matchfiles.h"
+#include "evalmatchfiles.h"
+#include <ncurses.h>
+
+#define MAXPADLINES 100
+
+
+#define WHITEONBLACK 1
+#define BLACKONWHITE 2
+#define REDONBLACK 3
+#define BLUEONBLACK 4
+#define WHITEONBLUE 5
+#define BLUEONGREEN 6
+#define BLACKONBLUE 7
+#define BLUEONYELLOW 9
+#define REDONYELLOW 10
+#define BLUEONMAGENTA 11
+#define WHITEONGREEN 12
+#define REDONGREEN 13
+#define WHITEONYELLOW 14
+#define REDONWHITE 15
+#define BLUEONWHITE 16
+#define GREENONWHITE 17
+#define MAGENTAONWHITE 18
+#define CYANONWHITE 19
+#define GREENONYELLOW 20
+#define REDONBLUE 21
+
+
+typedef struct {
+ matchfile_t *file;
+ fasta_t *set;
+ matchfileFrame_t *curframe;
+ matchfileFrameStats_t *curframestats;
+ matchfileSampleStats_t *stats;
+ matchfileindex_t *idx;
+
+ annotationtrack_t *annotation;
+ Uint annotationoffset;
+
+ WINDOW* annotationpad;
+ Uint annotationscroll;
+ WINDOW* pad;
+ Uint scroll;
+ Uint *map;
+ Uint *imap;
+ Uint offset;
+ Uint width;
+
+} matchfileView_t;
+
+typedef struct {
+
+ Uint noofviews;
+ Uint activeview;
+ matchfileView_t **views;
+ WINDOW *activebox;
+ int *pminrow;
+ int *pmincol;
+ int *sminrow;
+ int *smincol;
+ int *smaxrow;
+ int *smaxcol;
+
+} matchfilePanel_t;
+
+void
+bl_matchfileViewer(void *space, matchfile_t **files, Uint nooffiles,
+ fasta_t *set, annotationtrack_t *annotation, Uint start, Uint width );
+
+void
+bl_matchfileDestructView(void *space, matchfileView_t *view);
+
+#endif
diff --git a/segemehl/libs/citation.h b/segemehl/libs/citation.h
new file mode 100644
index 0000000..3189d12
--- /dev/null
+++ b/segemehl/libs/citation.h
@@ -0,0 +1,50 @@
+
+/*
+ *
+ * citation.h
+ * Beertime!
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 07/25/2007 10:50:10 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: citation.h 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/citation.h $
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+ char* cite[] = { "\"Beertime!\" (A. Torda)\0",
+ "\"Ick fahr nur noch die janz jrossen Poette, wa!\" (Apotheker Lenz)\0",
+ "\"Nochn' schoenes Bier verhaften?\" (M. Mosisch)\0",
+ "\"Mahlzeit!\" (Ditsche, Ingo, Schildkroede)\0",
+ "\"Halt die Klappe, ich hab' Feierabend.\" (Schildkroede)\0",
+ "\"Gehen Sie vorsichtig mit dem Begriff der Unendlichkeit um!\" (Shorty)\0",
+ "\"Die Ficker!\" (Thommy)\0",
+ "\"Hab' ich gerade inner Bild gelesen!\" (Bienchen)\0",
+ "\"Tschüss, Herr Kayser!\" (Shorty)\0",
+ "\"Ich bin neu in der Hamburger Schule\" (Tocotronic)\0",
+ "\"Es wäre nicht zum aushalten, wäre er echt\" (Kettcar)\0",
+ "\"Stefan Kurtz uses suffix arrays to fix his bike.\" (A. Torda)\0",
+ "\"Ich hol' jetzt die Hilti!\" (Ein verzweifelter Bauarbeiter)\0",
+ "\"Kaeff'chen?\" (Lars)\0",
+ "\"Wir sind hier nicht in Seattle Dirk!\" (Tocotronic)\0"};
+
+
+ unsigned citenumber = 15;
+
+ char* citerand() {
+ Uint r;
+ srand(time(NULL));
+ r = ((unsigned)rand()%((int)(citenumber)));
+ //fprintf(stderr, "cite %d\n",r);
+ return cite[r];
+ }
+
+
diff --git a/segemehl/libs/container.c b/segemehl/libs/container.c
new file mode 100644
index 0000000..73d4214
--- /dev/null
+++ b/segemehl/libs/container.c
@@ -0,0 +1,168 @@
+/**
+ * container.c
+ * implementation of a simple container for objects of defined size
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Tue Oct 14 16:31:33 CEST 2008
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 73 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-29 10:03:28 +0100 (Wed, 29 Oct 2008) $
+ * Id: $Id$
+ * Url: $URL$
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "debug.h"
+#include "basic-types.h"
+#include "container.h"
+
+/*--------------------------- bl_containerInit ---------------------------------
+ *
+ * @brief init container
+ * @author Christian Otto
+ *
+ */
+void bl_containerInit(Container *c, int allocelem, size_t sizeofelem){
+ if (allocelem <= 0){
+ DBG("container.c: Attempt to initialize a container of size %d.\
+Exit forced.\n", allocelem);
+ exit(-1);
+ }
+ if (sizeofelem <= 0){
+ DBG("container.c: Attempt to initialize a container with sizeofelem %d.\
+Exit forced.\n", sizeofelem);
+ exit(-1);
+ }
+ c->contspace = malloc(allocelem * sizeofelem);
+ if (c->contspace == NULL){
+ DBG("container.c: Memory allocation failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ c->nextfree = 0;
+ c->allocelem = allocelem;
+ c->sizeofelem = sizeofelem;
+}
+
+/*-------------------------- bl_containerDestruct ------------------------------
+ *
+ * @brief destruct container,
+ * remove method for elems as parameter possible
+ * @author Christian Otto
+ *
+ */
+void bl_containerDestruct(Container *c, void (*rmv) (void*)){
+ int i;
+ char *p;
+ if (rmv != NULL){
+ p = (char *) c->contspace;
+ for(i = 0; i < c->nextfree; i++){
+ rmv(p + (i * c->sizeofelem));
+ }
+ }
+ free(c->contspace);
+ c->nextfree = 0;
+ c->allocelem = 0;
+ c->sizeofelem = 0;
+}
+
+/*--------------------------- bl_containerIsEmpty ------------------------------
+ *
+ * @brief returns if the container is empty
+ * @author Christian Otto
+ *
+ */
+BOOL bl_containerIsEmpty(Container *c){
+ return (c->nextfree == 0);
+}
+
+/*--------------------------- bl_containerResize -------------------------------
+ *
+ * @brief expands the size of the container by a given value
+ * @author Christian Otto
+ *
+ */
+void bl_containerResize(Container *c, int inc){
+ if (inc <= 0){
+ DBG("container.c: Reallocation with %d senseless. Exit forced.\n", inc);
+ exit(-1);
+ }
+ c->contspace = realloc(c->contspace, (c->allocelem + inc) * c->sizeofelem);
+ if (c->contspace == NULL){
+ DBG("container.c: Memory reallocation failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ c->allocelem += inc;
+}
+
+/*---------------------------- bl_containerAdd ---------------------------------
+ *
+ * @brief adds element at the end of the container
+ * @author Christian Otto
+ *
+ */
+void bl_containerAdd(Container *c, void *elem){
+ char *p;
+ if (c->nextfree == c->allocelem){
+ bl_containerResize(c, BASEINC);
+ }
+ p = (char *) c->contspace;
+ memmove(p + (c->nextfree * c->sizeofelem), elem, c->sizeofelem);
+ c->nextfree++;
+}
+
+/*---------------------------- bl_containerGet ---------------------------------
+ *
+ * @brief returns Nth object in the container
+ * with N = 0,..,numofelems - 1
+ * @author Christian Otto
+ *
+ */
+void* bl_containerGet(Container *c, int n){
+ char *p;
+ if (bl_containerIsEmpty(c) || n < 0 || n >= c->nextfree){
+ return NULL;
+ }
+ p = (char *) c->contspace;
+ return (p + (n * c->sizeofelem));
+}
+
+/*--------------------------- bl_containerMerge --------------------------------
+ *
+ * @brief merges two containers together
+ * @author Christian Otto
+ *
+ */
+void bl_containerMerge(Container *c, Container *s){
+ int size;
+ char *p;
+ if (c->sizeofelem != s->sizeofelem){
+ DBG("container.c: Merge of containers with different data types failed.\
+Exit forced.\n", NULL);
+ exit(-1);
+ }
+ size = s->nextfree + c->nextfree;
+ if (size >= c->allocelem){
+ bl_containerResize(c, s->nextfree + BASEINC);
+ }
+ p = (char *) c->contspace;
+ memmove(p + (c->nextfree * c->sizeofelem), s->contspace,
+ s->nextfree * s->sizeofelem);
+ c->nextfree = size;
+}
+
+/*----------------------------- bl_containerSize -------------------------------
+ *
+ * @brief returns number of elements in the container
+ * @author Christian Otto
+ *
+ */
+Uint bl_containerSize(Container *c){
+ return c->nextfree;
+}
diff --git a/segemehl/libs/container.h b/segemehl/libs/container.h
new file mode 100644
index 0000000..aa61c30
--- /dev/null
+++ b/segemehl/libs/container.h
@@ -0,0 +1,47 @@
+/**
+ * container.h
+ * implementation of a simple container for objects of defined size
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Tue Oct 14 16:31:33 CEST 2008
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 73 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-29 10:03:28 +0100 (Wed, 29 Oct 2008) $
+ * Id: $Id$
+ * Url: $URL$
+ */
+
+#ifndef CONTAINER_H
+#define CONTAINER_H
+
+#include <stdlib.h>
+#include "basic-types.h"
+
+#define CONTINC 100
+#ifndef BASEINC
+#define BASEINC CONTINC
+#endif
+
+typedef struct {
+ void *contspace;
+ int nextfree;
+ int allocelem;
+ size_t sizeofelem;
+} Container;
+
+void bl_containerInit(Container *c, int allocelem, size_t sizeofelem);
+void bl_containerDestruct(Container *c, void (*rmv) (void*));
+BOOL bl_containerIsEmpty(Container *c);
+void bl_containerResize(Container *c, int inc);
+void bl_containerAdd(Container *c, void *elem);
+void* bl_containerGet(Container *c, int n);
+void bl_containerMerge(Container *c, Container *s);
+Uint bl_containerSize(Container *c);
+
+#endif /* CONTAINER_H */
diff --git a/segemehl/libs/debug.c b/segemehl/libs/debug.c
new file mode 100644
index 0000000..39d02e5
--- /dev/null
+++ b/segemehl/libs/debug.c
@@ -0,0 +1,91 @@
+
+/*
+ * debug.c
+ * debug messages
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 08/26/2007 06:49:02 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: debug.c 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/debug.c $
+ *
+ */
+
+ #include <stdarg.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+ #include "debug.h"
+
+ FILE *dbgdevice = NULL;
+
+ int
+ debugmsg( const char *file,
+ const int line,
+ const char *fmt, ...) {
+
+ if (dbgdevice == NULL) {
+ dbgdevice = DBGDEVICE;
+ }
+
+ int ret;
+ va_list ap;
+ va_start(ap, fmt);
+#ifdef DBGNFO
+ fprintf(dbgdevice, "[%s] file: %s, line: %d: ", "SEGEMEHL", file, line);
+#endif
+ ret = vfprintf(dbgdevice, fmt, ap);
+ va_end(ap);
+
+ return ret;
+ }
+
+
+void
+setdebugdevice(char *filename) {
+ FILE *fp;
+
+ fp = fopen(filename, "w");
+ if (fp == NULL) {
+ DBG("Couldn't open file '%s'. Exit forced.\n", filename);
+ exit(-1);
+ }
+
+ dbgdevice = fp;
+}
+
+int
+debuglevel( const char *file,
+ const int line,
+ int level,
+ const char *fmt, ... ) {
+
+ int ret=0;
+ va_list ap;
+
+ if (dbgdevice == NULL) {
+ dbgdevice = DBGDEVICE;
+ }
+
+ if (DBGLEVEL >= level) {
+
+ va_start(ap, fmt);
+#ifdef DBGNFO
+ fprintf(dbgdevice, "[%s] file: %s, line: %d: ", "segemehl", file, line);
+#endif
+ ret = vfprintf(dbgdevice, fmt, ap);
+ va_end(ap);
+ }
+
+ return ret;
+}
+
+
+
+
diff --git a/segemehl/libs/debug.h b/segemehl/libs/debug.h
new file mode 100644
index 0000000..9aa29dd
--- /dev/null
+++ b/segemehl/libs/debug.h
@@ -0,0 +1,46 @@
+ #ifndef DEBUG_H
+ #define DEBUG_H
+
+/*
+ *
+ * debug.h
+ * debug messages
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 08/26/2007 07:17:44 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: debug.h 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/debug.h $
+ */
+
+ #include <stdarg.h>
+ #include <stdio.h>
+ #include <string.h>
+
+#ifndef DBGLEVEL
+#define DBGLEVEL 0
+#endif
+
+#ifndef DBGDEVICE
+#define DBGDEVICE stderr
+#endif
+
+#define DBGL(L, X, ... ) debuglevel (__FILE__, __LINE__, L, X, __VA_ARGS__)
+#define DBG(X, ...) debugmsg(__FILE__, __LINE__, X, __VA_ARGS__)
+#define DBGEXIT(X, ...) { debugmsg (__FILE__, __LINE__, X, __VA_ARGS__); \
+ exit(-1); }
+
+
+/*deprecated*/
+#define DEBUG(X, ...) debugmsg(__FILE__, __LINE__, X, __VA_ARGS__)
+
+int debugmsg(const char *, const int, const char *fmt, ...);
+int debuglevel(const char *, const int, int, const char *fmt, ...);
+
+#endif
diff --git a/segemehl/libs/evalmatchfiles.c b/segemehl/libs/evalmatchfiles.c
new file mode 100644
index 0000000..b97da0c
--- /dev/null
+++ b/segemehl/libs/evalmatchfiles.c
@@ -0,0 +1,1976 @@
+
+/*
+ * evalmatchfiles.c
+ * evalutation/statistics
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 06.10.2010 20:26:00 CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "sort.h"
+#include "alignment.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "vtprogressbar.h"
+#include "fileio.h"
+#include "matchfilesfields.h"
+#include "matchfiles.h"
+#include "debug.h"
+#include "evalmatchfiles.h"
+#include "biofiles.h"
+#include "splicesites.h"
+
+
+
+/*-------------------------- bl_matchfileSampleCmp ---------------------------
+ *
+ * @brief cmp chromosome positions sampled below
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileSampleCmp (Uint elemA, Uint elemB, void *toSort, void *info)
+{
+ PairUint *samples;
+ samples = (PairUint*) toSort;
+
+ if(samples[elemA].a > samples[elemB].a)
+ return 1;
+ if(samples[elemA].a < samples[elemB].a)
+ return 2;
+
+ if(samples[elemA].b > samples[elemB].b)
+ return 1;
+ if(samples[elemA].b < samples[elemB].b)
+ return 2;
+
+ return 0;
+}
+
+
+/*---------------------- bl_matchfileGetCrossConsError -----------------------
+ *
+ * @brief get the error e of a cross section based on the consensus
+ * (not reference)
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+bl_matchfileGetCrossConsError (matchfileFrame_t *frame, Uint pos)
+{
+ Uint j;
+ double e=0;
+
+ for(j=0; j < frame->cs[pos].len; j++) {
+ if(frame->cs[pos].chars[j] != frame->cs[pos].cons)
+ e+=1.0;
+ }
+
+ if(frame->cs[pos].len) {
+ e /= (double)frame->cs[pos].len;
+ }
+
+ return e;
+}
+
+/*---------------------- bl_matchfileGetCrossRefError ------------------------
+ *
+ * @brief get the error e of a cross section based on the reference
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+bl_matchfileGetCrossRefError (matchfileFrame_t *frame, Uint pos)
+{
+ Uint j;
+ double e=0;
+
+ for(j=0; j < frame->cs[pos].len; j++) {
+ if(frame->cs[pos].chars[j] != frame->cs[pos].ref)
+ e+=1.0;
+ }
+
+ if(frame->cs[pos].len) {
+ e /= (double)frame->cs[pos].len;
+ }
+
+ return e;
+}
+
+/*------------------------- bl_matchfileGetErrorDensity --------------------------
+ *
+ * @brief get sample statistics
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchfileGetErrorDensity(void *space, matchfileFrame_t *frame,
+ Uint pos, matchfileFrameStats_t *framestats, void *nfo)
+{
+ double e, er=.0;
+ matchfileSampleStats_t *stats = (matchfileSampleStats_t*) nfo;
+
+ if (stats->e_N+1 >= stats->n) return;
+
+ Uint mincover = stats->mincover;
+ Uint maxcover = stats->maxcover;
+
+
+ if (frame->cs[pos].len < mincover || frame->cs[pos].len > maxcover) {
+ return;
+ }
+
+ e = bl_matchfileGetCrossConsError(frame, pos);
+
+ if(e > 0 && stats->e_N < stats->n) {
+ stats->eraw[stats->e_N]=e;
+ stats->e[stats->e_N++]=e-er;
+ }
+
+ return ;
+}
+
+
+/*------------------------- bl_matchfileGetNTCounts --------------------------
+ *
+ * @brief for each cross section in the frame:
+ * Get the count for all nucleotides
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint*
+bl_matchfileGetNTCounts(matchfileCross_t *cs) {
+ Uint j, *cnt;
+
+ cnt = ALLOCMEMORY(space, NULL, Uint, 256);
+ memset(cnt, 0, sizeof(Uint)*256);
+
+ for(j=0; j < cs->len; j++) {
+ cnt[(int)cs->chars[j]]++;
+ }
+
+ return cnt;
+}
+
+/*----------------------- bl_matchfileGetPlusCounts -----------------------
+ *
+ * @brief for each cross section in the frame:
+ * Get the count for all strands
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint*
+bl_matchfileGetPlusCounts(matchfileCross_t *cs) {
+ Uint j, *cnt;
+
+ cnt = ALLOCMEMORY(space, NULL, Uint, 256);
+ memset(cnt, 0, sizeof(Uint)*256);
+
+ for(j=0; j < cs->len; j++) {
+ if(cs->strands[j] == '+') cnt[(int)cs->chars[j]]++;
+ }
+
+ return cnt;
+}
+
+
+/*------------------------- bl_matchfileGetConsensus -------------------------
+ *
+ * @brief calculate the consensus bases in a frame
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileGetConsensus(matchfileFrame_t *frame) {
+ Uint i, j, *cnt, max;
+ double lentropy;
+ double rentropy;
+ double longentropy;
+ char gapalign = 0;
+ Uint *tab = NULL;
+
+ tab = ALLOCMEMORY(NULL, NULL, Uint, 256);
+ memset(tab, 0, sizeof(Uint)*256);
+
+ tab['A'] = 1;
+ tab['C'] = 2;
+ tab['G'] = 3;
+ tab['T'] = 4;
+
+ for(i=0; i < frame->width; i++) {
+
+
+ frame->cs[i].entropy = 2.0;
+ frame->cs[i].longentropy = 2.0;
+
+ if (frame->ref) {
+ frame->cs[i].ref = frame->ref[i];
+ lentropy = 2.0;
+ rentropy = 2.0;
+ longentropy = 2.0;
+
+ if(frame->start+i > 11 && frame->start+i+11 < frame->chrlen) {
+ longentropy=
+ shannonentropy(NULL, &frame->chrseq[frame->start+i-10], 21, 6, tab);
+ } else {
+ if(frame->start+i > 21) {
+ longentropy=
+ shannonentropy(NULL, &frame->chrseq[frame->start+i-21], 21, 6, tab);
+ }
+ if(frame->start+i+21 < frame->chrlen) {
+ longentropy =
+ shannonentropy(NULL, &frame->chrseq[frame->start+i], 21, 6, tab);
+ }
+ }
+
+ if(frame->start+i > 11 && frame->start+i+11 < frame->chrlen) {
+ lentropy=
+ shannonentropy(NULL, &frame->chrseq[frame->start+i-10], 10, 6, tab);
+ rentropy =
+ shannonentropy(NULL, &frame->chrseq[frame->start+i], 10, 6, tab);
+ } else {
+ if(frame->start+i > 11) {
+ lentropy=
+ shannonentropy(NULL, &frame->chrseq[frame->start+i-10], 10, 6, tab);
+ }
+ if(frame->start+i+11 < frame->chrlen) {
+ rentropy =
+ shannonentropy(NULL, &frame->chrseq[frame->start+i], 10, 6, tab);
+ }
+ }
+
+ frame->cs[i].entropy = MIN(MIN(lentropy,rentropy),2.0);
+ // frame->cs[i].entropy = MIN(MAX(lentropy,rentropy),2.0);
+ frame->cs[i].longentropy = MIN(longentropy,2.0);
+ }
+
+ frame->cs[i].cons = '^';
+ cnt = bl_matchfileGetNTCounts(&frame->cs[i]);
+
+ if(frame->cs[i].len){
+ max = uarraymax(cnt, 255);
+ frame->cs[i].cons = (char)max;
+ }
+
+ FREEMEMORY(space, cnt);
+
+ gapalign = 0;
+ if(frame->cs[i].noofdels > 1) {
+ for(j=0; j < frame->cs[i].noofdels; j++) {
+ if(frame->cs[i].dels[j].len > 1) {
+ gapalign = 1;
+ }
+ }
+ if(gapalign)
+ bl_matchfileGapAlign(frame->cs[i].dels, frame->cs[i].noofdels);
+ }
+ }
+
+ FREEMEMORY(space, tab);
+ return;
+}
+
+
+/*-------------------------- bl_matchfileCrossPrint --------------------------
+ *
+ * @brief print a cross section to stderr
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileCrossPrint(void *space, matchfileFrame_t *frame) {
+ Uint i, j, width; //, start;
+ matchfileCross_t *cs;
+
+ cs = frame->cs;
+ //not used: start = frame->start;
+ width = frame->width;
+ for(i=0; i < width; i++) {
+ fprintf(stderr, "%d: %d\t%d\t%d\t%s\t%s\t", i, cs[i].len,
+ cs[i].starts, cs[i].ends, cs[i].chars,
+ cs[i].quals);
+ for(j=0; j < cs[i].len; j++) {
+ fprintf(stderr, "%d,", cs[i].row[j]);
+ }
+ fprintf(stderr, "\t");
+
+ for(j=0; j < cs[i].len; j++) {
+ fprintf(stderr, "%d,", cs[i].edist[j]);
+ }
+ fprintf(stderr, "\n");
+ }
+}
+
+
+/*----------------------- bl_matchfileGetScoreSample ------------------------
+ *
+ * @brief get a sample of the scores
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchfileGetScoreSample (void *space, matchfileFrame_t *frame,
+ Uint pos, matchfileFrameStats_t *framestats, void *nfo)
+{
+
+ matchfileindex_t *idx = (matchfileindex_t *) nfo;
+ matchfileSampleStats_t *stats = (matchfileSampleStats_t *) idx->stats;
+ Uint mincover = stats->mincover;
+ Uint maxcover = stats->maxcover;
+
+ if(stats->s_N+1 > stats->n) return;
+ if(frame->cs[pos].len < mincover || frame->cs[pos].len > maxcover) {
+ return;
+ }
+
+ bl_matchfileTest(space, 0, 0, pos, &frame->cs[pos], frame->cs[pos].ref,
+ idx, 0, NULL);
+
+ if(frame->cs[pos].scr_sample > log(0) || frame->cs[pos].scr_cons > log(0)) {
+ stats->s[stats->s_N++] = MAX(frame->cs[pos].scr_sample,
+ frame->cs[pos].scr_cons);
+ } else {
+ // fprintf(stdout, "%f, %f, %f, rt:%f, rq:%f, mm:%f, minrp:%f, maxrp:%f\n", frame->cs[pos].p_refx, frame->cs[pos].p_ref, log(ecdf(frame->cs[pos].ee, stats->ecdf)), frame->cs[pos].diff_rt, frame->cs[pos].diff_rq, frame->cs[pos].diff_mm, stats->minrp, stats->maxrp);
+ }
+
+ return ;
+}
+
+/*------------------------- bl_matchfileGetConditional ----------------------
+ *
+ * @brief sample values for the scores
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchfileGetConditionals(void *space, matchfileFrame_t *frame,
+ Uint pos, matchfileFrameStats_t *framestats, void *nfo)
+{
+ matchfileSampleStats_t *stats = (matchfileSampleStats_t *) nfo;
+ double e, pxx = stats->pxx;
+ int rpos;
+ Uint j;
+
+
+ if(frame->cs[pos].len < stats->mincover
+ || frame->cs[pos].len > stats->maxcover) {
+ return;
+ }
+
+ e = bl_matchfileGetCrossConsError(frame, pos);
+
+ if(e < pxx) {
+
+ for(j=0; j < frame->cs[pos].len; j++) {
+
+
+ rpos =
+ trunc(((double)(((double)frame->cs[pos].readpos[j]*100.0)
+ /((double)frame->cs[pos].readlen[j]))));
+
+ if(stats->RP_N[rpos] < 3000000000) {
+ MATRIX2D(stats->R_N, 255, rpos, (int)frame->cs[pos].quals[j])++;
+ stats->RP_N[rpos] += 1;
+ }
+
+ if(stats->RQ_N[(int)frame->cs[pos].quals[j]] < 3000000000) {
+ if(frame->cs[pos].chars[j] != frame->cs[pos].cons) {
+ stats->RQ[(int)frame->cs[pos].quals[j]]++;
+ }
+ stats->RQ_N[(int)frame->cs[pos].quals[j]]+=1;
+ }
+
+ if(stats->RP[rpos] < 3000000000) {
+ if(frame->cs[pos].chars[j] != frame->cs[pos].cons) {
+ MATRIX2D(stats->R, 255, rpos, (int)frame->cs[pos].quals[j])++;
+ stats->RP[rpos] += 1;
+ }
+ }
+ }
+ }
+
+ if(e > 0 && e <pxx) {
+ for(j=0; j < frame->cs[pos].len; j++) {
+
+ if(stats->RR_N < 3000000000) {
+ if(frame->cs[pos].edist[j] <= 10)
+ stats->RR[(int)frame->cs[pos].edist[j]]++;
+ else
+ stats->RR[10]++;
+ stats->RR_N++;
+ }
+
+ if(stats->MM_N < 3000000000) {
+ if(frame->cs[pos].matchcnt[j] <= 50)
+ stats->MM[frame->cs[pos].matchcnt[j]]++;
+ else
+ stats->MM[50]++;
+ stats->MM_N++;
+ }
+ }
+ }
+
+ return ;
+}
+
+
+/*--------------------------- bl_matchfileSmallMap ---------------------------
+ *
+ * @brief get a small map to quickly find expressed/covered sites
+ * @author Steve Hoffmann
+ *
+ */
+
+unsigned char**
+bl_matchfileSmallMap (void *space, matchfile_t* file, Uint **mapsize)
+{
+
+ FILE *fp = NULL;
+ stringset_t *fields = NULL;
+ char *buffer = NULL, ch, *curchrom = NULL, *filename;
+ unsigned char **map = NULL;
+ Uint buffersize = 1024, len = 0, curstart = 0,
+ curend = 0, i, j, bin, lastbin=0, id, lastid=-1, *msz,
+ noofseqs =0;
+
+ matchfileindex_t *index;
+ unsigned char header = 1;
+ unsigned char gzip, fmt;
+ struct gzidxfile *gzf = NULL;
+
+ filename = file->filename;
+ gzip = file->gzip;
+ fmt = file->fmt;
+ index = file->index;
+
+ noofseqs = index->noofchroms;
+
+ if (gzip) {
+ //gzindex = bl_zranGetIndex(filename, &gzlen);
+ fp = fopen(filename, "rb");
+ gzf = bl_initgzidxfile(fp, file->index->gzindex, 0, LARGECHUNK);
+ } else {
+ fp = fopen(filename, "r");
+ }
+
+ if(fp == NULL) {
+ DBGEXIT("Couldn't open file %s. Exit forced!\n", filename);
+ }
+
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ map = ALLOCMEMORY(space, NULL, char*, noofseqs);
+ memset(map, 0, sizeof(char*)*noofseqs);
+ msz = ALLOCMEMORY(space, NULL, Uint, noofseqs);
+ memset(msz, 0, sizeof(Uint)*noofseqs);
+
+ while((ch = (gzip) ? bl_getgzidxc(gzf) : getc(fp)) != EOF) {
+
+ if(len == buffersize-1) {
+ buffersize = 2*buffersize+1;
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ }
+
+ if(ch == '\n' && len > 0) {
+
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+ header = (header) ? bl_matchfileIsHeader(buffer, len, fmt) : header;
+
+ if (!header) {
+ fields = tokensToStringset(space, "\t", buffer, len);
+ curstart = bl_matchfileGetStartPos(fields, fmt);
+ curend = bl_matchfileGetEndPos(fields, fmt);
+ curchrom = bl_matchfileGetChrom(fields, fmt);
+
+ if(curend+1 > 0 && curchrom) {
+
+ for(id=0; id < index->noofchroms; id++) {
+ if(strcmp(curchrom, index->chromnames[id]) == 0) {
+ break;
+ }
+
+ for(j=0; j < strlen(index->chromnames[id]); j++) {
+ if (isspace(index->chromnames[id][j])) break;
+ }
+
+ if(strlen(curchrom) <= j &&
+ strncmp(curchrom, index->chromnames[id], j) == 0) {
+ break;
+ }
+ }
+
+ if(id != lastid) {
+ lastbin = -1;
+ lastid = id;
+ NFO("mapping chrom id:%d\n", id);
+ }
+
+ if (id >= index->noofchroms) {
+ DBGEXIT("reference sequence '%s' not found\n", curchrom);
+ }
+
+ for(i=curstart; i < curend; i++) {
+ bin = i/255;
+ if (bin != lastbin) {
+ if(bin >= msz[id]) {
+ map[id] = ALLOCMEMORY(space, map[id], unsigned char, bin+1);
+ memset(&map[id][msz[id]], 0, sizeof(char)*((bin+1)-msz[id]));
+ msz[id] = bin;
+ }
+ }
+ if( ((Uint)map[id][bin])+1 > 0) map[id][bin]++;
+ lastbin = bin;
+ }
+ }
+
+ destructStringset(space, fields);
+ }
+
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ len = 0;
+
+ } else {
+ if(ch != '\n') buffer[len++] = ch;
+ }
+ }
+
+ if(gzip) {
+ bl_destructgzidxfile(gzf);
+ FREEMEMORY(space, gzf);
+ }
+
+ FREEMEMORY(space, buffer);
+ fclose(fp);
+
+ *mapsize = msz;
+ return map;
+}
+
+/*------------------ bl_evalmatchfileSampleCrossSections -------------------
+ *
+ * @brief sample and execute f on it
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileSampleCrossSections(void *space,
+ matchfile_t *file, fasta_t *set, Uint n,
+ void (*f)(void *, matchfileFrame_t*, Uint,
+ matchfileFrameStats_t *, void *), unsigned char **maps, Uint *mapsize, void *info)
+{
+ PairUint *samplepos;
+ Uint i=0, r=0, j=0, k, *cumchrlen,
+ *order, prev=0, nchr, curchrom=0, curstart=0;
+ matchfileFrame_t *frame = NULL;
+ matchfileFrameStats_t *stats = NULL;
+ Uint maxcover = 20000;
+ Uint setsize = 10000000;
+ //Uint setsize = 100000;
+
+ //init random number generator
+ srand((unsigned)(time(0)));
+ nchr = file->index->noofchroms;
+
+ samplepos = ALLOCMEMORY(space, NULL, PairUint, n+1);
+ memset(samplepos, 0, sizeof(PairUint)*n+1);
+ cumchrlen = ALLOCMEMORY(space, NULL, Uint, nchr);
+ memset(cumchrlen, 0, sizeof(Uint)*nchr);
+/*
+ if(mymaps) {
+ maps = mymaps;
+ } else {
+ MSG("generating small map\n");
+ //sum up the length of the references (i.e. chromosomes)
+ maps = bl_matchfileSmallMap (space, file, &mapsize);
+ MSG("map generated\n");
+ }
+*/
+ cumchrlen[0] = file->index->matchend[0] - file->index->matchstart[0] + 1;
+ for(i=1; i < nchr; i++) {
+ assert(file->index->matchend[i] >= file->index->matchstart[i]);
+ cumchrlen[i] = (file->index->matchend[i] -
+ file->index->matchstart[i]) + cumchrlen[i-1];
+/* NFO("chr %d: length: %d, sum: %d\n", i, (file->index->matchend[i] -
+ file->index->matchstart[i]), cumchrlen[i]);
+ */
+ }
+
+// fprintf(stderr, "printint small map\n");
+// for(i=0; i < nchr; i++) {
+// for(j=0; j < mapsize[i]; j++) {
+// fprintf(stdout, "chr %d\t%d\n", i, maps[i][j]);
+// }
+// }
+// fprintf(stderr, "small map printed\n");
+
+
+ //randomize n positions across the genome and deterimine their
+ //chromosomes
+ i = 0;
+ j = 0;
+
+ while(i < n && j < setsize) {
+ k=0;
+
+ samplepos[i].b =
+ (Uint)(((double)cumchrlen[nchr-1]*rand()/RAND_MAX+1))+1;
+
+ while(samplepos[i].b > cumchrlen[k] && k+1 < nchr) k++;
+ samplepos[i].a = k;
+
+ // if(k > 0) {
+ // fprintf(stderr, "violation: i=%d, samplepos[i].b=%d, k=%d,
+ // nchr=%d, cumchrlen[k]=%d\n",
+ // i, samplepos[i].b, k, nchr, cumchrlen[k]);
+ // for(j=0; j < k; k++) {
+ // fprintf(stderr, "cumchrlen[%d]=%d\n", j, cumchrlen[j]);
+ // }
+ // exit(-1);
+ // }
+
+ prev = (k == 0) ? 0 : cumchrlen[k-1];
+
+ if(!maps || (maps[samplepos[i].a]
+ && mapsize[samplepos[i].a] > (samplepos[i].b - prev)/255
+ && maps[samplepos[i].a][(samplepos[i].b - prev)/255] > 200)) {
+ i++;
+ r++;
+ }
+
+ j++;
+ }
+
+ NFO("sampling %d positions.\n", i);
+
+ /*
+ for(i=0; i < nchr; i++) {
+ if(maps[i]) {
+ FREEMEMORY(space, maps[i]);
+ }
+ }
+
+ FREEMEMORY(space, maps);
+ FREEMEMORY(space, mapsize);
+ */
+
+ if(j == setsize && r < (int)(0.8*((double)n))) {
+ NFO("current sample size %d is below the minimum\n", r);
+ /*FREEMEMORY(space, samplepos);
+ FREEMEMORY(space, cumchrlen);
+ return 0;*/
+ }
+
+// for(i=0; i < n; i++) {
+// assert(samplepos[i].a < 1);
+// }
+
+ //sort
+ order = quickSort(space, samplepos, n, bl_matchfileSampleCmp, NULL);
+
+ initProgressBarVT();
+ // for(i=0; i < n; i++) {
+ // assert(samplepos[order[i]].a < 1);
+ // }
+
+ //evaluate
+ //to increase speed a frame of size FRAMESIZE is loaded
+ for(i=0; i < n; i++) {
+
+ progressBarVT("positions sampled.", n, i, 25);
+ //is position on a new chromosome or in a new frame?
+ if(!frame || samplepos[order[i]].a != curchrom ||
+ samplepos[order[i]].b-prev+1 >= frame->start+frame->width) {
+
+ if(frame) {
+ bl_matchfileDestructFrame(space, frame);
+ frame = NULL;
+ //bl_matchfileDestructFrameStats(space, stats);
+ }
+
+ curchrom = samplepos[order[i]].a;
+ curstart = samplepos[order[i]].b;
+ prev = (samplepos[order[i]].a == 0) ? 0 : cumchrlen[samplepos[order[i]].a-1];
+
+ // fprintf(stderr, "getting frame for '%s', curstart '%d', prev '%d'\n",
+ // file->index->chromnames[samplepos[order[i]].a], curstart, prev);
+
+ frame = bl_matchfileGetFrame(space, file,
+ file->index->chromnames[samplepos[order[i]].a],
+ curstart-prev+1, 20000, set, maxcover, NULL);
+
+ // fprintf(stderr, "getting consensus\n" );
+ bl_matchfileGetConsensus(frame);
+ // stats = bl_matchfileFrameStats (space, frame);
+ }
+
+ // fprintf(stderr, "evaluation of %d\n", samplepos[order[i]].b-curstart);
+ f(space, frame, samplepos[order[i]].b-curstart, stats, info);
+ }
+ fprintf(stderr, "\n");
+ NFO("%d positions sampled.\n", n);
+
+ if(frame) {
+ bl_matchfileDestructFrame(space,frame);
+ frame = NULL;
+ // bl_matchfileDestructFrameStats(space, stats);
+ }
+
+ FREEMEMORY(space, order);
+ FREEMEMORY(space, samplepos);
+ FREEMEMORY(space, cumchrlen);
+
+ return 0;
+}
+
+/*---------------------------- bl_matchfileCensus ----------------------------
+ *
+ * @brief
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileCensus (void *space, matchfile_t *file,
+ fasta_t *set, Uint framesize,
+ void (*f)(void *, matchfileFrame_t*, Uint, matchfileFrameStats_t *, void *), void *nfo)
+{
+ Uint i, k, ret, n=0, nchr = 0, curchrom = 0, pos=0, chridx, maxchr;
+ matchfileFrame_t **frames = NULL;
+
+ matchfileFrameStats_t *stats = NULL;
+ Uint maxcover = 20000;
+
+
+ nchr = MAX(nchr, file->index->noofchroms);
+ frames = ALLOCMEMORY(space, NULL, matchfileFrame_t*, 1);
+ memset(frames, 0, sizeof(matchfileFrame_t*)*1);
+
+ for(k=0; k < nchr; k++) {
+
+ chridx = bl_fastxFindIDIdx(file->index->chromnames[k], set);
+ maxchr = bl_fastaGetSequenceLength(set, chridx);
+ n = file->index->matchend[k]+1;
+
+
+ NFO("evaluating chr: %d '%s' len:%d\n", chridx, file->index->chromnames[k], maxchr);
+ n = MIN(n, maxchr);
+
+ //evaluate
+ //to increase speed a frame of size FRAMESIZE is loaded
+
+ for(i=1; i < n; i++) {
+
+
+ if(file->index->matchend[k]>0) {
+
+
+
+
+ //is position on a new chromosome or in a new frame?
+ if(!frames[0] || k != curchrom ||
+ i >= frames[0]->start + frames[0]->width) {
+
+ curchrom = k;
+
+ if(frames[0]) {
+ bl_matchfileDestructFrame(space, frames[0]);
+ frames[0] = NULL;
+ }
+
+ if(file->index->matchend[k] < i ||
+ file->index->matchstart[k] > i)
+ continue;
+
+ frames[0] = bl_matchfileGetFrame(space, file,
+ file->index->chromnames[k], i, framesize, set, maxcover, NULL);
+ bl_matchfileGetConsensus(frames[0]);
+
+ }
+
+ pos = i - frames[0]->start;
+ frames[0]->cs[pos].p_hom = log(0);
+ ret = 0;
+
+ if(frames[0]->cs[pos].len) {
+ f(space, frames[0], pos, stats, nfo);
+ }
+ }
+
+ // if(groups) {
+
+ /*iter the groups*/
+ // for(j=0; j < maxgroupno; j++) {
+ // if(g->s_consx[j] > g->s_cons[j] ||
+ // g->s_refx[j] > g->s_ref[j]) {
+ // exclusive = 1;
+
+ // for(u=0; u < maxgroupno; u++) {
+ // if(u != j && (g->s_consx[u] > g->s_cons[u] ||
+ // g->s_refx[u] > g->s_ref[u])) {
+ // exclusive = 0;
+ // }
+ // }
+
+ // if(exclusive) {
+ // bl_matchfileTestGroupsPrint (g, j, frames[j], pos);
+ // }
+ // }
+ // }
+
+ // bl_matchfileTestGroupsDestruct (space, g);
+ // }
+ }
+
+
+
+
+ bl_matchfileDestructFrame(space,frames[0 ]);
+ frames[0] = NULL;
+
+ }
+
+
+
+ FREEMEMORY(space, frames);
+ return ;
+}
+
+/*-------------------------------- phred2prob --------------------------------
+ *
+ * @brief phred score to probability
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+phred2prob (char ascii, char offset, char p)
+{
+
+ double res, doffset = (double)offset, dascii = (double) ascii;
+ res = pow(10,((double)(dascii-doffset)/-10.0));
+
+ if(p && res >= 1.0) res = 0.99;
+
+ return res;
+}
+
+
+
+/*------------------------ bl_matchfileTestNonVariant ------------------------
+ *
+ * @brief test variant
+ * @author Steve Hoffmann
+ *
+ */
+
+ double
+bl_matchfileTestNonVariant ( matchfileCross_t *cs,
+ matchfileSampleStats_t *stats, char ref,
+ matchfileCrossStats_t *css, double minrp, double maxrp, double minrq,
+ double maxrq)
+{
+
+ double p = .0, *nv, errors = .0; //, k;
+ char *ch, *rq;
+ unsigned char *ed;
+ Uint len, i, rpos;
+ uint32_t *rp, *mc, *rl;
+ char usenativequal=1;
+
+ len = cs->len;
+ ch = cs->chars;
+ rp = cs->readpos;
+ rq = cs->quals;
+ mc = cs->matchcnt;
+ ed = cs->edist;
+ rl = cs->readlen;
+
+ css->mean_rt = .0;
+ css->mean_rq = .0;
+ css->mean_rr = .0;
+ css->mean_mm = .0;
+
+ nv = bl_matchfileGetNTReadPosVar(cs);
+
+ bl_matchfileCrossStatsInit(css, len);
+
+ for(i=0; i < len; i++) {
+ css->var_rt[i] = .0;
+ css->var_rq[i] = .0;
+ css->var_rr[i] = .0;
+ css->var_mm[i] = .0;
+
+ if(ntcode[(int)ch[i]] < 5 ) {
+
+
+ rpos = trunc(((double)(((double)rp[i]*100.0)/((double)rl[i]))));
+
+ if((int)ch[i] == (int)ref) {
+
+ //READ POSITION
+ //SUGGESTION RED
+ css->var_rt[i] =
+ log(1.0-(((double)stats->RP[rpos] + 1.0)/((double) stats->RP_N[rpos] + 1.0))) - log(maxrp);
+
+ //STANDARD
+ // css->var_rt[i] = log((((double)stats->RP[rpos] + 1.0)
+ // /((double) stats->RP_N[rpos] + 1.0))) ;
+
+//ORIGINAL
+// css->var_rt[i] = log(1.0-(((double)stats->RP[rpos] + 1.0)
+// /((double) stats->RP_N[rpos] + 1.0)))
+// - log(maxrp);
+
+ //READ QUALITY
+ if(usenativequal) {
+ css->var_rq[i] = log(1.0-phred2prob(rq[i], 64, 1));
+ //css->var_rq[i] = log(1.0-pow(10,((double)((double)rq[i]-64.0)/-10.0)));
+ } else {
+ css->var_rq[i] = log(1.0-(((double)stats->RQ[(int)rq[i]] + 1.0)
+ /((double)stats->RQ_N[(int)rq[i]] + 1.0))) - log(maxrq);
+ }
+
+ //READ ERROR
+ css->var_rr[i] = log(((double)stats->RR[MIN(ed[i],10)] + 1.0)/
+ ((double)stats->RR_N + 1.0));
+
+
+ //MULTIPLE MATCHES
+ css->var_mm[i] = .0;
+ if( stats->MM_N > stats->MM[MIN(mc[i],10)])
+ css->var_mm[i] = MAX(MINMMPROB,
+ log(((double)stats->MM[MIN(mc[i],10)]+1.0)/
+ ((double)stats->MM_N + 1.0)));
+
+ } else {
+
+ errors++;
+
+ //READ POSITION
+//SUGGESTION RED
+
+ css->var_rt[i] =
+ log((((double)stats->RP[rpos] + 1.0)/((double) stats->RP_N[rpos] + 1.0))) - log(1-minrp);
+
+//ORIGINAL
+// css->var_rt[i] = log(minrp) - log(1.0-(((double)stats->RP[rpos] + 1.0)
+// /((double) stats->RP_N[rpos] + 1.0)));
+
+//STANDARD
+// css->var_rt[i] =
+// log(1.0-(((double)stats->RP[rpos] + 1.0)
+// /((double) stats->RP_N[rpos] + 1.0)));
+
+ //READ QUALITY
+ if(usenativequal) {
+ css->var_rq[i] = log(phred2prob(rq[i], 64, 1));
+// css->var_rq[i] = log(pow(10,((double)((double)rq[i]-64.0)/-10.0)));
+ } else {
+ css->var_rq[i] = log(minrq) -
+ log(1.0-(((double)stats->RQ[(int)rq[i]] + 1.0)
+ /((double) stats->RQ_N[(int)rq[i]] + 1.0)));
+ }
+
+ //READ ERROR
+ css->var_rr[i] = log(1.0-((double)stats->RR[MIN(ed[i],10)] + 1.0)/
+ ((double)stats->RR_N + 1.0));
+
+
+ //MULTIPLE MATCHES
+ css->var_mm[i] = .0;
+ if( stats->MM_N > stats->MM[MIN(mc[i],10)]) {
+ css->var_mm[i] = MAX(MINMMPROB,
+ log(1.0-(((double)stats->MM[MIN(mc[i],10)]+1.0)/
+ ((double)stats->MM_N + 1.0))));
+ }
+ }
+
+ p += css->var_rt[i];
+ p += css->var_rq[i];
+ p += css->var_rr[i];
+ p += css->var_mm[i];
+
+ css->mean_rt += css->var_rt[i];
+ css->mean_rq += css->var_rq[i];
+ css->mean_rr += css->var_rr[i];
+ css->mean_mm += css->var_mm[i];
+
+ css->sub[i] = p;
+ }
+ }
+
+ css->mean_rt /= (double)len;
+ css->mean_rq /= (double)len;
+ css->mean_rr /= (double)len;
+ css->mean_mm /= (double)len;
+
+ FREEMEMORY(space, nv);
+ return p/(double)len;
+}
+
+
+/*------------------------ bl_matchfileTestNonVariant ------------------------
+ *
+ * @brief test variant
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+bl_matchfileTestVariant (matchfileCross_t *cs,
+ matchfileSampleStats_t *stats, char ref, matchfileCrossStats_t *css,
+ double minrp, double maxrp, double minrq, double maxrq)
+{
+
+
+ double px = .0, *nv; //, k;
+ char *ch, *rq;
+ unsigned char *ed;
+ Uint len, i, curerr, rpos;
+ uint32_t *rp, *mc, *rl;
+ char usenativequal=1;
+
+
+ len = cs->len;
+ ch = cs->chars;
+ rp = cs->readpos;
+ rq = cs->quals;
+ mc = cs->matchcnt;
+ ed = cs->edist;
+ nv = bl_matchfileGetNTReadPosVar(cs);
+ rl = cs->readlen;
+
+ bl_matchfileCrossStatsInit(css, len);
+
+ css->mean_rt = .0;
+ css->mean_rq = .0;
+ css->mean_rr = .0;
+ css->mean_mm = .0;
+
+ /*variant*/
+ for(i=0; i < len; i++) {
+
+ css->var_rt[i] = .0;
+ css->var_rq[i] = .0;
+ css->var_rr[i] = .0;
+ css->var_mm[i] = .0;
+
+ if(ntcode[(int)ch[i]] < 5 ) {
+
+ rpos = trunc(((double)(((double)rp[i]*100.0)/((double)rl[i]))));
+ //ALTERNATIVE RED
+ css->var_rt[i] =
+ log(1.0-(((double)stats->RP[rpos] + 1.0)/((double) stats->RP_N[rpos] + 1.0))) - log(maxrp);
+
+ //ORIGINAL
+ //css->var_rt[i] = log(1.0-(((double)stats->RP[rpos] + 1.0)
+ // /((double) stats->RP_N[rpos] + 1.0))) - log(maxrp);
+
+ //STANDARD
+ //css->var_rt[i] = log((((double)stats->RP[rpos] + 1.0)
+ // /((double) stats->RP_N[rpos] + 1.0))) ;
+
+ if(usenativequal) {
+ css->var_rq[i] = log(1.0-phred2prob(rq[i],64,1));
+ //css->var_rq[i] = log(1.0-pow(10,((double)((double)rq[i]-64.0)/-10.0)));
+ } else {
+ css->var_rq[i] = log(1.0-(((double)stats->RQ[(int)rq[i]] + 1.0)
+ /((double)stats->RQ_N[(int)rq[i]] + 1.0))) - log(maxrq);
+ }
+
+ curerr = ((int)ch[i] != (int)ref && ed[i]);
+
+ css->var_rr[i] = log(((double)stats->RR[MIN(ed[i]-curerr,10)]+1.0)/
+ ((double)stats->RR_N + 1.0));
+
+
+ if( stats->MM_N > stats->MM[MIN(mc[i],10)])
+ css->var_mm[i] = MAX(MINMMPROB, log(((double)stats->MM[MIN(mc[i],10)]+1.0)/
+ ((double)stats->MM_N + 1.0)));
+
+ px += css->var_rt[i];
+ px += css->var_rq[i];
+ px += css->var_rr[i];
+ px += css->var_mm[i];
+
+ css->mean_rt += css->var_rt[i];
+ css->mean_rq += css->var_rq[i];
+ css->mean_rr += css->var_rr[i];
+ css->mean_mm += css->var_mm[i];
+
+ css->sub[i] = px;
+ }
+ }
+
+ css->mean_rt /= (double)len;
+ css->mean_rq /= (double)len;
+ css->mean_rr /= (double)len;
+ css->mean_mm /= (double)len;
+
+ FREEMEMORY(space, nv);
+ return px/(double)len;
+}
+
+
+
+/*--------------------------- bl_matchfileTestCons ---------------------------
+ *
+ * @brief test the consensus vs. reference
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+bl_matchfileTestCons (matchfileCross_t *cs,
+ matchfileSampleStats_t *stats, char cons, matchfileCrossStats_t *css,
+ double minrp, double maxrp, double minrq, double maxrq, char usenativequal)
+{
+
+ double p = .0, *nv;
+ char *ch, *rq;
+ unsigned char *ed;
+ Uint len, i, rpos;
+ uint32_t *rp, *mc, *rl;
+
+ len = cs->len;
+ ch = cs->chars;
+ rp = cs->readpos;
+ rq = cs->quals;
+ mc = cs->matchcnt;
+ ed = cs->edist;
+ rl = cs->readlen;
+
+ css->mean_rt = .0;
+ css->mean_rq = .0;
+ css->mean_rr = .0;
+ css->mean_mm = .0;
+
+ nv = bl_matchfileGetNTReadPosVar(cs);
+ bl_matchfileCrossStatsInit(css, len);
+
+ for(i=0; i < len; i++) {
+
+ if(ntcode[(int)ch[i]] < 5 ) {
+
+ rpos = trunc(((double)(((double)rp[i]*100.0)/((double)rl[i]))));
+
+ if((int)ch[i] == (int)cons) {
+
+
+ css->var_rt[i] =
+ log((((double)stats->RP[rpos] + 1.0)/((double) stats->RP_N[rpos] + 1.0))) - log(1-minrp);
+
+/* css->var_rt[i] = log(minrp) -
+ log(1.0-(((double)stats->RP[rpos] + 1.0)
+ /((double) stats->RP_N[rpos] + 1.0)));
+*/
+ if(usenativequal) {
+ // css->var_rq[i] = log(1.0-pow(10,phred2prob(rq[i],64,1)));
+ css->var_rq[i] = log(phred2prob(rq[i], 64, 1));
+
+ //css->var_rq[i] = log(1.0-pow(10,((double)((double)rq[i]-64.0)/-10.0)));
+ } else {
+ css->var_rq[i] = log(minrq) - log(((double)stats->RQ[(int)rq[i]] + 1.0)
+ /((double) stats->RQ_N[(int)rq[i]] + 1.0));
+ }
+
+ css->var_rq[i]*=QUALFACTOR;
+
+ css->var_rr[i] = log(((double)stats->RR[MIN(ed[i],10)] + 1.0)/
+ ((double)stats->RR_N + 1.0));
+
+ css->var_mm[i] = .0;
+
+ if( stats->MM_N > stats->MM[MIN(mc[i],10)])
+ css->var_mm[i] = MAX(MINMMPROB, log(((double)stats->MM[MIN(mc[i],10)]+1.0)/
+ ((double)stats->MM_N + 1.0)));
+
+ }
+
+ p += css->var_rt[i];
+ p += css->var_rq[i];
+ p += css->var_rr[i];
+ p += css->var_mm[i];
+ css->sub[i] = p;
+
+ css->mean_rt += css->var_rt[i];
+ css->mean_rq += css->var_rq[i];
+ css->mean_rr += css->var_rr[i];
+ css->mean_mm += css->var_mm[i];
+ }
+ }
+
+ css->mean_rt /= (double)len;
+ css->mean_rq /= (double)len;
+ css->mean_rr /= (double)len;
+ css->mean_mm /= (double)len;
+
+ FREEMEMORY(space, nv);
+
+ return p;
+}
+
+
+/*---------------------- bl_matchfileGetStandardization ----------------------
+ *
+ * @brief get the standardization
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileGetStandardization (void *space, matchfileSampleStats_t *stats)
+{
+ Uint i;
+
+ stats->MM[10] = (int)((double)stats->MM_N*0.0001)+1;
+ stats->RR[10] = (int)((double)stats->RR_N*0.0001)+1;
+
+ stats->minrp = 100;
+ stats->maxrp = -100;
+ stats->maxrq = -100;
+ stats->minrq = 10;
+ stats->currq = .0;
+
+ for(i=0; i < 100; i++) {
+ if(stats->RP[i] > 1) {
+ stats->maxrp = (stats->maxrp < 1.0-(((double)stats->RP[i]+1.0)/(stats->RP_N[i]+1.0))) ?
+ 1.0-(((double)stats->RP[i]+1.0)/(stats->RP_N[i]+1.0)) : stats->maxrp;
+
+ stats->minrp = (stats->minrp > 1.0-(((double)stats->RP[i]+1.0)/(stats->RP_N[i]+1.0))) ?
+ 1.0-(((double)stats->RP[i]+1.0)/(stats->RP_N[i]+1.0)) : stats->minrp;
+ }
+ }
+
+ for(i=0; i < 255; i++) {
+ if(stats->RQ[i] > 10) {
+
+ stats->currq = (((double)stats->RQ[i]+1.0)/(stats->RQ_N[i]+1.0));
+
+ stats->maxrq = (stats->maxrq < 1.0-stats->currq && stats->currq < 1.0) ?
+ 1.0-stats->currq : stats->maxrq;
+ stats->minrq = (stats->minrq > 1.0-stats->currq && stats->currq < 1.0) ?
+ 1.0-stats->currq : stats->minrq;
+
+ }
+ }
+
+ stats->standardized = 1;
+
+ return ;
+}
+
+/*----------------------------- bl_matchfileTest -----------------------------
+ *
+ * @brief test
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileTest(void *space, Uint fidx, Uint cidx, Uint pos,
+ matchfileCross_t *cs, char ref, matchfileindex_t *idx,
+ unsigned char show, void *nfo)
+{
+ matchfileSampleStats_t *stats = idx->stats;
+ matchfileCrossStats_t css;
+ Uint *cnt, *strands, conscnt, second, secondcnt, secondplus, secondminus, secondminimum;
+ cs->s_cons = 1;
+ cs->s_consx = 0;
+ cs->s_ref = 1;
+ cs->s_refx = 0;
+ cs->p_hom = log(0);
+ cs->scr_ref = log(0);
+ cs->scr_cons = log(0);
+ cs->scr_sample = log(0);
+
+ if(!stats || cs->len < stats->mincover
+ || cs->len > stats->maxcover) {
+ return 0;
+ }
+
+ cnt = bl_matchfileGetNTCounts(cs);
+ strands = bl_matchfileGetPlusCounts(cs);
+
+ cs->cons = (char) uarraymax(cnt, 255);
+ conscnt = cnt[(int) cs->cons];
+ second = (char) uarraysecond(cnt, 255, cs->cons);
+ secondcnt = cnt[(int) second];
+ assert(secondcnt <= conscnt);
+
+ if(secondcnt == 0) {
+ if(cs->cons != ref) {
+ secondcnt = conscnt;
+ second = cs->cons;
+ } else {
+ FREEMEMORY(NULL, cnt);
+ FREEMEMORY(NULL, strands);
+ return 0;
+ }
+ }
+
+ secondplus = strands[(int) second];
+ secondminus = secondcnt -secondplus;
+ secondminimum = MIN(secondplus, secondminus);
+ secondminimum = secondcnt- secondminimum;
+
+ cs->ee = (double)secondcnt/(double)cs->len;
+
+ FREEMEMORY(space, strands);
+ FREEMEMORY(space, cnt);
+
+
+ if(!stats->standardized) bl_matchfileGetStandardization (space, stats);
+
+ cs->p_cons = bl_matchfileTestNonVariant (cs, stats, cs->cons, &css, stats->minrp,
+ stats->maxrp, stats->minrq, stats->maxrq);
+
+ bl_matchfileCrossStatsDestruct (&css);
+
+ cs->p_consx = bl_matchfileTestVariant (cs, stats, cs->cons, &css, stats->minrp,
+ stats->maxrp, stats->minrq, stats->maxrq);
+
+ bl_matchfileCrossStatsDestruct (&css);
+
+
+ cs->p_refx = bl_matchfileTestVariant (cs, stats, ref, &css, stats->minrp,
+ stats->maxrp, stats->minrq, stats->maxrq);
+
+ cs->diff_mm = 0;
+ cs->diff_rt = css.mean_rt;
+ cs->diff_rq = css.mean_rq;
+ cs->diff_rr = css.mean_rr;
+ cs->diff_mm = css.mean_mm;
+
+ bl_matchfileCrossStatsDestruct (&css);
+
+ cs->p_ref = bl_matchfileTestNonVariant (cs, stats, ref, &css, stats->minrp,
+ stats->maxrp, stats->minrq, stats->maxrq);
+
+// cs->diff_rt -= css.mean_rt;
+// cs->diff_rq -= css.mean_rq;
+// cs->diff_rr -= css.mean_rr;
+// cs->diff_mm -= css.mean_mm;
+
+ bl_matchfileCrossStatsDestruct (&css);
+
+ if(cs->cons != ref)
+ cs->ee = (double)conscnt /(double)cs->len;
+ else
+ cs->ee = (double)secondcnt/(double)cs->len;
+
+ cs->pee = log(ecdf(cs->ee, stats->ecdf));
+
+ cs->secondminimum= secondminimum;
+ cs->secondcnt = secondcnt;
+
+ if(stats->strand)
+ cs->pbinom = log(pbinom(secondminimum, secondcnt, 0.5, 1));
+ else
+ cs->pbinom = 0;
+
+
+ cs->scr_ref = (cs->p_refx - cs->p_ref) + log(ecdf(cs->ee, stats->ecdf))
+ + cs->pbinom;
+
+ cs->ee = (double)secondcnt/(double)cs->len;
+
+ cs->scr_cons = (cs->p_consx - cs->p_cons) + log(ecdf(cs->ee, stats->ecdf))
+ + cs->pbinom;
+
+ cs->scr_sample = cs->scr_ref;
+
+ //cs->scr_sample = (cs->p_refx - cs->p_ref) + log(ecdf(cs->ee, stats->ecdf))
+ // + cs->pbinom;
+
+
+
+// if (cs->cons != cs->ref && isinf(p_consx) && isinf(p_refx)
+// && cs->p_cons-PX+P > cs->p_ref) {
+// cs->p_hom = cs->p_cons;
+// }
+
+ return 0;
+}
+
+
+
+/*---------------------------- bl_matchfileSNPvcf ----------------------------
+ *
+ * @brief get the VCF for a SNP
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileSNPvcf (matchfileFrame_t *f, Uint p, char ref, Uint depth, Uint *cnt, Uint phred)
+{
+ Uint i;
+ char id[] = ".", sep;
+ char upper[] = {'A','C','G','T','N'};
+ char* strings[] = {"A","C","G","T","N"};
+ char lower[] = {'a','c','g','t','n'};
+ char alt[50];
+ char info[1000];
+ Uint n = 0;
+ Uint altcnt =0;
+
+ memset(alt, 0, 50);
+ memset(info, 0, 1000);
+
+ n = snprintf(info, 1000, "DP=%d;AC=", depth);
+
+ sep = 0;
+ for(i=0; i < 5; i++) {
+ if((cnt[(Uint)upper[i]] || cnt[(Uint)lower[i]]) &&
+ (upper[i] != ref && lower[i] != ref)) {
+ if (sep == 1) {
+ strcat (alt, ",");
+ n+=snprintf(info+n, 1000-n, ",");
+ }
+ strcat(alt, strings[i]);
+ altcnt += cnt[(Uint)upper[i]]+cnt[(Uint)lower[i]];
+ n+=snprintf(info+n, 1000-n,"%d", cnt[(Uint)upper[i]]+cnt[(Uint)lower[i]]);
+ sep=1;
+ }
+ }
+
+ if(altcnt < cnt[(Uint)'-']) {
+ n+=snprintf(info+n, 1000-n, ";INDEL");
+ }
+
+ printf("%s\t%d\t%s\t",f->chrname, f->start+p, id);
+ printf("%c\t%s\t%d\tPASS\t%s\n",ref, alt, phred, info);
+
+
+ return;
+}
+
+/*----------------------- bl_matchfileVariationHandler -----------------------
+ *
+ * @brief handle variation output
+ * @author Steve Hoffmann
+ *
+ */
+
+indelvcf_t*
+bl_matchfileVariationHandler (matchfileFrame_t *f, Uint p, double scr_ref, double scr_cons, double cut,
+ char ret, indelvcf_t *indelvcf)
+{
+
+ // char upper[] = {'A','C','G','T','N'};
+ // char lower[] = {'a','c','g','t','n'};
+ char ref = f->ref[p];
+ //Uint prev = f->start+p;
+ //Uint nonrefsnps = 0;
+ Uint depth = bl_matchfileGetCov(&f->cs[p], 1);
+ double phred;
+ double maxscr = MAX(scr_cons, scr_ref);
+ Uint intphred;
+
+
+
+
+ if(maxscr > cut || ret) {
+ phred = 10*(maxscr/log(10));
+ intphred = phred;
+
+ Uint *cnt = bl_matchfileGetNTCounts(&f->cs[p]);
+ /*
+ for(i=0; i < 5; i++) {
+ if((cnt[(Uint)upper[i]] || cnt[(Uint)lower[i]]) &&
+ (upper[i] != ref && lower[i] != ref)) {
+ nonrefsnps++;
+ }
+ }
+
+ //very simple check for deletion
+ if(cnt[(Uint)'-'] > nonrefsnps) {
+ if(indellen == 0) {
+ if(prev > 1) prev -= 2; //zero-offset
+ indelvcf = ALLOCMEMORY(NULL, NULL, indelvcf_t, 1);
+ indelvcf.ref = ALLOCMEMORY(NULL, indelvcf.ref, char, 2);
+ indelvcf.ref[0] = f->refseq[prev];
+ indelvcf.ref[1] = 0;
+ indelvcf.len = 1;
+ indelvcf.alleles = NULL;
+ indelvcf.pos = prev+1; //one-offset
+ }
+
+ indelvcf.ref = ALLOCMEMORY(NULL, indelvcf.ref, char, indelvcf.len+2);
+ indelvcf.ref[indelvcf.len] = ref;
+ indelvcf.ref[indelvcf.len+1]=0;
+ indelvcf.alleles = ALLOCMEMORY(NULL, indelvcf.alleles, char*, indelvcf.len);
+ indelvcf.alleles[indelvcf.len-1] = ALLOCMEMORY(NULL, NULL, Uint, 5);
+ indelvcf.phreds = ALLOCMEMORY(NULL, indelvcf.phreds, Uint, indelvcf.len);
+ indelvcf.phreds[indelvcf.len-1] = intphred;
+ for(i=0; i < 5; i++) {
+ indelvcf.alleles[i] = cnt[(Uint)upper[i]] + cnt[(Uint)lower[i];
+ }
+ indelvcf.len += 1;
+
+ } else {
+
+ if(indelvcf) {
+
+ printf("%s\t%d\t%s\t",f->chrname, indelvcf.pos, id);
+ printf("%c\t%c\t%d\tPASS\t%s\n", indelvcf.ref, indelvcf.ref[0], intphred, info);
+
+ FREEMEMORY(NULL, indelvcf.ref);
+ FREEMEMORY(NULL, indelvcf.phred);
+ FREEMEMORY(NULL, indelvcf. alleles);
+ FREEMEMORY(NULL, indelvcf);
+ indelvcf = NULL;
+ }
+*/
+ bl_matchfileSNPvcf (f, p, ref, depth, cnt, intphred);
+ // }
+
+ FREEMEMORY(space, cnt);
+
+/* } else if(indelvcf) {
+ *
+ FREEMEMORY(NULL, indelvcf.ref);
+ FREEMEMORY(NULL, indelvcf.phreds);
+ FREEMEMORY(NULL, indelvcf.alleles);
+ FREEMEMORY(NULL, indelvcf);
+ indelvcf = NULL;*/
+ }
+
+ return indelvcf;
+// return;
+}
+
+
+/*----------------------- bl_matchfileGroupTestsReset ------------------------
+ *
+ * @brief reset group tests
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileTestGroupsReset (matchfileTestGroups_t *g)
+{
+
+ Uint i;
+ for(i=0; i < g->noofgroups; i++) {
+ FREEMEMORY(space, g->cnt[i]);
+ }
+
+ memset(g->cnt, 0, sizeof(Uint*)*g->noofgroups);
+ memset(g->p_cons, 0, sizeof(double)*g->noofgroups);
+ memset(g->p_ref, 0, sizeof(double)*g->noofgroups);
+ memset(g->p_consx, 0, sizeof(double)*g->noofgroups);
+ memset(g->p_refx, 0, sizeof(double)*g->noofgroups);
+ memset(g->scr_cons, 0, sizeof(double)*g->noofgroups);
+ memset(g->scr_ref, 0, sizeof(double)*g->noofgroups);
+ memset(g->type, 0, sizeof(char)*g->noofgroups);
+
+ return ;
+}
+
+/*------------------------ bl_matchfileGroupTestsInit ------------------------
+ *
+ * @brief init the group tests
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchfileTestGroupsInit (void *space, matchfileTestGroups_t *g, Uint n)
+{
+ g->noofgroups = n;
+ g->p_cons = ALLOCMEMORY(space, NULL, double, g->noofgroups);
+ g->p_ref = ALLOCMEMORY(space, NULL, double, g->noofgroups);
+ g->p_consx = ALLOCMEMORY(space, NULL, double, g->noofgroups);
+ g->p_refx = ALLOCMEMORY(space, NULL, double, g->noofgroups);
+ g->scr_cons = ALLOCMEMORY(space, NULL, double, g->noofgroups);
+ g->scr_ref = ALLOCMEMORY(space, NULL, double, g->noofgroups);
+ g->cnt = ALLOCMEMORY(space, NULL, Uint, g->noofgroups);
+ g->type = ALLOCMEMORY(sapce, NULL, char, g->noofgroups);
+
+ bl_matchfileTestGroupsReset(g);
+
+ return ;
+}
+
+
+/*---------------------- bl_matchfileTestGroupsDestruct ----------------------
+ *
+ * @brief destruct the groups
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileTestGroupsDestruct (void *space, matchfileTestGroups_t *g)
+{
+ Uint i;
+ for(i=0; i < g->noofgroups; i++) {
+ FREEMEMORY(space, g->cnt[i]);
+ }
+
+ FREEMEMORY(space, g->scr_cons);
+ FREEMEMORY(space, g->scr_ref);
+ FREEMEMORY(space, g->cnt);
+ FREEMEMORY(space, g->type);
+ FREEMEMORY(space, g->p_cons);
+ FREEMEMORY(space, g->p_ref);
+ FREEMEMORY(sapce, g->p_consx);
+ FREEMEMORY(space, g->p_refx);
+
+
+ return ;
+}
+
+
+/*------------------------ bl_matchfileGroupAddResult ------------------------
+ *
+ * @brief add a test result to a group
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileTestGroupsAddResult (matchfileTestGroups_t *g, Uint no,
+ matchfileCross_t *cs)
+{
+
+ Uint *cnt = bl_matchfileGetNTCounts(cs);
+
+ g->scr_cons[no] = cs->scr_cons;
+ g->scr_ref[no] = cs->scr_ref;
+ g->p_cons[no] = cs->p_cons;
+ g->p_ref[no] = cs->p_ref;
+ g->p_consx[no] = cs->p_consx;
+ g->p_refx[no] = cs->p_refx;
+ g->cnt[no] = cnt;
+
+
+ return ;
+}
+
+
+
+/*----------------------- bl_matchfileTestGroupsPrint ------------------------
+ *
+ * @brief print group tests
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileTestGroupsPrint (matchfileTestGroups_t *g, Uint no,
+ matchfileFrame_t *f, Uint p)
+{
+
+ return ;
+}
+
+/*---------------------- bl_matchfileEvalCrossSections -----------------------
+ *
+ * @brief evaluate all cross sections
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_matchfileEvalCrossSections (void *space, matchfile_t **files,
+ int *groups, Uint nooffiles, fasta_t *set, Uint framesize,
+ Uint (*f)(void *, Uint fidx, Uint cidx, Uint pos, matchfileCross_t*, char ref,
+ matchfileindex_t *, unsigned char, void *), void *nfo)
+{
+ Uint i, j, k, ret, n=0, nchr = 0, curchrom = 0, pos=0, maxgroupno=0, chridx, maxchr;
+ matchfileFrame_t **frames = NULL;
+ matchfileTestGroups_t *g = NULL;
+ //unsigned char exclusive = 0;
+ Uint maxcover = 20000;
+ indelvcf_t *indel;
+
+ for(j=0; j < nooffiles; j++) {
+ nchr = MAX(nchr, files[j]->index->noofchroms);
+ }
+
+ if(groups) {
+
+ g = ALLOCMEMORY(space, NULL, matchfileTestGroups_t, 1);
+ for(i=0; i < nooffiles; i++) {
+ maxgroupno = (groups[i] > maxgroupno) ? groups[i] : maxgroupno;
+ }
+ maxgroupno+=1;
+ }
+
+ frames = ALLOCMEMORY(space, NULL, matchfileFrame_t*, nooffiles);
+ memset(frames, 0, sizeof(matchfileFrame_t*)*nooffiles);
+
+ for(k=0; k < nchr; k++) {
+
+ chridx = bl_fastxFindIDIdx(files[0]->index->chromnames[k], set);
+ maxchr = bl_fastaGetSequenceLength(set, chridx);
+
+ for(j=0, n=0; j < nooffiles; j++) {
+ n = MAX(n, files[j]->index->matchend[k]+1);
+ }
+
+ NFO("evaluating chr: %d '%s' len:%d\n", chridx, files[0]->index->chromnames[k], maxchr);
+ n = MIN(n, maxchr);
+
+ //evaluate
+ //to increase speed a frame of size FRAMESIZE is loaded
+
+ for(i=1; i < n; i++) {
+
+ for(j=0; j < nooffiles; j++) {
+
+ if(files[j]->index->matchend[k]>0) {
+
+ if(groups) {
+ bl_matchfileTestGroupsInit(space, g, maxgroupno);
+ }
+
+ //is position on a new chromosome or in a new frame?
+ if(!frames[j] || k != curchrom ||
+ i >= frames[j]->start + frames[j]->width) {
+
+ curchrom = k;
+
+ if(frames[j]) {
+ bl_matchfileDestructFrame(space, frames[j]);
+ frames[j] = NULL;
+ }
+
+ if(files[j]->index->matchend[k] < i ||
+ files[j]->index->matchstart[k] > i)
+ continue;
+
+ frames[j] = bl_matchfileGetFrame(space, files[j],
+ files[j]->index->chromnames[k], i, framesize, set, maxcover, NULL);
+ bl_matchfileGetConsensus(frames[j]);
+
+ }
+
+ pos = i - frames[j]->start;
+ frames[j]->cs[pos].p_hom = log(0);
+ ret = 0;
+
+ if(frames[j]->cs[pos].len) {
+ ret = f(space, j, k, i, &frames[j]->cs[pos], frames[j]->ref[pos],
+ files[j]->index, 0, nfo);
+ }
+
+ double scr_ref = frames[j]->cs[pos].scr_ref;
+ double scr_cons = frames[j]->cs[pos].scr_cons;
+ double cut = files[j]->index->stats->cut;
+
+// if(ret != 0 ||
+// MAX(frames[j]->cs[pos].scr_ref, frames[j]->cs[pos].scr_cons) >
+// files[j]->index->stats->cut) {
+
+ if(!groups) {
+ //determine type of variation
+ indel= bl_matchfileVariationHandler (frames[j], pos, scr_ref, scr_cons, cut, ret, indel);
+ }
+
+// ccnt++;
+// }
+
+ if(groups && frames[j]->cs[pos].len){
+ bl_matchfileTestGroupsAddResult (g, groups[j], &frames[j]->cs[pos]);
+ }
+ }
+
+ // if(groups) {
+
+ /*iter the groups*/
+ // for(j=0; j < maxgroupno; j++) {
+ // if(g->s_consx[j] > g->s_cons[j] ||
+ // g->s_refx[j] > g->s_ref[j]) {
+ // exclusive = 1;
+
+ // for(u=0; u < maxgroupno; u++) {
+ // if(u != j && (g->s_consx[u] > g->s_cons[u] ||
+ // g->s_refx[u] > g->s_ref[u])) {
+ // exclusive = 0;
+ // }
+ // }
+
+ // if(exclusive) {
+ // bl_matchfileTestGroupsPrint (g, j, frames[j], pos);
+ // }
+ // }
+ // }
+
+ // bl_matchfileTestGroupsDestruct (space, g);
+ // }
+ }
+ }
+
+ for(j=0; j < nooffiles; j++){
+ if(frames[j]) bl_matchfileDestructFrame(space,frames[j]);
+ frames[j] = NULL;
+ }
+ }
+
+
+ if(groups) {
+ FREEMEMORY(space, g);
+ }
+
+ FREEMEMORY(space, frames);
+ return ;
+}
+
+
+
+
+
+/*-------------------------- bl_matchfileConsensus --------------------------
+ *
+ * @brief a simple consensus calling based on majority voting,
+ * considering deletions as well (not done by bl_matchfileGetConsensus)
+ * @author Christian Otto
+ *
+ */
+Uint
+bl_matchfileConsensus ( void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs,
+ char ref, matchfileindex_t *idx, unsigned char show, void *nfo )
+{
+ Uint i, j, k, max, *cnt = NULL, len = 0;
+ char *del, **dels = NULL;
+
+ matchfile_t **files = (matchfile_t **) nfo;
+
+ /* compile deletion string freq table */
+ if (cs->noofdels > 0){
+ for (i = 0; i < cs->noofdels; i++){
+ del = ALLOCMEMORY(space, NULL, char, cs->dels[i].len + 1);
+ for (j = 0, k = 0; j < cs->dels[i].len; j++){
+ if (cs->dels[i].string[j] != '^')
+ del[k++] = cs->dels[i].string[j];
+ }
+ del[k] = '\0';
+
+ for (j = 0; j < len; j++){
+ if (strlen(dels[j]) == strlen(del) &&
+ strcmp(dels[j], del) == 0){
+ cnt[j]++;
+ }
+ }
+ if (j == len){
+ dels = ALLOCMEMORY(space, dels, char*, len+1);
+ cnt = ALLOCMEMORY(space, cnt, Uint, len+1);
+
+ dels[len] = ALLOCMEMORY(space, NULL, char, strlen(del)+1);
+ memmove(dels[len], del, strlen(del));
+ dels[len][strlen(del)] = '\0';
+ cnt[len] = 1;
+ len++;
+ }
+ FREEMEMORY(space, del);
+ }
+
+ if (len > 0){
+ max = uarraymax(cnt, len);
+
+ if (2 * cnt[max] >= cs->len){
+ fprintf(stdout, "%s\t%d\t-\t%s\t%d\n", files[fidx]->index->chromnames[cidx],
+ pos, dels[max], cs->len);
+ }
+ }
+ }
+ fprintf(stdout, "%s\t%d\t%c\t%c\t%d\n", files[fidx]->index->chromnames[cidx],
+ pos, cs->ref, cs->cons, cs->len);
+
+ if (cs->noofdels > 0){
+ for (i = 0; i < len; i++){
+ FREEMEMORY(space, dels[i]);
+ }
+ FREEMEMORY(space, dels);
+ FREEMEMORY(space, cnt);
+ }
+ return 0;
+}
+
+/*----------------------------- bl_matchfileGetCov -----------------------------
+ *
+ * @brief get coverage in cross section w/ or w/o considering deleted bases
+ * @author Christian Otto
+ *
+ */
+Uint
+bl_matchfileGetCov(matchfileCross_t *cs, unsigned char allowdel){
+ Uint cov, *cnt;
+
+ cov = cs->len;
+ cnt = bl_matchfileGetNTCounts(cs);
+ if (!allowdel){
+ cov -= cnt[(Uint)'-'];
+ }
+
+ free(cnt);
+ return cov;
+}
+
+/*--------------------------- bl_matchfileGetContext ---------------------------
+ *
+ * @brief get sequence context of given length w/r/t to given strand
+ * @author Christian Otto
+ *
+ */
+char *
+bl_matchfileGetContext(fasta_t *fasta, Uint idx, Uint pos, Uint strand, Uint len){
+ Uint i, seqlen;
+ char *seq, *context;
+
+ context = ALLOCMEMORY(space, NULL, char, len+1);
+ memset(context, 'N', len);
+ context[len] = '\0';
+
+ if (!strand || idx >= fasta->noofseqs)
+ return context;
+
+ seq = bl_fastaGetSequence(fasta, idx);
+ seqlen = bl_fastaGetSequenceLength(fasta, idx);
+
+ if (pos > seqlen)
+ return context;
+
+ for (i = 0; i < len; i++){
+ if (strand == PLUS_STRAND && pos+i-1 < seqlen){
+ context[i] = seq[pos+i-1];
+ }
+ else if (strand == MINUS_STRAND && pos >= i+1) {
+ context[i] = charComplementChar(seq[pos-i-1]);
+ }
+ }
+ return context;
+}
+
diff --git a/segemehl/libs/evalmatchfiles.h b/segemehl/libs/evalmatchfiles.h
new file mode 100644
index 0000000..a3b5627
--- /dev/null
+++ b/segemehl/libs/evalmatchfiles.h
@@ -0,0 +1,131 @@
+#ifndef EVALMATCHFILE_H
+#define EVALMATCHFILE_H
+
+/*
+ *
+ * evalmatchfiles.h
+ * evalutate matchfiles
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 10/14/2010 12:07:57 AM CEST
+ *
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 418 $
+ * Author: $Author: steve $
+ * Date: $Date: 2015-01-05 05:17:35 -0500 (Mon, 05 Jan 2015) $
+ * Id: $Id: evalmatchfiles.h 418 2015-01-05 10:17:35Z steve $
+ * Url: $URL: http://www2.bioinf.uni-leipzig.de/svn5/segemehl/libs/evalmatchfiles.h $
+ */
+
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "basic-types.h"
+#include "matchfiles.h"
+#include "biofiles.h"
+
+
+#define MAX_STARTSITES 20000
+#define EPSILON_NV 0.0000001
+#define SDFRACTION 3
+#define MINSUBPROB -4.0
+#define MINRVPROB -3.0
+#define MINMMPROB -3.0
+#define MINEEPROB -7.0
+#define STRANDPENALTY -3.0
+#define QUALFACTOR 1.0
+#define PLUS_STRAND 1
+#define MINUS_STRAND (1 << 1)
+#define BOTH_STRANDS (PLUS_STRAND | MINUS_STRAND)
+#define EEWEIGHT .0
+#define SPWEIGHT 0.75
+#define RQWEIGHT 1.0
+#define RRWEIGHT -1.0
+#define RTWEIGHT 1500.0
+#define MMWEIGHT 1.5
+
+
+typedef struct {
+ Uint noofgroups;
+ char *type;
+ double *scr_cons;
+ double *scr_ref;
+ double *p_consx;
+ double *p_cons;
+ double *p_refx;
+ double *p_ref;
+ Uint **cnt;
+} matchfileTestGroups_t;
+
+
+typedef struct {
+ char *ref;
+ char *phreds;
+ char len;
+ char **alleles;
+ Uint pos;
+
+} indelvcf_t;
+
+extern char * ntcode;
+extern char * ntdecode;
+
+ void
+bl_matchfileEvalCrossSections (void *space, matchfile_t **file, int *gropus, Uint nooffiles, fasta_t *set, Uint maxframesize,
+ Uint (*f)(void *, Uint fidx, Uint cidx, Uint pos, matchfileCross_t*, char, matchfileindex_t *, unsigned char, void *), void *nfo);
+
+matchfileFrameStats_t *bl_matchfileFrameStats (void *space, matchfileFrame_t *frame);
+void bl_matchfileDestructFrame(void *space, matchfileFrame_t *frame);
+void bl_matchfileGetConsensus(matchfileFrame_t *frame);
+Uint* bl_matchfileGetNTCounts(matchfileCross_t *cs);
+double* bl_matchfileGetNTError(matchfileFrame_t *frame, Uint pos);
+double* bl_matchfileGetNTRedundancy(matchfileFrame_t *frame, Uint pos);
+double* bl_matchfileGetNTEdist(matchfileFrame_t *frame, Uint pos);
+double* bl_matchfileGetNTReadPos(matchfileFrame_t *frame, Uint pos);
+double* bl_matchfileGetNTReadPosVar(matchfileCross_t *);
+matchfileFrameStats_t* bl_matchfileFrameStats(void *space, matchfileFrame_t *frame);
+void bl_matchfileDestructFrameStats(void *space, matchfileFrameStats_t *stats);
+void bl_matchfileRSSGNUPLOT(void *space, matchfileFrame_t *frame, matchfileFrameStats_t *stats);
+void bl_matchfileCOVGNUPLOT(void *space, matchfileFrame_t *frame);
+extern FILE *popen( const char *command, const char *modes);
+extern int pclose(FILE *stream);
+Uint bl_matchfileSampleCmp (Uint elemA, Uint elemB, void *toSort, void *info);
+void bl_matchfileGetErrorDensity(void *space, matchfileFrame_t *frame, Uint pos, matchfileFrameStats_t *, void *nfo);
+Uint bl_matchfileTest(void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs, char ref, matchfileindex_t *, unsigned char, void *nfo);
+Uint bl_matchfileSampleCrossSections(void *space, matchfile_t *file, fasta_t *set, Uint n,
+ void (*f)(void *, matchfileFrame_t*, Uint, matchfileFrameStats_t *, void *), unsigned char **maps, Uint *mapsize, void *info);
+void bl_matchfileGetConditionals (void *space, matchfileFrame_t *frame,
+ Uint pos, matchfileFrameStats_t *stats, void *nfo);
+matchfileSampleStats_t*
+bl_matchfileInitSampleStats (void *space, Uint maxsample, Uint maxcover, Uint mincover, double minfrac, char entropyfilter,
+ Uint areasize, double maxareae);
+void bl_matchfileDestructSampleStats (void *space, matchfileSampleStats_t *stats);
+void bl_matchfileDumpSampleStats (matchfileSampleStats_t *stats);
+void bl_matchfileFitGEV (void *space, matchfileSampleStats_t *stats);
+unsigned char** bl_matchfileSmallMap (void *space, matchfile_t* file, Uint **mapsize);
+Uint bl_matchfileSimpleGEV (void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs,
+ char ref, matchfileindex_t *idx, unsigned char show, void *nfo);
+Uint bl_matchfileSimpleGATK ( void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs,
+ char ref, matchfileindex_t *idx, unsigned char show, void *nfo);
+Uint bl_matchfileConsensus ( void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs,
+ char ref, matchfileindex_t *idx, unsigned char show, void *nfo);
+Uint bl_matchfileGetCov(matchfileCross_t *cs, unsigned char allowdel);
+char *bl_matchfileGetContext(fasta_t *fasta, Uint idx, Uint pos, Uint strand, Uint len);
+void bl_matchfileGetScoreSample (void *space, matchfileFrame_t *frame, Uint pos, matchfileFrameStats_t *framestats, void *nfo);
+
+void
+bl_matchfileCrossStatsDestruct (matchfileCrossStats_t *css);
+void
+bl_matchfileCrossStatsInit (matchfileCrossStats_t *css, Uint len);
+void
+bl_matchfileCensus (void *space, matchfile_t *file,
+ fasta_t *set, Uint framesize,
+ void (*f)(void *, matchfileFrame_t*, Uint, matchfileFrameStats_t *, void *), void *nfo);
+
+#endif
diff --git a/segemehl/libs/evalmatchfileshelper.c b/segemehl/libs/evalmatchfileshelper.c
new file mode 100644
index 0000000..cbb57d1
--- /dev/null
+++ b/segemehl/libs/evalmatchfileshelper.c
@@ -0,0 +1,1705 @@
+
+/*
+ * evalmatchfileshelper.c
+ * helper functions
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 12/24/2013 02:48:45 AM CET
+ *
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "sort.h"
+#include "alignment.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "vtprogressbar.h"
+#include "fileio.h"
+#include "matchfilesfields.h"
+#include "matchfiles.h"
+#include "debug.h"
+#include "evalmatchfiles.h"
+#include "biofiles.h"
+#include "splicesites.h"
+
+
+/*-------------------------- bl_matchfileSimpleGEV ---------------------------
+ *
+ * @brief simple GEV calling
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileSimpleGEV (void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs,
+ char ref, matchfileindex_t *idx, unsigned char show, void *nfo )
+{
+
+ double p = .0, px =.0, e, pxP =.0, pP = .0;
+ char *ch, *rq, *usequal;
+ matchfileSampleStats_t *stats = idx->stats;
+ Uint len, i, errors=0;
+
+ usequal = (char*) nfo;
+
+ len = cs->len;
+ if(len < 2) return 0;
+
+ ch = cs->chars;
+ rq = cs->quals;
+
+
+
+
+ for(i=0; i < len; i++) {
+
+ if((int)ch[i] != (int)ref) {
+ errors++;
+ //error probability P
+ //if P is high -> bad for px, good for p
+ //if P is low -> bad for p, good for px
+
+ if(*usequal) {
+ pP += log(pow(10.0,((double)((double)rq[i]-64.0)/-10.0)))+log(0.9999);
+ pxP += log(1.0-(pow(10.0,((double)((double)rq[i]-64.0)/-10.0))))+log(0.0001);
+ }
+ }
+ }
+
+ if(errors) {
+ e = (double)errors/(double)len;
+ p = log(0.999)+log(1-gevcdf(e,stats->gev_mu[0],stats->gev_si[0],stats->gev_xi[0]))+pP;
+ px = log(0.001)+log(gevcdf(e,stats->gev_mu[0],stats->gev_si[0],stats->gev_xi[0]))+pxP;
+
+ if(px > p) fprintf(stdout, "chr21\t%d\t%d\t%f\t%f\t%f\t%f\t%f\t%s\n", pos, len, e, p, px, pP, pxP, cs->chars);
+ fflush(stdout);
+ }
+
+ return 0;
+}
+
+
+
+/*-------------------------- bl_matchfileSimpleGATK --------------------------
+ *
+ * @brief a simple GATK implementation w/o fragment stuff
+ * @author Steve Hoffmann
+ * @info modeled after https://github.com/broadgsa/gatk/blob/master/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
+ *
+ *
+ * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
+ * calculates:
+ *
+ * P(G | D) = P(G) * P(D | G)
+ *
+ * where
+ *
+ * P(D | G) = sum_i log10 P(bi | G)
+ *
+ * and
+ *
+ * P(bi | G) = 1 - P(error | q1) if bi is in G
+ * = P(error | q1) / 3 if bi is not in G
+ *
+ * for homozygous genotypes and for heterozygous genotypes:
+ *
+ * P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
+ * = P(error | q1) / 3 if bi is not in G
+ *
+ * for each of the 10 unique diploid genotypes AA, AC, AG, .., TT
+ *
+ * Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
+ */
+
+Uint
+bl_matchfileSimpleGATK ( void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs,
+ char ref, matchfileindex_t *idx, unsigned char show, void *nfo )
+{
+
+ char *ch, *rq, n1, n2;
+ double *gtslik = NULL, bl;
+ double HETLIK = log10(10e-3);
+ char *gts[] = {"AA", "AC", "AG", "AT", "CC", "CG", "CT", "GG", "GT", "TT"};
+ char *obs;
+ Uint len, i, j, errors=0, bj=0;
+
+ len = cs->len;
+ if(len < 2) return 0;
+
+ ch = cs->chars;
+ rq = cs->quals;
+
+ gtslik = ALLOCMEMORY(space, NULL, double, 10);
+ memset(gtslik, 0, sizeof(double)*10);
+ gtslik[1] = HETLIK;
+ gtslik[2] = HETLIK;
+ gtslik[3] = HETLIK;
+ gtslik[5] = HETLIK;
+ gtslik[6] = HETLIK;
+ gtslik[8] = HETLIK;
+
+ obs = ALLOCMEMORY(space, NULL, char, 256);
+ memset(obs, 0, sizeof(char)*256);
+
+
+ //AA, AC AG, AT, CC, CG, CT, GG, GT, TT
+ // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+
+ for(j=0; j < 10; j++) {
+ n1 = gts[j][0];
+ n2 = gts[j][1];
+
+ for(i=0; i < len; i++) {
+ obs[(int)ch[i]] = 1;
+ if(n1 == n2) {
+ if((int)ch[i] != (int)n1) {
+ errors++;
+ //error probability P
+ //if P is high -> bad for px, good for p
+ //if P is low -> bad for p, good for px
+ gtslik[j] += log10(
+ pow(10.0,((double)((double)rq[i]-64.0)/-10.0)))-log10(3.0);
+ } else {
+ gtslik[j] += log10(
+ 1.0-(pow(10.0,((double)((double)rq[i]-64.0)/-10.0))));
+ }
+
+ } else {
+ if((int)ch[i] != (int)n1 && (int)ch[i] != (int)n2) {
+ errors++;
+ //error probability P
+ //if P is high -> bad for px, good for p
+ //if P is low -> bad for p, good for px
+ gtslik[j] += log10(pow(10.0,((double)((double)rq[i]-64.0)/-10.0)))-log10(3.0);
+ } else {
+ gtslik[j] += log10(
+ 1.0-(pow(10.0,((double)((double)rq[i]-64.0)/-10.0))/2) +
+ (pow(10.0,((double)((double)rq[i]-64.0)/-10.0))/6));
+ }
+ }
+ }
+ }
+
+ switch(ref) {
+ case 'A':
+ case 'a':
+ bj = 0;
+ break;
+ case 'C':
+ case 'c':
+ bj = 4;
+ break;
+ case 'G':
+ case 'g':
+ bj = 7;
+ break;
+ case 'T':
+ case 't':
+ bj = 9;
+ break;
+ default:
+ bj = 0;
+ }
+
+ bl = gtslik[bj];
+
+// fprintf(stdout, "--- \n");
+ for(j=0; j < 10; j++) {
+ if(pos == 9413199) fprintf(stdout, "GT:%s\t%f\n", gts[j], gtslik[j]);
+ if(gtslik[j] > bl && gtslik[j] != 0.0 && obs[(int)gts[j][0]] && obs[(int)gts[j][1]]) {
+ bl = gtslik[j];
+ bj = j;
+ }
+ }
+
+ n1 = gts[bj][0];
+ n2 = gts[bj][1];
+
+ if(ref != (int) 'N' && (n1 != (int)ref || n2 != (int)ref)) {
+ fprintf(stdout, "CS[%d]:%s\n", len, ch);
+ fprintf(stdout, "%d\t%d\t%c\t%c\t%c\n", cidx, pos, ref, n1, n2);
+ if(pos == 9413199) exit(-1);
+ // exit(-1);
+ }
+
+ FREEMEMORY(space, gtslik);
+ return 0;
+}
+
+/*----------------- bl_matchfileGetCrossConsErrorQualScaled -----------------
+ *
+ * @brief get the error e of a cross section based on the consensus
+ * (not reference)
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+bl_matchfileGetCrossConsErrorQualScaled (matchfileFrame_t *frame, Uint pos)
+{
+ Uint j;
+ double e=.0, mat = .0, mis = .0;
+
+ for(j=0; j < frame->cs[pos].len; j++) {
+ if(frame->cs[pos].chars[j] != frame->cs[pos].cons) {
+ mis += pow(10,((double)((double)frame->cs[pos].quals[j]-64.0)/-10.0));
+ } else {
+ mat += 1.0-pow(10,((double)((double)frame->cs[pos].quals[j]-64.0)/-10.0));
+ }
+ }
+
+ if(frame->cs[pos].len && mat) {
+ e = mis/mat;
+ }
+
+ return e;
+}
+
+
+
+/*------------------------ bl_matchfileGetStrandBias -------------------------
+ *
+ * @brief get the strand bias of a cross section
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+bl_matchfileGetStrandBias (matchfileFrame_t *frame, Uint pos)
+{
+
+ Uint j;
+ double e=0;
+
+ for(j=0; j < frame->cs[pos].len; j++) {
+ if(frame->cs[pos].strands[j] == '+')
+ e++;
+ }
+
+ if(frame->cs[pos].len) {
+ e /= frame->cs[pos].len;
+ }
+
+ return e;
+}
+/*----------------------- bl_matchfileGetNTRedundancy ------------------------
+ *
+ * @brief get for each NT average redundancy of reads (multiple read hit count)
+ * in the cross section
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+bl_matchfileGetNTRedundancy(matchfileFrame_t *frame, Uint pos) {
+ Uint j, *c;
+ double *r;
+
+ r = ALLOCMEMORY(space, NULL, double, 256);
+ c = ALLOCMEMORY(space, NULL, Uint, 256);
+ memset(r, 0, sizeof(double)*256);
+ memset(c, 0, sizeof(Uint)*256);
+
+ for(j=0; j < frame->cs[pos].len; j++) {
+ r[(int)frame->cs[pos].chars[j]]
+ += (int)frame->cs[pos].matchcnt[j];
+ c[(int)frame->cs[pos].chars[j]]++;
+ }
+
+ for(j=0; j < 256; j++) {
+ if(c[j]) r[j] /= c[j];
+ }
+
+ FREEMEMORY(space, c);
+ return r;
+}
+
+
+
+/*-------------------------- bl_matchfileGetNTError --------------------------
+ *
+ * @brief return average quality score, ie error probability,
+ * in a cross section for each nucleotide present in the cross section
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+bl_matchfileGetNTError(matchfileFrame_t *frame, Uint pos) {
+ Uint j;
+ Uint *c;
+ double *p;
+
+
+ p = ALLOCMEMORY(space, NULL, double, 256);
+ c = ALLOCMEMORY(space, NULL, Uint, 256);
+ memset(c, 0, sizeof(Uint)*256);
+
+ for(j=0; j < 256; j++) {
+ p[j] = log10(0);
+ }
+
+ for(j=0; j < frame->cs[pos].len; j++) {
+ p[(int)frame->cs[pos].chars[j]] =
+ log10add(p[(int)frame->cs[pos].chars[j]],
+ (((double)frame->cs[pos].quals[j])/-10.));
+ c[(int)frame->cs[pos].chars[j]]++;
+ }
+
+ for(j=0; j < 256; j++) {
+ if(c[j]) p[j] -= log10(c[j]);
+ }
+
+ FREEMEMORY(space, c);
+ return p;
+}
+
+/*--------------------------- bl_matchfileGetNTQual ----------------------------
+ *
+ * @brief get for each NT average qual in a cross section
+ * @author Steve Hoffmann
+ *
+ */
+
+
+double*
+bl_matchfileGetNTQual(matchfileFrame_t *frame, Uint pos) {
+ Uint j, *c;
+ double *r;
+
+ r = ALLOCMEMORY(space, NULL, double, 256);
+ c = ALLOCMEMORY(space, NULL, Uint, 256);
+ memset(r, 0, sizeof(Uint)*256);
+ memset(c, 0, sizeof(Uint)*256);
+
+ for(j=0; j < frame->cs[pos].len; j++) {
+ r[(int)frame->cs[pos].chars[j]]
+ += frame->cs[pos].quals[j];
+ c[(int)frame->cs[pos].chars[j]]++;
+ }
+
+ for(j=0; j < 256; j++) {
+ if(c[j]) r[j] /= c[j];
+ }
+
+ FREEMEMORY(space, c);
+ return r;
+}
+
+
+/*------------------------- bl_matchfileGetNTReadPos -------------------------
+ *
+ * @brief get for each NT average read position in a cross section
+ * @author Steve Hoffmann
+ *
+ */
+
+
+double*
+bl_matchfileGetNTReadPos(matchfileFrame_t *frame, Uint pos) {
+ Uint j, *c;
+ double *r;
+
+ r = ALLOCMEMORY(space, NULL, double, 256);
+ c = ALLOCMEMORY(space, NULL, Uint, 256);
+ memset(r, 0, sizeof(Uint)*256);
+ memset(c, 0, sizeof(Uint)*256);
+
+ for(j=0; j < frame->cs[pos].len; j++) {
+ r[(int)frame->cs[pos].chars[j]]
+ += frame->cs[pos].readpos[j];
+ c[(int)frame->cs[pos].chars[j]]++;
+ }
+
+ for(j=0; j < 256; j++) {
+ if(c[j]) r[j] /= c[j];
+ }
+
+ FREEMEMORY(space, c);
+ return r;
+}
+
+
+/*----------------------- bl_matchfileGetNTReadPosVar ------------------------
+ *
+ * @brief get for each NT read position variance in a cross section pos
+ * @author Steve Hoffmann
+ *
+ */
+
+
+double*
+bl_matchfileGetNTReadPosVar(matchfileCross_t *cs) {
+ double *v, **r;
+ int *c, j;
+
+ r = ALLOCMEMORY(space, NULL, double*, 256);
+ c = ALLOCMEMORY(space, NULL, Uint, 256);
+ v = ALLOCMEMORY(space, NULL, double, 256);
+ memset(r, 0, sizeof(double*)*256);
+ memset(c, 0, sizeof(Uint)*256);
+ memset(v, 0, sizeof(double)*256);
+
+ for(j=0; j < cs->len; j++) {
+
+ r[(int)cs->chars[j]] =
+ ALLOCMEMORY(space,
+ r[(int)cs->chars[j]], double,
+ c[(int)cs->chars[j]]+1);
+
+ r[(int)cs->chars[j]][c[(int)cs->chars[j]]]
+ = (double)cs->readpos[j] / (double)cs->readlen[j];
+
+ c[(int)cs->chars[j]]++;
+ }
+
+ for(j=0; j < 256; j++) {
+ if(c[j]) {
+ v[j] = var(r[j],c[j]);
+ }
+ FREEMEMORY(space, r[j]);
+ }
+
+ FREEMEMORY(space, r);
+ FREEMEMORY(space, c);
+ return v;
+}
+
+
+
+
+/*-------------------------- bl_matchfileGetNTEdist --------------------------
+ *
+ * @brief for each cross section in the frame:
+ * Get the average edist of reads in the cross section
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+bl_matchfileGetNTEdist(matchfileFrame_t *frame, Uint pos) {
+ Uint j, *c;
+ double *e;
+
+ e = ALLOCMEMORY(space, NULL, double, 256);
+ c = ALLOCMEMORY(space, NULL, Uint, 256);
+ memset(e, 0, sizeof(Uint)*256);
+ memset(c, 0, sizeof(Uint)*256);
+
+
+ for(j=0; j < frame->cs[pos].len; j++) {
+ e[(int)frame->cs[pos].chars[j]]
+ += (int)frame->cs[pos].edist[j];
+ c[(int)frame->cs[pos].chars[j]]++;
+ }
+
+ for(j=0; j < 256; j++) {
+ if(c[j]) e[j] /= c[j];
+ }
+
+ FREEMEMORY(space, c);
+ return e;
+}
+
+
+
+/*---------------------- bl_matchfileGetRegionMotif ----------------------
+ *
+ * @brief get the error e of a of a region of cross section based
+ * (not reference)
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetRegionMotif (matchfileFrame_t *frame,
+ Uint pos, Uint left, Uint right)
+{
+ Uint i, l, r, motif=0;
+
+
+ assert(frame->width > right);
+
+ if(left >= pos) {
+ return -1;
+ } else {
+ if(pos + right >= frame->width-1) {
+ return -1;
+ } else {
+ l = pos - left;
+ r = pos + right;
+ }
+ }
+
+
+ for(i=r; i >= l; i--) {
+ switch(frame->cs[i-1].cons) {
+ case 'A':
+ // motif += 0* pow(4,(r-i))
+ break;
+ case 'C':
+ motif += 1*pow(4,(r-i));
+ break;
+ case 'G':
+ motif += 2*pow(4,(r-i));
+ break;
+ case 'T':
+ motif += 3*pow(4,(r-i));
+ break;
+ default:
+ return -1;
+ break;
+ }
+ }
+
+ return motif;
+}
+
+/*---------------------- bl_matchfileGetRegionConsError ----------------------
+ *
+ * @brief get the error e of a of a region of cross section based
+ * (not reference)
+ * @author Steve Hoffmann
+ *
+ */
+
+ double
+bl_matchfileGetRegionConsError (matchfileFrame_t *frame,
+ Uint pos, Uint range)
+{
+ Uint i, j, l, r;
+ double e=0, ef=0;
+
+ assert(frame->width > range);
+
+ if(range/2 >= pos) {
+ l = 0;
+ r = range;
+ } else {
+ if(range/2 + pos >= frame->width-1) {
+ l = frame->width-1-range;
+ r = frame->width-1;
+ } else {
+ l = pos - range/2;
+ r = pos + range/2-1;
+ }
+ }
+
+
+ for(i=l; i < r; i++) {
+
+ for(j=0, e=0; j < frame->cs[i].len; j++) {
+ if(frame->cs[i].chars[j] != frame->cs[i].cons)
+ e++;
+ }
+
+ if(frame->cs[i].len) {
+ e /= (double)frame->cs[i].len;
+ ef += e;
+ }
+ }
+
+ return ef/(double) range;
+}
+
+
+/*---------------------------- bl_matchfileFitGEV ----------------------------
+ *
+ * @brief fit a generalized extreme value distribution to raw and adjust errors
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileFitGEV (void *space, matchfileSampleStats_t *stats)
+{
+
+ double m=.0, s=.0, k=.0;
+ double mx=.0, sx=.0, kx=.0;
+
+ //NFO("evaluating sample stats with %d values (raw error).\n", stats->e_N);
+ qsort(stats->eraw, stats->e_N, sizeof(double), cmp_dbl_qsort);
+
+ gevLmoment(stats->eraw, stats->e_N, &m, &s, &k);
+ NFO("lmoment m:%f, s:%f, xi:%f \n", m, s, -k);
+ NFO("log-likelihood:%f\n", gevll( stats->eraw, stats->e_N, m, s, k));
+ gevmle(NULL, stats->eraw, stats->e_N, &m, &s, &k, 10000, stats->eraw[0],
+ stats->eraw[stats->e_N-1]);
+ NFO("gev m:%f, s:%f, xi:%f \n", m, s, -k);
+ NFO("log-likelihood:%f\n", gevll( stats->eraw, stats->e_N, m, s, k));
+ NFO("cdf example for mu%f =%f\n", mx, gevcdf(m, m, s, -k ));
+ NFO("cdf example for 0.1 %f\n", gevcdf(0.1, m, s, -k ));
+ NFO("cdf example for 0.2 %f\n", gevcdf(0.2, m, s, -k ));
+ NFO("cdf example for 0.3 %f\n", gevcdf(0.3, m, s, -k ));
+ NFO("cdf example for 0.4 %f\n", gevcdf(0.4, m, s, -k ));
+ NFO("cdf example for 0.5 %f\n", gevcdf(0.5, m, s, -k ));
+ NFO("cdf example for 0.6 %f\n", gevcdf(0.6, m, s, -k ));
+
+ stats->gev_mu[0] = m;
+ stats->gev_si[0] = s;
+ stats->gev_xi[0] = -k;
+ stats->gev_ll[0] = gevll(stats->eraw, stats->e_N, mx, sx, kx);
+
+ NFO("evaluating sample stats with %d values (adj error).\n", stats->e_N);
+ qsort(stats->e, stats->e_N, sizeof(double), cmp_dbl_qsort);
+ gevLmoment(stats->e, stats->e_N, &mx, &sx, &kx);
+
+ NFO("lmoment m:%f, s:%f, xi:%f \n", mx, sx, -kx);
+ NFO("log-likelihood:%f\n", gevll(stats->e, stats->e_N, mx, sx, kx));
+
+ gevmle(NULL, stats->e, stats->e_N, &mx, &sx, &kx, 10000, stats->e[0],
+ stats->e[stats->e_N-1]);
+ NFO("gev m:%f, s:%f, xi:%f \n", mx, sx, -kx);
+ NFO("log-likelihood:%f\n", gevll( stats->e, stats->e_N, mx, sx, kx));
+ NFO("variance:%f (ll:%f)\n",gevvar(mx,sx,-kx), gevcdf(gevvar(mx,sx,-kx)+mx,
+ mx, sx, -kx));
+ NFO("cdf example for mu%f =%f\n", mx, gevcdf(mx, mx, sx, -kx ));
+ NFO("cdf example for 0.1 %f\n", gevcdf(0.1, mx, sx, -kx ));
+ NFO("cdf example for 0.2 %f\n", gevcdf(0.2, mx, sx, -kx ));
+ NFO("cdf example for 0.3 %f\n", gevcdf(0.3, mx, sx, -kx ));
+ NFO("cdf example for 0.4 %f\n", gevcdf(0.4, mx, sx, -kx ));
+ NFO("cdf example for 0.5 %f\n", gevcdf(0.5, mx, sx, -kx ));
+ NFO("cdf example for 0.6 %f\n", gevcdf(0.6, mx, sx, -kx ));
+
+ stats->gev_mu[1] = mx;
+ stats->gev_si[1] = sx;
+ stats->gev_xi[1] = -kx;
+ stats->gev_ll[1] = gevll(stats->e, stats->e_N, mx, sx, kx);
+
+
+ return ;
+}
+
+
+/*----------------------- bl_matchfileInitSampleStats ------------------------
+ *
+ * @brief initialize sample stats
+ * @author Steve Hoffmann
+ *
+ */
+
+matchfileSampleStats_t*
+bl_matchfileInitSampleStats (void *space, Uint maxsample, Uint mincover,
+ Uint maxcover, double minfrac, char entropyfilter, Uint areasize, double maxareae)
+{
+
+ matchfileSampleStats_t *stats;
+
+ stats = ALLOCMEMORY(space, NULL, matchfileSampleStats_t, 1);
+ stats->standardized = 0;
+ stats->n = maxsample;
+ stats->px = .0;
+ stats->pxx = .0;
+ stats->b_ll = .0;
+ stats->e = ALLOCMEMORY(space, NULL, double, maxsample);
+ stats->b = ALLOCMEMORY(space, NULL, double, maxsample);
+ stats->s= ALLOCMEMORY(space, NULL, double, maxsample);
+ stats->eraw = ALLOCMEMORY(space, NULL, double, maxsample);
+ stats->entropy = ALLOCMEMORY(space, NULL, double, maxsample);
+
+ stats->entropydensity = NULL;
+ stats->entropydensitystep = 0;
+ stats->entropydensitylen = 0;
+ stats->mincover = mincover;
+ stats->maxcover = maxcover;
+ stats->areasize = areasize;
+ stats->maxareae = maxareae;
+ stats->minfrac = minfrac;
+ stats->entropyfilter = entropyfilter;
+ stats->e_N = 0;
+ stats->s_N= 0;
+ stats->px = .0;
+ stats->V_N = 0;
+ stats->V_mu = .1;
+ stats->V_sd = .1;
+ stats->V_ll = 0;
+ stats->Vx_N = 0;
+ stats->Vx_mu = .1;
+ stats->Vx_sd = .1;
+ stats->Vx_ll = 0;
+ stats->P = 0;
+ stats->X = 0;
+ stats->N = 0;
+
+ stats->RR_N = 0;
+ stats->RR = ALLOCMEMORY(space, NULL, Uint, 11);
+ memset(stats->RR, 0, sizeof(Uint)*11);
+
+ stats->MM_N = 0;
+ stats->MM = ALLOCMEMORY(space, NULL, Uint, 51);
+ memset(stats->MM, 0, sizeof(Uint)*51);
+
+ stats->e_mu = ALLOCMEMORY(space, NULL, double, 2);
+ stats->e_sd = ALLOCMEMORY(space, NULL, double, 2);
+ stats->gev_mu = ALLOCMEMORY(space, NULL, double, 2);
+ stats->gev_si = ALLOCMEMORY(space, NULL, double, 2);
+ stats->gev_xi = ALLOCMEMORY(space, NULL, double, 2);
+ stats->gev_ll = ALLOCMEMORY(space, NULL, double, 2);
+
+ stats->e_mu[0] = 0.1;
+ stats->e_mu[1] = 0.6;
+ stats->e_sd[0] = 0.1;
+ stats->e_sd[1] = 0.6;
+ stats->e_ll =.0;
+
+ stats->gev_mu[0] = 0.044763;
+ stats->gev_mu[1] = 0.020171;
+ stats->gev_si[0] = 0.022864;
+ stats->gev_si[1] = 0.031077;
+ stats->gev_xi[0] = 0.212219;
+ stats->gev_xi[1] = -0.041355;
+ stats->gev_ll[0] = 6291.208397;
+ stats->gev_ll[1] = 5908.074411;
+
+
+ stats->S = ALLOCMEMORY(space, NULL, double, 6*6);
+ stats->S_N = ALLOCMEMORY(space, NULL, Uint, 6);
+ stats->Sx = ALLOCMEMORY(space, NULL, double, 6*6);
+ stats->Sx_N = ALLOCMEMORY(space, NULL, double, 6);
+ stats->R = ALLOCMEMORY(space, NULL, Uint, 100*255);
+ stats->R_N = ALLOCMEMORY(space, NULL, Uint, 100*255);
+ stats->V = ALLOCMEMORY(space, NULL, double, maxsample);
+ stats->Vx = ALLOCMEMORY(space, NULL, double, maxsample);
+ stats->RP = ALLOCMEMORY(space, NULL, Uint, 100);
+ stats->RP_N = ALLOCMEMORY(space, NULL, Uint, 100);
+ stats->RQ = ALLOCMEMORY(space, NULL, Uint, 255);
+ stats->RQ_N = ALLOCMEMORY(space, NULL, Uint, 255);
+ stats->MO = ALLOCMEMORY(space, NULL, Uint, 1024);
+ stats->MO_N = ALLOCMEMORY(space, NULL, Uint, 1024);
+ memset(stats->S, 0, sizeof(double)*(6*6));
+ memset(stats->S_N, 0, sizeof(Uint)*6);
+ memset(stats->Sx, 0, sizeof(double)*(6*6));
+ memset(stats->Sx_N, 0, sizeof(Uint)*6);
+ memset(stats->R, 0, sizeof(Uint)*(100*255));
+ memset(stats->RP, 0, sizeof(Uint)*(100));
+ memset(stats->RQ, 0, sizeof(Uint)*(255));
+ memset(stats->R_N, 0, sizeof(Uint)*(100*255));
+ memset(stats->RP_N, 0, sizeof(Uint)*(100));
+ memset(stats->RQ_N, 0, sizeof(Uint)*(255));
+ memset(stats->MO, 0, sizeof(Uint)*(1024));
+ memset(stats->MO_N, 0, sizeof(Uint)*(1024));
+
+ memset(stats->V, 0, sizeof(double)*maxsample);
+ memset(stats->Vx, 0, sizeof(double)*maxsample);
+ memset(stats->s, 0, sizeof(double)*maxsample);
+ return stats;
+}
+
+
+/*--------------------- bl_matchfileDestructSampleStats ----------------------
+ *
+ * @brief destruct sample stats
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDestructSampleStats (void *space, matchfileSampleStats_t *stats)
+{
+
+ FREEMEMORY(space, stats->entropydensity);
+ FREEMEMORY(space, stats->e);
+
+ //if(stats->b) FREEMEMORY(space, stats->b);
+ //if(stats->s) FREEMEMORY(space, stats->s);
+
+ FREEMEMORY(space, stats->eraw);
+ FREEMEMORY(space, stats->entropy);
+ FREEMEMORY(space, stats->e_mu);
+ FREEMEMORY(space, stats->e_sd);
+ FREEMEMORY(space, stats->S);
+ FREEMEMORY(space, stats->S_N);
+ FREEMEMORY(space, stats->Sx);
+ FREEMEMORY(space, stats->Sx_N);
+ FREEMEMORY(space, stats->R);
+ FREEMEMORY(space, stats->R_N);
+ FREEMEMORY(space, stats->V);
+ FREEMEMORY(space, stats->Vx);
+ FREEMEMORY(space, stats->RP);
+ FREEMEMORY(space, stats->RQ);
+ FREEMEMORY(space, stats->RP_N);
+ FREEMEMORY(space, stats->RQ_N);
+ FREEMEMORY(space, stats->RR);
+ FREEMEMORY(space, stats->MM);
+ FREEMEMORY(space, stats->MO);
+ FREEMEMORY(space, stats->MO_N);
+
+ if(stats->ecdf) {
+ ecdf_destruct(stats->ecdf);
+ FREEMEMORY(space, stats->ecdf);
+ }
+
+ if(stats->gev_mu) {
+ FREEMEMORY(space, stats->gev_mu);
+ FREEMEMORY(space, stats->gev_si);
+ FREEMEMORY(space, stats->gev_xi);
+ FREEMEMORY(space, stats->gev_ll);
+ }
+
+ return ;
+}
+
+/*-------------------------- bl_matchfileFrameStats --------------------------
+ *
+ * @brief get descriptive statistics for a frame
+ * @author Steve Hoffmann
+ *
+ */
+
+
+matchfileFrameStats_t *
+bl_matchfileFrameStats (void *space, matchfileFrame_t *frame) {
+ Uint j, pos=0, ch, qu, rp, mc, rss, *c, *f, *n,
+ *D, D_ymax=0, D_ymax_1=0, D_xmax=0, noofstarts=0;
+ matchfileFrameStats_t* stats;
+ int **s;
+ double *e, *r, *v, *y, *d, x=0;
+
+ stats = ALLOCMEMORY(space, NULL, matchfileFrameStats_t, 1);
+ v = ALLOCMEMORY(space, NULL, double, 256);
+ d = ALLOCMEMORY(space, NULL, double, 256);
+ e = ALLOCMEMORY(space, NULL, double, 256);
+ c = ALLOCMEMORY(space, NULL, double, 256);
+ r = ALLOCMEMORY(space, NULL, double, 256);
+ y = ALLOCMEMORY(space, NULL, double, 256);
+ f = ALLOCMEMORY(space, NULL, Uint, 256);
+ D = ALLOCMEMORY(space, NULL, Uint, MAX_STARTSITES);
+
+ memset(v, 0, sizeof(double)*256);
+ memset(c, 0, sizeof(double)*256);
+ memset(r, 0, sizeof(double)*256);
+ memset(y, 0, sizeof(double)*256);
+ memset(d, 0, sizeof(double)*256);
+ memset(f, 0, sizeof(Uint) *256);
+ memset(D, 0, sizeof(Uint)*MAX_STARTSITES);
+
+ for(j=0; j < 256; j++) e[j] = log10(0);
+
+ for(pos=0; pos < frame->width; pos++) {
+
+ x += frame->cs[pos].len;
+ s = ALLOCMEMORY(space, NULL, int*, 256);
+ n = ALLOCMEMORY(space, NULL, Uint, 256);
+ memset(s, 0, sizeof(int*)*256);
+ memset(n, 0, sizeof(Uint)*256);
+ f[(int)frame->cs[pos].ref]++;
+ rss = frame->cs[pos].starts;
+ noofstarts += rss;
+
+ if(rss >= MAX_STARTSITES) {
+ rss = MAX_STARTSITES-1;
+ D_xmax = MAX_STARTSITES;
+ } else {
+ D_xmax = (rss > D_xmax) ? rss: D_xmax;
+ }
+
+ D[rss]++;
+ if(D[rss] > D_ymax) {
+ D_ymax = D[rss];
+ }
+
+ if(rss > 0 && D[rss] >D_ymax_1) {
+ D_ymax_1 = D[rss];
+ }
+
+ for(j=0; j < frame->cs[pos].len; j++) {
+ ch = frame->cs[pos].chars[j];
+ qu = frame->cs[pos].quals[j];
+ rp = frame->cs[pos].readpos[j];
+ mc = frame->cs[pos].matchcnt[j];
+
+
+ if(ch != frame->cs[pos].ref) {
+ d[ch]++;
+ }
+
+ s[ch] = ALLOCMEMORY(space, s[ch], Uint, n[ch]+1);
+ s[ch][n[ch]] = rp;
+ e[ch] = log10add(e[ch],(qu/-10.));
+ r[ch] += rp;
+ y[ch] += mc;
+ c[ch]++;
+ n[ch]++;
+ }
+
+ for(j=0; j < 256; j++) {
+ if(n[j]) v[j] += sqrt(var_int(s[j],n[j]));
+ FREEMEMORY(space, s[j]);
+ }
+
+ FREEMEMORY(space, n);
+ FREEMEMORY(space, s);
+ }
+
+ for(j=0; j < 256; j++) {
+ if(c[j]) e[j] -= log10(c[j]);
+ if(c[j]) r[j] /= c[j];
+ if(c[j]) y[j] /= c[j];
+ if(c[j]) v[j] /= f[j];
+ if(c[j]) d[j] /= c[j];
+ }
+
+
+ stats->ntcnt = c;
+ stats->mean_err = e;
+ stats->mean_sde = v;
+ stats->mean_pos = r;
+ stats->mean_mul = y;
+ stats->mean_dis = d;
+ stats->char_frq = f;
+ stats->mean_cov = x/frame->width;
+ stats->dist_rss_xmax = D_xmax;
+ stats->dist_rss_ymax = D_ymax;
+ stats->dist_rss_ymax_1 = D_ymax_1;
+ stats->dist_rss = D;
+ stats->rss = noofstarts;
+
+ return stats;
+}
+
+
+
+/*------------------------ bl_matchfileDestructFrame -------------------------
+ *
+ * @brief wrap the frame
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDestructFrame(void *space, matchfileFrame_t *frame)
+{
+
+ bl_matchfileDestructCross(space, frame->cs, frame->width);
+ FREEMEMORY(space, frame->cs);
+ FREEMEMORY(space, frame);
+
+ return ;
+}
+
+/*---------------------- bl_matchfileDestructFrameStats ----------------------
+ *
+ * @brief destruct the frame statistics structure
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_matchfileDestructFrameStats(void *space, matchfileFrameStats_t *stats) {
+ FREEMEMORY(space, stats->mean_err);
+ FREEMEMORY(space, stats->mean_sde);
+ FREEMEMORY(space, stats->mean_pos);
+ FREEMEMORY(space, stats->mean_mul);
+ FREEMEMORY(space, stats->mean_dis);
+ FREEMEMORY(space, stats->dist_rss);
+ FREEMEMORY(space, stats->char_frq);
+ FREEMEMORY(space, stats->ntcnt);
+}
+
+/*------------------------ bl_matchfileDumpSampleStats -------------------------
+ *
+ * @brief dump the stats
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchfileDumpSampleStats (matchfileSampleStats_t *stats)
+{
+
+ double mx =.0 , sx = .0, kx = .0;
+ //double ws[]={0.95, 0.05};
+ Uint i;
+ //Uint j;
+ //
+/*
+ stats->V_mu = .1;
+ stats->V_sd = .1;
+ for(i=0; i < stats->entropydensitylen; i++) {
+ fprintf(stderr, "%.6f\t%.6f\n", (double)i*0.05, stats->entropydensity[i]);
+ }
+stats->V_ll = 0;
+ stats->Vx_mu = .1;
+ stats->Vx_sd = .1;
+ stats->Vx_ll = 0;
+
+ ws[0] = 1.0;
+ fprintf(stderr, "fitting %d\n", stats->V_N);
+
+ stats->V_ll=gmm(NULL, stats->V, stats->V_N, 1,
+ &stats->V_mu, &stats->V_sd, ws, 1, 100000);
+
+ stats->Vx_ll=gmm(NULL, stats->Vx, stats->Vx_N, 1,
+ &stats->Vx_mu, &stats->Vx_sd, ws, 1, 100000);
+
+*/
+ for(i=0; i < stats->e_N; i++) {
+ fprintf(stderr, "%.6f\n", stats->e[i]);
+ }
+
+ fprintf(stderr, "-------------\n");
+
+ for(i=0; i < stats->s_N; i++) {
+ fprintf(stderr, "%.6f\n", stats->s[i]);
+ }
+
+ // return;
+
+
+ fprintf(stderr, "px: %f\n",stats->px);
+ fprintf(stderr, "pxx: %f\n",stats->pxx);
+
+ fprintf(stderr, "edensity:\n");
+
+ fprintf(stderr, "mu={%f }, sd=%f, ll=%f\n",
+ stats->e_mu[0], stats->e_sd[0], stats->e_ll);
+
+ fprintf(stderr, "mu={%f }, sd=%f, ll=%f\n",
+ stats->e_mu[1], stats->e_sd[1], stats->e_ll);
+
+
+ /*
+ fprintf(stderr, "noise matrix\n");
+
+ for(i=30; i < 100; i++) {
+ for(j=0; j < 100; j++) {
+ if(MATRIX2D(stats->R_N, 255, j, i))
+ fprintf(stderr, "%d %d %f\n", i, j,
+ (double)MATRIX2D(stats->R, 255, j, i)/MATRIX2D(stats->R_N, 255, j, i));
+ }
+ }
+ */
+
+ fprintf(stderr, "P=%d, X=%d, N=%d\n", stats->P, stats->X, stats->N);
+ fprintf(stderr, "P(X)=%f log:%f\n",(double)stats->X/stats->N, log((double)stats->X/stats->N));
+ fprintf(stderr, "P(N)=%f log:%f\n",(double)stats->P/stats->N, log((double)stats->P/stats->N));
+
+ if(stats->MO) {
+ fprintf(stderr, "motif");
+ for(i=0; i < 1024; i++) {
+ if(stats->MO_N[i]) {
+ fprintf(stderr, "%d %d %d %f\n", i, stats->MO[i], stats->MO_N[i], log((double)stats->MO[i]/(double)stats->MO_N[i]));
+ }
+ }
+ }
+
+ fprintf(stderr, "readpos\n");
+ for(i=0; i < 100; i++) {
+ if(stats->RP_N[i])
+ fprintf(stderr, "%d %d %d %f\n", i, stats->RP[i], stats->RP_N[i], log((double)stats->RP[i]/(double)stats->RP_N[i]));
+ }
+
+ fprintf(stderr, "readqual\n");
+ for(i=0; i < 255; i++) {
+ if(stats->RQ_N[i])
+ fprintf(stderr, "%d %d %d %f\n", i, stats->RQ[i], stats->RQ_N[i], (double)stats->RQ[i] / stats->RQ_N[i]);
+ }
+
+ fprintf(stderr, "readerror\n");
+ for(i=0; i < 11; i++) {
+ fprintf(stderr, "%d %d %d %f\n", i, stats->RR[i], stats->RR_N, (double)stats->RR[i] / stats->RR_N);
+ }
+
+ fprintf(stderr, "multiple matches\n");
+ for(i=0; i < 50; i++) {
+ fprintf(stderr, "%d %d %d %f\n", i, stats->MM[i], stats->MM_N, (double)stats->MM[i] / stats->MM_N);
+ }
+
+ fprintf(stderr, "readstartvar\n");
+ for(i=0; i < stats->V_N; i++) {
+ fprintf(stderr, "%f\n", stats->V[i]);
+ }
+
+ fprintf(stderr, "readstartvar gaussian model\n");
+
+ fprintf(stderr, "mu=%f, sd=%f, ll=%f\n",
+ stats->V_mu, stats->V_sd, stats->V_ll);
+
+ fprintf(stderr, "readstartvar X");
+
+ for(i=0; i < stats->Vx_N; i++) {
+ fprintf(stderr, "%f\n", stats->Vx[i]);
+ }
+
+ fprintf(stderr, "readstartvar X gaussian model\n");
+
+ fprintf(stderr, "mu=%f, sd=%f, ll=%f\n",
+ stats->Vx_mu, stats->Vx_sd, stats->Vx_ll);
+
+ if(stats->b) {
+ fprintf(stderr, "strand bias\n");
+ for(i=0; i < stats->e_N; i++) {
+ fprintf(stderr, "%.6f\n", stats->b[i]);
+ }
+ }
+
+ fprintf(stderr, "raw error\n");
+
+ for(i=0; i < stats->e_N; i++) {
+ fprintf(stderr, "%.6f\n", stats->eraw[i]);
+ }
+
+ fprintf(stderr, "adjust error\n");
+
+ for(i=0; i < stats->e_N; i++) {
+ fprintf(stderr, "%.6f\n", stats->e[i]);
+ }
+
+ fprintf(stderr, "entropy\n");
+
+ for(i=0; i < stats->e_N; i++) {
+ fprintf(stderr, "%.6f\n", stats->entropy[i]);
+ }
+
+ fprintf(stderr, "entropydensity: %d\n", stats->entropydensitylen);
+
+ for(i=0; i < stats->entropydensitylen; i++) {
+ fprintf(stderr, "%.6f\t%.6f\n", (double)i*0.05, stats->entropydensity[i]);
+ }
+
+ qsort(stats->eraw, stats->e_N, sizeof(double), cmp_dbl_qsort);
+ gevLmoment(stats->eraw, stats->e_N, &mx, &sx, &kx);
+
+ NFO("lmoment m:%f, s:%f, xi:%f \n", mx, sx, -kx);
+ NFO("log-likelihood:%f\n", gevll(stats->eraw, stats->e_N, mx, sx, kx));
+
+ gevmle(NULL, stats->eraw, stats->e_N, &mx, &sx, &kx, 10000, stats->eraw[0], stats->eraw[stats->e_N-1]);
+ NFO("gev m:%f, s:%f, xi:%f \n", mx, sx, -kx);
+ NFO("log-likelihood:%f\n", gevll(stats->eraw, stats->e_N, mx, sx, kx));
+
+ stats->px = mx;
+ while(gevcdf(stats->px, mx, sx, -kx) < 0.99)
+ stats->px += 0.00001;
+
+ fprintf(stderr, "eraw px: %f\n", stats->px);
+
+ qsort(stats->e, stats->e_N, sizeof(double), cmp_dbl_qsort);
+ gevLmoment(stats->e, stats->e_N, &mx, &sx, &kx);
+
+ NFO("lmoment m:%f, s:%f, xi:%f \n", mx, sx, -kx);
+ NFO("log-likelihood:%f\n", gevll(stats->e, stats->e_N, mx, sx, kx));
+ gevmle(NULL, stats->e, stats->e_N, &mx, &sx, &kx, 10000, stats->e[0], stats->e[stats->e_N-1]);
+ NFO("gev m:%f, s:%f, xi:%f \n", mx, sx, -kx);
+ NFO("log-likelihood:%f\n", gevll(stats->e, stats->e_N, mx, sx, kx));
+
+ stats->px = mx;
+ while(gevcdf(stats->px, mx, sx, -kx) < 0.99)
+ stats->px += 0.00001;
+
+ fprintf(stderr, "99: %f\n", stats->px);
+
+ stats->px = mx;
+ while(gevcdf(stats->px, mx, sx, -kx) < 0.95)
+ stats->px += 0.00001;
+
+ fprintf(stderr, "95: %f\n", stats->px);
+
+ stats->px = mx;
+ while(gevcdf(stats->px, mx, sx, -kx) < 0.90)
+ stats->px += 0.00001;
+
+ fprintf(stderr, "90: %f\n", stats->px);
+
+
+ return ;
+}
+/*------------------------ bl_matchfileTabulateFull -------------------------
+ *
+ * @brief full tabulation of all factors for the calculation of snvs
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileTabulateFull (FILE *dev, Uint pos, matchfileCrossStats_t* cvcss, matchfileCrossStats_t* cnvcss, matchfileCrossStats_t* rvcss, matchfileCrossStats_t* rnvcss,
+ matchfileCross_t *cs, matchfileSampleStats_t *stats, char ref, char cons, double p_cons, double p_ref, double p_consx, double p_refx)
+{
+ double *nv;
+ char *ch, *rq, *st;
+ unsigned char *ed;
+ uint32_t *rp, *mc, i, rpos, *rl, errors=0;
+ double e;
+ double *rvrt, *rvrq, *rvrr, *rvrv, *rvmm, rvsp;
+ double *rnrt, *rnrq, *rnrr, *rnrv, *rnmm, rnsp;
+ double *cvrt, *cvrq, *cvrr, *cvrv, *cvmm, cvsp;
+ double *cnrt, *cnrq, *cnrr, *cnrv, *cnmm, cnsp;
+ double rvee, cvee, rnee, cnee;
+
+ ch = cs->chars;
+ st = cs->strands;
+ rq = cs->quals;
+ ed = cs->edist;
+ rp = cs->readpos;
+ nv = bl_matchfileGetNTReadPosVar(cs);
+ mc = cs->matchcnt;
+ rl = cs->readlen;
+
+ rvrt = rvcss->var_rt;
+ double rvrt_sum = .0;
+ rvrq = rvcss->var_rq;
+ double rvrq_sum = .0;
+ rvrr = rvcss->var_rr;
+ double rvrr_sum = .0;
+ rvrv = rvcss->var_rv;
+ double rvrv_sum = .0;
+ rvmm = rvcss->var_mm;
+ double rvmm_sum = .0;
+ rvee = rvcss->var_ee;
+ rvsp = rvcss->strandpenalty;
+
+ cvrt = cvcss->var_rt;
+ cvrq = cvcss->var_rq;
+ cvrr = cvcss->var_rr;
+ cvrv = cvcss->var_rv;
+ cvmm = cvcss->var_mm;
+ cvee = cvcss->var_ee;
+ cvsp = cvcss->strandpenalty;
+
+ rnrt = rnvcss->var_rt;
+ double rnrt_sum = .0;
+ rnrq = rnvcss->var_rq;
+ double rnrq_sum = .0;
+ rnrr = rnvcss->var_rr;
+ double rnrr_sum = .0;
+ rnrv = rnvcss->var_rv;
+ double rnrv_sum = .0;
+ rnmm = rnvcss->var_mm;
+ double rnmm_sum = .0;
+ rnee = rnvcss->var_ee;
+ rnsp = .0;
+
+ cnrt = cnvcss->var_rt;
+ cnrq = cnvcss->var_rq;
+ cnrr = cnvcss->var_rr;
+ cnrv = cnvcss->var_rv;
+ cnmm = cnvcss->var_mm;
+ cnee = cnvcss->var_ee;
+ cnsp = .0;
+
+
+
+ fprintf(dev, "%d\t%c\t%c\t%f\t%f\t%f\t%f\t", pos, ref, cons, p_cons,
+ p_ref, p_consx, p_refx);
+
+ for(i=0; i < cs->len; i++) {
+ fprintf(dev, "%c", ch[i]);
+ if((int)ch[i] != (int)ref && ch[i] != 'N') {
+ errors++;
+ }
+ }
+ fprintf(dev, "\t");
+
+ for(i=0; i < cs->len; i++) {
+ fprintf(dev, "%c", st[i]);
+ }
+ fprintf(dev, "\t");
+
+
+ e = (double)errors/(double)cs->len;
+
+ fprintf(dev, "%d\t%f\t", errors, (double)errors/(double)cs->len);
+
+ for(i=0; i < cs->len; i++) fprintf(dev, "%c", rq[i]);
+ fprintf(dev, "\t");
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%d,", ed[i]);
+ fprintf(dev, "%d\t", ed[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%d,", rp[i]);
+ fprintf(dev, "%d\t", rp[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%d,", mc[i]);
+ fprintf(dev, "%d\t", mc[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) {
+ rpos = trunc(((double)(((double)rp[i]*100.0)/((double)rl[i]))));
+ fprintf(dev, "%d,", rpos);
+ }
+ rpos = trunc(((double)(((double)rp[cs->len-1]*100.0)/((double)rl[cs->len-1]))));
+ fprintf(dev, "%d\t", rpos);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", nv[(int)ch[i]]);
+ fprintf(dev, "%f\t", nv[(int)ch[i]]);
+
+
+ for(i=0; i < cs->len-1; i++) {
+ rvrt_sum += rvrt[i];
+ fprintf(dev, "%f,", rvrt[i]);
+ }
+ rvrt_sum += rvrt[i];
+ fprintf(dev, "%f\t", rvrt[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) {
+ rvrq_sum += rvrq[i];
+ fprintf(dev, "%f,", rvrq[i]);
+ }
+ rvrq_sum += rvrq[i];
+ fprintf(dev, "%f\t", rvrq[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) {
+ rvrr_sum += rvrr[i];
+ fprintf(dev, "%f,", rvrr[i]);
+ }
+ rvrr_sum += rvrr[i];
+ fprintf(dev, "%f\t", rvrr[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) {
+ rvrv_sum += rvrv[i];
+ fprintf(dev, "%f,", rvrv[i]);
+ }
+ rvrv_sum += rvrv[i];
+ fprintf(dev, "%f\t", rvrv[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) {
+ rvmm_sum += rvmm[i];
+ fprintf(dev, "%f,", rvmm[i]);
+ }
+ rvmm_sum += rvmm[i];
+ fprintf(dev, "%f\t", rvmm[cs->len-1]);
+
+ fprintf(dev, "%f\t", rvsp);
+
+ fprintf(dev, "%f\t", rvee);
+
+ for(i=0; i < cs->len-1; i++) {
+ rnrt_sum += rnrt[i];
+ fprintf(dev, "%f,", rnrt[i]);
+ }
+ rnrt_sum += rnrt[i];
+ fprintf(dev, "%f\t", rnrt[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) {
+ rnrq_sum += rnrq[i];
+ fprintf(dev, "%f,", rnrq[i]);
+ }
+ rnrq_sum += rnrq[i];
+ fprintf(dev, "%f\t", rnrq[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) {
+ rnrr_sum += rnrr[i];
+ fprintf(dev, "%f,", rnrr[i]);
+ }
+ rnrr_sum += rnrr[i];
+ fprintf(dev, "%f\t", rnrr[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) {
+ rnrv_sum += rnrv[i];
+ fprintf(dev, "%f,", rnrv[i]);
+ }
+ rnrv_sum += rnrv[i];
+ fprintf(dev, "%f\t", rnrv[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) {
+ rnmm_sum += rnmm[i];
+ fprintf(dev, "%f,", rnmm[i]);
+ }
+ rnmm_sum += rnmm[i];
+ fprintf(dev, "%f\t", rnmm[cs->len-1]);
+
+ fprintf(dev, "%f\t", rnsp);
+
+ fprintf(dev, "%f\t", rnee);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", cvrt[i]);
+ fprintf(dev, "%f\t", cvrt[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", cvrq[i]);
+ fprintf(dev, "%f\t", cvrq[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", cvrr[i]);
+ fprintf(dev, "%f\t", cvrr[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", cvrv[i]);
+ fprintf(dev, "%f\t", cvrv[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", cvmm[i]);
+ fprintf(dev, "%f\t", cvmm[cs->len-1]);
+
+ fprintf(dev, "%f\t", cvsp);
+
+ fprintf(dev, "%f\t", cvee);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", cnrt[i]);
+ fprintf(dev, "%f\t", cnrt[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", cnrq[i]);
+ fprintf(dev, "%f\t", cnrq[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", cnrr[i]);
+ fprintf(dev, "%f\t", cnrr[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", cnrv[i]);
+ fprintf(dev, "%f\t", cnrv[cs->len-1]);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", cnmm[i]);
+ fprintf(dev, "%f\t", cnmm[cs->len-1]);
+
+ fprintf(dev, "%f\t", cnsp);
+
+ fprintf(dev, "%f\t", cnee);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", (double)stats->RP[rpos]);
+ fprintf(dev, "%f\t", (double)stats->RP[rpos]);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", (double)stats->RP_N[rpos]);
+ fprintf(dev, "%f\t", (double)stats->RP_N[rpos]);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", (double)stats->RR[MIN(ed[i],10)]);
+ fprintf(dev, "%f\t", (double)stats->RR[MIN(ed[i],10)]);
+
+ fprintf(dev, "%f\t", (double)stats->RR_N);
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,",
+ univarnormcdf(nv[(int)ch[i]],
+ stats->V_mu-(stats->V_sd/SDFRACTION), stats->V_sd));
+ fprintf(dev, "%f\t", univarnormcdf(nv[(int)ch[i]],
+ stats->V_mu-(stats->V_sd/SDFRACTION), stats->V_sd));
+
+ for(i=0; i < cs->len-1; i++) fprintf(dev, "%f,", (double)stats->MM[MIN(mc[i],10)]);
+ fprintf(dev, "%f\t", (double)stats->MM[MIN(mc[i],10)]);
+
+ fprintf(dev, "%f\t", (double)stats->MM_N);
+
+ fprintf(dev, "%f\n", gevcdf(e, stats->gev_mu[0], stats->gev_si[0], stats->gev_xi[0]));
+
+
+ return ;
+}
+
+
+/*------------------------- bl_matchfilePrintVariant -------------------------
+ *
+ * @brief print variant
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileCrossStatsPrint (FILE *fp, matchfileCrossStats_t *css, matchfileCross_t *cs,
+ matchfileSampleStats_t *stats, char ref)
+{
+
+ double *nv;
+ char *ch, *rq;
+ unsigned char *ed;
+ uint32_t *rp, *mc, i, rpos, *rl;
+
+
+ rp = cs->readpos;
+ rq = cs->quals;
+ ed = cs->edist;
+ ch = cs->chars;
+ nv = bl_matchfileGetNTReadPosVar(cs);
+ mc = cs->matchcnt;
+ rl = cs->readlen;
+
+ fprintf(fp, "----stats----\n");
+
+ for(i=0; i < cs->len; i++) {
+
+ rpos = trunc(((double)(((double)rp[i]*100.0)/((double)rl[i]))));
+
+ fprintf(fp, "------------------------------");
+ fprintf(fp, "nucleotide %i\n", i);
+ fprintf(fp, "P(%c -> %c) = %f\n", ref, ch[i], css->var_s[i]);
+ fprintf(fp, "RP(%d)=%f\n", (int)rpos, css->var_rt[i]);
+ fprintf(fp, "RQ(%d)=%f (p:%f, %f)\n", (int)rq[i], css->var_rq[i], pow(10,((double)((double)rq[i]-64.0)/-10.0)), log(pow(10,((double)((double)rq[i]-64.0)/-10.0))) );
+// fprintf(fp, "(1-RQ + 1-RP)/2=%f\n", logadd(css->var_rt[i], css->var_rq[i]) - log(2));
+ fprintf(fp, "RR(%d)=%f\n", ed[i], css->var_rr[i]);
+ fprintf(fp, "RV(%f)=%f (mu: %f sd:%f)\n", nv[(int)ch[i]], css->var_rv[i],
+ stats->V_mu-(stats->V_sd/SDFRACTION), stats->V_sd);
+ fprintf(fp, "MM(%d)=%f (%d/%d )\n", mc[i], css->var_mm[i],
+ stats->MM[MIN(mc[i],10)], stats->MM_N);
+ fprintf(fp, "subtotal: %f\n", css->sub[i]);
+ }
+
+ fprintf(fp, "------------------------------");
+ fprintf(fp, "pentropy %f\n", css->pentropy);
+ fprintf(fp, "strandpenalty %f\n", css->strandpenalty);
+
+ return ;
+}
+
+/*----------------------------- bl_matchfileTestCheck -----------------------------
+ *
+ * @brief test
+ * @author Steve Hoffmann
+ *
+ *
+
+Uint
+bl_matchfileTestCheck(void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs,
+ char ref, matchfileSampleStats_t *stats, unsigned char show, void *nfo)
+{
+ matchfileCrossStats_t cnvcss, cvcss, rnvcss, rvcss;
+ FILE *fp = NULL;
+ double p_cons = .0, p_consx = .0, p_ref = .0, p_refx = .0, PX, P;
+ Uint *cnt, i;
+
+ cs->s_cons = 1;
+ cs->s_consx = 0;
+ cs->s_ref = 1;
+ cs->s_refx = 0;
+ cs->p_hom = log(0);
+
+ //override
+ //stats->maxcover = 10000;
+
+ fp = stdout;
+
+ if(!stats || cs->len < stats->mincover
+ || cs->len > stats->maxcover) {
+ return 0;
+ }
+
+ if(cs->len && !cs->cons){
+ cnt = bl_matchfileGetNTCounts(cs);
+ cs->cons = (char) uarraymax(cnt, 255);
+ FREEMEMORY(space, cnt);
+ }
+
+ for(i=0; i < cs->len; i++) {
+ if(cs->chars[i] != cs->cons ||
+ cs->chars[i] != cs->ref) break;
+ }
+
+ if(i == cs->len) {
+ return 0;
+ }
+
+ if(!stats->standardized) bl_matchfileGetStandardization (space, stats);
+
+ PX = log((double)stats->X/stats->N);
+ P = log((double)stats->P/stats->N);
+ //PX = MAX(PX, -3.5);
+ //P = MAX(P, -0.03);
+
+ PX = log(0.01);
+ P = log(0.99);
+
+ p_cons = bl_matchfileTestNonVariant (cs, stats, cs->cons, &cnvcss, stats->minrp, stats->maxrp, stats->minrq, stats->maxrq);
+ p_consx = bl_matchfileTestVariant (cs, stats, cs->cons, &cvcss, stats->minrp, stats->maxrp, stats->minrq, stats->maxrq);
+ p_ref = bl_matchfileTestNonVariant (cs, stats, ref, &rnvcss, stats->minrp, stats->maxrp, stats->minrq, stats->maxrq);
+ p_refx = bl_matchfileTestVariant (cs, stats, ref, &rvcss, stats->minrp, stats->maxrp, stats->minrq, stats->maxrq);
+
+
+ cs->p_cons = P+p_cons;
+ cs->p_consx = PX+p_consx;
+
+ cs->p_ref = P+p_ref;
+ cs->p_refx = PX+p_refx;
+
+
+ bl_matchfileTabulateFull (fp, pos, &cvcss, &cnvcss, &rvcss, &rnvcss, cs, stats,
+ ref, cs->cons, cs->p_cons, cs->p_ref, cs->p_consx, cs->p_refx);
+
+ bl_matchfileCrossStatsDestruct (&cnvcss);
+ bl_matchfileCrossStatsDestruct (&cvcss);
+ bl_matchfileCrossStatsDestruct (&rnvcss);
+ bl_matchfileCrossStatsDestruct (&rvcss);
+
+
+ return 0;
+
+}
+
+*/
+
+
+
+/*------------------------ bl_matchfileCrossStatsInit ------------------------
+ *
+ * @brief initalize cross stats
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileCrossStatsInit (matchfileCrossStats_t *css, Uint len)
+{
+
+ css->len = len;
+ css->var_s = ALLOCMEMORY(space, NULL, double, len);
+ css->var_rt = ALLOCMEMORY(space, NULL, double, len);
+ css->var_rq = ALLOCMEMORY(space, NULL, double, len);
+ css->var_rr = ALLOCMEMORY(space, NULL, double, len);
+ css->var_rv = ALLOCMEMORY(space, NULL, double, len);
+ css->var_mm = ALLOCMEMORY(space, NULL, double, len);
+ css->sub = ALLOCMEMORY(space, NULL, double, len);
+ css->var_ee = .0;
+ css->pentropy = .0;
+ css->strandpenalty = .0;
+
+ return ;
+}
+
+/*---------------------- bl_matchfileDestructCrossStats ----------------------
+ *
+ * @brief destruct cross stats
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileCrossStatsDestruct (matchfileCrossStats_t *css)
+{
+
+ if (css->var_s) FREEMEMORY(space, css->var_s);
+ if (css->var_rt) FREEMEMORY(space, css->var_rt);
+ if (css->var_rq) FREEMEMORY(space, css->var_rq);
+ if (css->var_rr) FREEMEMORY(space, css->var_rr);
+ if (css->var_rv) FREEMEMORY(space, css->var_rv);
+ if (css->var_mm) FREEMEMORY(space, css->var_mm);
+ if (css->sub) FREEMEMORY(space, css->sub);
+
+ return ;
+}
+
+/*----------------------- bl_matchfilePrintTestResult ------------------------
+ *
+ * @brief dump the test result
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchfileTestPrint (matchfileFrame_t *f, Uint p)
+{
+
+ char type;
+
+ if(!isinf(f->cs[p].p_hom)) {
+ type = 'H';
+ } else if(f->cs[p].p_consx > f->cs[p].p_cons &&
+ f->cs[p].p_refx > f->cs[p].p_ref) {
+ type = 'B';
+ } else if (f->cs[p].p_consx > f->cs[p].p_cons) {
+ type = 'C';
+ } else{
+ type = 'R';
+ }
+
+ printf("%s\t%d\t%c\t%c\t%c\t%s\t%f\t%f\t%f\t%f\t%f\t%f\t%d\t%d\t%f\t%f\t%f\n",
+ f->chrname, f->start+p, type,
+ f->ref[p], f->cs[p].cons,
+ f->cs[p].chars, f->cs[p].diff_rt, f->cs[p].diff_rq,
+ f->cs[p].diff_rr, f->cs[p].diff_mm,
+ f->cs[p].ee, f->cs[p].pee, f->cs[p].secondminimum, f->cs[p].secondcnt, f->cs[p].pbinom, f->cs[p].scr_cons, f->cs[p].scr_ref);
+
+ return ;
+}
+
+
diff --git a/segemehl/libs/evalmethylmatchfiles.c b/segemehl/libs/evalmethylmatchfiles.c
new file mode 100644
index 0000000..86f3997
--- /dev/null
+++ b/segemehl/libs/evalmethylmatchfiles.c
@@ -0,0 +1,732 @@
+
+
+/**
+ * evalmethylmatchfiles.c
+ * evaluation and statistics of matchfiles from methylC-seq
+ *
+ * @author Christian Otto & Helene Kretzmer
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Thu May 2 10:07:27 EDT 2013
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <time.h>
+#include "basic-types.h"
+#include "info.h"
+#include "debug.h"
+#include "sort.h"
+#include "vtprogressbar.h"
+#include "mathematics.h"
+#include "biofiles.h"
+#include "container.h"
+#include "matchfilesfields.h"
+#include "matchfiles.h"
+#include "evalmatchfiles.h"
+#include "evalmethylmatchfiles.h"
+#include "matfile.h"
+
+
+/*-------------------------- bl_matchfileGetBSCross ----------------------------
+ *
+ * @brief for each cross section in the frame:
+ * Get the count for all nucleotides
+ * @author Christian Otto
+ *
+ */
+matchfileCross_t *
+bl_matchfileGetBSCross(matchfileCross_t *cs) {
+ Uint i, j, *space=NULL, *cnt, max;
+ matchfileCross_t *bscs;
+
+ bscs = ALLOCMEMORY(space, NULL, matchfileCross_t, 1);
+ memset(bscs, 0, sizeof(matchfileCross_t));
+
+ if (cs->ref != 'C' && cs->ref != 'G')
+ return bscs;
+
+ bscs->chars = ALLOCMEMORY(space, NULL, char, cs->len);
+ bscs->quals = ALLOCMEMORY(space, NULL, char, cs->len);
+ bscs->strands = ALLOCMEMORY(space, NULL, char, cs->len);
+ bscs->feat = ALLOCMEMORY(space, NULL, char, cs->len);
+ bscs->readpos = ALLOCMEMORY(space, NULL, uint32_t, cs->len);
+ bscs->readlen = ALLOCMEMORY(space, NULL, uint32_t, cs->len);
+ bscs->row = ALLOCMEMORY(space, NULL, uint32_t, cs->len);
+ bscs->matchcnt = ALLOCMEMORY(space, NULL, uint32_t, cs->len);
+ bscs->edist = ALLOCMEMORY(space, NULL, unsigned char, cs->len);
+ bscs->bisulfite = ALLOCMEMORY(space, NULL, uint32_t, cs->len);
+
+ /*
+ * currently: no copy of dels (would be possible) &
+ * splits (is useless) &
+ * matelinks (would need further modifications
+ * on input reading method)
+ */
+ for (i = 0, j = 0; j < cs->len; j++){
+ if ((cs->ref == 'C' && cs->bisulfite[j] == 1) ||
+ (cs->ref == 'G' && cs->bisulfite[j] == 2)){
+
+ bscs->chars[i] = cs->chars[j];
+ bscs->quals[i] = cs->quals[j];
+ bscs->strands[i] = cs->strands[j];
+ bscs->feat[i] = cs->feat[j];
+ /* update: starts, ends */
+ if (cs->feat[j] == '*'){
+ bscs->starts++;
+ }
+ else if (cs->feat[j] == '$'){
+ bscs->ends++;
+ }
+ bscs->readpos[i] = cs->readpos[j];
+ bscs->readlen[i] = cs->readlen[j];
+ bscs->row[i] = cs->row[j];
+ /* update: maxrow */
+ if(bscs->maxrow < cs->row[j]) {
+ bscs->maxrow = cs->row[j];
+ }
+ bscs->edist[i] = cs->edist[j];
+ bscs->bisulfite[i] = cs->bisulfite[j];
+ i++;
+ }
+ }
+ /* update: ref, len */
+ bscs->ref = cs->ref;
+ bscs->len = i;
+ if (!bscs->len){
+ bl_matchfileDestructCross(space, bscs, 1);
+ memset(bscs, 0, sizeof(matchfileCross_t));
+
+ return bscs;
+ }
+ /* get consensus */
+ bscs->cons = '^';
+ cnt = bl_matchfileGetNTCounts(bscs);
+
+ max = uarraymax(cnt, 256);
+ bscs->cons = (char)max;
+
+ /* realloc space */
+ bscs->chars = ALLOCMEMORY(space, bscs->chars, char, bscs->len);
+ bscs->quals = ALLOCMEMORY(space, bscs->quals, char, bscs->len);
+ bscs->strands = ALLOCMEMORY(space, bscs->strands, char, bscs->len);
+ bscs->feat = ALLOCMEMORY(space, bscs->feat, char, bscs->len);
+ bscs->readpos = ALLOCMEMORY(space, bscs->readpos, uint32_t, bscs->len);
+ bscs->readlen = ALLOCMEMORY(space, bscs->readlen, uint32_t, bscs->len);
+ bscs->row = ALLOCMEMORY(space, bscs->row, uint32_t, bscs->len);
+ bscs->matchcnt = ALLOCMEMORY(space, bscs->matchcnt, uint32_t, bscs->len);
+ bscs->edist = ALLOCMEMORY(space, bscs->edist, unsigned char, bscs->len);
+ bscs->bisulfite = ALLOCMEMORY(space, bscs->bisulfite, uint32_t, bscs->len);
+
+ FREEMEMORY(space, cnt);
+
+ return bscs;
+}
+
+/*--------------------------- bl_matchfileGetBSStrand --------------------------
+ *
+ * @brief get bisulfite-relevant strand with given reference base
+ * @author Christian Otto
+ *
+ */
+Uint
+bl_matchfileGetBSStrand(char ref){
+ Uint strand = 0;
+
+ if (ref == 'C'){
+ strand = PLUS_STRAND;
+ }
+ else if (ref == 'G'){
+ strand = MINUS_STRAND;
+ }
+ else {
+ strand = 0;
+ }
+ return strand;
+}
+
+
+/*--------------------------- bl_matchfileGetBSBase ----------------------------
+ *
+ * @brief get converted or unconverted bisulfite-related bases
+ * w/r/t given strand
+ * @author Christian Otto
+ *
+ */
+char
+bl_matchfileGetBSBase(Uint strand, unsigned char conv){
+ if (!strand)
+ return 'N';
+
+ if (!conv){
+ return (strand == PLUS_STRAND) ? 'C' : 'G';
+ }
+ else {
+ return (strand == PLUS_STRAND) ? 'T' : 'A';
+ }
+}
+
+
+/*--------------------------- bl_matchfileGetBSCount ---------------------------
+ *
+ * @brief get count of converted or unconverted bisulfite-related bases
+ * w/r/t given strand
+ * @author Christian Otto
+ *
+ */
+Uint
+bl_matchfileGetBSCount(matchfileCross_t *bscs, Uint strand, unsigned char conv){
+ Uint *cnt, ret;
+ char ch;
+
+ if (!strand) return 0;
+
+ cnt = bl_matchfileGetNTCounts(bscs);
+ ch = bl_matchfileGetBSBase(strand, conv);
+ ret = cnt[(Uint) ch];
+
+ free(cnt);
+ return ret;
+}
+
+/*------------------------- bl_matchfileGetBSRateSimple ------------------------
+ *
+ * @brief get simple methylation rate estimate
+ * @author Christian Otto
+ *
+ */
+double
+bl_matchfileGetBSRateSimple(matchfileCross_t *bscs, Uint strand, unsigned char allowdel){
+ Uint i, j, max, meth, methch, unmeth, unmethch, *cnt;
+ double rate = -1;
+
+ if (!strand) return rate;
+
+ cnt = bl_matchfileGetNTCounts(bscs);
+ if (!allowdel) cnt[(int)'-'] = 0;
+
+ /* get unconverted/methylated and converted/unmethylated base */
+ methch = (Uint) bl_matchfileGetBSBase(strand, 0);
+ meth = cnt[methch];
+ unmethch = (Uint) bl_matchfileGetBSBase(strand, 1);
+ unmeth = cnt[unmethch];
+
+ /* calculate methylation rate */
+ if (meth + unmeth > 0){
+ i = uarraymax(cnt, 255);
+ max = cnt[i];
+ cnt[i] = 0;
+ j = uarraymax(cnt, 255);
+
+ /* Two valid cases:
+ * case 1:
+ * unique maximal occurring character in cross
+ * which is either the methylated or unmethylated
+ * character
+ * case 2:
+ * ambiguous maximal occurring character in cross
+ * which must be the methylation and unmethylated
+ * character
+ */
+ if ((max > cnt[j] && (i == methch || i == unmethch)) ||
+ (max == cnt[j] && ((i == methch && j == unmethch)||
+ (i == unmethch && j == methch)))){
+ rate = (double) meth/((double) meth + (double) unmeth);
+ }
+ }
+ free(cnt);
+ return rate;
+}
+
+/*------------------------ bl_matchfileCallMethylSimple ------------------------
+ *
+ * @brief Simple methylation caller
+ * @author Christian Otto
+ *
+ */
+Uint
+bl_matchfileCallMethylSimple (void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs,
+ char ref, matchfileindex_t *idx, unsigned char show, void *nfo )
+{
+ Uint i, strand, cov, methcov, meth, unmeth, other;
+ double rate;
+ char strandch, missing, *chrom, *context;
+ matchfileCross_t *bscs;
+ matfile_t *matfile = (matfile_t *) nfo;
+
+ /* basic settings */
+ chrom = matfile->files[fidx]->index->chromnames[cidx];
+ missing = strandch = '.';
+
+ /* get bisulfite conversion strand */
+ strand = bl_matchfileGetBSStrand(ref);
+ if (strand == PLUS_STRAND){
+ strandch = '+';
+ }
+ else if (strand == MINUS_STRAND){
+ strandch = '-';
+ }
+
+ /* do not process non-methylation sites */
+ if (!strand){
+ return 0;
+ }
+
+ /* get bisulfite cross */
+ bscs = bl_matchfileGetBSCross(cs);
+
+ /* get sequence context */
+ i = bl_fastxFindIDIdx(chrom, matfile->fasta);
+ context = bl_matchfileGetContext(matfile->fasta, i, pos, strand, 2);
+
+ /* get coverage (DP) and methylation coverage (MDP) */
+ cov = bl_matchfileGetCov(cs, 0);
+ methcov = bl_matchfileGetCov(bscs, 0);
+
+ /* get base counts in bisulfite cross */
+ meth = unmeth = other = 0;
+ rate = -1;
+
+ if (methcov > 0){
+ assert(strand);
+
+ /* count unconverted/methylated and converted/unmethylated bases */
+ meth = bl_matchfileGetBSCount(bscs, strand, 0);
+ unmeth = bl_matchfileGetBSCount(bscs, strand, 1);
+ other = methcov - meth - unmeth;
+
+ /* calculate methylation rate */
+ rate = bl_matchfileGetBSRateSimple(bscs, strand, 0);
+ }
+
+ /* report output */
+ if (rate != -1){
+ /* output first fields */
+ fprintf(matfile->dev, "%s\t%d\t%c\t%c\t%c\t%c\t%c", chrom, pos, missing, ref, missing, missing, missing);
+ fprintf(matfile->dev, "\t");
+ /* info field */
+ fprintf(matfile->dev, "CS=%c;CC=%s;NS=1;MMR=%.2f;DMR=.", strandch, context, rate);
+ fprintf(matfile->dev, "\t");
+ /* format field */
+ fprintf(matfile->dev, "DP:MDP:MDP3:MRDP:CM:CU:MR");
+ fprintf(matfile->dev, "\t");
+ /* data field */
+ fprintf(matfile->dev, "%d:%d:%d,%d,%d:%d:%d:%d:%.2f", cov, methcov, meth, unmeth, other,
+ meth + unmeth, meth, unmeth, rate);
+ fprintf(matfile->dev, "\n");
+ }
+
+ /* destruct everything */
+ bl_matchfileDestructCross(space, bscs, 1);
+ FREEMEMORY(space, bscs);
+ FREEMEMORY(space, context);
+
+ return 0;
+}
+
+Uint
+bl_matchfileCalcMethylBias (void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs,
+ char ref, matchfileindex_t *idx, unsigned char show, void *nfo )
+{
+ //matchfileSampleStats_t *stats = idx->stats;
+ Uint i, strand, methcov, methbase, unmethbase, base;
+ char *chrom, *context;
+ matchfileCross_t *bscs;
+ matfile_t *matfile = (matfile_t *) nfo;
+
+ /* get bisulfite conversion strand */
+ strand = bl_matchfileGetBSStrand(ref);
+
+ /* do not process non-methylation sites */
+ if (!strand){
+ return 0;
+ }
+
+ /* get bisulfite cross */
+ bscs = bl_matchfileGetBSCross(cs);
+
+ /* get methylation coverage (MDP) */
+ methcov = bl_matchfileGetCov(bscs, 0);
+
+ if (methcov > 0){
+
+ /* get sequence context */
+ chrom = matfile->files[fidx]->index->chromnames[cidx];
+ i = bl_fastxFindIDIdx(chrom, matfile->fasta);
+ context = bl_matchfileGetContext(matfile->fasta, i, pos, strand, 2);
+
+ /* TODO: parameter for sequence context filtering (given via nfo) */
+ /* TODO: coverage filter */
+ /* do not process nonCpG sites */
+ if (strcmp(context, "CG") == 0){
+
+ methbase = bl_matchfileGetBSBase(strand, 0);
+ unmethbase = bl_matchfileGetBSBase(strand, 1);
+
+ for (i = 0; i < bscs->len; i++){
+
+ base = bscs->chars[i];
+
+ /* exclude deleted chars (TODO: same as in BSmooth?) */
+ if (base != '-'){
+
+ /* report output */
+ fprintf(matfile->dev, "%u\t%u\t%u\t%u\n", //pos, ref,
+ bscs->readpos[i],
+ (base == methbase) ? 1 : 0,
+ (base == unmethbase) ? 1 : 0,
+ (base != methbase && base != unmethbase) ? 1 : 0);
+ }
+ }
+ }
+ FREEMEMORY(space, context);
+ }
+
+ /* destruct everything */
+ bl_matchfileDestructCross(space, bscs, 1);
+ FREEMEMORY(space, bscs);
+
+ return 0;
+}
+
+/*------------------ bl_evalmatchfileSampleCrossSectionsBS -------------------
+ *
+ * @brief sample and execute f on it
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+bl_matchfileSampleCrossSectionsBS(void *space,
+ matchfile_t *file, fasta_t *set, Uint n,
+ void (*f)(void *, matchfileFrame_t*, Uint,
+ matchfileFrameStats_t *, void *), void *info)
+{
+ PairUint *samplepos;
+ Uint i=0, r=0, j=0, k, *cumchrlen,
+ *order, prev=0, nchr, curchrom=0, curstart=0,
+ *mapsize = NULL;
+ matchfileFrame_t *frame = NULL;
+ matchfileFrameStats_t *stats = NULL;
+ Uint maxcover = 20000;
+ Uint setsize = 10000000;
+ unsigned char **maps;
+
+ char *sequence;
+ char *pch;
+ char next;
+ Container *positions;
+ Lint pos;
+
+
+ //init random number generator
+ srand((unsigned)(time(0)));
+ nchr = file->index->noofchroms;
+
+ samplepos = ALLOCMEMORY(space, NULL, PairUint, n+1);
+ memset(samplepos, 0, sizeof(PairUint)*n+1);
+ cumchrlen = ALLOCMEMORY(space, NULL, Uint, nchr);
+ memset(cumchrlen, 0, sizeof(Uint)*nchr);
+
+ MSG("generating small map\n");
+ //sum up the length of the references (i.e. chromosomes)
+ maps = bl_matchfileSmallMap (space, file, &mapsize);
+ MSG("map generated\n");
+
+ cumchrlen[0] = file->index->matchend[0] - file->index->matchstart[0] + 1;
+ fprintf(stderr, "%u: cumchrlen %u\n", 0, cumchrlen[0]);
+ for(i=1; i < nchr; i++) {
+ assert(file->index->matchend[i] >= file->index->matchstart[i]);
+ cumchrlen[i] = (file->index->matchend[i] -
+ file->index->matchstart[i]) + cumchrlen[i-1];
+ fprintf(stderr, "%u: cumchrlen %u\n", i, cumchrlen[i]);
+ }
+
+ // HELENE
+ positions = ALLOCMEMORY(space, NULL, Container, 1);
+ bl_containerInit(positions, 10000, sizeof(Lint));
+
+ for (i=0; i<nchr; i++){
+ sequence = bl_fastaGetSequence(set, i);
+ printf("%s\t%d\n", bl_fastaGetDescription(set, i), bl_fastaGetSequenceLength(set, i));
+
+ //position C on plus
+ pch = strchr(sequence, 'C');
+ while (pch!=NULL){
+
+ next = sequence[pch-sequence+1];
+ if (next == 'G'){
+ if (i == 0){
+ pos = pch-sequence;
+ bl_containerAdd(positions, &pos);
+ }
+ else{
+ pos = pch-sequence+cumchrlen[i-1];
+ bl_containerAdd(positions, &pos);
+ }
+ }
+ pch=strchr(pch+1,'C');
+ }
+ // printf("Start-End: %u\t%u\n", file->index->matchstart[i], file->index->matchend[i]);
+ }
+
+ // printf ("Anzahl CG: %u\n", bl_containerSize(positions));
+
+ //randomize n positions across the genome and deterimine their
+ //chromosomes
+ i = 0;
+ j = 0;
+
+ while(i < n && j < setsize) {
+ k=0;
+
+ pos = RANDINT(bl_containerSize(positions)-1);
+ samplepos[i].b = * ((Uint *) bl_containerGet(positions, pos));
+ // printf("%d\n", samplepos[i].b);
+
+ while(samplepos[i].b > cumchrlen[k] && k+1 < nchr) k++;
+ samplepos[i].a = k;
+ // printf("%d\n\n", samplepos[i].a);
+
+ prev = (k == 0) ? 0 : cumchrlen[k-1];
+
+ if(maps[samplepos[i].a]
+ && mapsize[samplepos[i].a] > (samplepos[i].b - prev)/255
+ && maps[samplepos[i].a][(samplepos[i].b - prev)/255] > 200) {
+ i++;
+ r++;
+ }
+
+ j++;
+ }
+
+ NFO("\n selected %d positions for sampling %d %d\n", i, j, n);
+
+ for(i=0; i < nchr; i++) {
+ if(maps[i]) {
+ FREEMEMORY(space, maps[i]);
+ }
+ }
+
+ bl_containerDestruct(positions, NULL);
+ FREEMEMORY(space, positions);
+ FREEMEMORY(space, maps);
+ FREEMEMORY(space, mapsize);
+
+ if(j == setsize && r < (int)(0.8*((double)n))) {
+ DBG("current sample size %d is below the minimum\n", r);
+ FREEMEMORY(space, samplepos);
+ FREEMEMORY(space, cumchrlen);
+ return 0;
+ }
+
+ //sort
+ order = quickSort(space, samplepos, n, bl_matchfileSampleCmp, NULL);
+
+ initProgressBarVT();
+
+ //evaluate
+ //to increase speed a frame of size FRAMESIZE is loaded
+ for(i=0; i < n; i++) {
+
+
+ progressBarVT("positions sampled.", n, i, 25);
+ //is position on a new chromosome or in a new frame?
+ if(!frame || samplepos[order[i]].a != curchrom ||
+ samplepos[order[i]].b-prev+1 >= frame->start+frame->width) {
+
+ if(frame) {
+ bl_matchfileDestructFrame(space, frame);
+ frame = NULL;
+ //bl_matchfileDestructFrameStats(space, stats);
+ }
+
+ curchrom = samplepos[order[i]].a;
+ curstart = samplepos[order[i]].b;
+ prev = (samplepos[order[i]].a == 0) ? 0 : cumchrlen[samplepos[order[i]].a-1];
+
+ fprintf(stderr, "getting frame for '%s', curstart '%d', prev '%d'\n",
+ file->index->chromnames[samplepos[order[i]].a], curstart, prev);
+
+ frame = bl_matchfileGetFrame(space, file,
+ file->index->chromnames[samplepos[order[i]].a],
+ curstart-prev+1, 20000, set, maxcover, NULL);
+
+ fprintf(stderr, "getting consensus\n" );
+ bl_matchfileGetConsensus(frame);
+ // stats = bl_matchfileFrameStats (space, frame);
+ }
+
+ fprintf(stderr, "evaluation of %d\n", samplepos[order[i]].b-curstart);
+ f(space, frame, samplepos[order[i]].b-curstart, stats, info);
+ }
+
+ NFO("\n %d positions sampled.\n", n);
+
+ if(frame) {
+ bl_matchfileDestructFrame(space,frame);
+ frame = NULL;
+ // bl_matchfileDestructFrameStats(space, stats);
+ }
+
+ FREEMEMORY(space, order);
+ FREEMEMORY(space, samplepos);
+ FREEMEMORY(space, cumchrlen);
+ return 1;
+}
+
+
+/*------------------------- bl_matchfileSampleStatsBS --------------------------
+ *
+ * @brief get bisulfite sample statistics
+ * @author Helene
+ *
+ */
+
+void
+bl_matchfileSampleStatsBS(void *space, matchfileFrame_t *frame,
+ Uint pos, matchfileFrameStats_t *framestats, void *nfo)
+{
+ double rate;//, er=.0, b;
+ //Uint i;
+ Uint methcov, strand;
+ matchfileCross_t *bscs;
+// matchfileSampleStats_t *stats = (matchfileSampleStats_t*) nfo;
+
+ /* get methylation coverage */
+ bscs = bl_matchfileGetBSCross(&frame->cs[pos]);
+ methcov = bl_matchfileGetCov(bscs, 0);
+
+ printf("BS_cov\t%d\n", methcov);
+
+ if (methcov < 15)
+ return;
+
+ /* calculate methylation rate */
+ strand = bl_matchfileGetBSStrand(bscs->ref);
+ rate = bl_matchfileGetBSRateSimple(bscs, strand, 0);
+ printf("BS_rate\t%lf\n", rate);
+
+ /* TODO: rewrite functions
+ int *positions = bl_matchfileGetReadPosBS(space, frame, pos);
+ printf("BS_pos\t");
+ for (i = 0; i < methcov; i++){
+ printf("%d\t", positions[i]);
+ }
+ printf("\n");
+ FREEMEMORY(space, positions);
+
+ int *qualities = bl_matchfileGetReadQualBS(space, frame, pos);
+ printf("BS_quals\t");
+ for (i = 0; i < methcov; i++){
+ printf("%d\t", qualities[i]);
+ }
+ printf("\n");
+ FREEMEMORY(space, qualities);
+ */
+
+/* b = bl_matchfileGetStrandBias(frame, pos);
+
+ if(e > 0 && stats->e_N < stats->n) {
+ stats->entropy[stats->e_N] = frame->cs[pos].longentropy;
+ stats->eraw[stats->e_N]=e;
+ stats->b[stats->e_N]=b;
+ stats->e[stats->e_N++]=e-er;
+ }
+*/
+ return;
+}
+
+/*---------------------- bl_matchfileGetReadPosBS -----------------------
+ *
+ * @brief get the positions of a read that cover a cross section
+ * (not reference)
+ * @author Helene
+ *
+ */
+/*
+double*
+bl_matchfileGetReadPosBS (void *space, matchfileFrame_t *frame, Uint pos)
+{
+ Uint j;
+ Uint i=0;
+ double *r;
+ matchfileCross_t *bscs, *save;
+ //TODO: rewrite function with use of following function
+ bscs = bl_matchfileGetBSCross(&frame->cs[pos]);
+
+
+ positions = ALLOCMEMORY(space, NULL, int, frame->cs[pos].len);
+
+ if (frame->cs[pos].len){
+ for(j=0; j < frame->cs[pos].len; j++) {
+ positions[i] = -1;
+ if (frame->cs[pos].ref == 'C'){
+ if (frame->cs[pos].bisulfite[j] == 1){
+ if (frame->cs[pos].chars[j] == 'C' || frame->cs[pos].chars[j] == 'T'){
+ positions[i] = frame->cs[pos].readpos[j];
+ i++;
+ }
+ }
+ }
+ else if (frame->cs[pos].ref == 'G'){
+ if (frame->cs[pos].bisulfite[j] == 2){
+ if (frame->cs[pos].chars[j] == 'G' || frame->cs[pos].chars[j] == 'A'){
+ positions[i] = frame->cs[pos].readpos[j];
+ i++;
+ }
+ }
+ }
+ }
+ }
+
+ return positions;
+}
+*/
+
+/*---------------------- bl_matchfileGetReadPosBS -----------------------
+ *
+ * @brief get the qualities of the reads that cover a cross section
+ * (not reference)
+ * @author Helene
+ *
+ */
+/*
+int *
+bl_matchfileGetReadQualBS (void *space, matchfileFrame_t *frame, Uint pos)
+{
+ Uint j;
+ Uint i=0;
+ int *quality;
+ quality = ALLOCMEMORY(space, NULL, int, frame->cs[pos].len);
+
+ //TODO: rewrite function with use of following function
+ bscs = bl_matchfileGetBSCross(&frame->cs[pos]);
+
+ if (frame->cs[pos].len){
+ for(j=0; j < frame->cs[pos].len; j++) {
+ quality[i] = -1;
+ if (frame->cs[pos].ref == 'C'){
+ if (frame->cs[pos].bisulfite[j] == 1){
+ if (frame->cs[pos].chars[j] == 'C' || frame->cs[pos].chars[j] == 'T'){
+ quality[i] = frame->cs[pos].quals[j];
+ i++;
+ }
+ }
+ }
+ else if (frame->cs[pos].ref == 'G'){
+ if (frame->cs[pos].bisulfite[j] == 2){
+ if (frame->cs[pos].chars[j] == 'G' || frame->cs[pos].chars[j] == 'A'){
+ quality[i] = frame->cs[pos].quals[j];
+ i++;
+ }
+ }
+ }
+ }
+ }
+
+ return quality;
+}
+*/
diff --git a/segemehl/libs/evalmethylmatchfiles.h b/segemehl/libs/evalmethylmatchfiles.h
new file mode 100644
index 0000000..e5bc1a3
--- /dev/null
+++ b/segemehl/libs/evalmethylmatchfiles.h
@@ -0,0 +1,39 @@
+#ifndef EVALMETHYLMATCHFILES_H
+#define EVALMETHYLMATCHFILES_H
+
+/**
+ * evalmethylmatchfiles.c
+ * evaluation and statistics of matchfiles from methylC-seq
+ *
+ * @author Christian Otto & Helene Kretzmer
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Thu May 2 10:07:27 EDT 2013
+ *
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 408 $
+ * Author: $Author: steve $
+ * Date: $Date: 2014-06-12 07:10:00 -0400 (Thu, 12 Jun 2014) $
+ * Id: $Id: evalmethylmatchfiles.h 408 2014-06-12 11:10:00Z steve $
+ * Url: $URL: http://www2.bioinf.uni-leipzig.de/svn5/segemehl/libs/evalmethylmatchfiles.h $
+ */
+
+matchfileCross_t* bl_matchfileGetBSCross(matchfileCross_t *cs);
+Uint bl_matchfileGetBSStrand(char ref);
+char bl_matchfileGetBSBase(Uint strand, unsigned char conv);
+Uint bl_matchfileGetBSCount(matchfileCross_t *bscs, Uint strand, unsigned char conv);
+double bl_matchfileGetBSRateSimple(matchfileCross_t *bscs, Uint strand, unsigned char allowdel);
+Uint bl_matchfileCallMethylSimple ( void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs,
+ char ref, matchfileindex_t *idx, unsigned char show, void *nfo);
+Uint bl_matchfileCalcMethylBias ( void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs,
+ char ref, matchfileindex_t *stats, unsigned char show, void *nfo);
+void bl_matchfileSampleStatsBS(void *space, matchfileFrame_t *frame, Uint pos, matchfileFrameStats_t *, void *nfo);
+int bl_matchfileSampleCrossSectionsBS(void *space, matchfile_t *file, fasta_t *set, Uint n,
+ void (*f)(void *, matchfileFrame_t*, Uint, matchfileFrameStats_t *, void *), void *info);
+int *bl_matchfileGetReadQualBS (void *space, matchfileFrame_t *frame, Uint pos);
+int *bl_matchfileGetReadPosBS (void *space, matchfileFrame_t *frame, Uint pos);
+
+#endif
diff --git a/segemehl/libs/fileBins.c b/segemehl/libs/fileBins.c
new file mode 100644
index 0000000..4481f51
--- /dev/null
+++ b/segemehl/libs/fileBins.c
@@ -0,0 +1,956 @@
+/*
+ * fileBins.c
+ * segemehl
+ *
+ * Created by Steve Hoffmann on 09.02.10.
+ * Copyright 2010 University Leipzig.
+ * All rights reserved.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+//#include <malloc/malloc.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <pthread.h>
+
+#include "memory.h"
+#include "fileio.h"
+#include "basic-types.h"
+#include "radixsort.h"
+#include "fileBins.h"
+#include "info.h"
+#include "debug.h"
+
+
+#define _FILE_OFFSET_BITS 64
+
+
+/*---------------------------- bl_fileBinsGetInfo ----------------------------
+ *
+ * @brief get info on file bins
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinsGetInfo (bl_fileBins_t *fb)
+{
+ Uint i;
+
+ if(!fb) {
+ DBG("fileBins not initialized:\n", NULL);
+ return;
+ }
+
+ NFO("total number of filebins: %d\n", fb->noofbins);
+ for(i=0; i < fb->noofbins; i++) {
+ NFO("Bin[%d] %s (classname: %s, range:%lld-%lld)\n", i, fb->b[i].fname, fb->b[i].id->classname, fb->b[i].id->start, fb->b[i].id->end);
+ }
+ return ;
+}
+
+
+/*------------------------- bl_fileBinsDomainGetInfo -------------------------
+ *
+ * @brief get file bins domain info
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinsDomainGetInfo (bl_fileBinDomains_t *dms) {
+ Uint i;
+
+ NFO("total number of domains: %d\n", dms->noofdomains);
+ for(i=0; i < dms->noofdomains; i++) {
+ NFO("Domain[%d] %s, domainsize: %d\n",
+ i, dms->domain[i].domainname, dms->domain[i].domainsize);
+ bl_fileBinsGetInfo(&dms->domain[i].bins);
+ }
+
+ return;
+}
+
+
+
+/*--------------------------- bl_fileBinsWriteLine ---------------------------
+ *
+ * @brief write a line
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinsWriteLn (void *space, bl_fileBin_t *fx, char *line)
+{
+ fprintf(fx->fp,"%s\n", line);
+ fx->lines++;
+ return ;
+}
+
+
+/*---------------------------- bl_fileBinsIsOpen -----------------------------
+ *
+ * @brief returns 1 if file is open, 0 if not
+ * @author Steve Hoffmann
+ *
+ */
+
+unsigned char
+bl_fileBinsIsOpen (bl_fileBin_t *fx)
+{
+ return (fx->fp != NULL);
+}
+
+/*----------------------------- bl_fileBinsClose ------------------------------
+ *
+ * @brief close a bin
+ * @author Steve Hoffmann
+ *
+ */
+
+
+int
+bl_fileBinsClose(bl_fileBin_t *fx){
+ int ret;
+
+ assert(fx->fp);
+ ret = fclose(fx->fp);
+ fx->fp = NULL;
+
+ return ret;
+}
+
+
+
+/*------------------------------ fileBinsUnlock ------------------------------
+ *
+ * @brief unlock file bin
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinsUnlock (bl_fileBin_t *bin)
+{
+
+ int ret;
+ assert(bin);
+ ret = pthread_mutex_trylock(bin->mtx);
+ assert(ret == EBUSY);
+ pthread_mutex_unlock(bin->mtx);
+
+ return ;
+}
+
+
+/*------------------------------ bl_fileBinsLock -----------------------------
+ *
+ * @brief file bin set lock
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinsLock (bl_fileBin_t *bin)
+{
+ assert(bin);
+ pthread_mutex_lock(bin->mtx);
+ return ;
+}
+
+/*----------------------------- bl_fileBinsOpen ------------------------------
+ *
+ * @brief open a bin
+ * @author Steve Hoffmann
+ *
+ */
+
+
+FILE*
+bl_fileBinsOpen(void *space, bl_fileBin_t* bin, const char *mode){
+
+ /*empty files?*/
+ if (!bin->fp) bin->fp = fopen(bin->fname, mode);
+
+ if(!bin->fp) {
+ DBG("filebins couldnt open file %s in mode '%s'. Exit forced.\n", bin->fname, mode);
+ printf( "Error opening file: %s\n", strerror( errno ) );
+ exit(-1);
+ }
+
+ return bin->fp;
+}
+
+/*----------------------------- bl_fileBinsFind -----------------------------
+ *
+ * @brief return bin
+ * @author Steve Hoffmann
+ *
+ */
+
+
+bl_fileBin_t *
+bl_fileBinsFind (void *space, bl_fileBins_t* bins,
+ int (*selector)(void *id, void *nfo), void *nfo)
+{
+
+ Uint i;
+
+ for(i=0; i < bins->noofbins; i++) {
+ if(selector(bins->b[i].id, nfo)) {
+ return &bins->b[i];
+ }
+ }
+ return NULL;
+}
+
+
+/*----------------------------- bl_fileBinsInit ------------------------------
+ *
+ * @brief initialize file bins
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinsInit(void *space, bl_fileBins_t *bins) {
+
+ bins->b = NULL;
+ bins->noofbins = 0;
+
+ return;
+}
+
+
+
+/*----------------- bl_fileBinsDomainsGetNames(space, desc) ------------------
+ *
+ * @brief pass list of k domain names and their length
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_fileBinsDomainsGetList(void *space, bl_fileBinDomains_t *domains,
+ char **domainnames[], Uint **domainsizes)
+{
+ Uint i=0, k=0;
+ char **list=NULL;
+ Uint *ll = NULL;
+
+ for(i=0; i < domains->noofdomains; i++) {
+ if(list == NULL ||
+ list[k-1] != domains->domain[i].domainname) {
+ list = ALLOCMEMORY(space, list, char*, k+1);
+ ll = ALLOCMEMORY(space, ll, Uint, k+1);
+ list[k] = domains->domain[i].domainname;
+ ll[k] = domains->domain[i].domainsize;
+ k++;
+ }
+ }
+
+ *domainnames = list;
+ *domainsizes = ll;
+ return k;
+}
+
+/*--------------------------- bl_fileBinsCloseAll ----------------------------
+ *
+ * @brief closes all file bins
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinsCloseAll (bl_fileBins_t *bins)
+{
+ Uint i;
+ assert(bins);
+ for(i=0; i < bins->noofbins; i++) {
+ if(bl_fileBinsIsOpen(&bins->b[i])) bl_fileBinsClose(&bins->b[i]);
+ }
+ return ;
+}
+
+/*------------------------------ bl_fileBinsAdd ------------------------------
+ *
+ * @brief add file bins to file bin container
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_fileBinsAdd (void *space, bl_fileBins_t *bins, Uint add,
+ bl_fileBinClass_t* (*assigner)(void *, int, void *), void *nfo, char **filenames,
+ char *template, Uint tmplen) {
+ Uint i;
+ char *fname;
+
+ bins->b = ALLOCMEMORY(bins->b, NULL, bl_fileBin_t, bins->noofbins+add);
+ bins->noofbins += add;
+
+ for(i=0; i < bins->noofbins; i++) {
+ if(filenames == NULL) {
+ fname = bl_getTempFile(template, tmplen);
+ } else {
+ fname = filenames[i];
+ }
+
+ bins->b[i].unlinked = 0;
+ bins->b[i].fname = fname;
+
+ if (assigner)
+ bins->b[i].id = assigner(space, i, nfo);
+ else
+ bins->b[i].id = NULL;
+
+ bins->b[i].lines=0;
+ bins->b[i].sorted = 0;
+ bins->b[i].fp = NULL;
+ bins->b[i].mtx = NULL;
+ bins->b[i].mtx = ALLOCMEMORY(space, NULL, pthread_mutex_t, 1);
+ pthread_mutex_init(bins->b[i].mtx, NULL);
+ }
+
+ return;
+}
+
+
+/*--------------------------- bl_fileBinsDestruct ----------------------------
+ *
+ * @brief destruct file bins
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinsDestruct (void *space, bl_fileBins_t *bins)
+{
+ Uint i;
+
+ for(i=0; i < bins->noofbins; i++) {
+ FREEMEMORY(space, bins->b[i].fname);
+ FREEMEMORY(space, bins->b[i].id);
+ FREEMEMORY(space, bins->b[i].mtx);
+ }
+
+ FREEMEMORY(space, bins->b);
+ bins->noofbins = 0;
+ bins->b = NULL;
+ return;
+}
+
+
+/*--------------------------- bl_fileBinDomainsDestruct ----------------------------
+ *
+ * @brief destruct file bin domains
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinDomainsDestruct (void *space, bl_fileBinDomains_t *dms)
+{
+ Uint i;
+
+ for(i=0; i < dms->noofdomains; i++) {
+ bl_fileBinsDestruct(space, &dms->domain[i].bins);
+ FREEMEMORY(space, dms->domain[i].domainname);
+ }
+
+ FREEMEMORY(space, dms->domain);
+
+ dms->noofdomains = 0;
+ dms->domain = NULL;
+ return;
+}
+
+/*----------------------.-- bl_fileBinDomainsCloseAll ------------------------
+ *
+ * @brief closes all file bins
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinDomainsCloseAll (bl_fileBinDomains_t *dms)
+{
+ Uint i;
+ assert(dms->domain);
+
+ for(i=0; i < dms->noofdomains; i++) {
+ bl_fileBinsCloseAll(&dms->domain[i].bins);
+ }
+ return ;
+}
+
+
+/*-------------------------- bl_fileBinDomainsInit ---------------------------
+ *
+ * @brief initalize file bin domains
+ * find next highest power of two; make binsize a power of two
+ * @author Steve Hoffmann
+ *
+ */
+
+bl_fileBinDomains_t*
+bl_fileBinsDomainsInit(void *space, char **domainnames, Uint *domainsizes,
+ Uint noofdomains, Uint totalsize, Uint avgbins, Uint maxbins,
+ char *filenametemplate, Uint tmplen){
+
+ Uint i, j, noofbins, e=0, binsize, maxbinperdomain,
+ maxdomainsize=0, est=0;
+ bl_fileBinDomains_t* dms;
+ bl_fileBinClass_t *ptr;
+
+
+ if(noofdomains > maxbins || maxbins == 0 || avgbins > maxbins) {
+ DBG("bl_fileBinDomainsInit: maxbins=%u < %u=noofdomains\n", maxbins, noofdomains);
+ return NULL;
+ }
+
+ for (i=0; i < noofdomains; i++) maxdomainsize = domainsizes[i];
+
+ dms = ALLOCMEMORY(space, NULL, bl_fileBinDomains_t, 1);
+ dms->noofdomains = noofdomains;
+ binsize = ceil(totalsize/avgbins);
+
+ while (((binsize-1) >> ++e) >= 1);
+
+ if (e >= 31) {
+ DBG("bl_fileBinDomainsInit: binsize 2^%u is out of range.\n", e);
+ return NULL;
+ }
+
+ binsize = (1 << e);
+
+ for(i=0; i < noofdomains; i++) {
+ est += ceil((double)domainsizes[i]/binsize);
+ }
+
+ if(est >= maxbins) {
+ maxbinperdomain = floor(maxbins/noofdomains);
+ binsize = maxdomainsize/maxbinperdomain + 1;
+ while (((binsize-1) >> ++e) >= 1);
+ binsize = (1 << e);
+ if (e >= 31) {
+ DBG("bl_fileBinDomainsInit: binsize 2^%u is out of range.\n", e);
+ return NULL;
+ }
+ }
+
+ dms->exp = e;
+ dms->domain = ALLOCMEMORY(space, NULL, bl_fileBinDomain_t, noofdomains);
+
+ for (i=0; i < noofdomains; i++) {
+ dms->domain[i].domainsize = domainsizes[i];
+ dms->domain[i].domainname = ALLOCMEMORY(space, NULL, char, strlen(domainnames[i])+1);
+ memmove(dms->domain[i].domainname, domainnames[i], strlen(domainnames[i]));
+ dms->domain[i].domainname[strlen(domainnames[i])] = '\0';
+
+ noofbins = (domainsizes[i] >> e) + ((domainsizes[i] & (binsize-1)) > 0);
+
+ bl_fileBinsInit(space, &dms->domain[i].bins);
+ bl_fileBinsAdd (space, &dms->domain[i].bins, noofbins, NULL, NULL, NULL,
+ filenametemplate, tmplen);
+
+ dms->domain[i].bins.noofbins = noofbins;
+
+ for (j=0; j < noofbins; j++) {
+ ptr = ALLOCMEMORY(space, NULL, bl_fileBinClass_t, 1);
+ ptr->start = j*binsize;
+ ptr->end = (j+1)*binsize -1;
+ ptr->classname = NULL;
+ dms->domain[i].bins.b[j].id = ptr;
+ }
+ }
+
+ return dms;
+}
+
+
+/*------------------------- bl_fileBinDomainsFindBin -------------------------
+ *
+ * @brief find domain and return bin
+ * @author Steve Hoffmann
+ *
+ */
+
+bl_fileBin_t *
+bl_fileBinsDomainGetBin (bl_fileBinDomains_t *dms, char *domainname, Uint pos)
+{
+ Uint i;
+
+ for(i=0; i < dms->noofdomains; i++) {
+ //fprintf(stderr,"compare %s == %s\n", dms->domain[i].domainname, domainname);
+ if(strcmp(dms->domain[i].domainname, domainname) == 0) break;
+ }
+
+ if (i == dms->noofdomains)
+ return NULL;
+
+ return &dms->domain[i].bins.b[(pos >> dms->exp)];
+}
+
+
+
+/*--------------------------- bl_fileBinsUnixSort ----------------------------
+ *
+ * @brief start unix sort tool
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinsUnixSort (void *space, bl_fileBins_t *fb, const char *fldstr, const char delim)
+{
+ Uint i;
+ //int ret;
+ char *filename;
+
+ for(i=0; i < fb->noofbins; i++) {
+ NFO("sorting file '%s'.\n", fb->b[i].fname);
+ filename = fb->b[i].fname;
+ //not used: ret = bl_UnixSort(space, filename, fldstr);
+ bl_UnixSort(space, filename, fldstr, delim);
+ }
+
+ return ;
+}
+
+
+
+
+/*-------------------------- fileBinsDomainUnixSort --------------------------
+ *
+ * @brief sort all bins of all domains
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinDomainsUnixSort (void *space, bl_fileBinDomains_t *dms, const char *fldstr, const char delim)
+{
+
+ Uint i, j;
+ //int ret;
+ char *filename;
+
+ for(i=0; i < dms->noofdomains; i++) {
+ NFO("sorting domain %d.\n", i);
+ for(j=0; j < dms->domain[i].bins.noofbins; j++) {
+ filename = dms->domain[i].bins.b[j].fname;
+ //not used: ret = bl_UnixSort(space, filename, fldstr);
+ bl_UnixSort(space, filename, fldstr, delim);
+ }
+ }
+
+ return ;
+}
+
+void
+bl_fileBinDomainsSortMerge(void *space, bl_fileBinDomains_t *dms,
+ char *bname, Uint bnamelen,
+ char *suf, Uint suflen,
+ const char *fldstr, const char delim,
+ unsigned char remove)
+{
+
+ char *cname, *ccname, *newname;
+ char **filenames;
+ Uint i,j,cnamelen;
+
+
+
+ for(i=0; i < dms->noofdomains; i++) {
+
+ filenames = ALLOCMEMORY(space, NULL, char*, dms->domain[i].bins.noofbins);
+ cname = dms->domain[i].domainname;
+ cnamelen = strlen(cname);
+ ccname = bl_replacenonalphanum(cname, cnamelen);
+ newname = ALLOCMEMORY(space, NULL, char, cnamelen + bnamelen + suflen + 4);
+ sprintf(newname, "%s_%s.%s", bname, ccname, suf);
+
+
+ for(j=0; j < dms->domain[i].bins.noofbins; j++) {
+ filenames[j] = dms->domain[i].bins.b[j].fname;
+ }
+
+ bl_UnixSortMerge(space, filenames, dms->domain[i].bins.noofbins, fldstr, delim, newname);
+
+ FREEMEMORY(space, ccname);
+ FREEMEMORY(space, newname);
+ }
+
+}
+
+
+/*------------------------- bl_fileBinsDomainsMerge --------------------------
+ *
+ * @brief merge all bins of all domains
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_fileBinDomainsMerge (void *space, bl_fileBinDomains_t *dms,
+ char *bname, Uint bnamelen,
+ char *suf, Uint suflen, char **header,
+ unsigned char remove)
+{
+
+ FILE *outfile=NULL, *fp;
+ size_t buffersize = 1024, len;
+ off_t fs;
+ char *buffer, *cname, *ccname, *newname;
+ Uint i,j,cnamelen;
+
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+
+ for(i=0; i < dms->noofdomains; i++) {
+
+ cname = dms->domain[i].domainname;
+ cnamelen = strlen(cname);
+ ccname = bl_replacenonalphanum(cname, cnamelen);
+ newname = ALLOCMEMORY(space, NULL, char, cnamelen + bnamelen + suflen + 4);
+ sprintf(newname, "%s_%s.%s", bname, ccname, suf);
+
+
+ if(header && header[i]) {
+ outfile = fopen(newname,"w");
+ fprintf(outfile, "%s", header[i]);
+ fclose(outfile);
+ }
+
+ outfile = fopen(newname, "ab");
+ if (!outfile) {
+ DBG("Opening of file %s failed. Exit forced.\n", newname);
+ exit(EXIT_FAILURE);
+ }
+
+ for(j=0; j < dms->domain[i].bins.noofbins; j++) {
+
+ fp = fopen(dms->domain[i].bins.b[j].fname, "rb");
+
+ if (fp == NULL){
+ DBG("Opening of file %s failed. Exit forced.\n",
+ dms->domain[i].bins.b[j].fname);
+ exit(EXIT_FAILURE);
+ }
+
+ fseek (fp , 0 , SEEK_END);
+ fs = ftello(fp);
+ rewind (fp);
+
+ while((len = fread(buffer, 1, buffersize, fp)) > 0) {
+ fwrite(buffer, 1, len, outfile);
+ fs -= len;
+ }
+
+ if(fs > 0) {
+ DBG("Could not read %s entirely (fs:%zu)\n",
+ dms->domain[i].bins.b[j].fname, fs);
+ }
+
+ fclose(fp);
+ if (remove) {
+ bl_rm(space, dms->domain[i].bins.b[j].fname);
+ dms->domain[i].bins.b[j].unlinked = 1;
+ }
+ }
+
+ FREEMEMORY(space, newname);
+ FREEMEMORY(space, ccname);
+ }
+
+ FREEMEMORY(space, buffer);
+ fclose(outfile);
+ return ;
+}
+
+
+
+
+
+/* helpers
+ *
+ *
+ *
+ *
+ *
+ */
+
+
+/*--------------------------- bl_fileBinsSortLine ----------------------------
+ *
+ * @brief sort lines in a bin (in memory or directly in the file)
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinsSortLine(void *space, bl_fileBins_t* bins,
+ unsigned char fileSort, char *filename, unsigned char remove,
+ LLint (*keyaccess)(char *, void*), void* nfo) {
+
+ Uint i, k, j=0, len;
+ bl_fileSort_t *data;
+ LLint res;
+ char *line, *tmpname=NULL;
+ off_t fs;
+ int fseekres;
+ size_t fwriteres, freadres;
+ FILE *fp;
+ FILE *outfile;
+ struct stat st;
+ char *template = "filebinsort";
+
+ if (filename) {
+ outfile = fopen(filename, "w");
+ } else {
+ tmpname = bl_getTempFile(template, 11);
+ outfile = fopen(tmpname, "w");
+ }
+
+ if (outfile == NULL){
+ fprintf(stderr, "Opening temp file failed. Exit forced.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ for(i=0; i < bins->noofbins; i++) {
+
+ data = malloc(sizeof(bl_fileSort_t)*bins->b[i].lines);
+ if(!data) {
+ bins->b[i].sorted = 0;
+ fprintf(stderr,"warning: not enough memory for fileBins. Try unix sort.");
+ continue;
+ } else {
+ fileSort=1;
+ }
+
+ if (stat(bins->b[i].fname, &st) == 0) {
+ line = malloc(st.st_size);
+ if(!line) {
+ bins->b[i].sorted = 0;
+ fprintf(stderr,"warning: not enough memory for fileBins. Try sort.");
+ continue;
+ }
+ free(line);
+ } else {
+ continue;
+ }
+
+ fp = fopen(bins->b[i].fname, "r");
+ if (fp == NULL){
+ fprintf(stderr, "Opening file %s failed. Exit forced.\n",
+ bins->b[i].fname);
+ exit(EXIT_FAILURE);
+ }
+
+ fs = ftello(fp);
+ if (fs == -1) {
+ fprintf(stderr,"File access error for %s. Exit forced.\n",
+ bins->b[i].fname);
+ exit(EXIT_FAILURE);
+ }
+ j = 0;
+
+ while((len = bl_fgets(space, fp, &line)) != EOF) {
+ res = keyaccess(line, nfo);
+
+ data[j].key = res;
+ data[j].ptr = fs;
+ data[j].len = len;
+
+ if (fileSort) {
+ FREEMEMORY(space, line);
+ data[j].line = NULL;
+ fs = ftello(fp);
+
+ if (fs == -1) {
+ fprintf(stderr,"File access error for %s. Exit forced.\n",
+ bins->b[i].fname);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ data[j].line = line;
+ }
+ j += 1;
+ }
+
+ if(fileSort) FREEMEMORY(space, line);
+ bl_radixSortKeyFirst(space, data, sizeof(bl_fileSort_t), j, 16);
+
+ for(k=0; k < j; k++) {
+ if(fileSort) {
+ line = ALLOCMEMORY(space, NULL, sizeof(char), data[k].len+1);
+ fseekres = fseeko(fp, data[k].ptr, 0);
+ if (fseekres == -1) {
+ fprintf(stderr,"File access error for %s. Exit forced.\n",
+ bins->b[i].fname);
+ exit(EXIT_FAILURE);
+ }
+ freadres = fread(line, sizeof(char), data[k].len+1, fp);
+ if (freadres != data[k].len+1) {
+ fprintf(stderr,"File access error for %s. Exit forced.\n",
+ bins->b[i].fname);
+ exit(EXIT_FAILURE);
+ }
+ fwriteres = fwrite(line, sizeof(char), data[k].len+1, outfile);
+ if (fwriteres != data[k].len+1) {
+ fprintf(stderr,"File access error for %s. Exit forced.\n", tmpname);
+ exit(EXIT_FAILURE);
+ }
+ FREEMEMORY(space, line);
+ } else {
+ fprintf(outfile, "%s\n", data[k].line);
+ FREEMEMORY(space, data[k].line);
+ }
+ }
+
+ fclose(fp);
+
+ if (filename == NULL) {
+ fclose(outfile);
+ unlink(bins->b[i].fname);
+ rename(tmpname, bins->b[i].fname);
+ outfile = fopen(tmpname, "w");
+ if (outfile == NULL){
+ fprintf(stderr, "Opening temp file failed. Exit forced.\n");
+ exit(EXIT_FAILURE);
+ }
+ } else if(remove) {
+ unlink(bins->b[i].fname);
+ bins->b[i].unlinked = 1;
+ }
+
+ FREEMEMORY(space, data);
+ }
+
+ fclose(outfile);
+ return;
+}
+
+
+/*----------------------------- bl_fileBinsMerge -----------------------------
+ *
+ * @brief merge file bins
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinsMerge(void *space, char *filename, bl_fileBins_t* bins,
+ unsigned char delete) {
+
+ FILE *outfile, *fp;
+ char *line;
+ Uint i, len;
+
+ outfile = fopen(filename, "w");
+ for(i=0; i < bins->noofbins; i++) {
+ fp = fopen(bins->b[i].fname, "r");
+ if (fp == NULL){
+ fprintf(stderr, "Opening of file %s failed. Exit forced.\n",
+ bins->b[i].fname);
+ exit(EXIT_FAILURE);
+ }
+ fprintf(stderr, "start file\n");
+
+ while((len = bl_fgets(space, fp, &line)) != EOF) {
+ fprintf(outfile, "%s\n", line);
+ }
+ //unlink file
+ }
+}
+
+
+/*------------------------- bl_fileBinsCClassSelect --------------------------
+ *
+ * @brief select classname
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+bl_fileBinsCClassSelect (void *id, void *nfo)
+{
+ bl_fileBinClass_t *elem;
+ char *toSelect;
+
+ elem = (bl_fileBinClass_t*) id;
+ toSelect = (char*) nfo;
+
+ if (strcmp(elem->classname, toSelect) == 0 || elem->classname[0]=='*') {
+ return 1;
+ }
+
+ return 0;
+}
+
+
+/*------------------------- bl_fileBinsCClassRename --------------------------
+ *
+ * @brief rename to classname
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fileBinsCClassRename (void *space, bl_fileBins_t *fb,
+ char *bname, Uint bnamelen, char *suf, Uint suflen)
+{
+ char *newname, *cname, *ccname;
+ Uint i, cnamelen;
+ int ret;
+
+ for(i=0; i < fb->noofbins; i++) {
+ cname = fb->b[i].id->classname;
+ cnamelen = strlen(cname);
+ ccname = bl_replacenonalphanum(cname, cnamelen);
+ newname = ALLOCMEMORY(space, NULL, char, cnamelen + bnamelen + suflen + 4);
+ sprintf(newname, "%s_%s.%s", bname, ccname, suf);
+
+ ret = rename(fb->b[i].fname, newname);
+ assert(ret != -1);
+
+ FREEMEMORY(space, ccname);
+ FREEMEMORY(space, newname);
+ }
+
+ return ;
+}
+
+
+
+/*-------------------------- bl_fileBinCClassAssign --------------------------
+ *
+ * @brief assign classname
+ * @author Steve Hoffmann
+ *
+ */
+
+bl_fileBinClass_t*
+bl_fileBinCClassAssign (void *space, int id, void *nfo)
+{
+ bl_fileBinClass_t *ptr;
+ char **classnames;
+
+ ptr = ALLOCMEMORY(space, NULL, bl_fileBinClass_t, 1);
+ classnames = (char**) nfo;
+ ptr->classname = classnames[id];
+
+ return ptr;
+}
+
+
+
diff --git a/segemehl/libs/fileBins.h b/segemehl/libs/fileBins.h
new file mode 100644
index 0000000..a6cea05
--- /dev/null
+++ b/segemehl/libs/fileBins.h
@@ -0,0 +1,179 @@
+#ifndef FILEBINS_H
+#define FILEBINS_H
+
+/*
+ * fileBins.h
+ * segemehl
+ *
+ * Created by Steve Hoffmann on 09.02.10.
+ * Copyright 2010 University Leipzig.
+ * All rights reserved.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/types.h>
+#include "basic-types.h"
+#include <pthread.h>
+
+
+#ifndef HAVE_FSEEKO
+ int fseeko(FILE *stream, off_t offset, int whence);
+#endif
+
+#ifndef HAVE_FTELLO
+ off_t ftello(FILE *stream);
+#endif
+
+typedef struct bl_fileBinClass_s{
+ char *classname;
+ int classno;
+ LLint start;
+ LLint end;
+} bl_fileBinClass_t;
+
+typedef struct fileBin_s{
+ FILE *fp;
+ bl_fileBinClass_t *id;
+ char *fname;
+ unsigned char unlinked;
+ off_t maxsize;
+ pthread_mutex_t* mtx;
+ unsigned char sorted;
+ unsigned long long int lines;
+} bl_fileBin_t;
+
+typedef struct fileBins_s {
+ Uint noofbins;
+ bl_fileBin_t *b;
+} bl_fileBins_t;
+
+typedef struct fileBinDomain_s {
+ char *domainname;
+ Uint domainsize;
+ bl_fileBins_t bins;
+} bl_fileBinDomain_t;
+
+typedef struct fileBinDomains_t {
+ Uint noofdomains;
+ Uint exp; //to the base of two
+ bl_fileBinDomain_t *domain;
+} bl_fileBinDomains_t;
+
+typedef struct fileSort_s {
+ LLint key;
+ Uint len;
+ char *line;
+ off_t ptr;
+} bl_fileSort_t;
+
+
+void
+bl_fileBinsUnlock (bl_fileBin_t *bin);
+
+void
+bl_fileBinsLock (bl_fileBin_t *bin);
+
+void
+bl_fileBinsCloseAll (bl_fileBins_t *bins);
+
+void
+bl_fileBinsAdd(void *space, bl_fileBins_t* bins, Uint add,
+ bl_fileBinClass_t* (*assigner)(void *, int, void *), void *nfo, char** names,
+ char *template, Uint tmplen);
+
+void
+bl_fileBinsSortLine(void *space, bl_fileBins_t* bins,
+ unsigned char fileSort, char *filename, unsigned char ulink,
+ LLint (*key)(char *, void*), void* nfo);
+
+int
+bl_fileBinsCClassSelect (void *id, void *nfo);
+
+bl_fileBinClass_t*
+bl_fileBinCClassAssign (void *space, int id, void *nfo);
+
+int
+bl_fileBinsClose(bl_fileBin_t *fx);
+
+void
+bl_fileBinsDestruct (void *space, bl_fileBins_t *bins);
+
+void
+bl_fileBinsInit(void *space, bl_fileBins_t *bins);
+
+bl_fileBin_t *
+bl_fileBinsFind (void *space, bl_fileBins_t* bins,
+ int (*selector)(void *id, void *nfo), void *nfo);
+
+FILE*
+bl_fileBinsOpen(void *space, bl_fileBin_t *bin, const char *mode);
+
+void
+bl_fileBinsWriteLn(void *space, bl_fileBin_t *fx, char *line);
+
+unsigned char
+bl_fileBinsIsOpen (bl_fileBin_t *fx);
+
+char *
+bl_fileBinsGetTemp(char *tmp, Uint tmplen);
+
+void
+bl_fileBinsGetInfo(bl_fileBins_t *);
+
+void
+bl_fileBinsCClassRename (void *space, bl_fileBins_t *fb,
+ char *bname, Uint bnamelen, char *suf, Uint suflen);
+
+void
+bl_fileBinsUnixSort (void *space, bl_fileBins_t *fb, const char *fieldstring, const char delim);
+
+bl_fileBinDomains_t*
+bl_fileBinsDomainsInit(void *space, char **domainnames, Uint *domainsizes,
+ Uint noofdomains, Uint total, Uint avgbins, Uint maxbins, char *filenametemplate, Uint tmplen);
+
+bl_fileBin_t *
+bl_fileBinsDomainGetBin (bl_fileBinDomains_t *dms, char *domainname, Uint pos);
+
+void
+bl_fileBinDomainsDestruct (void *space, bl_fileBinDomains_t *dms);
+
+
+void
+bl_fileBinsDomainGetInfo (bl_fileBinDomains_t *dms);
+
+void
+bl_fileBinDomainsCloseAll (bl_fileBinDomains_t *dms);
+
+void
+bl_fileBinDomainsUnixSort (void *space, bl_fileBinDomains_t *dms, const char *fldstr, const char delim);
+
+
+void
+bl_fileBinDomainsMerge (void *space, bl_fileBinDomains_t *dms,
+ char *bname, Uint bnamelen,
+ char *suf, Uint suflen, char **header,
+ unsigned char remove);
+
+void
+bl_fileBinDomainsSortMerge(void *space, bl_fileBinDomains_t *dms,
+ char *bname, Uint bnamelen,
+ char *suf, Uint suflen,
+ const char *fldstr, const char delim,
+ unsigned char remove);
+
+Uint
+bl_fileBinsDomainsGetList(void *space, bl_fileBinDomains_t *domains,
+ char **domainnames[], Uint **domainsizes);
+
+void
+bl_fileBinDomainsDestruct (void *space, bl_fileBinDomains_t *dms);
+
+
+#endif
diff --git a/segemehl/libs/fileio.c b/segemehl/libs/fileio.c
new file mode 100644
index 0000000..240c70c
--- /dev/null
+++ b/segemehl/libs/fileio.c
@@ -0,0 +1,580 @@
+/*
+ * fileio.c
+ * functions to manipulate and read files
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: fileio.c 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/fileio.c $
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "stringutils.h"
+#include "basic-types.h"
+#include "fileio.h"
+
+#ifndef DIR_SEPARATOR
+#define DIR_SEPARATOR '/'
+#endif
+
+#if defined (_WIN32) || defined (__MSDOS__) || defined (__DJGPP__) || \
+ defined (__OS2__)
+#define HAVE_DOS_BASED_FILE_SYSTEM
+#ifndef DIR_SEPARATOR_2
+#define DIR_SEPARATOR_2 '\\'
+#endif
+#endif
+
+/* Define IS_DIR_SEPARATOR. */
+#ifndef DIR_SEPARATOR_2
+# define IS_DIR_SEPARATOR(ch) ((ch) == DIR_SEPARATOR)
+#else /* DIR_SEPARATOR_2 */
+# define IS_DIR_SEPARATOR(ch) \
+ (((ch) == DIR_SEPARATOR) || ((ch) == DIR_SEPARATOR_2))
+#endif /* DIR_SEPARATOR_2 */
+
+
+
+void
+bl_writeFileHeader(char *filename, char *header) {
+ char *tmpfilename, *buffer;
+ FILE *out, *in;
+ size_t buffersize= 1024, len;
+
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+
+ tmpfilename = bl_getTempFile("headerwrite",11);
+
+ out = fopen(tmpfilename, "w");
+
+ if(!out) {
+ fprintf(stderr, "Couldnt open file %s for writing. Exit forced.", tmpfilename);
+ exit(-1);
+ }
+
+ fprintf(out, "%s\n", header);
+ fclose(out);
+
+ out = fopen(tmpfilename, "a");
+ in = fopen(filename, "r");
+
+ if(!in) {
+ fprintf(stderr, "Couldnt open file %s for reading. Exit forced.", filename);
+ exit(-1);
+ }
+
+ while((len = fread(buffer, 1, buffersize, in)) > 0) {
+ fwrite(buffer, 1, len, out);
+ }
+
+ fclose(out);
+ fclose(in);
+ rename(tmpfilename, filename);
+
+ return;
+}
+
+
+char*
+bl_replacenonalphanum(char *s, Uint len) {
+ Uint i, u=0, lastalphanum=0;
+ int ch;
+ char *new = ALLOCMEMORY(NULL, NULL, char, len+1);
+
+ for(i=0; i < len; i++) {
+ ch = s[i];
+ if(((Uint)((ch | 0x20) - 'a') < 26u)|| ((Uint)(ch-'0') < 10u)) {
+ lastalphanum = u;
+ new[u++] = ch;
+ } else {
+ if (u > 0)
+ new[u++] = '_';
+ }
+ }
+
+ new[lastalphanum+1] ='\0';
+ return new;
+}
+
+
+
+/*------------------------------ bl_getTempFile ------------------------------
+ *
+ * @brief get a temporary file
+ * @author Steve Hoffmann
+ *
+ */
+
+char *
+bl_getTempFile(char *tmp, Uint tmplen)
+{
+
+ int res;
+ char *fname=NULL;
+
+ fname = ALLOCMEMORY(NULL, NULL, char, tmplen+11);
+
+ if(tmplen > 0)
+ sprintf(fname, "%sXXXXXX", tmp);
+ else
+ sprintf(fname,"XXXXXX");
+
+ if ((res = mkstemp(fname)) == -1) {
+ fprintf(stderr, "Error in creating temporary file '%s'. Exit forced.\n",
+ fname);
+ exit(-1);
+ }
+ if (close(res) == -1){
+ fprintf(stderr, "Error in closing temporary file '%s'. Exit forced.\n",
+ fname);
+ exit(-1);
+ }
+ return fname;
+}
+
+int
+bl_UnixSortMerge(void *space, char **filenames, Uint nooffiles,
+ const char *fieldstring, const char delim, char *outfile) {
+ int ret;
+ Uint i, filenamestringpos;
+ char *prg = "LC_COLLATE=C sort";
+ char *cmd;
+ char *filenamestring = NULL;
+
+ filenamestringpos = 0;
+ for(i = 0; i < nooffiles; i++) {
+
+ filenamestring =
+ ALLOCMEMORY(space, filenamestring, char, filenamestringpos+strlen(filenames[i])+2);
+
+ memmove(&filenamestring[filenamestringpos], filenames[i], strlen(filenames[i]));
+ filenamestringpos += strlen(filenames[i]);
+ filenamestring[filenamestringpos] = ' ';
+ filenamestringpos++;
+ filenamestring[filenamestringpos] = 0;
+ }
+
+
+ cmd = ALLOCMEMORY(space, NULL, char, strlen(prg) + strlen(fieldstring)
+ + strlen(filenamestring) + strlen(outfile) + 15);
+ sprintf(cmd, "%s -m -t '%c' %s %s > %s", prg, delim, fieldstring, filenamestring, outfile);
+ ret = system(cmd);
+
+ return ret;
+}
+
+int
+bl_rm(void *space, char *filename) {
+ int ret=0;
+ char *prg = "rm";
+ char *cmd;
+
+ cmd = calloc(strlen(prg) + strlen(filename) + 10, 1);
+
+ sprintf(cmd, "%s -f %s", prg, filename);
+ system(cmd);
+
+ free(cmd);
+ return ret;
+}
+
+int
+bl_UnixSort(void *space, char *filename, const char *fieldstring, const char delim) {
+ int ret=0;
+ char *prg = "LC_COLLATE=C sort";
+ char *cmd;
+ char *tempfilename;
+
+ tempfilename = bl_getTempFile("sort", 4);
+
+ cmd = calloc(strlen(prg) + strlen(fieldstring)
+ + strlen(filename) + strlen(tempfilename) + 15, 1);
+
+ sprintf(cmd, "%s -o %s -t '%c' %s %s", prg, tempfilename, delim, fieldstring, filename);
+ system(cmd);
+
+ bl_rm(space, filename);
+ rename(tempfilename, filename);
+
+ free(cmd);
+ free(tempfilename);
+ return ret;
+}
+
+char* dirname(const char *filename) {
+ char *s;
+ s=strrchr(filename, (int)'/');
+ if(s && *s)
+ *s = '\0';
+
+ return s;
+}
+
+int
+bl_fileprefixlen(char *filename) {
+ Uint i, suf;
+
+ suf = strlen(filename);
+ for(i=1; i < strlen(filename); i++) {
+ if(filename[i] == '.') {
+ suf = i;
+ }
+ }
+ return suf;
+}
+
+char *
+bl_basename (const char *name)
+{
+ const char *base;
+
+#if defined (HAVE_DOS_BASED_FILE_SYSTEM)
+ /* Skip over the disk name in MSDOS pathnames. */
+ if (ISALPHA (name[0]) && name[1] == ':')
+ name += 2;
+#endif
+
+ for (base = name; *name; name++)
+ {
+ if (IS_DIR_SEPARATOR (*name))
+ {
+ base = name + 1;
+ }
+ }
+ return (char *) base;
+}
+
+
+void
+bl_fnreplace(char *filename, char oldchar, char newchar, Uint nreplace) {
+ int ch, n=0;
+ FILE *fp;
+
+ fp = fopen(filename, "rb+");
+ if(!fp) {
+ fprintf(stderr,"Couldnt open file '%s'. Exit forced!\n", filename);
+ exit(-1);
+ }
+
+ while((ch = fgetc(fp)) != EOF) {
+ if(ch == oldchar) {
+ fseek(fp, -1, SEEK_CUR);
+ fputc(newchar, fp);
+ n++;
+ }
+ if(nreplace == n) break;
+ }
+
+
+ fclose(fp);
+ return;
+}
+
+void
+bl_freplacearr(char *filename, char* oldchars, char* newchars, Uint len, char stop) {
+ int ch, i;
+ char oldchar;
+ FILE *fp;
+
+ fp = fopen(filename, "rb+");
+ if(!fp) {
+ fprintf(stderr,"Couldnt open file '%s'. Exit forced!\n", filename);
+ exit(-1);
+ }
+
+ while((ch = fgetc(fp)) != EOF) {
+ for(i=0; i < len; i++) {
+ oldchar = oldchars[i];
+ if(ch == oldchar) {
+ fseek(fp, -1, SEEK_CUR);
+ fputc(newchars[i], fp);
+ break;
+ }
+ }
+
+ if(ch == stop) {
+ fseek(fp, -1, SEEK_CUR);
+ fputc(' ', fp);
+ break;
+ }
+ }
+
+ fclose(fp);
+ return;
+}
+
+void
+bl_freplace(char *filename, char oldchar, char newchar, char stop) {
+ int ch;
+ FILE *fp;
+
+ fp = fopen(filename, "rb+");
+ if(!fp) {
+ fprintf(stderr,"Couldnt open file '%s'. Exit forced!\n", filename);
+ exit(-1);
+ }
+
+ while((ch = fgetc(fp)) != EOF) {
+ if(ch == oldchar) {
+ fseek(fp, -1, SEEK_CUR);
+ fputc(newchar, fp);
+ }
+ if(ch == stop) {
+ fseek(fp, -1, SEEK_CUR);
+ fputc('\n', fp);
+ break;
+ }
+ }
+
+ fclose(fp);
+ return;
+}
+
+void
+bl_freplacestr(char *filename, char *str, Uint len, char stop){
+ int i = 0;
+ char ch;
+ FILE *fp;
+
+ fp = fopen(filename, "rb+");
+ if (!fp) {
+ fprintf(stderr, "Couldn't open file '%s'. Exit forced.\n", filename);
+ exit(EXIT_FAILURE);
+ }
+
+ while((ch = fgetc(fp)) != EOF){
+ if (ch == stop){
+ break;
+ }
+ fseek(fp, -1, SEEK_CUR);
+ fputc(str[i%len], fp);
+ i++;
+ }
+
+ fclose(fp);
+ return;
+}
+
+int
+bl_fgets(void *space, FILE *fp, char **str) {
+ char ch, *buffer;
+ Uint buffersize = 100;
+ Uint len = 0;
+
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+
+ while((ch=getc(fp)) != EOF && ch != '\n') {
+ if(len == buffersize - 1) {
+ buffersize = 2 * buffersize + 1;
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ }
+ buffer[len++] = (char) ch;
+ }
+
+ if(ch == EOF) return EOF;
+
+ buffer[len] = '\0';
+ *str = buffer;
+
+ return len;
+}
+
+
+char*
+readfile(void* space, char* filename, Uint* strlen) {
+
+ char ch;
+ char *buffer;
+ FILE *fp;
+ Uint buffersize = MAXBUFFERSIZE;
+ Uint len=0;
+
+ fp = fopen(filename, "r");
+ if (fp == NULL){
+ fprintf(stderr, "Opening of file %s failed. Exit forced.\n", filename);
+ exit(EXIT_FAILURE);
+ }
+
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+
+ while((ch=getc(fp)) != EOF) {
+ if(len == buffersize-1) {
+ buffersize = 2*buffersize+1;
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ }
+ len++;
+ buffer[len-1]=(char)ch;
+ }
+ buffer[len]='\0';
+ fclose(fp);
+
+ *strlen = len;
+ return buffer;
+}
+
+
+
+stringset_t **
+readcsv(void *space,
+ char* filename,
+ char *delim,
+ Uint *linecount) {
+
+ Uint i, contentlen;
+ char *content;
+ stringset_t *lines, **csv;
+
+ content = readfile(space, filename, &contentlen);
+#ifndef _CRLF_
+ lines = tokensToStringset(space, "\n", content, contentlen);
+#else
+ lines = tokensToStringset(space, "\r\n", content, contentlen);
+#endif
+ FREEMEMORY(space, content);
+ *linecount=lines->noofstrings;
+ csv=ALLOCMEMORY(space, NULL, stringset_t *, lines->noofstrings);
+
+ for(i=0; i < lines->noofstrings; i++) {
+ csv[i] = tokensToStringset(space, delim, lines->strings[i].str, lines->strings[i].len);
+ }
+
+ destructStringset(space, lines);
+ return csv;
+}
+
+double*
+readX(void *space, char *filename, Uint *nvals) {
+
+ double *X, r;
+ Uint n, i, j = 0;
+ stringset_t **csv;
+
+ csv = readcsv(space, filename, "\t ", &n);
+ X = ALLOCMEMORY(space, NULL, double, n);
+
+ for(i=0; i < n; i++) {
+ if(csv[i]->noofstrings){
+ //fprintf(stderr, "%s\n", csv[i]->strings[0].str);
+ r= atof(csv[i]->strings[0].str);
+ if(!isinf(r)) {
+ X[j] = r;
+ j++;
+ // fprintf(stderr, "%d\t%f\n",j, r);
+ }
+ }
+ destructStringset(space, csv[i]);
+ }
+
+
+ FREEMEMORY(space, csv);
+ X = ALLOCMEMORY(space, X, double, j);
+ *nvals = j;
+ return X;
+}
+
+void
+writeY(char *filename, double *Y, Uint len, Uint xoff, Uint yoff) {
+ FILE *file;
+ Uint i;
+
+ file = fopen(filename, "w");
+ if (file == NULL) {
+ fprintf(stderr, "couldn't open %s - exit forced", filename);
+ exit(-1);
+ }
+
+ for(i=yoff; i < len; i++) {
+ fprintf(file,"%d\t%f\n", i+xoff, Y[i]);
+ }
+
+ fclose(file);
+ return;
+}
+
+void
+writeYUint(char *filename, Uint *Y, Uint len, Uint xoff, Uint yoff) {
+ FILE *file;
+ Uint i;
+
+ file = fopen(filename, "w");
+ if (file == NULL) {
+ fprintf(stderr, "couldn't open %s - exit forced", filename);
+ exit(-1);
+ }
+
+ for(i=yoff; i < len; i++) {
+ fprintf(file,"%d\t%d\n", i+xoff, Y[i]);
+ }
+
+ fclose(file);
+ return;
+}
+
+void
+writeYUintNorm(char *filename, Uint *Y, Uint len, Uint off) {
+ FILE *file;
+ Uint i, norm=0;
+
+ file = fopen(filename, "w");
+ if (file == NULL) {
+ fprintf(stderr, "couldn't open %s - exit forced", filename);
+ exit(-1);
+ }
+ for(i=0; i < len; i++) {
+ norm += Y[i];
+ }
+
+ for(i=off; i < len; i++) {
+ fprintf(file,"%d\t%f\n", i, (double)Y[i]/norm);
+ }
+
+ fclose(file);
+ return;
+}
+
+
+void
+writeXYUint(char *filename, Uint *X, Uint *Y, Uint len) {
+ FILE *file;
+ Uint i;
+
+ file = fopen(filename, "w");
+ if (file == NULL) {
+ fprintf(stderr, "couldn't open %s - exit forced", filename);
+ exit(-1);
+ }
+
+ for(i=0; i < len; i++) {
+ fprintf(file,"%d\t%d\t%d\n", i, X[i], Y[i]);
+ }
+
+ fclose(file);
+}
+
+void
+writeXYZ(char *filename, double *X, double *Y, double *Z, Uint len) {
+ FILE *file;
+ Uint i;
+
+ file = fopen(filename, "w");
+ if (file == NULL) {
+ fprintf(stderr, "couldn't open %s - exit forced", filename);
+ exit(-1);
+ }
+
+ for(i=0; i < len; i++) {
+ fprintf(file,"%f\t%f\t%f\n", X[i], Y[i], Z[i]);
+ }
+
+ fclose(file);
+}
+
diff --git a/segemehl/libs/fileio.h b/segemehl/libs/fileio.h
new file mode 100644
index 0000000..7b1a379
--- /dev/null
+++ b/segemehl/libs/fileio.h
@@ -0,0 +1,47 @@
+#ifndef FILEIO_H
+#define FILEIO_H
+
+/*
+ * fileio.h
+ * declarations for file io
+ *
+ * @author Steve Hoffmann
+ * @date Sat 25 Nov 2006
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: fileio.h 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/fileio.h $
+ */
+
+#ifndef ALLOCMEMORY
+ #include "memory.h"
+#endif
+#include <math.h>
+#include "stringutils.h"
+
+char * bl_getTempFile(char *tmp, Uint tmplen);
+int bl_UnixSort(void *space, char *filename, const char *fieldstring, const char delim);
+char* readfile(void *, char *, Uint*);
+stringset_t **readcsv(void *, char *, char*, Uint *);
+void writeY(char *, double *, Uint, Uint, Uint);
+void writeXYUint(char *filename, Uint *X, Uint *Y, Uint len);
+int bl_fgets(void *space, FILE *fp, char **str);
+char * bl_basename (const char *name);
+char* bl_replacealphanum(char *s, Uint len);
+char* bl_replacenonalphanum(char *s, Uint len);
+int bl_fileprefixlen(char *filename);
+int bl_UnixSortMerge(void *space, char **filenames, Uint nooffiles, const char *fieldstring, const char delim, char *outfile);
+void bl_writeFileHeader(char *filename, char *header) ;
+void bl_freplace(char *filename, char oldchar, char newchar, char stop);
+void bl_freplacearr(char *filename, char* oldchars, char *newchars, Uint len, char stop);
+void bl_freplacestr(char *filename, char* str, Uint len, char stop);
+void writeXYZ(char *filename, double *X, double *Y, double *Z, Uint len);
+int bl_rm(void *space, char *filename);
+void writeYUint(char *filename, Uint *Y, Uint len, Uint xoff, Uint yoff);
+void writeYUintNorm(char *filename, Uint *Y, Uint len, Uint yoff);
+double* readX(void *space, char *filename, Uint *nvals);
+#endif
diff --git a/segemehl/libs/fqueue.c b/segemehl/libs/fqueue.c
new file mode 100644
index 0000000..de7fba5
--- /dev/null
+++ b/segemehl/libs/fqueue.c
@@ -0,0 +1,183 @@
+
+/*
+ * fqueue.c
+ * implementation for a flexible circular queue that
+ * takes care of memory management
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 07/07/08 09:31:37 CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "memory.h"
+#include "fqueue.h"
+
+unsigned char
+fqueueisempty (fqueue *queue) {
+ return((unsigned char) (queue->noofelems==0));
+}
+
+int
+initfqueue(fqueue *queue, size_t size, int alloc) {
+
+ queue->elems=realloc(NULL, size*alloc);
+ if (queue->elems == NULL) return -1;
+
+ queue->size = size;
+ queue->alloc = alloc;
+ queue->noofelems = 0;
+ queue->enqueueidx = 0;
+ queue->dequeueidx = 0;
+
+ return 0;
+}
+
+
+int
+resizefqueue(fqueue *queue) {
+ char *q, *src, *dst, *ptr;
+
+ queue->elems = realloc(queue->elems, queue->size*queue->alloc*2);
+ if(queue->dequeueidx >= queue->enqueueidx) {
+ q=(char*) queue->elems;
+ src = q+(queue->dequeueidx*queue->size);
+ dst = q+((queue->dequeueidx+queue->alloc)*queue->size);
+ ptr = memmove(dst, src, (queue->alloc-queue->dequeueidx)*queue->size);
+ if (ptr != dst) return -1;
+ queue->dequeueidx += queue->alloc;
+ }
+
+ queue->alloc = queue->alloc*2;
+ return 0;
+}
+
+
+int
+fenqueue(fqueue *queue, void *elem) {
+ int r;
+ char *ptr;
+
+ if(queue->noofelems == queue->alloc) {
+ r = resizefqueue(queue);
+ if (r != 0) return -1;
+ }
+
+ ptr = (char*)queue->elems;
+ ptr = ptr+(queue->enqueueidx*queue->size);
+ ptr = memmove(ptr, elem, queue->size);
+
+ if (ptr == NULL) return -2;
+ queue->noofelems++;
+ if(queue->enqueueidx == queue->alloc-1) {
+ queue->enqueueidx = 0;
+ } else {
+ queue->enqueueidx++;
+ }
+
+ return 0;
+}
+
+
+int
+fqueuejump(fqueue *queue, void *elem) {
+ int r;
+ char *ptr;
+
+ if(queue->noofelems == queue->alloc) {
+ r = resizefqueue(queue);
+ if (r != 0) return -1;
+ }
+
+ if(queue->dequeueidx == 0) {
+ queue->dequeueidx = queue->alloc-1;
+ } else {
+ queue->dequeueidx--;
+ }
+
+ assert(queue->dequeueidx != queue->enqueueidx);
+
+ ptr = (char*)queue->elems;
+ ptr = ptr+(queue->dequeueidx*queue->size);
+ ptr = memmove(ptr, elem, queue->size);
+
+ if (ptr == NULL) return -2;
+ queue->noofelems++;
+
+ return 0;
+}
+
+
+
+void*
+fdequeue (fqueue *queue) {
+ char *elem;
+
+ if(fqueueisempty(queue)) {
+ return NULL;
+ }
+
+ elem = (char*) queue->elems;
+ elem = elem+(queue->dequeueidx*queue->size);
+ queue->noofelems--;
+
+ if(queue->dequeueidx == queue->alloc-1) {
+ queue->dequeueidx = 0;
+ } else {
+ queue->dequeueidx++;
+ }
+ return elem;
+}
+
+
+void*
+fqueuefront(fqueue *queue) {
+ char *elem;
+
+ if(fqueueisempty(queue)) {
+ return NULL;
+ }
+
+ elem = (char*) queue->elems;
+ elem = elem+(queue->dequeueidx*queue->size);
+
+ return elem;
+}
+
+void*
+fqueueget(fqueue *queue, unsigned int k) {
+ char *elem;
+ unsigned int i,j;
+
+ if(fqueueisempty(queue) || k >= queue->noofelems) {
+ return NULL;
+ }
+
+ i = queue->dequeueidx;
+ for(j=0; j < k; j++) {
+
+ if(i == queue->alloc-1) {
+ i = 0;
+ } else {
+ i++;
+ }
+ }
+
+ elem = (char*) queue->elems;
+ elem = elem+(i*queue->size);
+
+ return elem;
+}
+
+void
+wrapfqueue(fqueue *queue) {
+ free(queue->elems);
+}
+
+
+
+
+
diff --git a/segemehl/libs/fqueue.h b/segemehl/libs/fqueue.h
new file mode 100644
index 0000000..22f7ff1
--- /dev/null
+++ b/segemehl/libs/fqueue.h
@@ -0,0 +1,35 @@
+#ifndef FQUEUE_H
+#define FQUEUE_H
+
+/*
+ *
+ * fqueue.h
+ * declaration for flexible queues
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 07/07/08 09:30:29 CEST
+ *
+ *
+ */
+
+typedef struct {
+
+ size_t size;
+ int alloc;
+ int enqueueidx;
+ int dequeueidx;
+ int noofelems;
+ void *elems;
+} fqueue;
+
+void wrapfqueue(fqueue *queue);
+void* fdequeue (fqueue *queue);
+int fenqueue(fqueue *queue, void *elem);
+int resizefqueue(fqueue *queue);
+int initfqueue(fqueue *queue, size_t size, int alloc);
+unsigned char fqueueisempty (fqueue *queue);
+void *fqueuefront(fqueue *queue);
+void* fqueueget(fqueue *queue, unsigned int i);
+int fqueuejump(fqueue *queue, void *elem);
+#endif
diff --git a/segemehl/libs/fstack.c b/segemehl/libs/fstack.c
new file mode 100644
index 0000000..8da6f8d
--- /dev/null
+++ b/segemehl/libs/fstack.c
@@ -0,0 +1,92 @@
+
+/*
+ * fstack.c
+ * flexible stack implementaion that takes care
+ * of all the allocation work
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 07/07/08 08:35:38 CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "memory.h"
+#include "fstack.h"
+
+int
+initfstack(fstack* stack, size_t size, int alloc) {
+
+ stack->elems=realloc(NULL, size*alloc);
+ if (stack->elems == NULL) return -1;
+
+ stack->size = size;
+ stack->alloc = alloc;
+ stack->top = -1;
+
+ return 0;
+}
+
+unsigned char fstackisempty(fstack *stack){
+ return ((unsigned char)(stack->top < 0));
+}
+
+int
+fstackpush(fstack *stack, void* elem) {
+ char *ptr;
+
+ if(stack->top >= stack->alloc-1) {
+ stack->elems = realloc(stack->elems, stack->size*(stack->alloc+stack->inc));
+ if (stack->elems == NULL) exit(-1);
+ stack->alloc += stack->inc;
+ }
+
+ stack->top++;
+ ptr = (char*) stack->elems;
+ memmove((ptr+(stack->top*stack->size)), elem, stack->size);
+
+ return 0;
+}
+
+void*
+fstackpop(fstack *stack) {
+ void *elem;
+ char *ptr;
+
+ if(fstackisempty(stack)) return NULL;
+
+ /*cleanup*/
+ if(stack->top < stack->alloc-stack->inc) {
+ ptr = realloc(stack->elems, stack->size*(stack->top+1));
+ if(ptr == NULL) return NULL;
+ stack->elems = ptr;
+ stack->alloc = stack->top+1;
+ }
+
+
+ ptr = (char*) stack->elems;
+ elem= (ptr+(stack->size*stack->top));
+
+ stack->top--;
+ return elem;
+}
+
+void*
+fstacktop(fstack *stack) {
+ void *elem;
+ char *ptr;
+
+ if(fstackisempty(stack)) return NULL;
+ ptr = (char*) stack->elems;
+ elem= (ptr+(stack->size*stack->top));
+
+ return elem;
+}
+
+void
+destructfstack(void *space, fstack *stack){
+ free(stack->elems);
+}
+
diff --git a/segemehl/libs/fstack.h b/segemehl/libs/fstack.h
new file mode 100644
index 0000000..9f24da1
--- /dev/null
+++ b/segemehl/libs/fstack.h
@@ -0,0 +1,36 @@
+#ifndef FSTACK_H
+#define FSTACK_H
+
+/*
+ *
+ * fstack.h
+ * flexible stack declarations
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 07/07/08 08:36:50 CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#define FSTACKINC 1000
+
+typedef struct {
+ size_t size;
+ int alloc;
+ int top;
+ void *elems;
+} fstack;
+
+
+int initfstack(fstack *stack, size_t size, int alloc);
+unsigned char fstackisempty(fstack *stack);
+int fstackpush(fstack *stack, void* elem);
+void* fstackpop(fstack *stack);
+void* fstacktop(fstack *stack);
+void destructstack(fstack *stack);
+
+#endif
+
diff --git a/segemehl/libs/hash.c b/segemehl/libs/hash.c
new file mode 100644
index 0000000..421897a
--- /dev/null
+++ b/segemehl/libs/hash.c
@@ -0,0 +1,346 @@
+/**
+ * hash.c
+ * implementation of simple hashing
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @date Wed Nov 24 15:48:15 CET 2010
+ *
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 284 $
+ * Author: $Author: steve $
+ * Date: $Date: 2011-05-03 07:41:30 -0400 (Tue, 03 May 2011) $
+ * Id: $Id: hash.c 284 2011-05-03 11:41:30Z steve $
+ * Url: $URL: http://www2.bioinf.uni-leipzig.de/svn5/segemehl/libs/hash.c $
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "basic-types.h"
+#include "debug.h"
+#include "info.h"
+#include "biofiles.h"
+#include "hash.h"
+
+#ifdef HASHING
+
+hash_t DJBHash(char* msg){
+ Uint i, len;
+ hash_t hash = 5381;
+
+ len = strlen(msg);
+ for(i = 0; i < len; msg++, i++){
+ hash = ((hash << 5) + hash) + (*msg);
+ }
+
+ return hash;
+}
+
+hash_t* DJBHash2(char* msg, Uint len, Uint masksize){
+ Uint i, j;
+ hash_t *hash;
+ hash = malloc(sizeof(hash_t) * masksize);
+ for (j = 0; j < masksize; j++){
+ hash[j] = 5381;
+ }
+
+ for(i = 0; i < len; msg++, i++){
+ for (j = 0; j < masksize; j++){
+ if (j == 0 || i & (1 << (j-1))){
+ hash[j] = ((hash[j] << 5) + hash[j]) + (*msg);
+ //DBG("%d %d\n", i, j);
+ }
+ }
+ }
+ return hash;
+}
+
+void bl_hashInit(Hash *h, Uint hashbitsize, size_t sizeofelem,
+ hash_t (*hashfunc)(char *)){
+ if (hashbitsize < 0 || hashbitsize >= PRIMES_SIZE){
+ DBG("hash.c: Attempt to initialize a simple hash of bit size %d but \
+must be between 1 and 32. Exit forced.\n", hashbitsize);
+ exit(-1);
+ }
+ if (sizeofelem <= 0){
+ DBG("hash.c: Attempt to initialize a simple hash with data of size %d.\
+Exit forced.\n", sizeofelem);
+ exit(-1);
+ }
+ h->allocelem = PRIMES[hashbitsize - 1];
+ h->sizeofelem = sizeofelem;
+ h->numofelem = 0;
+ h->hashfunc = hashfunc;
+
+ h->hashspace = malloc(h->allocelem * h->sizeofelem);
+ h->flag = malloc(h->allocelem * sizeof(char));
+ if (h->hashspace == NULL || h->flag == NULL){
+ DBG("hash.c: Memory allocation failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ memset(h->hashspace, 0, h->allocelem * h->sizeofelem);
+ memset(h->flag, EMPTY, h->allocelem * sizeof(char));
+}
+
+void bl_hashDestruct(Hash *h, void (*rmv)(void *)){
+ hash_t i;
+ char *p;
+ if (rmv != NULL){
+ p = (char *) h->hashspace;
+ for (i = 0; i < h->numofelem; i++){
+ rmv(p + (i * h->sizeofelem));
+ }
+ }
+ free(h->hashspace);
+ free(h->flag);
+ h->allocelem = 0;
+ h->sizeofelem = 0;
+ h->numofelem = 0;
+}
+
+hash_t bl_hashGetHashval(Hash *h, char *key){
+ assert(key != NULL);
+ hash_t hashval;
+
+ hashval = (h->hashfunc)(key) % h->allocelem;
+ return hashval;
+}
+
+hash_t bl_hashGetHashinc(Hash *h, char *key){
+ assert(key != NULL);
+ hash_t hashval;
+
+ hashval = 1 + ((h->hashfunc)(key) % (h->allocelem - 1));
+ return hashval;
+}
+
+unsigned char bl_hashGetFlagFromKey(Hash *h, char *key){
+ return bl_hashGetFlag(h, bl_hashGetHashval(h, key));
+}
+
+unsigned char bl_hashGetFlag(Hash *h, hash_t hashval){
+ return h->flag[hashval];
+}
+
+void *bl_hashGetDataFromKey(Hash *h, char *key){
+ return bl_hashGetData(h, bl_hashGetHashval(h, key));
+}
+
+void *bl_hashGetData(Hash *h, hash_t hashval){
+ char *p;
+ if (h->numofelem == 0 || bl_hashGetFlag(h, hashval) == EMPTY){
+ return NULL;
+ }
+ p = (char *) h->hashspace;
+ return(p + (hashval * h->sizeofelem));
+}
+
+unsigned char bl_hashInsertFromKey(Hash *h, char *key, void *data){
+ return bl_hashInsert(h, bl_hashGetHashval(h, key), data);
+}
+
+unsigned char bl_hashInsert(Hash *h, hash_t hashval, void *data){
+ char *p;
+ if (bl_hashGetFlag(h, hashval) != EMPTY){
+ return 0;
+ }
+ p = (char *) h->hashspace;
+ memmove(p + (hashval * h->sizeofelem), data, h->sizeofelem);
+ h->flag[hashval] = OCCUPIED;
+ h->numofelem++;
+ return 1;
+}
+
+/*
+ * used for non-indexed fasta files:
+ *
+ * --> very performant with few overhead (only hashtable)
+ */
+void bl_fastxGetTags(void *space, fasta_t *f){
+ Uint i, hashbitsize = 28, readlen, *data;
+ hash_t dupcnt = 0, collcnt = 0, cnt = 0;
+ Hash *hash;
+ hash_t hashval, hashinc;
+ char *read;
+
+ hash = malloc(sizeof(Hash));
+ bl_hashInit(hash, hashbitsize, sizeof(Uint), DJBHash);
+
+ for (i = 0; i < f->noofseqs; i++){
+ readlen = bl_fastaGetSequenceLength(f, i);
+ read = malloc(readlen + 1);
+ memmove(read, bl_fastaGetSequence(f, i), readlen+1);
+ hashval = bl_hashGetHashval(hash, read);
+ hashinc = bl_hashGetHashinc(hash, read);
+
+ /* collision handling */
+ while(bl_hashGetFlag(hash, hashval) != EMPTY){
+ /*
+ * no double hashing if too full
+ * --> notice user? abort?
+ * but: too full is known before since each read is
+ * inserted at most at one position
+ * but: amount of double hashing is not known
+ */
+ collcnt++;
+ data = (Uint *) bl_hashGetData(hash, hashval);
+ //DBG("coll:%u\t%u\t%s\n%u\t%u\t%s\n", i, hashval, read, *data, bl_hashGetHashval(hash, bl_fastaGetSequence(f, *data)), bl_fastaGetSequence(f, *data));
+ /* duplicate */
+ if (strcmp(bl_fastaGetSequence(f, *data), read) == 0){
+ break;
+ }
+ /* double hashing */
+ if (hashval + hashinc >= hash->allocelem){
+ hashval = hashval + hashinc - hash->allocelem;
+ }
+ else {
+ hashval = hashval + hashinc;
+ }
+ }
+ /* empty position --> insert into hash */
+ if (bl_hashGetFlag(hash, hashval) == EMPTY){
+ if (!bl_hashInsert(hash, hashval, &i)){
+ DBG("Insert in hash failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ cnt++;
+ /* store that read is unique up to now */
+
+ }
+ /* duplicate handling */
+ else {
+ dupcnt++;
+ }
+ free(read);
+ }
+ NFO("%u/%u unique reads and %u duplicates and %u collisions\n", hash->numofelem, cnt, dupcnt, collcnt);
+ bl_hashDestruct(hash, NULL);
+ free(hash);
+}
+
+/*
+ * used for indexed fasta files:
+ * stores collisions with read indices and one read sequence for post-processing,
+ * afterwards sorting to access indexed fasta files only block-wise and
+ * resolve collisions by one linear scan
+ * NOTE: correct number of collisions may differ to non-indexed variant
+ */
+void bl_fastxGetTags3(void *space, fasta_t *f){
+ Uint i, j, hashbitsize = 28, readlen, *data, lastcoll;
+ hash_t dupcnt = 0, cnt = 0;
+ Hash *hash;
+ hash_t hashval;
+ char *read;
+ collision_t *coll;
+
+ hash = malloc(sizeof(Hash));
+ bl_hashInit(hash, hashbitsize, sizeof(Uint), DJBHash);
+ lastcoll = 0;
+
+ /* TODO: more efficiently */
+ coll = malloc(sizeof(collision_t) * f->noofseqs);
+
+
+ MSG("Hashing\n");
+ for (i = 0; i < f->noofseqs; i++){
+ //DBG("%u\t%u\n", i, bl_fastaGetQuantity(f,i));
+ bl_fastaSetQuantity(f, i, 2);
+ //DBG("%u\t%u\n", i, bl_fastaGetQuantity(f,i));
+ readlen = bl_fastaGetSequenceLength(f, i);
+ read = malloc(readlen + 1);
+ memmove(read, bl_fastaGetSequence(f, i), readlen+1);
+ hashval = bl_hashGetHashval(hash, read);
+
+ /* collision handling */
+ if(bl_hashGetFlag(hash, hashval) != EMPTY){
+ data = (Uint *) bl_hashGetData(hash, hashval);
+ coll[lastcoll].a = *data;
+ coll[lastcoll].b = i;
+ coll[lastcoll].readb = read;
+ lastcoll++;
+ }
+ /* empty position --> insert into hash */
+ else {
+ cnt++;
+ /* init entry object */
+ if (!bl_hashInsert(hash, hashval, &i)){
+ DBG("Insert in hash failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ free(read);
+ }
+ }
+ /* sort collisions by a idx and readb sequence */
+ MSG("Sorting\n");
+ qsort(coll, lastcoll, sizeof(collision_t), cmp_collision_qsort);
+ NFO("Resolving %u collisions\n", lastcoll);
+ /* resolving collisions */
+ for (i = 0; i < lastcoll; i++){
+ read = bl_fastaGetSequence(f, coll[i].a);
+ readlen = bl_fastaGetSequenceLength(f, coll[i].a);
+ /* duplicate with element in hash table */
+ if (strncmp(read, coll[i].readb, readlen) == 0){
+ bl_fastaSetQuantity(f, coll[i].a, bl_fastaGetQuantity(f, coll[i].a) + 1);
+ /* TODO: slow if this needs sequence data reloading */
+ //bl_fastaSetQuantity(f, coll[i].b, 0);
+ dupcnt++;
+ free(coll[i].readb);
+ }
+ else {
+ cnt++;
+ /*
+ * equal read sequences have
+ * collisions with same element
+ * in hash table but element has
+ * different sequence, e.g.,
+ * hashtable[i] = element a with hash(a)=i
+ * hash(b) = i but a.sequence != b.sequence
+ * hash(c) = i but a.sequence != c.sequence
+ * BUT: b.sequence == c.sequence
+ * --> process all such cases and jump over
+ */
+ j = 1;
+ while(i + j < lastcoll && coll[i].a == coll[i+j].a &&
+ strcmp(coll[i].readb, coll[i+j].readb) == 0){
+ //DBG("coll:%u\t%u\t%s\t%u\t%u\t%s\t%u\n%u\t%u\t%s\t%u\t%u\t%s\t%u\n", i, coll[i].a, bl_fastaGetSequence(f, coll[i].a),
+ // bl_hashGetHashval(hash, bl_fastaGetSequence(f, coll[i].a)), coll[i].b, coll[i].readb, bl_hashGetHashval(hash, coll[i].readb),
+ // i+j, coll[i+j].a, bl_fastaGetSequence(f, coll[i+j].a), bl_hashGetHashval(hash, bl_fastaGetSequence(f, coll[i+j].a)),
+ // coll[i+j].b, coll[i+j].readb, bl_hashGetHashval(hash, coll[i+j].readb));
+ dupcnt++;
+ /* TODO: slow since it requires sequence data reloading */
+ //bl_fastaSetQuantity(f, coll[i].b, bl_fastaGetQuantity(f, coll[i].b) + 1);
+ //bl_fastaSetQuantity(f, coll[i+j].b, 0);
+ free(coll[i+j].readb);
+ j++;
+ }
+ free(coll[i].readb);
+
+ /* jump over already processed entries */
+ i += j - 1;
+ }
+ }
+ free(coll);
+
+ NFO("%u unique reads and %u duplicates and %u collisions\n", cnt, dupcnt, lastcoll);
+ bl_hashDestruct(hash, NULL);
+ free(hash);
+}
+
+void bl_entryDestruct(void *data){
+ entry_t *entry = (entry_t *) data;
+ free(entry->read);
+ entry->idx = 0;
+}
+
+int cmp_collision_qsort(const void *a, const void *b){
+ collision_t *first = (collision_t *) a;
+ collision_t *second = (collision_t *) b;
+ if (first->a > second->a) return 1;
+ if (first->a < second->a) return -1;
+ return strcmp(first->readb, second->readb);
+}
+
+#endif /* only required if hashing is defined */
diff --git a/segemehl/libs/hash.h b/segemehl/libs/hash.h
new file mode 100644
index 0000000..ecc8c11
--- /dev/null
+++ b/segemehl/libs/hash.h
@@ -0,0 +1,87 @@
+#ifndef HASH_H
+#define HASH_H
+
+/**
+ * hash.h
+ * implementation of simple hashing
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @date Wed Nov 24 15:48:15 CET 2010
+ *
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 278 $
+ * Author: $Author: steve $
+ * Date: $Date: 2011-04-04 11:06:15 -0400 (Mon, 04 Apr 2011) $
+ * Id: $Id: hash.h 278 2011-04-04 15:06:15Z steve $
+ * Url: $URL: http://www2.bioinf.uni-leipzig.de/svn5/segemehl/libs/hash.h $
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "basic-types.h"
+
+typedef uint32_t hash_t;
+
+#define MAXFILL 0.77
+
+#define EMPTY 0
+#define OCCUPIED 1
+#define LOADED 2
+
+#define PRIMES_SIZE 32
+static const hash_t PRIMES[PRIMES_SIZE] =
+{
+ 0ul, 3ul, 11ul, 23ul, 53ul,
+ 97ul, 193ul, 389ul, 769ul, 1543ul,
+ 3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
+ 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
+ 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
+ 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
+ 3221225473ul, 4294967291ul
+};
+
+typedef struct
+{
+ void *hashspace;
+ unsigned char *flag;
+ hash_t allocelem;
+ hash_t numofelem;
+ size_t sizeofelem;
+ hash_t (*hashfunc) (char *);
+} Hash;
+
+typedef struct
+{
+ Uint a;
+ Uint b;
+ char *readb;
+} collision_t;
+
+typedef struct
+{
+ Uint idx;
+ char *read;
+} entry_t;
+
+void bl_entryDestruct(void *data);
+hash_t DJBHash(char* msg);
+int cmp_collision_qsort(const void *a, const void *b);
+void bl_hashInit(Hash *h, Uint hashbitsize, size_t sizeofelem, hash_t (*hashfunc)(char*));
+void bl_hashDestruct(Hash *h, void (void *));
+hash_t bl_hashGetHashval(Hash *h, char *key);
+hash_t bl_hashGetHashinc(Hash *h, char *key);
+unsigned char bl_hashGetFlagFromKey(Hash *h, char *key);
+unsigned char bl_hashGetFlag(Hash *h, hash_t hashval);
+void *bl_hashGetDataFromKey(Hash *h, char *key);
+void *bl_hashGetData(Hash *h, hash_t hashval);
+unsigned char bl_hashInsertFromKey(Hash *h, char *key, void *data);
+unsigned char bl_hashInsert(Hash *h, hash_t hashval, void *data);
+void bl_hashDestruct(Hash *h, void (*rmv)(void*));
+void bl_fastxGetTags(void *space, fasta_t *f);
+void bl_fastxGetTags3(void *space, fasta_t *f);
+
+#endif /* HASH_H */
diff --git a/segemehl/libs/info.c b/segemehl/libs/info.c
new file mode 100644
index 0000000..7618e81
--- /dev/null
+++ b/segemehl/libs/info.c
@@ -0,0 +1,128 @@
+
+/*
+ * info.c
+ * nfo messages
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 08/26/2007 06:49:02 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: info.c 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/info.c $
+ *
+ */
+
+ #include <stdarg.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+ #include <time.h>
+ #include "info.h"
+ #include "debug.h"
+
+ FILE *nfodevice = NULL;
+
+
+ char *timestr_r(const struct tm *timeptr) {
+ static const char wday_name[7][3] = {
+ "Sun", "Mon", "Tue", "Wed",
+ "Thu", "Fri", "Sat"
+ };
+
+ static const char mon_name[12][3] = {
+ "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+ };
+
+ static char result[26];
+
+ sprintf(result, "%.3s %.3s%3d %.2d:%.2d:%.2d %d",
+ wday_name[timeptr->tm_wday], mon_name[timeptr->tm_mon],
+ timeptr->tm_mday, timeptr->tm_hour, timeptr->tm_min,
+ timeptr->tm_sec, 1900 + timeptr->tm_year);
+
+ return result;
+ }
+
+ int
+ infomsg( char *file,
+ int line,
+ const char *fmt, ...) {
+
+ int ret;
+ va_list ap;
+ time_t rawtime;
+ struct tm *timeinfo;
+
+ if (mute) return 0;
+
+ time(&rawtime);
+ timeinfo = localtime (&rawtime);
+
+ if (nfodevice == NULL) {
+ nfodevice = NFODEVICE;
+ }
+
+ va_start(ap, fmt);
+#ifdef PROGNFO
+ fprintf(nfodevice, "[%s] %s: ", "SEGEMEHL", timestr_r(timeinfo));
+#endif
+ ret = vfprintf(nfodevice, fmt, ap);
+ va_end(ap);
+
+ return ret;
+ }
+
+
+void
+setnfodevice(char *filename) {
+ FILE *fp;
+
+ fp = fopen(filename, "w");
+ if (fp == NULL) {
+ DBG("Couldn't open file '%s'. Exit forced.\n", filename);
+ exit(-1);
+ }
+
+ nfodevice = fp;
+}
+
+int
+nfolevel( char *file,
+ int line,
+ int level,
+ const char *fmt, ... ) {
+
+ int ret=0;
+ va_list ap;
+ time_t rawtime;
+ struct tm *timeinfo;
+
+ if (mute) return 0;
+
+ time(&rawtime);
+ timeinfo = localtime (&rawtime);
+
+ if (nfodevice == NULL) {
+ nfodevice = NFODEVICE;
+ }
+
+ if (NFOLEVEL >= level) {
+
+ va_start(ap, fmt);
+#ifdef PROGNFO
+ fprintf(nfodevice, "[%s] %s: ", "SEGEMEHL", timestr_r(timeinfo));
+#endif
+ ret = vfprintf(nfodevice, fmt, ap);
+ va_end(ap);
+ }
+
+ return ret;
+}
+
+
diff --git a/segemehl/libs/info.h b/segemehl/libs/info.h
new file mode 100644
index 0000000..a1f3c67
--- /dev/null
+++ b/segemehl/libs/info.h
@@ -0,0 +1,44 @@
+ #ifndef INFO_H
+ #define INFO_H
+
+/*
+ *
+ * info.h
+ * nfo messages
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 08/26/2007 07:17:44 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: info.h 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/info.h $
+ */
+
+ #include <stdarg.h>
+ #include <stdio.h>
+ #include <string.h>
+
+#ifndef NFOLEVEL
+#define NFOLEVEL 0
+#endif
+
+#ifndef NFODEVICE
+#define NFODEVICE stderr
+#endif
+
+#define NFOL(L, X, ... ) debuglevel (__FILE__, __LINE__, L, X, __VA_ARGS__)
+#define NFO(X, ...) infomsg(__FILE__, __LINE__, X, __VA_ARGS__)
+#define INFO(X, ...) infomsg(__FILE__, __LINE__, X, __VA_ARGS__)
+#define MSG(X) infomsg(__FILE__, __LINE__, X)
+
+extern unsigned char mute;
+
+int infomsg(char *, int, const char *fmt, ...);
+int infolevel(char *, int, int, const char *fmt, ...);
+
+ #endif
diff --git a/segemehl/libs/iupac.c b/segemehl/libs/iupac.c
new file mode 100644
index 0000000..82b624b
--- /dev/null
+++ b/segemehl/libs/iupac.c
@@ -0,0 +1,291 @@
+/**
+ * iupac.c
+ * declarations for IUPAC nucleotide code
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Fri Jul 23 15:03:08 CEST 2010
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 149 $
+ * Author: $Author: steve $
+ * Date: $Date: 2010-09-14 05:45:04 -0400 (Tue, 14 Sep 2010) $
+ * Id: $Id: iupac.c 149 2010-09-14 09:45:04Z steve $
+ * Url: $URL: http://www2.bioinf.uni-leipzig.de/svn5/segemehl/libs/iupac.c $
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+#include "basic-types.h"
+#include "debug.h"
+#include "iupac.h"
+
+/* defined maximal allowed symbol ambiguity on query sequence */
+static Uint maxqryamb;
+
+/* defined maximal allowed symbol ambiguity on subject sequence */
+static Uint maxseqamb;
+
+/* indicates whether iupac matching is enabled */
+static BOOL iupac;
+
+/* define iupac on ascii sized tabular */
+#define IUPACTABSIZE 255
+
+/* define maximal iupac bit */
+#define IUPACMAXBIT 8
+
+/*
+ * iupac symbol as bit vectors, non-iupac as zeros,
+ * mapping of different symbols by AND
+ */
+static Uint IUPACTAB[IUPACTABSIZE];
+
+/*
+ * ambiguity of iupac symbol as number of ones
+ * in bitvector
+ */
+static Uint IUPACAMB[IUPACTABSIZE];
+
+/*--------------------------------- getAmb -------------------------------------
+ *
+ * @brief get degree of ambiguity of any char with given Uint value
+ * @author Christian Otto
+ *
+ */
+Uint getAmb(Uint num){
+ Uint count = 0;
+ while(num != 0){
+ count = (num & 1)?count + 1:count;
+ num >>= 1;
+ }
+ return count;
+}
+
+/*-------------------------------- initIUPAC -----------------------------------
+ *
+ * @brief initializes IUPAC table using one bit for each nucleotide,
+ * note that IUPACTAB is a constant since it does not depend on
+ * any given parameter (only qryamb and seqamb are parameters)
+ * @author Christian Otto
+ *
+ */
+void initIUPAC(Uint qryamb, Uint seqamb){
+ Uint i, A, C, G, T;
+ memset(IUPACTAB, 0, IUPACTABSIZE * sizeof(Uint));
+
+ maxqryamb = qryamb;
+ maxseqamb = seqamb;
+ iupac = (maxseqamb > 1 || maxqryamb > 1);
+
+ /* define nucleotides */
+ A = (1 << 0);
+ C = (1 << 1);
+ G = (1 << 2);
+ T = (1 << 3);
+
+ /* init nucleotides */
+ IUPACTAB[(Uint)'A'] = A;
+ IUPACTAB[(Uint)'C'] = C;
+ IUPACTAB[(Uint)'G'] = G;
+ IUPACTAB[(Uint)'T'] = T;
+ IUPACTAB[(Uint)'U'] = T;
+
+ /* define symbols of ambiguity 2 */
+ IUPACTAB[(Uint)'R'] = (A | G);
+ IUPACTAB[(Uint)'Y'] = (C | T);
+ IUPACTAB[(Uint)'S'] = (G | C);
+ IUPACTAB[(Uint)'W'] = (A | T);
+ IUPACTAB[(Uint)'K'] = (G | T);
+ IUPACTAB[(Uint)'M'] = (A | C);
+
+ /* define symbols of ambiguity 3 */
+ IUPACTAB[(Uint)'B'] = (C | G | T);
+ IUPACTAB[(Uint)'D'] = (A | G | T);
+ IUPACTAB[(Uint)'H'] = (A | C | T);
+ IUPACTAB[(Uint)'V'] = (A | C | G);
+
+ /* define symbol of ambiguity 4 */
+ IUPACTAB[(Uint)'N'] = (A | C | G | T);
+
+ /* define lower case chars */
+ IUPACTAB[(Uint)'a'] = IUPACTAB[(Uint)'A'] << 4;
+ IUPACTAB[(Uint)'c'] = IUPACTAB[(Uint)'C'] << 4;
+ IUPACTAB[(Uint)'g'] = IUPACTAB[(Uint)'G'] << 4;
+ IUPACTAB[(Uint)'t'] = IUPACTAB[(Uint)'T'] << 4;
+ IUPACTAB[(Uint)'u'] = IUPACTAB[(Uint)'U'] << 4;
+ IUPACTAB[(Uint)'r'] = IUPACTAB[(Uint)'R'] << 4;
+ IUPACTAB[(Uint)'y'] = IUPACTAB[(Uint)'Y'] << 4;
+ IUPACTAB[(Uint)'s'] = IUPACTAB[(Uint)'S'] << 4;
+ IUPACTAB[(Uint)'w'] = IUPACTAB[(Uint)'W'] << 4;
+ IUPACTAB[(Uint)'k'] = IUPACTAB[(Uint)'K'] << 4;
+ IUPACTAB[(Uint)'m'] = IUPACTAB[(Uint)'M'] << 4;
+ IUPACTAB[(Uint)'b'] = IUPACTAB[(Uint)'B'] << 4;
+ IUPACTAB[(Uint)'d'] = IUPACTAB[(Uint)'D'] << 4;
+ IUPACTAB[(Uint)'h'] = IUPACTAB[(Uint)'H'] << 4;
+ IUPACTAB[(Uint)'v'] = IUPACTAB[(Uint)'V'] << 4;
+ IUPACTAB[(Uint)'n'] = IUPACTAB[(Uint)'N'] << 4;
+
+ for (i = 0; i < IUPACTABSIZE; i++){
+ IUPACAMB[i] = getAmb(IUPACTAB[i]);
+ }
+ //DBG("qryamb:%u, seqamb:%u, isallowedIUPAC:%u\n", qryamb, seqamb, isallowedIUPAC());
+}
+
+BOOL couldMatchIUPAC(char qrych){
+ if (maxseqamb == 1 &&
+ (IUPACAMB[(Uint) qrych] == 1 ||
+ IUPACAMB[(Uint) qrych] > maxqryamb)){
+ return 0;
+ }
+ else {
+ return 1;
+ }
+}
+
+Uint countAmbChars(char *seq, Uint len){
+ Uint i, cur, amb=0;
+ for (i = 0; i < len; i++){
+ cur = IUPACAMB[(Uint) seq[i]];
+ if (cur > 1 && cur <= maxqryamb){
+ amb++;
+ }
+ }
+ return amb;
+}
+
+Uint countNonMatchingChars(char *seq, Uint len){
+ Uint i, cur, cnt=len;
+ for (i = 0; i < len; i++){
+ cur = IUPACAMB[(Uint)seq[i]];
+ if (cur > 0 && cur <= maxqryamb){
+ cnt--;
+ }
+ }
+ return cnt;
+}
+
+/*-------------------------------- matchIUPAC ----------------------------------
+ *
+ * @brief indicates whether a query character matches the subject sequence
+ * character under initialized maximal ambiguity parameters
+ * @author Christian Otto
+ *
+ */
+BOOL matchIUPAC(char qrych, char seqch){
+ if (IUPACAMB[(Uint) seqch] <= maxseqamb && IUPACAMB[(Uint) qrych] <= maxqryamb){
+ return ((IUPACTAB[(Uint) seqch] & IUPACTAB[(Uint) qrych]) > 0);
+ }
+ return 0;
+}
+
+/*------------------------------ isallowedIUPAC --------------------------------
+ *
+ * @brief check whether any ambigious IUPAC symbol is allowed in matching
+ * @author Christian Otto
+ *
+ */
+BOOL isallowedIUPAC(){
+ return iupac;
+}
+
+/*--------------------------- iupacshannonentropy ------------------------------
+ *
+ * @brief minimal zero order sequence entropy for strings containing
+ * symbols of the IUPAC nucleotide code by maximizing nucleotide
+ * counts using ambigious characters
+ * @author Christian Otto
+ *
+ */
+double minshannonentropy(char *seq, Uint len) {
+ Uint i, j, k, *bitcnt, *chcnt, max, sum, isamb=0;
+ double *p, H=0;
+
+ /* init nucleotide counts (currently lower case differs from upper case!!!) */
+ p = malloc(IUPACMAXBIT * sizeof(double));
+ memset(p, 0, sizeof(double)*IUPACMAXBIT);
+
+ /* set IUPAC symbol counts (and set isamb if ambigious symbols occur) */
+ chcnt = malloc(IUPACTABSIZE * sizeof(Uint));
+ memset(chcnt, 0, IUPACTABSIZE * sizeof(Uint));
+ for (i = 0; i < len; i++){
+ chcnt[(Uint)seq[i]]++;
+ if (!isamb && IUPACAMB[(Uint)seq[i]] > 1){
+ isamb = 1;
+ }
+ }
+
+ /*
+ * maximize counts for nucleotides if
+ * ambigious symbols are occuring -> minimize entropy
+ */
+ if (isamb){
+ /* init bit counts at positions */
+ bitcnt = malloc(IUPACMAXBIT * sizeof(Uint));
+
+ for (i = 0; i < IUPACMAXBIT; i++){
+ /* count bits at each position */
+ memset(bitcnt, 0, IUPACMAXBIT * sizeof(Uint));
+ for (j = 0; j < IUPACTABSIZE; j++){
+ if (IUPACTAB[j] > 0 && chcnt[j] > 0){
+ for (k = 0; k < IUPACMAXBIT; k++){
+ bitcnt[k] += chcnt[j] * (1 & (IUPACTAB[j] >> k));
+ }
+ }
+ }
+ /* get max and sum */
+ max = 0; sum = 0;
+ for (k = 0; k < IUPACMAXBIT; k++){
+ if (bitcnt[k] > bitcnt[max]){
+ max = k;
+ }
+ sum += bitcnt[k];
+ }
+ if (sum == 0) break;
+
+ /* set symbol count to zero if max-th bit is set */
+ for (j = 0; j < IUPACTABSIZE; j++){
+ if (1 & (IUPACTAB[j] >> max)){
+ p[max] += chcnt[j];
+ chcnt[j] = 0;
+ }
+ }
+ }
+ /* abort if sum > 0 */
+ assert(sum == 0);
+ free(bitcnt);
+ }
+ /*
+ * otherwise simply count characters
+ */
+ else {
+ sum = 0;
+ k = 0;
+ for (j = 0; j < IUPACTABSIZE; j++){
+ if (chcnt[j] > 0){
+ p[k++] = chcnt[j];
+ sum += chcnt[j];
+ }
+ assert(k < IUPACMAXBIT);
+ }
+ assert(sum == len);
+ }
+
+ /* normalization and calculation of entropy */
+ for (i = 0; i < IUPACMAXBIT; i++){
+ if (p[i] > 0){
+ //DBG("%u\t%g\t%u\n", i, p[i], len);
+ H += (p[i]/len) * log2(p[i]/len);
+ }
+ }
+
+ /* cleanup */
+ free(chcnt);
+ free(p);
+ return -1 * H;
+}
diff --git a/segemehl/libs/iupac.h b/segemehl/libs/iupac.h
new file mode 100644
index 0000000..247c613
--- /dev/null
+++ b/segemehl/libs/iupac.h
@@ -0,0 +1,34 @@
+#ifndef IUPAC_H
+#define IUPAC_H
+
+/**
+ * iupac.h
+ * declarations for IUPAC nucleotide code
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Fri Jul 23 15:03:08 CEST 2010
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 144 $
+ * Author: $Author: steve $
+ * Date: $Date: 2010-09-02 05:58:04 -0400 (Thu, 02 Sep 2010) $
+ * Id: $Id: iupac.h 144 2010-09-02 09:58:04Z steve $
+ * Url: $URL: http://www2.bioinf.uni-leipzig.de/svn5/segemehl/libs/iupac.h $
+ */
+
+#include "basic-types.h"
+
+void initIUPAC(Uint maxqryamb, Uint maxseqamb);
+BOOL isallowedIUPAC();
+BOOL matchIUPAC(char qrych, char seqch);
+BOOL couldMatchIUPAC(char qrych);
+Uint countAmbChars(char *seq, Uint len);
+Uint countNonMatchingChars(char *seq, Uint len);
+double minshannonentropy(char *seq, Uint len);
+
+#endif /* IUPAC_H */
+
diff --git a/segemehl/libs/karlin.c b/segemehl/libs/karlin.c
new file mode 100644
index 0000000..d3b9d86
--- /dev/null
+++ b/segemehl/libs/karlin.c
@@ -0,0 +1,271 @@
+/*
+ Copyright by Stefan Kurtz (C) 1999-2003
+ =====================================
+ You may use, copy and distribute this file freely as long as you
+ - do not change the file,
+ - leave this copyright notice in the file,
+ - do not make any profit with the distribution of this file
+ - give credit where credit is due
+ You are not allowed to copy or distribute this file otherwise
+ The commercial usage and distribution of this file is prohibited
+ Please report bugs and suggestions to <kurtz at zbh.uni-hamburg.de>
+*/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "debug.h"
+#include "info.h"
+#include "basic-types.h"
+#include "memory.h"
+
+#define MAXIT 150 /* Maximum number of iterations used in calculating K */
+
+static int gcd (int a, int b)
+{
+ int c;
+
+ if (b < 0)
+ {
+ b = -b;
+ }
+ if (b > a)
+ {
+ c = a;
+ a = b;
+ b = c;
+ }
+ for (; b > 0; b = c)
+ {
+ c = a % b;
+ a = b;
+ }
+ return a;
+}
+
+static int karlinpp(void *space, int low, int high, double *pr,
+ double *lambda, double *K)
+{
+ int i, j, range, lo, hi, first, last;
+ double upval, Sumval, av, sum, *p, *P, *ptrP, *ptr1, *ptr2, *ptr1e, newval,
+ Ktmp;
+
+ /* Check that scores and their associated probabilities are valid */
+
+ if (low >= 0)
+ {
+ NFO("Lowest score %ld must be negative", low);
+ return -1;
+ }
+ for (i = range = high - low; i > -low && !pr[i]; --i)
+ /* Nothing */ ;
+ if (i <= -low)
+ {
+ MSG("A positive score must be possible");
+ return -2;
+ }
+ for (sum = 0.0, i = 0; i <= range; sum += pr[i++])
+ {
+ if (pr[i] < 0.0)
+ {
+ DBG("Negative probability %.2f not allowed",pr[i]);
+ return -3;
+ }
+ }
+ if (sum < 0.99995 || sum > 1.00005)
+ {
+ DBGL(3,"Probabilities sum to %.4f. Normalizing.\n", sum);
+ }
+
+ p = ALLOCMEMORY(space, NULL, double, (Uint)(range+100));
+ if(p == NULL)
+ {
+ return -4;
+ }
+ for (Sumval = (double) low, i = 0; i <= range; ++i)
+ {
+ Sumval += i * (p[i] = pr[i] / sum);
+ }
+ if(Sumval >= 0.0)
+ {
+ NFO("Invalid (non-negative) expected score: %.3f", Sumval);
+ return -5;
+ }
+
+ /* Calculate the parameter lambda */
+
+ upval = 0.5;
+ do
+ {
+ upval *= 2;
+ ptr1 = p;
+ for (sum = 0.0, i = low; i <= high; ++i)
+ {
+ sum += *ptr1++ * exp (upval * i);
+ }
+ } while (sum < 1.0);
+ for (*lambda = 0.0, j = 0; j < 40; ++j)
+ {
+ newval = (*lambda + upval) / 2.0;
+ ptr1 = p;
+ for (sum = 0.0, i = low; i <= high; ++i)
+ {
+ sum += *ptr1++ * exp (newval * i);
+ }
+ if (sum > 1.0)
+ {
+ upval = newval;
+ } else
+ {
+ *lambda = newval;
+ }
+ }
+
+ /* Calculate the pamameter K */
+
+ ptr1 = p;
+ for (av = 0.0, i = low; i <= high; ++i)
+ {
+ av += *ptr1++ * i * exp (*lambda * i);
+ }
+ if (low == -1 || high == 1)
+ {
+ *K = (high == 1) ? av : Sumval * Sumval / av;
+ *K *= 1.0 - exp (-*lambda);
+ free (p);
+ return 0; /* Parameters calculated successfully */
+ }
+ Sumval = 0.0;
+ lo = 0;
+ hi = 0;
+ P = ALLOCMEMORY(space, NULL,double,(Uint) (MAXIT * range + 100));
+ if(P == NULL)
+ {
+ return -6;
+ }
+ for (*P = 1.0, sum = 1.0, j = 1;
+ j <= MAXIT && sum > 0.00001; Sumval += sum /= j++)
+ {
+ first = last = range;
+ for (ptrP = P + (hi += high) - (lo += low); ptrP >= P; *ptrP-- = sum)
+ {
+ ptr1 = ptrP - first;
+ ptr1e = ptrP - last;
+ ptr2 = p + first;
+ for (sum = 0.0; ptr1 >= ptr1e;)
+ {
+ sum += *ptr1-- * *ptr2++;
+ }
+ if (first)
+ {
+ --first;
+ }
+ if (( (ptrP - P)) <= range)
+ {
+ --last;
+ }
+ }
+ for (sum = 0.0, i = lo; i; ++i)
+ {
+ sum += *++ptrP * exp (*lambda * i);
+ }
+ for (; i <= hi; ++i)
+ {
+ sum += *++ptrP;
+ }
+ }
+ if (j > MAXIT)
+ {
+ MSG("Value for K may be too large due to insufficient iterations");
+ return -7;
+ }
+ for (i = low; !p[i - low]; ++i)
+ /* Nothing */ ;
+ for (j = -i; i < high && j > 1;)
+ {
+ if (p[++i - low] != 0.0)
+ {
+ j = gcd (j, i);
+ }
+ }
+ Ktmp = (double) (j * exp (-2 * Sumval));
+ *K = Ktmp / (av * (1.0 - exp (-*lambda * j)));
+
+ FREEMEMORY(space, P);
+ FREEMEMORY(space, p);
+ return 0; /* Parameters calculated successfully */
+}
+
+int karlinunitcostpp(void *space, double *lambda, double *H, double *K)
+{
+ int ret;
+ int match = 1;
+ int mismatch = -1;
+ double targetid;
+
+ double pr[] = {0.75, 0.0, 0.25};
+ ret = karlinpp(space, mismatch, match, &pr[0], lambda, K);
+ if (ret != 0) return ret;
+
+ targetid = 0.25 * exp(*lambda*match);
+ *H = (*lambda * match * targetid) + (*lambda * mismatch * (1-targetid));
+
+ return ret;
+}
+
+
+double significance (double lambda,double K,double multiplier, int score)
+{
+ double y;
+
+ y = -lambda * score;
+ y = K * multiplier * exp (y);
+ return exp (-y);
+}
+
+
+double evalue (double lambda,double K,double multiplier, int score)
+{
+ double y;
+
+ y = -lambda * score;
+ y = K * multiplier * exp (y);
+ return y;
+}
+
+double bitscoreevalue (double lambda,double K,double multiplier, int score)
+{
+ double y;
+
+ y = -1 * score;
+ y = multiplier * pow(2,(y));
+ return y;
+}
+
+double bitscore(int score, double lambda, double K) {
+ return ((lambda*score)-log(K))/log(2);
+}
+
+double explength(Uint m, Uint n, double H, double K) {
+ return log(m*n*K)/H;
+}
+
+double effSubjectLength(Uint m, Uint n, double H, double K) {
+ double effl = (double)n - (1*explength(m, n, H, K));
+ return effl < 1/K ? 1/K : effl;
+
+}
+double effQueryLength(Uint m, Uint n, double H, double K) {
+ double effl = (double)m - explength(m, n, H, K);
+ return effl < 1/K ? 1/K : effl;
+}
+
+double spacemult(Uint m, Uint n, double H, double K) {
+ return (double) effSubjectLength(m,n,H,K) * effQueryLength(m,n,H,K);
+}
+
+
diff --git a/segemehl/libs/karlin.h b/segemehl/libs/karlin.h
new file mode 100644
index 0000000..7cbb9ff
--- /dev/null
+++ b/segemehl/libs/karlin.h
@@ -0,0 +1,36 @@
+#ifndef KARLIN_H
+#define KARLIN_H
+/*
+ *
+ * karlin.h
+ * declaration for karlin.c
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 05/06/2008 08:55:40 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: karlin.h 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/karlin.h $
+ */
+
+
+typedef struct karlin_s {
+ double lambda;
+ double H;
+ double K;
+} karlin_t;
+
+
+int karlinunitcostpp(void *space, double *lambda, double*H, double *K);
+double significance (double lambda,double K,double multiplier, int score);
+double evalue (double lambda,double K,double multiplier, int score);
+double explength(Uint m, Uint n, double H, double K);
+double effSubjectLength(Uint m, Uint n, double H, double K);
+double effQueryLength(Uint m, Uint n, double H, double K);
+double spacemult(Uint m, Uint n, double H, double K);
+#endif
diff --git a/segemehl/libs/kdchain.c b/segemehl/libs/kdchain.c
new file mode 100644
index 0000000..85e3fb4
--- /dev/null
+++ b/segemehl/libs/kdchain.c
@@ -0,0 +1,1379 @@
+
+/*
+ * kdchain.c
+ * implementation of kdchain
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 04/29/2008 07:01:30 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 85 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-11-18 15:34:44 +0100 (Tue, 18 Nov 2008) $
+ *
+ * Id: $Id: kdchain.c 85 2008-11-18 14:34:44Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/kdchain.c $
+ *
+ */
+
+#include "manout.h"
+#include "kdchain.h"
+#include "mathematics.h"
+#include "sufarray.h"
+#include "container.h"
+#include "kdseed.h"
+#include "debug.h"
+#include "karlin.h"
+#include "bitvectoralg.h"
+#include "iupac.h"
+#include <assert.h>
+#include <limits.h>
+#include <unistd.h>
+#include <float.h>
+#include <math.h>
+
+
+/*----------------------------- minDistfragment ------------------------------
+ *
+ * @brief minimum distance of fragment hits
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+minDistFragmentHits (Suffixarray *arr, branchfragment_t *u, branchfragment_t *v)
+{
+
+ Uint i, j, d1idx, d2idx;
+ Uint mindist = UINT_MAX;
+ Uint d = UINT_MAX;
+
+ for(i=u->branch->l; i <= u->branch->r; i++) {
+ for(j=v->branch->l; j <= v->branch->r; j++) {
+ d1idx = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[i]]);
+ d2idx = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[j]]);
+ if(d1idx == d2idx && (d=llabs((Lint)arr->suftab[i] - arr->suftab[j])) < mindist) {
+ mindist = d;
+ }
+ }
+ }
+
+ return d;
+}
+
+
+/*-------------------------------- wrapChains --------------------------------
+ *
+ * @brief remove chains from heap
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+wrapChains(void *space, branchChain_t *chains, Uint noofchains) {
+ Uint i;
+
+ for(i=0; i < noofchains; i++) {
+ FREEMEMORY(space, chains[i].f);
+ }
+ return;
+}
+
+
+
+/*---------------------------------- chain -----------------------------------
+ *
+ * @brief add fragment to a chain
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+chain(branchChain_t *chain, branchfragment_t *f) {
+ chain->score = chainscore(chain, f);
+ chain->end = f->end;
+ chain->nooffragments++;
+ chain->f = ALLOCMEMORY(space, chain->f, branchfragment_t*, chain->nooffragments);
+ chain->f[chain->nooffragments-1] = f;
+}
+
+
+/*------------------------------- fragmentovl --------------------------------
+ *
+ * @brief fragment overlap
+ * @author Steve Hoffmann
+ * f1->end
+ * f1 -----------------|
+ * |--------------
+ * f2->start
+ */
+
+int
+fragmentovl (branchfragment_t *f1, branchfragment_t *f2)
+{
+ return (f1->end >= f2->start && f1->start <= f2->start) ? f1->end - f2->start : 0;
+}
+
+/*--------------------------------- chainovl ---------------------------------
+ *
+ * @brief overlap of a fragment with chain on the query!
+ * @author Steve Hoffmann
+ *
+ */
+
+
+Lint
+chainovl(branchChain_t *chain, branchfragment_t *f) {
+ return ((Lint)chain->end - (Lint)f->start)+1;
+}
+
+
+/*-------------------------------- chainscore --------------------------------
+ *
+ * @brief get score of a chain when a fragment is added to it
+ * @author Steve Hoffmann
+ *
+ */
+
+
+int
+chainscore (branchChain_t *chain, branchfragment_t *f) {
+ Lint ovl = 0;
+ int score = 0;
+
+ ovl = chainovl(chain, f); //v -- 0 or -ovl
+ ovl = (ovl < 0) ? 0 : ovl;
+
+ score = (chain->score + f->score) - ovl;
+ return score;
+}
+
+
+
+/*------------------------------- chainscore2 --------------------------------
+ *
+ * @brief get score of chain with two fragments chained
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+chainscore2 (branchChain_t *chain, branchfragment_t *f1, branchfragment_t *f2)
+{
+ Lint ovl =0;
+ Lint ovl2=0;
+ int score =0;
+
+ ovl = chainovl(chain, f1);
+ ovl = (ovl < 0) ? -ovl : ovl;
+
+ score = (chain->score + f1->score) - ovl;
+
+ ovl2 = (Lint)f1->end - (Lint)f2->start + 1;
+ ovl2 = (ovl2 < 0) ? -ovl2 : ovl2;
+
+ score += f2->score - ovl2;
+
+ return score;
+}
+
+/*----------------------------- cmp_chainscores ------------------------------
+ *
+ * @brief compare the scores of a chain (qsort)
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+cmp_chainscores (const void *a, const void *b) {
+ branchChain_t *first = (branchChain_t *) a;
+ branchChain_t *second = (branchChain_t *) b;
+
+ if(first->score < second->score) return 1;
+ if(first->score == second->score) return 0;
+
+ return -1;
+}
+
+
+/*---------------------------- cmp_chainlocality -----------------------------
+ *
+ * @brief compare locality of chain
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+cmp_chainlocality (const void *a, const void *b)
+{
+ branchChain_t *first = (branchChain_t *) a;
+ branchChain_t *second = (branchChain_t *) b;
+
+ Uint i, swtch1=0, swtch2=0, strandswtch1=0, strandswtch2=0;
+
+ for(i=1; i < first->nooffragments; i++) {
+ if( first->f[i]->subidx != first->f[i-1]->subidx
+ || dist_uint(first->f[i]->substart, first->f[i-1]->substart) > 200000)
+ swtch1++;
+ if(first->f[i]->strand != first->f[i-1]->strand) strandswtch1++;
+ }
+
+ for(i=1; i < second->nooffragments; i++) {
+ if( second->f[i]->subidx != second->f[i-1]->subidx
+ || dist_uint(second->f[i]->substart, second->f[i-1]->substart) > 200000)
+ swtch2++;
+ if(second->f[i]->strand != second->f[i-1]->strand) strandswtch2++;
+ }
+
+ if(swtch1 > swtch2) return 1;
+ if(swtch1 < swtch2) return -1;
+ if(swtch1 == swtch2) {
+ if(strandswtch1 > strandswtch2) return 1;
+ if(strandswtch1 < strandswtch2) return -1;
+ }
+
+
+ return 0;
+}
+
+/*--------------------------- cmp_branchfragments ----------------------------
+ *
+ * @brief compare start positions of branch fragmentsi (qsort)
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+cmp_branchfragments (const void *a, const void *b) {
+ branchfragment_t *first = (branchfragment_t*) a;
+ branchfragment_t *second = (branchfragment_t*) b;
+
+ if(first->start < second->start) return -1;
+ if(first->start == second->start) return 0;
+
+ return 1;
+}
+
+/*--------------------------- cmp_branchfragmentsptr ----------------------------
+ *
+ * @brief compare start positions of branch fragmentsi (qsort)
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+cmp_branchfragmentsptr (const void *a, const void *b) {
+ branchfragment_t **first = (branchfragment_t**) a;
+ branchfragment_t **second = (branchfragment_t**) b;
+
+ if(first[0]->start < second[0]->start) return -1;
+ if(first[0]->start == second[0]->start) return 0;
+
+ return 1;
+}
+
+/*--------------------------- cmp_branchfragmentssub ----------------------------
+ *
+ * @brief compare start positions of branch fragmentsi (qsort)
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+cmp_branchfragmentssub (const void *a, const void *b) {
+ branchfragment_t **first = (branchfragment_t**) a;
+ branchfragment_t **second = (branchfragment_t**) b;
+
+ if(first[0]->substart < second[0]->substart) return -1;
+ if(first[0]->substart == second[0]->substart) return 0;
+
+ return 1;
+}
+
+/*-------------------------------- initChains --------------------------------
+ *
+ * @brief initalize chains
+ * @author Steve Hoffmann
+ *
+ */
+
+branchChain_t*
+initChains (branchChain_t *chains, Uint k)
+{
+ Uint i;
+
+ for(i=0; i < k; i++) {
+ chains[i].start = 0;
+ chains[i].end = 0;
+ chains[i].f = NULL;
+ chains[i].score= 0;
+ chains[i].nooffragments=0;
+ }
+
+ return chains;
+}
+
+
+
+
+/*------------------------------ condenseChain -------------------------------
+ *
+ * @brief merge fragments of the chain if they are too close on the reference
+ * practically [u.v][x,y] -> [u,y];
+ * if(u<x &% u<y)
+ * if(x<v || x-v < 50)
+ * merge
+ * else
+ * dont merge
+ * @author Steve Hoffmann
+ *
+ */
+
+ branchChain_t*
+condenseChain (branchChain_t * chains, Uint noofchains, MultiCharSeq *seq,
+ Suffixarray *arr)
+{
+ Uint i, j, u, v, x, y, k, h, w, len1, len2, strand1, strand2,
+ chr1, chr2, nochain, d1idx=0, d2idx=0, d3idx=0, l, r, ll, rr, p=0, q=0;
+ double mindist = DBL_MAX, d1=0, d2=0, di, dj;
+ branchChain_t *newchains = NULL;
+ Uint sub_start, sub_end, subidx, ***bd, *beststart=NULL, *cnt=NULL;
+
+ bd = ALLOCMEMORY(space, NULL, Uint**, noofchains);
+ cnt = ALLOCMEMORY(space, NULL, Uint, noofchains);
+ beststart = ALLOCMEMORY(space, NULL, Uint, noofchains);
+
+
+
+ for(k=0; k < noofchains; k++){
+
+ l = chains[k].f[0]->branch->l;
+ r = chains[k].f[0]->branch->r;
+ mindist = DBL_MAX;
+
+ bd[k] = ALLOCMEMORY(space, NULL, Uint*, r-l+1);
+ cnt[k] = r-l+1;
+
+ /******
+ * first step: minimize distance of fragment hits within chain
+ * for all possible start loci in [l,r] select
+ * a chain of closest loci
+ ******/
+ for(i=0, p=l; p <= r; p++, i++) {
+ bd[k][i] = calloc(chains[k].nooffragments, sizeof(Uint));
+ bd[k][i][0] = p;
+
+ for(di=0, j=1; j < chains[k].nooffragments; j++) {
+ ll = chains[k].f[j]->branch->l;
+ rr = chains[k].f[j]->branch->r;
+
+ for(dj=0, q=ll; q <= rr; q++) {
+ d1idx = getMultiCharSeqIndex(seq,
+ &seq->sequences[arr->suftab[q]]);
+ d2idx = getMultiCharSeqIndex(seq,
+ &seq->sequences[arr->suftab[bd[k][i][j-1]]]);
+
+ if(d1idx != d2idx) {
+ d1 = UINT_MAX;
+ } else {
+ d1 = llabs((LLint) arr->suftab[bd[k][i][j-1]] - arr->suftab[q]);
+ }
+
+ if(bd[k][i][j]) {
+ d3idx = getMultiCharSeqIndex(seq,
+ &seq->sequences[arr->suftab[bd[k][i][j]]]);
+ d2 = llabs((LLint) arr->suftab[bd[k][i][j-1]] - arr->suftab[bd[k][i][j]]);
+ }
+
+ if(d3idx != d2idx) {
+ d2 = UINT_MAX;
+ }
+
+ if(!bd[k][i][j] || d1 < d2) {
+ bd[k][i][j] = q;
+ dj = d1;
+ }
+ }
+ di += dj;
+ }
+
+ if(di < mindist) {
+ beststart[k] = i;
+ mindist = di;
+ }
+ }
+ }
+
+ /*decode substarts to real coordinates*/
+ for(i=0; i < noofchains; i++) {
+ for(j=0; j < chains[i].nooffragments; j++) {
+ chains[i].f[j]->substart = arr->suftab[bd[i][beststart[i]][j]];
+ }
+ }
+
+ /******
+ *second step: merge fragments that are close
+ ******/
+
+ for(i=0; i < noofchains; i++) {
+ for(j=0; j < chains[i].nooffragments-1; j++) {
+ for(k=j+1; k < chains[i].nooffragments; k++) {
+ // for(k=j+1; k < j+2; k++) {
+
+ len1 = chains[i].f[j]->end - chains[i].f[j]->start;
+ strand1 = chains[i].f[j]->strand;
+ h = chains[i].f[j]->end;
+ u = arr->suftab[bd[i][beststart[i]][j]];
+ v = u + len1;
+ chr1 = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[u]);
+
+ len2 = chains[i].f[k]->end - chains[i].f[k]->start;
+ strand2 = chains[i].f[k]->strand;
+ w = chains[i].f[k]->start;
+ x = arr->suftab[bd[i][beststart[i]][k]];
+ y = x + len2;
+ chr2 = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[x]);
+
+ // read: h)--(w
+ // reference: [u,v] [x,y]
+ // if the distance of h and w equals the distance of v and x we assume
+ // that the fragments need to be merged
+ if(u < x && v < y && chr1 == chr2 && strand1 == strand2 && strand1 == 0) {
+ if(x < v || (x-v <= 20 && w-h <= 20) || (w >= h && x-v < w-h+20 && x-v+20 > w-h)) {
+ //merge
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "merging fragment %d with %d. [%d,%d;%d,%d] with [%d,%d;%d,%d]. u=%u : %u=y, u-y=%u \t h=%u : %u=w, w-h=%u\n, pass:%d,%d\n",
+ j, k,
+ chains[i].f[j]->start, chains[i].f[j]->end,
+ u, v,
+ chains[i].f[k]->start, chains[i].f[k]->end,
+ x, y,
+ u, y,
+ u-y,
+ h, w,
+ w-h,
+ chains[i].f[j]->pass,
+ chains[i].f[k]->pass
+ );
+#endif
+ chains[i].f[j]->start = MIN(chains[i].f[j]->start, chains[i].f[k]->start);
+ chains[i].f[j]->end = MAX(chains[i].f[j]->end, chains[i].f[k]->end);
+ chains[i].f[k]->start = chains[i].f[j]->start;
+ chains[i].f[k]->end = chains[i].f[j]->end;
+ //chains[i].f[k]->pass = chains[i].f[j]->pass;
+ bd[i][beststart[i]][j] = (u < x) ? bd[i][beststart[i]][j] : bd[i][beststart[i]][k];
+ bd[i][beststart[i]][k] = bd[i][beststart[i]][j];
+ chains[i].f[j]->substart = arr->suftab[bd[i][beststart[i]][j]];
+ chains[i].f[k]->substart = arr->suftab[bd[i][beststart[i]][j]];
+
+ } else {
+ //dont merge
+ }
+ }
+ // h)--(w
+ //[x,y] [u,v]
+ if(x < u && y < v && chr1 == chr2 && strand1 == strand2 && strand1 == 1) {
+ //if(u < y || u-y < ((w >= h) ? w-h+20 : 20)) {
+ if(u < y || (u-y <= 20 && w-h <= 20) || (w >= h && u-y < w-h+20 && u-y+20 > w-h)) {
+ //merge
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "merging fragment %d with %d. [%d,%d;%d,%d] with [%d,%d;%d,%d]. u=%u : %u=y, u-y=%u \t h=%u : %u=w, w-h=%u, pass:%d,%d\n",
+ j, k,
+ chains[i].f[j]->start, chains[i].f[j]->end,
+ u, v,
+ chains[i].f[k]->start, chains[i].f[k]->end,
+ x, y,
+ u, y,
+ u-y,
+ h, w,
+ w-h,
+ chains[i].f[j]->pass,
+ chains[i].f[k]->pass
+ );
+#endif
+ chains[i].f[j]->start = MIN(chains[i].f[j]->start, chains[i].f[k]->start);
+ chains[i].f[j]->end = MAX(chains[i].f[j]->end, chains[i].f[k]->end);
+ chains[i].f[k]->start = chains[i].f[j]->start;
+ chains[i].f[k]->end = chains[i].f[j]->end;
+ //chains[i].f[k]->pass = chains[i].f[j]->pass;
+ bd[i][beststart[i]][j] = (u < x) ? bd[i][beststart[i]][j] : bd[i][beststart[i]][k];
+ bd[i][beststart[i]][k] = bd[i][beststart[i]][j];
+ chains[i].f[j]->substart = arr->suftab[bd[i][beststart[i]][j]];
+ chains[i].f[k]->substart = arr->suftab[bd[i][beststart[i]][j]];
+
+ } else {
+ //dont merge
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "\t not merging fragment %d with %d. [%d,%d;%d,%d] with [%d,%d;%d,%d]. u=%u : %u=y, u-y=%u \t h=%u : %u=w, h-w=%u, pass:%d,%d\n",
+ j, k,
+ chains[i].f[j]->start, chains[i].f[j]->end,
+ u, v,
+ chains[i].f[k]->start, chains[i].f[k]->end,
+ x, y,
+ u, y,
+ u-y,
+ h, w,
+ h-w,
+ chains[i].f[j]->pass,
+ chains[i].f[k]->pass
+ );
+#endif
+ }
+ }
+
+ }
+ }
+ }
+
+ newchains = ALLOCMEMORY(space, NULL, branchChain_t, noofchains);
+ initChains (newchains, noofchains);
+
+ for(i=0; i < noofchains; i++) {
+ newchains[i].nooffragments = 0;
+ newchains[i].f = NULL;
+ newchains[i].score = chains[i].score;
+ newchains[i].start = chains[i].start;
+ newchains[i].end = chains[i].end;
+
+ for(j=0; j < chains[i].nooffragments; j++) {
+ nochain = 0;
+ len1 = chains[i].f[j]->end - chains[i].f[j]->start;
+ for (k=j+1; k < chains[i].nooffragments; k++) {
+ len2 = chains[i].f[k]->end - chains[i].f[k]->start;
+ if(chains[i].f[j]->start >= chains[i].f[k]->start &&
+ chains[i].f[j]->end <= chains[i].f[k]->end &&
+ chains[i].f[j]->strand == chains[i].f[k]->strand &&
+ chains[i].f[j]->substart >= chains[i].f[k]->substart &&
+ chains[i].f[j]->substart+len1 <= chains[i].f[k]->substart+len2) {
+ nochain = 1;
+ }
+ }
+
+ if(nochain) {
+ } else {
+ subidx = getMultiCharSeqIndex(seq, &seq->sequences[chains[i].f[j]->substart]);
+ getMultiCharSeqIdxBounds(seq, subidx, &sub_start, &sub_end);
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "adding element [%d,%d] to newchain -> %d (%u) (pass:%d)\n", chains[i].f[j]->start, chains[i].f[j]->end, chains[i].f[j]->substart-sub_start, chains[i].f[j]->substart, chains[i].f[j]->pass);
+#endif
+ chains[i].f[j]->subidx = subidx;
+ chain(&newchains[i], chains[i].f[j]);
+ }
+ }
+ }
+
+ for(k=0; k < noofchains; k++) {
+ l = cnt[k];
+ for (i=0; i < l; i++) {
+ FREEMEMORY(space, bd[k][i]);
+ }
+ FREEMEMORY(space, bd[k]);
+ }
+
+ FREEMEMORY(space, bd);
+ FREEMEMORY(space, cnt);
+ FREEMEMORY(space, beststart);
+
+ return newchains;
+ }
+
+
+/*----------------------------- appendFragments ------------------------------
+ *
+ * @brief merge append list b to list a
+ * @author Steve Hoffmann
+ *
+ */
+
+ branchfragment_t *
+appendFragments (branchfragment_t *a, Uint m, branchfragment_t *b, Uint n)
+{
+ Uint i;
+
+ a = ALLOCMEMORY(space, a, branchfragment_t, m+n);
+
+ for(i=0; i < n; i++) {
+
+ a[m+i].start = b[i].start;
+ a[m+i].end = b[i].end;
+ a[m+i].substart = b[i].substart;
+ a[m+i].strand = b[i].strand;
+ a[m+i].branchno = b[i].branchno;
+ a[m+i].branch = b[i].branch;
+ a[m+i].score = b[i].score;
+ a[m+i].x = b[i].x;
+ a[m+i].evalue = b[i].evalue;
+ a[m+i].pass = b[i].pass;
+ a[m+i].subidx = b[i].subidx;
+ }
+
+ return a;
+}
+
+/*----------------------------- filterFragments ------------------------------
+ *
+ * @brief filter the fragments with respect to entropy, Evalue and maxocc
+ * @author Steve Hoffmann
+ *
+ */
+
+branchfragment_t *
+filterFragments (void *space, Suffixarray *arr, matchstem_t **stems, char** seqs,
+ Uint len, karlin_t* stats, double maxevalue, Uint* nooffrags)
+{
+
+ Uint i, u, start, end, substart, k=0, l, r, s, x, maxocc = 50;
+ double minentropy = 1.5, E, H;
+ branch_t *branch;
+ branchfragment_t *f = NULL;
+
+ for (i = 0; i < len; i++) {
+ for (u = 0; u < 2; u++) {
+ x = (u == 0) ? i : len-1-i;
+
+ for (s = 0; s < stems[u][x].noofbranches; s++) {
+ branch = &stems[u][x].branches[s];
+ // for (v=branch->l; v <= branch->r; v++) {
+ l = branch->l;
+ r = branch->r;
+
+ if (u == 0) {
+ start = i;
+ //CHANGED: end position one position too far
+ //before: end = i + branch->mat;
+ end = i + branch->mat - 1;
+ substart = arr->suftab[l]; //l to v
+ } else {
+ start = i - branch->mat + 1;
+ end = i;
+ substart = arr->suftab[l] - branch->mat + 1; //l to v
+ }
+
+ E = kd_getBranchEvalue(stems[u], x, s, len, arr->numofsuffixes, stats);
+ H = minshannonentropy(&seqs[0][start], end-start+1);
+
+ Uint sub_idx, sub_start =0, sub_end=0;
+ sub_idx = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[l]]);
+ getMultiCharSeqIdxBounds(arr->seq, sub_idx, &sub_start, &sub_end);
+
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "%d-[%d,%d] -> chr:%d-%d\tx:%d (strand:%d)\t", k, start, end,
+ getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[l]]),arr->suftab[l]-sub_start
+ , x, u);
+ fprintf(stdout, "Evalue: %f (max: %f), x:%u, s:%u, len:%u, H: %f (min %f), occ:%d, scr:%d\n", E, maxevalue, x, s, len, H, minentropy, r-l, kdscore(branch));
+#endif
+ if(E < maxevalue && H > minentropy && l <= r && r - l < maxocc) {
+
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "adding %d [%d,%d]\n", k, start, end);
+#endif
+ k++;
+ f = ALLOCMEMORY(space, f, branchfragment_t, k);
+ f[k-1].start = start ;
+ f[k-1].end = end;
+ f[k-1].substart = substart;
+ f[k-1].subidx = sub_idx;
+ f[k-1].strand = (unsigned char) u;
+ f[k-1].branchno = s;
+ f[k-1].branch = branch;
+ f[k-1].score = kdscore(branch);
+ f[k-1].x = x;
+ f[k-1].evalue = E;
+ f[k-1].pass = 0;
+ }
+ }
+ }
+ }
+ *nooffrags = k;
+ return f;
+ }
+
+
+/*-------------------------- removeOverlapFragments --------------------------
+ *
+ * @brief remove overlapping fragments
+ * @author Steve Hoffmann
+ *
+ */
+
+branchfragment_t*
+removeOverlapFragments (branchfragment_t *f, Uint nooffrags, Uint maxovl, Uint *newnooffrags)
+{
+
+ Uint i, j, k=0;
+ branchfragment_t *g = NULL;
+
+ for(i=0; i < nooffrags; i++) {
+ for(j=0; j < k; j++){
+
+ Uint x1 = g[j].start;
+ Uint y1 = g[j].end;
+ Uint x2 = f[i].start;
+ Uint y2 = f[i].end;
+ //Uint u1 = f[i].substart;
+ //Uint u2 = g[i].substart;
+ //to remove: set s1 = s2;
+ Uint s1 = g[j].strand;
+ Uint s2 = f[i].strand;
+
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "[%d,%d] vs. [%d,%d]\n", x1, y1, x2, y2);
+#endif
+ //inclusion
+ if(x1 <= x2 && y2 <= y1 && s1 == s2 ) //&& dist_uint(u1,u2) < 200000)
+ {
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "inclusion consumption (1)\n");
+#endif
+ break;
+ }
+ if(x2 <= x1 && y1 <= y2 && s1 == s2 ) //&& dist_uint(u1,u2) < 200000)
+ {
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "inclusion consumption (2)\n");
+#endif
+ break;
+ }
+ //overlap
+ if(y1 > x2 && y2 > y1 && y1-x2 >= maxovl){
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "overlap consumption (1)\n");
+#endif
+ //break;
+ }
+ if(y2 > x1 && y1 > y2 && y2-x1 >= maxovl) {
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "overlap consumption (2)\n");
+#endif
+ //break;
+ }
+ }
+
+ if(j < k) {
+ if (kdscore(f[i].branch) > g[j].score) {
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "replacing %d [%d,%d] by [%d,%d]\n", k, g[i].start, g[i].end, f[i].start, f[i].end);
+#endif
+ g[j].start = f[i].start;
+ g[j].end = f[i].end;
+ g[j].substart = f[i].substart;
+ g[j].strand = (unsigned char) f[i].strand;
+ g[j].branchno = f[i].branchno;
+ g[j].branch = f[i].branch;
+ g[j].score = kdscore(f[i].branch);
+ g[j].x = f[i].x;
+ g[j].evalue = f[i].evalue;
+ }
+ } else {
+
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "adding %d [%d,%d]\n", k, f[i].start, f[i].end);
+#endif
+ k++;
+ g = ALLOCMEMORY(space, g, branchfragment_t, k);
+ g[k-1].start = f[i].start ;
+ g[k-1].end = f[i].end;
+ g[k-1].substart = f[i].substart;
+ g[k-1].subidx = f[i].subidx;
+ g[k-1].strand = (unsigned char)f[i].strand;
+ g[k-1].branchno = f[i].branchno;
+ g[k-1].branch = f[i].branch;
+ g[k-1].score = kdscore(f[i].branch);
+ g[k-1].x = f[i].x;
+ g[k-1].evalue = f[i].evalue;
+ g[k-1].pass = f[i].pass;
+ }
+ }
+
+ *newnooffrags = k;
+ return g;
+}
+
+
+/*---------------------------- wrapFixinFragments ----------------------------
+ *
+ * @brief wrap the fixins
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+wrapFixinFragments (void *space, branchfragment_t **f, Uint nooffragments)
+{
+ Uint i;
+
+ for(i=0; i < nooffragments; i++) {
+ FREEMEMORY(NULL, f[i]->branch);
+ FREEMEMORY(NULL, f[i]);
+ }
+
+ return ;
+}
+
+/*----------------------------- fixinfragments------------------------------
+ *
+ * @brief fixin fragments to chains
+ * @author Steve Hoffmann
+ *
+ */
+
+branchfragment_t**
+fixinfragments (void *space, branchChain_t *chains, Uint noofchains, Suffixarray *arr, char **seqs, Uint len,
+ Uint *nooffragments)
+{
+
+ Uint u, v, i, j, t, r, k=0, p, q, n=0, trans=0, maxwidth = 5000;
+ char* fixinseq, s;
+ branchfragment_t **g = NULL;
+ matchstem_t *b = NULL;
+
+ for(i=0; i < noofchains; i++) {
+
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, "chain %d: %d-%d (%d)\n", i, chains[i].start,
+ chains[i].end, chains[i].score);
+#endif
+ trans =0;
+ for(j=1; j < chains[i].nooffragments; j++) {
+ if(chains[i].f[j-1]->subidx != chains[i].f[j]->subidx) trans=1;
+ if(chains[i].f[j-1]->strand != chains[i].f[j]->strand) trans=1;
+ if(chains[i].f[j-1]->strand == 1 && chains[i].f[j-1]->substart > chains[i].f[j]->substart) trans=1;
+ if(chains[i].f[j-1]->strand == 0 && chains[i].f[j-1]->substart < chains[i].f[j]->substart) trans=1;
+ }
+
+ if(trans) continue;
+
+ u = chains[i].start;
+ v = chains[i].end+1;
+
+ if(u >= 10) {
+ k=0;
+
+ fixinseq = ALLOCMEMORY(space, NULL, char, 11);
+ fixinseq[10] =0;
+ s = chains[i].f[0]->strand;
+ t = chains[i].f[0]->substart;
+ r = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[t]);
+
+ for(j=0; j <= u-10; j++) {
+ memmove(fixinseq, &seqs[0][j], 10);
+
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, ">>> fixin start %d-%d: %s\n", j, j+10-1, fixinseq);
+#endif
+ if(s) {
+ char * rc = charDNAcomplement(space, fixinseq, 10);
+ FREEMEMORY(space, fixinseq);
+ fixinseq = rc;
+
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, "using reverse complement %d-%d: %s\n", j, j+10-1, fixinseq);
+#endif
+ }
+
+ b = kd_match(space, arr, fixinseq, 10, 0, 0, 0, 0, 0, arr->numofsuffixes-1, 0, arr->numofsuffixes-1, 0, 0);
+ int width = b->branches[0].r - b->branches[0].l;
+
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, "width: %d\n", width);
+#endif
+ if(width < maxwidth)
+ for(int wi=0; wi <= width; wi++) {
+
+ if(s) {
+ p = arr->suftab[b->branches[0].l+wi];
+ q = t;
+ } else {
+ p = t;
+ q = arr->suftab[b->branches[0].l+wi];
+ }
+
+ //simple check for distance to save computation of coords
+ if((dist_uint(t, arr->suftab[b->branches[0].l+wi]) < 200000 && p > q)
+#ifdef FIXINBACKSPLICE
+ || dist_uint(t, arr->suftab[b->branches[0].l+wi]) < 2000
+#endif
+ ) {
+ Uint refidx, refstart =0, refend = 0;
+ refidx = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[b->branches[0].l+wi]]);
+ getMultiCharSeqIdxBounds(arr->seq, refidx, &refstart, &refend);
+ if(refidx == r) {
+
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, "hit\n");
+ fprintf(stdout, "@chr:%d-%d (%d)\n",
+ getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[b->branches[0].l+wi]]),
+ arr->suftab[b->branches[0].l+wi],
+ arr->suftab[b->branches[0].l+wi]-refstart);
+#endif
+ branch_t *cpy = copyBranch(space, &b->branches[0]);
+ cpy->l += wi;
+ cpy->r = cpy->l;
+
+ k++;
+ g = ALLOCMEMORY(space, g, branchfragment_t*, n+k);
+ g[n+k-1] = ALLOCMEMORY(space, NULL, branchfragment_t, 1);
+
+ g[n+k-1]->start = j;
+ g[n+k-1]->end = j + 10 - 1;
+ g[n+k-1]->substart = arr->suftab[b->branches[0].l+wi];
+ g[n+k-1]->subidx = refidx;
+ g[n+k-1]->strand = (unsigned char)s;
+ g[n+k-1]->branchno = 0;
+ g[n+k-1]->branch = cpy;
+ g[n+k-1]->score = 10;
+ g[n+k-1]->x = 0;
+ g[n+k-1]->evalue = 1;
+ g[n+k-1]->pass =1;
+ }
+ }
+ }
+
+ FREEMEMORY(space, b->branches);
+ FREEMEMORY(space, b);
+ }
+
+ FREEMEMORY(space, fixinseq);
+
+ if(k > 0) {
+ qsort(&g[n], k, sizeof(branchfragment_t*), cmp_branchfragmentssub);
+
+ Uint first = 0;
+ Uint last = 0;
+ Uint bestfirst = 0;
+ Uint bestlast = 0;
+
+ for(j=1; j < k; j++) {
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, "sorted fragment %d: [%d,%d]->%d:%u\n", j,
+ g[n+j]->start, g[n+j]->end,
+ getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[g[n+j]->substart]),
+ g[n+j]->substart);
+#endif
+ if(s) {
+ p = g[n+j]->start;
+ q = g[n+last]->start;
+ } else {
+ p = g[n+last]->start;
+ q = g[n+j]->start;
+ }
+
+
+ if (dist_uint(g[n+last]->substart, g[n+j]->substart) < 20 && p < q) {
+ last = j;
+ } else {
+
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, "condensing frags %d-[%d,%d] to %d-[%d,%d]\n", first,
+ g[n+first]->start, g[n+first]->end, last, g[n+last]->start, g[n+last]->end);
+#endif
+ if(bestlast - bestfirst < last - first) {
+ bestfirst = first;
+ bestlast = last;
+ }
+
+ first = j;
+ last = j;
+ }
+ }
+
+ if(bestlast - bestfirst < last - first) {
+ bestfirst = first;
+ bestlast = last;
+ }
+
+ Uint c=chains[i].nooffragments;
+ chains[i].f = ALLOCMEMORY(space, chains[i].f, branchfragment_t*, c+(bestlast-bestfirst+1));
+ Uint newstart = chains[i].start;
+ Uint newend = chains[i].end;
+ int score = 0;
+ for(j=bestfirst; j <= bestlast; j++) {
+ score += 10;
+ if(j > bestfirst) {
+ if(g[n+j-1]->start < g[n+j]->start)
+ {
+ if(g[n+j-1]->end > g[n+j]->start) score -= g[n+j-1]->end - g[n+j]->start +1;
+ } else {
+ if(g[n+j]->end > g[n+j-1]->start) score -= g[n+j]->end - g[n+j]->start;
+ }
+ }
+
+ chains[i].f[c] = g[n+j];
+ c++;
+ newstart = MIN(newstart, g[n+j]->start);
+ newend = MAX(newend, g[n+j]->end);
+
+#ifdef FIXINSMALLDEBUG
+ if(j>bestfirst) fprintf(stdout, "selecting fragment %d with score: %d, ovl:%d\n", j, score,
+ g[n+j-1]->end - g[n+j]->start);
+#endif
+ }
+
+
+ chains[i].nooffragments += (bestlast-bestfirst+1);
+ chains[i].score += score;
+ chains[i].end = MAX(newend, chains[i].end);
+ chains[i].start = MIN(newstart, chains[i].start);
+
+
+ qsort(chains[i].f, chains[i].nooffragments, sizeof(branchfragment_t*), cmp_branchfragmentsptr);
+ }
+
+ n += k;
+ }
+
+ if(len > 10 && v <= len-10) {
+ k=0;
+ fixinseq = ALLOCMEMORY(space, NULL, char, 11);
+ fixinseq[10] =0;
+ s = chains[i].f[chains[i].nooffragments-1]->strand;
+ t = chains[i].f[chains[i].nooffragments-1]->substart;
+ r = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[t]);
+
+
+ for(j=v; j <= len-10; j++) {
+ memmove(fixinseq, &seqs[0][j], 10);
+
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, "<<< fixin end %d-%d: %s - target:%u ( strand:%d )\n", j, j+10-1, fixinseq, t, s);
+#endif
+ if(s) {
+ char * rc = charDNAcomplement(space, fixinseq, 10);
+ FREEMEMORY(space, fixinseq);
+ fixinseq = rc;
+
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, "using reverse complement %d-%d: %s\n", j, j+10-1, fixinseq);
+#endif
+ }
+
+ b = kd_match(space, arr, fixinseq, 10, 0, 0, 0, 0, 0, arr->numofsuffixes-1, 0, arr->numofsuffixes-1, 0, 0);
+ int width = b->branches[0].r - b->branches[0].l;
+
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, "width: %d\n", width);
+#endif
+ if(width < maxwidth)
+ for(int wi=0; wi <= width; wi++) {
+ //simple check for distance to save computation of coords
+
+ if(s) {
+ q = arr->suftab[b->branches[0].l+wi];
+ p = t;
+ } else {
+ p = arr->suftab[b->branches[0].l+wi];
+ q = t;
+ }
+//number of chains changes drastically with this switch
+ if((dist_uint(t, arr->suftab[b->branches[0].l+wi]) < 200000 && p > q)
+#ifdef FIXINBACKSPLICE
+ || dist_uint(t, arr->suftab[b->branches[0].l+wi]) < 2000
+#endif
+ ) {
+// if(dist_uint(t, arr->suftab[b->branches[0].l+wi]) < 20000 && arr->suftab[b->branches[0].l+wi] > t) {
+ Uint refidx, refstart =0, refend = 0;
+ refidx = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[b->branches[0].l+wi]]);
+ getMultiCharSeqIdxBounds(arr->seq, refidx, &refstart, &refend);
+ if(refidx == r) {
+
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, "hit: \n");
+ fprintf(stdout, "@chr:%d-%d (%d)\n",
+ getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[b->branches[0].l+wi]]),
+ arr->suftab[b->branches[0].l+wi],
+ arr->suftab[b->branches[0].l+wi]-refstart);
+#endif
+ branch_t *cpy = copyBranch(space, &b->branches[0]);
+ cpy->l += wi;
+ cpy->r = cpy->l;
+
+ k++;
+ g = ALLOCMEMORY(space, g, branchfragment_t*, n+k);
+ g[n+k-1] = ALLOCMEMORY(space, NULL, branchfragment_t, 1);
+ g[n+k-1]->start = j;
+ g[n+k-1]->end = j + 10 - 1;
+ g[n+k-1]->substart = arr->suftab[b->branches[0].l+wi];
+ g[n+k-1]->subidx = refidx;
+ g[n+k-1]->strand = (unsigned char)s;
+ g[n+k-1]->branchno = 0;
+ g[n+k-1]->branch = cpy;
+ g[n+k-1]->score = 10;
+ g[n+k-1]->x = 0;
+ g[n+k-1]->evalue = 1;
+ g[n+k-1]->pass =1;
+ }
+ }
+ }
+
+ FREEMEMORY(space, b->branches);
+ FREEMEMORY(space, b);
+ }
+
+ FREEMEMORY(space, fixinseq);
+
+ if(k > 0) {
+ qsort(&g[n], k, sizeof(branchfragment_t*), cmp_branchfragmentssub);
+
+ Uint first = 0;
+ Uint last = 0;
+ Uint bestfirst = 0;
+ Uint bestlast = 0;
+
+ for(j=1; j < k; j++) {
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, "[%d,%d]->%d:%u\n",
+ g[n+j]->start, g[n+j]->end,
+ getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[g[n+j]->substart]),
+ g[n+j]->substart);
+#endif
+ if(s) {
+ p = g[n+j]->start;
+ q = g[n+last]->start;
+ } else {
+ p = g[n+last]->start;
+ q = g[n+j]->start;
+ }
+
+
+ if(dist_uint(g[n+last]->substart, g[n+j]->substart) < 20 && p < q) {
+ last = j;
+ } else {
+#ifdef FIXINSMALLDEBUG
+ fprintf(stdout, "condensing frags %d-[%d,%d] to %d-[%d,%d]\n", first,
+ g[n+first]->start, g[n+first]->end,
+ last, g[n+last]->start, g[n+last]->end);
+#endif
+ if(bestlast - bestfirst < last - first) {
+ bestfirst = first;
+ bestlast = last;
+ }
+
+ first = j;
+ last = j;
+ }
+
+ }
+
+ if(bestlast - bestfirst < last - first) {
+ bestfirst = first;
+ bestlast = last;
+ }
+
+ Uint c=chains[i].nooffragments;
+ chains[i].f = ALLOCMEMORY(space, chains[i].f, branchfragment_t*, c+(bestlast-bestfirst+1));
+ int score = 0;
+ Uint newstart = chains[i].start;
+ Uint newend = chains[i].end;
+ for(j=bestfirst; j <= bestlast; j++) {
+ score += 10;
+ if(j > bestfirst) {
+ if(g[n+j-1]->start < g[n+j]->start)
+ {
+ if(g[n+j-1]->end > g[n+j]->start) score -= g[n+j-1]->end - g[n+j]->start + 1;
+ } else {
+ if(g[n+j]->end > g[n+j-1]->start) score -= g[n+j]->end - g[n+j]->start;
+ }
+ }
+ chains[i].f[c] = g[n+j];
+
+ c++;
+ newstart = MIN(newstart, g[n+j]->start);
+ newend = MAX(newend, g[n+j]->end);
+
+#ifdef FIXINSMALLDEBUG
+ if(j>bestfirst) fprintf(stdout, "score: %d, ovl:%d\n", score, g[n+j-1]->end - g[n+j]->start);
+#endif
+ }
+
+ chains[i].nooffragments += (bestlast-bestfirst+1);
+ chains[i].score += score;
+ chains[i].end = MAX(newend, chains[i].end);
+ chains[i].start = MIN(newstart, chains[i].start);
+
+ qsort(chains[i].f, chains[i].nooffragments, sizeof(branchfragment_t*), cmp_branchfragmentsptr);
+
+ }
+ n += k;
+ }
+ }
+
+ *nooffragments = n;
+ return g;
+}
+
+
+/*------------------------------- branchChain --------------------------------
+ *
+ * @brief find chain of branches
+ * @author Steve Hoffmann
+ *
+ */
+
+branchChain_t *
+branchChain(void *space, Suffixarray *arr, matchstem_t **stems, char **seqs,
+ Uint len, karlin_t *stats, Uint *noofchains, branchfragment_t **fragments,
+ double maxevalue) {
+
+ Uint i, j, k=0, l=0, c_prime=0, a=0; //q_prime = 0, c=0, q, v;
+ int maxovl = 12, bestscr; //maxgap = 22;
+ branchChain_t *chains = NULL, *extra = NULL;
+ branchfragment_t *f = NULL, *g = NULL, *h = NULL;// *f_prime = NULL;
+
+ g = filterFragments (space, arr, stems, seqs, len, stats, maxevalue, &l);
+ f = removeOverlapFragments (g, l, maxovl, &k);
+ FREEMEMORY(space, g);
+ FREEMEMORY(space, h);
+
+ qsort(f, k, sizeof(branchfragment_t), cmp_branchfragments);
+ chains = ALLOCMEMORY(space, chains, branchChain_t, k);
+ initChains(chains, k);
+
+ for(i = 0; i < k; i++) {
+
+ c_prime = i;
+ bestscr = 0;
+
+ // search for best precedessor chain
+ for (j = 0; j < i; j++){
+
+ // only allow short overlap
+ if (chainovl(&chains[j], &f[i]) < maxovl ||
+ (dist_uint(chains[j].f[0]->substart, f[i].substart) < 1000 && //allow local template switches explicitly
+ chains[j].nooffragments == 1 &&
+ chains[j].f[0]->strand != f[i].strand)) {
+
+ //fprintf(stdout, "attempt to chain %d: [%d,%d] with [%d,%d]i - ovl:%lld, dist:%u\n", j, chains[j].start, chains[j].end, f[i].start, f[i].end, chainovl(&chains[j], &f[i]), dist_uint(chains[j].end, f[i].start));
+ // update best precessor
+ if (bestscr < chainscore(&chains[j], &f[i])){
+ bestscr = chainscore(&chains[j], &f[i]);
+ c_prime = j;
+ }
+ if (bestscr == chainscore(&chains[j], &f[i]) && (c_prime == i ||
+ minDistFragmentHits(arr, chains[j].f[chains[j].nooffragments-1], &f[i]) <
+ minDistFragmentHits(arr, chains[c_prime].f[chains[c_prime].nooffragments-1], &f[i]))) {
+ bestscr = chainscore(&chains[j], &f[i]);
+ c_prime = j;
+
+ Uint sub_start, sub_end;
+ Uint subidx = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[f[i].branch->l]]);
+ getMultiCharSeqIdxBounds(arr->seq, subidx, &sub_start, &sub_end);
+
+ }
+ }
+ }
+
+ if (c_prime != i){
+ // TODO: add function to do the following
+ chains[i].nooffragments = chains[c_prime].nooffragments + 1;
+ chains[i].f = ALLOCMEMORY(space, NULL, branchfragment_t*, chains[i].nooffragments);
+ memmove(chains[i].f, chains[c_prime].f, sizeof(branchfragment_t*) * chains[c_prime].nooffragments);
+ chains[i].f[chains[i].nooffragments-1] = &f[i];
+ chains[i].score = bestscr;
+ chains[i].end = f[i].end;
+ chains[i].start = chains[c_prime].start;
+ }
+ else {
+ chains[i].nooffragments = 1;
+ chains[i].f =
+ ALLOCMEMORY(space, NULL, branchfragment_t*, chains[i].nooffragments);
+ chains[i].f[0] = &f[i];
+ chains[i].score = f[i].score;
+ chains[i].start = f[i].start;
+ chains[i].end = f[i].end;
+ }
+
+ if(c_prime != i && chains[i].nooffragments > 1) {
+
+ Uint u = chains[i].nooffragments-1;
+ Uint v = chains[i].nooffragments-2;
+ Uint frag1 = chains[i].f[u]->branch->l;
+ Uint frag2 = chains[i].f[v]->branch->l;
+ Uint idx1 = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[frag1]]);
+ Uint idx2 = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[frag2]]);
+
+ if(
+ (chains[i].f[u]->strand == 0 && chains[i].f[u]->substart < chains[i].f[v]->substart)
+ ||
+ (chains[i].f[u]->strand == 1 && chains[i].f[u]->substart > chains[i].f[v]->substart)
+ ||
+ (chains[i].f[u]->strand != chains[i].f[v]->strand)
+ ||
+ (chains[i].f[u]->strand != chains[i].f[v]->strand)
+ ||
+ (idx1 != idx2)
+ )
+
+ {
+
+ extra = ALLOCMEMORY(space, extra, branchChain_t, a+1);
+ extra[a].start = f[i].start;
+ extra[a].end = f[i].end;
+ extra[a].f = ALLOCMEMORY(space, NULL, branchfragment_t*, 1);
+ extra[a].score= f[i].score;
+ extra[a].nooffragments=1;
+ extra[a].f[0] = &f[i];
+ a++;
+ }
+ }
+ }
+
+
+ if(a > 0) {
+ chains = ALLOCMEMORY(space, chains, branchChain_t, a+k);
+ memmove(&chains[k], extra, sizeof(branchChain_t)*a);
+ k += a;
+ }
+
+ FREEMEMORY(space, extra);
+
+ (*noofchains) = k;
+ (*fragments) = f;
+
+ return chains;
+}
+
+
+
+/*-------------------------------- showChains --------------------------------
+ *
+ * @brief dump the chains
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+showChains(branchChain_t *chains, Uint noofchains, Suffixarray *arr,
+ FILE *dev, char *seq, Uint len) {
+
+ Uint i, j, q, subidx, sub_start, sub_end;
+ double H;
+
+ for(i=0; i < noofchains; i++) {
+ fprintf(dev, "chain %d: %d-%d (%d)\n", i, chains[i].start,
+ chains[i].end, chains[i].score);
+
+ for(j=0; j < chains[i].nooffragments; j++) {
+
+ fprintf(dev, "fragment %d: %d-%d (%d) (%d:%f) substart:", j,
+ chains[i].f[j]->start, chains[i].f[j]->end,
+ chains[i].f[j]->strand, chains[i].f[j]->score,
+ chains[i].f[j]->evalue);
+
+ for(q=chains[i].f[j]->branch->l; q <= chains[i].f[j]->branch->r; q++) {
+ subidx = getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[q]]);
+ getMultiCharSeqIdxBounds(arr->seq, subidx, &sub_start, &sub_end);
+
+ fprintf(dev,"%u (chr:%d) -> %u, ",arr->suftab[q],
+ getMultiCharSeqIndex(arr->seq, &arr->seq->sequences[arr->suftab[q]]),arr->suftab[q]-sub_start);
+ }
+ //CHANGED: from end-start to end-start+1 for length
+ //before: H = shannonentropy(NULL, &seq[chains[i].f[j]->start], chains[i].f[j]->end - chains[i].f[j]->start, asize, tab);
+ H = minshannonentropy(&seq[chains[i].f[j]->start], chains[i].f[j]->end - chains[i].f[j]->start + 1);
+
+ fprintf(dev, "entropy: %f\n", H);
+ fprintf(dev, "substart selected: %u\n", chains[i].f[j]->substart);
+ }
+ fprintf(dev, "\n");
+ }
+}
diff --git a/segemehl/libs/kdchain.h b/segemehl/libs/kdchain.h
new file mode 100644
index 0000000..5badeb3
--- /dev/null
+++ b/segemehl/libs/kdchain.h
@@ -0,0 +1,100 @@
+#ifndef KDCHAIN_H
+#define KDCHAIN_M
+
+/*
+ *
+ * kdchain.h
+ * declarations and marcors for kd chaining
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 03/11/2008 06:40:32 PM CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 72 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-28 18:14:42 +0100 (Tue, 28 Oct 2008) $
+ *
+ * Id: $Id: kdchain.h 72 2008-10-28 17:14:42Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/kdchain.h $
+ */
+
+
+#include "manout.h"
+#include "container.h"
+#include "sufarray.h"
+#include "kdseed.h"
+
+#define FSTART_S(a) ((a)->p)
+#define FEND_S(a) (int)((a)->p+(a)->mat+(a)->mis+(a)->del-1)
+#define RINTERVALSIZE 100
+#define FSTART_Q(a) ((a)->i)
+#define FEND_Q(a) (int)((a)->i+(a)->mat+(a)->mis+(a)->ins-1)
+#define FASSIGN(d, s) d->i = s->i;\
+ d->p = s->p;\
+ d->scr = s->scr;\
+ d->mat = s->mat;\
+ d->mis = s->mis;\
+ d->ins = s->ins;\
+ d->del = s->del
+
+typedef struct {
+ Uint size;
+ Uint bestscr;
+ Uint bestscrpos;
+ gmatch_t* chains;
+} gchains_t;
+
+typedef struct {
+ Uint start;
+ Uint end;
+ Uint x;
+ unsigned char strand;
+ Uint branchno;
+ Uint substart;
+ Uint subidx;
+ int score;
+ double evalue;
+ branch_t *branch;
+ char pass;
+} branchfragment_t;
+
+typedef struct {
+ Uint nooffragments;
+ Uint start;
+ Uint end;
+ int score;
+ branchfragment_t **f;
+} branchChain_t;
+
+void joinFragments(gmatch_t*, gmatch_t*, gmatch_t*, int err);
+gmatch_t* greedyfchain(void *space, gmatch_t* F, Uint n, Uint *scr,
+ Uint *pos, Uint *m);
+void wrapChains(void *space, branchChain_t *chains, Uint noofchains);
+branchChain_t * branchChain(void *space, Suffixarray *arr, matchstem_t **stems,
+ char **seqs, Uint len, karlin_t *stats, Uint *noofchains,
+ branchfragment_t **, double maxevalue);
+void showChains(branchChain_t *chains, Uint noofchains, Suffixarray *arr,
+ FILE *dev, char *, Uint);
+int cmp_chainscores(const void *a, const void *b);
+extern void reportfchain(void *space, gchains_t *pi, gmatch_t *e);
+void branch2match(Suffixarray *s, Container *C, branch_t* b,
+ Uint noofbranches);
+extern void reportchaincoords(gmatch_t *dest, gmatch_t *a, gmatch_t *b,
+ int ovq, int dmis, int dins, int ddel);
+Container* findfchains(void *space, Suffixarray *s, matchstem_t* M,
+ Uint m, Uint t, unsigned char strict, int sigmatch, double lambda,
+ double H, double K, double maxevalue);
+extern int misfrag (gmatch_t *a, gmatch_t *b);
+extern int delfrag (gmatch_t *a, gmatch_t *b);
+extern int insfrag (gmatch_t *a, gmatch_t *b);
+extern void joinfrag(gmatch_t *dest, gmatch_t *a, gmatch_t *b);
+int chainscore(branchChain_t *chain, branchfragment_t *f);
+Lint chainovl(branchChain_t *chain, branchfragment_t *f);
+void chain(branchChain_t *chain, branchfragment_t *f);
+branchChain_t* condenseChain (branchChain_t * chains, Uint noofchains, MultiCharSeq *seq, Suffixarray *arr);
+branchfragment_t** fixinfragments (void *space, branchChain_t *chains, Uint noofchains, Suffixarray *arr, char **seqs, Uint len, Uint *nooffragments);
+void wrapFixinFragments (void *space, branchfragment_t **f, Uint nooffragments);
+int cmp_chainlocality (const void *a, const void *b);
+
+#endif
diff --git a/segemehl/libs/kdseed.c b/segemehl/libs/kdseed.c
new file mode 100644
index 0000000..cfc645f
--- /dev/null
+++ b/segemehl/libs/kdseed.c
@@ -0,0 +1,1079 @@
+
+/*
+ * kdseed.c
+ * getting k-diff seeds
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 04/11/2008 10:41:11 PM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 91 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-12-07 16:44:27 +0100 (Sun, 07 Dec 2008) $
+ *
+ * Id: $Id: kdseed.c 91 2008-12-07 15:44:27Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/kdseed.c $
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "memory.h"
+#include "fileio.h"
+#include "stringutils.h"
+#include "charsequence.h"
+#include "multicharseq.h"
+#include "sufarray.h"
+#include "mmchar.h"
+#include "mathematics.h"
+#include "biofiles.h"
+#include "vtprogressbar.h"
+#include "sort.h"
+#include "bitArray.h"
+#include "vqueue.h"
+#include "vstack.h"
+#include "container.h"
+#include <pthread.h>
+#include "kdseed.h"
+#include "info.h"
+#include "debug.h"
+#include <assert.h>
+#include "iupac.h"
+/*#include "uedist.h"*/
+
+unsigned int rekCounter=0;
+unsigned int update=0;
+
+inline int
+kdscore(branch_t *b) {
+ if (b->mat < b->mis + b->ins + b->del) return 0;
+ return (int)b->mat - ((int)b->ins + (int)b->mis + (int)b->del);
+}
+
+
+/*--------------------------- se_kdGetBranchScore ----------------------------
+ *
+ * @brief getting the score of a matchstems branch
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+kd_getBranchScore(matchstem_t *M, Uint i, Uint q) {
+ int scr;
+
+ scr = M[i].branches[q].mat;
+ scr -= M[i].branches[q].mis+M[i].branches[q].ins+M[i].branches[q].del;
+ return scr;
+}
+
+
+double
+kd_getBranchEvalue(matchstem_t *stem, Uint i, Uint q, Uint m, Uint n,
+ karlin_t *stats) {
+ int scr;
+ double space;
+
+ scr = kd_getBranchScore(stem, i, q);
+ space = spacemult(m, n, stats->H, stats->K);
+ return evalue(stats->lambda, stats->K, space, scr);
+}
+
+int cmp_branch_qsort (const void * a, const void * b){
+ // sort l ascending and r descending
+ if (((branch_t *) a)->l != ((branch_t *) b)->l){
+ return (int)((branch_t *) a)->l - (int)((branch_t *) b)->l;
+ }
+ else {
+ return (int)((branch_t *) b)->r - (int)((branch_t *) a)->r;
+ }
+}
+
+
+inline void
+matchstemAddBranch(void *space,
+ matchstem_t *a,
+ Uint mat, Uint q,
+ Uint mis, Uint ins, Uint del,
+ Uint l, Uint r, Uint u, Uint v) {
+
+ a->branches = realloc(a->branches, sizeof(branch_t)*(a->noofbranches+1));
+ a->noofbranches++;
+ a->branches[a->noofbranches-1].mat = mat;
+ a->branches[a->noofbranches-1].q = q;
+ a->branches[a->noofbranches-1].mis = mis;
+ a->branches[a->noofbranches-1].ins = ins;
+ a->branches[a->noofbranches-1].del = del;
+ a->branches[a->noofbranches-1].l = l;
+ a->branches[a->noofbranches-1].r = r;
+ a->branches[a->noofbranches-1].u = u;
+ a->branches[a->noofbranches-1].v = v;
+
+ return;
+}
+
+branch_t *
+copyBranch(void *space, branch_t *a) {
+ branch_t *ret = NULL;
+
+ ret = ALLOCMEMORY(space, NULL, branch_t, 1);
+ ret->mat = a->mat;
+ ret->q = a->q;
+ ret->mis = a->mis;
+ ret->ins = a->ins;
+ ret->del = a->del;
+ ret->l = a->l;
+ ret->r = a->r;
+ ret->u = a->u;
+ ret->v = a->v;
+
+ return ret;
+}
+
+inline void
+matchstemModifyBranch(void *space,
+ matchstem_t *a, Uint k,
+ Uint mat, Uint q,
+ Uint mis, Uint ins, Uint del,
+ Uint l, Uint r, Uint u, Uint v) {
+
+ assert(a->noofbranches >= k);
+
+ a->branches[k].mat = mat;
+ a->branches[k].q = q;
+ a->branches[k].mis = mis;
+ a->branches[k].ins = ins;
+ a->branches[k].del = del;
+ a->branches[k].l = l;
+ a->branches[k].r = r;
+ a->branches[k].u = u;
+ a->branches[k].v = v;
+
+ return;
+}
+
+void
+matchstemDestruct(void *space, matchstem_t *M) {
+ FREEMEMORY(space, M->branches);
+}
+
+/*-------------------------------- pushkdlcp ---------------------------------
+ *
+ * @brief helper function to push lcps and singletons to a stack
+ * @author Steve Hoffmann
+ *
+ */
+
+
+inline void
+pushkdlcp ( void *space,
+ VStack *vstack,
+ char *p,
+ Uint m,
+ Uint kp,
+ kdiffm_t *data) {
+ Uint i;
+ bl_vstackPush(vstack, data);
+ kdiffm_t * tmp = (kdiffm_t *) bl_vstackTop(vstack);
+ tmp->parent.a = tmp->child.a;
+ tmp->parent.b = tmp->child.b;
+ tmp->sptr++; tmp->kcnt++; tmp->del++;
+ for(i = 1; i + data->kcnt <= kp && data->qptr + i < m; i++) {
+ bl_vstackPush(vstack, data);
+ kdiffm_t * tmp = (kdiffm_t *) bl_vstackTop(vstack);
+ tmp->parent.a = tmp->child.a;
+ tmp->parent.b = tmp->child.b;
+ tmp->qptr += i; tmp->kcnt += i; tmp->ins +=i;
+ }
+
+ return;
+}
+
+inline void
+pushkdbranchesArr ( void *space,
+ Suffixarray *s,
+ VStack *vstack,
+ char *p,
+ Uint m,
+ Uint kp,
+ kdiffm_t *data,
+ PairUint pr) {
+
+ Uint l,r,v,i;
+ PairUint child;
+ Lint *c;
+ Uint count=0, lcp=0, j=0;
+ char cur;
+
+ child.a = 0;
+ child.b = 0;
+
+ c = getChildintervalsArr(space, s, pr.a, pr.b, &count, 1);
+ lcp = getlcpval(s, pr.a, pr.b);
+
+ for(i=0; i < count; i++) {
+ if(s->seq->sequences[ s->suftab[c[i*2]] + lcp] == p[data->qptr]){
+ child.a = c[i*2];
+ child.b = c[i*2+1];
+ break;
+ }
+ }
+
+ for (v=0; v < count; v++) {
+ l = c[v*2];
+ r = c[v*2+1];
+ if (l <= r) {
+ if (l != child.a || r != child.b) {
+ /* matches due to IUPAC nucleotide code */
+ if (isallowedIUPAC() && matchIUPAC(p[data->qptr],
+ s->seq->sequences[s->suftab[l] + lcp])){
+ bl_vstackPush(vstack, data);
+ kdiffm_t * tmp = (kdiffm_t *) bl_vstackTop(vstack);
+ tmp->qptr++; tmp->sptr++; tmp->mat++;
+ tmp->parent.a = pr.a; tmp->parent.b = pr.b;
+ tmp->child.a = l; tmp->child.b = r;
+ }
+ /* mismatches */
+ else if (data->kcnt < kp){
+ bl_vstackPush(vstack, data);
+ kdiffm_t * tmp = (kdiffm_t *) bl_vstackTop(vstack);
+ tmp->qptr++; tmp->sptr++; tmp->kcnt++; tmp->mis++;
+ tmp->parent.a = pr.a; tmp->parent.b = pr.b;
+ tmp->child.a = l; tmp->child.b = r;
+ }
+ }
+ /* deletions */
+ if (data->qptr > 1 && data->kcnt < kp) {
+ bl_vstackPush(vstack, data);
+ kdiffm_t * tmp = (kdiffm_t *) bl_vstackTop(vstack);
+ tmp->sptr++; tmp->kcnt++; tmp->del++;
+ tmp->parent.a = pr.a; tmp->parent.b = pr.b;
+ tmp->child.a = l; tmp->child.b = r;
+ }
+ }
+ }
+
+ /* insertions */
+ if (data->qptr > 1) {
+ for(i = 1; i + data->kcnt <= kp && data->qptr + i < m; i++){
+ for(j=0; j < count; j++) {
+ cur = s->seq->sequences[ s->suftab[c[j*2]] + lcp];
+ if(!isallowedIUPAC() ? cur == p[data->qptr+i] :
+ matchIUPAC(p[data->qptr+i], cur)){
+ child.a = c[j*2];
+ child.b = c[j*2+1];
+
+ if (child.a <= child.b) {
+ bl_vstackPush(vstack, data);
+ kdiffm_t * tmp = (kdiffm_t *) bl_vstackTop(vstack);
+ tmp->qptr += i; tmp->kcnt += i; tmp->ins += i;
+ tmp->parent.a = pr.a; tmp->parent.b = pr.b;
+ tmp->child.a = child.a; tmp->child.b = child.b;
+ }
+ }
+ }
+ }
+ }
+
+ free(c);
+ return;
+}
+
+
+inline void
+pushkdbranches ( void *space,
+ Suffixarray *s,
+ VStack *vstack,
+ char *p,
+ Uint m,
+ Uint kp,
+ kdiffm_t *data,
+ PairUint pr) {
+ Uint l,r,v,i, lcp=0;
+ char cur;
+ Container *c;
+ PairUint child;
+ child = getCharInterval(space, s, pr.a, pr.b, 0, p[data->qptr]);
+ c = getChildintervals(space, s, pr.a, pr.b, 1);
+ if (isallowedIUPAC()){
+ lcp = getlcpval(s, pr.a, pr.b);
+ }
+
+ for (v=0; v < bl_containerSize(c); v++) {
+ l = ((PairUint*)bl_containerGet(c,v))->a;
+ r = ((PairUint*)bl_containerGet(c,v))->b;
+ if (l <= r) {
+ if (l != child.a || r != child.b) {
+ /* matches due to IUPAC nucleotide code */
+ if (isallowedIUPAC() && matchIUPAC(p[data->qptr],
+ s->seq->sequences[s->suftab[l] + lcp])){
+ bl_vstackPush(vstack, data);
+ kdiffm_t * tmp = (kdiffm_t *) bl_vstackTop(vstack);
+ tmp->qptr++; tmp->sptr++; tmp->mat++;
+ tmp->parent.a = pr.a; tmp->parent.b = pr.b;
+ tmp->child.a = l; tmp->child.b = r;
+ }
+ /* mismatches */
+ else if (data->kcnt < kp) {
+ bl_vstackPush(vstack, data);
+ kdiffm_t * tmp = (kdiffm_t *) bl_vstackTop(vstack);
+ tmp->qptr++; tmp->sptr++; tmp->kcnt++; tmp->mis++;
+ tmp->parent.a = pr.a; tmp->parent.b = pr.b;
+ tmp->child.a = l; tmp->child.b = r;
+ }
+ }
+ /* deletions */
+ if (data->qptr > 1 && data->kcnt < kp) {
+ bl_vstackPush(vstack, data);
+ kdiffm_t * tmp = (kdiffm_t *) bl_vstackTop(vstack);
+ tmp->sptr++; tmp->kcnt++; tmp->del++;
+ tmp->parent.a = pr.a; tmp->parent.b = pr.b;
+ tmp->child.a = l; tmp->child.b = r;
+ }
+ }
+ }
+
+ /* insertions */
+ if (data->qptr > 1) {
+ for(i = 1; i + data->kcnt <= kp && data->qptr + i < m; i++){
+ /* on exact char interval */
+ child = getCharInterval(space, s, pr.a, pr.b, 0, p[data->qptr+i]);
+ if (child.a <= child.b) {
+ bl_vstackPush(vstack, data);
+ kdiffm_t * tmp = (kdiffm_t *) bl_vstackTop(vstack);
+ tmp->qptr += i; tmp->kcnt += i; tmp->ins += i;
+ tmp->parent.a = pr.a; tmp->parent.b = pr.b;
+ tmp->child.a = child.a; tmp->child.b = child.b;
+ }
+ /* on other matching IUPAC nucleotides */
+ if (isallowedIUPAC()){
+ for (v=0; v < bl_containerSize(c); v++){
+ l = ((PairUint*)bl_containerGet(c,v))->a;
+ r = ((PairUint*)bl_containerGet(c,v))->b;
+ if (l <= r && (l != child.a || r != child.b)){
+ cur = s->seq->sequences[s->suftab[l] + lcp];
+ if (matchIUPAC(p[data->qptr+i], cur)){
+ bl_vstackPush(vstack, data);
+ kdiffm_t * tmp = (kdiffm_t *) bl_vstackTop(vstack);
+ tmp->qptr += i; tmp->kcnt += i; tmp->ins += i;
+ tmp->parent.a = pr.a; tmp->parent.b = pr.b;
+ tmp->child.a = l; tmp->child.b = r;
+ }
+ }
+ }
+ }
+ }
+ }
+ bl_containerDestruct(c, NULL);
+ free(c);
+ return;
+}
+
+inline void
+kd_updatebranches(matchstem_t* b, kdiffm_t *data, unsigned int seedlen) {
+ if (data->mis + data->ins + data->del == 0 && data->mat <= seedlen){
+ matchstemModifyBranch(NULL, b, 0, data->mat, data->mat, data->mis, data->ins, data->del,
+ data->child.a, data->child.b, data->parent.a, data->parent.b);
+ }
+ if (data->mat > data->ins+data->mis+data->del){
+ if (b->noofbranches == 1 ||
+ (int)data->mat-(int)data->ins-(int)data->mis-(int)data->del > kdscore(&b->branches[1])) {
+ b->branches = realloc(b->branches, sizeof(branch_t)*2);
+ b->noofbranches = 2;
+ matchstemModifyBranch(NULL, b, 1, data->mat, data->mat, data->mis, data->ins, data->del,
+ data->child.a, data->child.b, data->parent.a, data->parent.b);
+ } else {
+ if((int)data->mat-(int)data->ins-(int)data->mis-(int)data->del == kdscore(&b->branches[1])) {
+ //CHANGED: do not insert equal branches,
+ //can occur due to 'alignment' ambiguity
+ //--> performance needs to be tested (tradeoff
+ // between time consumption of this loop
+ // against multiple evaluation of equal branches
+ /*Uint i;
+ for (i = 1; i < b->noofbranches; i++){
+ if (data->child.a == b->branches[i].l &&
+ data->child.b == b->branches[i].r &&
+ data->parent.a == b->branches[i].u &&
+ data->parent.b == b->branches[i].v &&
+ data->mat == b->branches[i].mat){
+ break;
+ }
+ }
+ if (i == b->noofbranches)*/
+ matchstemAddBranch(NULL, b, data->mat, data->mat, data->mis, data->ins, data->del,
+ data->child.a, data->child.b, data->parent.a, data->parent.b);
+ }
+ }
+ }
+}
+
+inline int
+kd_matchlcp(void *space,
+ Suffixarray *s, char *p, unsigned int m,
+ kdiffm_t *data, unsigned int cursufpos,
+ unsigned int lcplen, unsigned int seedlen, VStack *vstack,
+ unsigned int maxdiff, int sext, int pmis, int xoff, matchstem_t *b) {
+
+ unsigned int i;
+ unsigned int fallbackmat=data->mat, fallbackmis=data->mis;
+ unsigned int origmat = data->mat;
+ unsigned int pnlty = 0;
+ unsigned int seqlen = s->numofsuffixes;
+ unsigned int lastmat=0, lastmis=0;
+ char *cursuf;
+ unsigned char delimbreak=0;
+
+ /*to match inbetween the lcp*/
+ cursuf = &s->seq->sequences[cursufpos];
+
+ /*
+ * we have to consider two indices:
+ * i iters through the lcp
+ * sptr shows the suffix pos (relative to cursuf) we start in the lcp
+ * qptr shows the pattern pos we start with
+ * we use "stacking" if alignment to the end of the query is
+ * not possible
+ */
+
+ for(i=0; i < lcplen
+ && cursufpos+data->sptr+i < seqlen
+ && data->qptr+i < m; i++) {
+
+ if(cursuf[data->sptr+i] == s->seq->delim) break;
+
+ if(data->kcnt < maxdiff) {
+ data->qptr += i; data->sptr+=i;
+ pushkdlcp(space, vstack, p, m, maxdiff, data);
+ data->qptr -= i; data->sptr-=i;
+ }
+ /* exact match or IUPAC match if allowed */
+ if(!isallowedIUPAC() ? cursuf[data->sptr+i] == p[data->qptr+i] :
+ cursuf[data->sptr+i] == p[data->qptr+i] ||
+ matchIUPAC(p[data->qptr+i], cursuf[data->sptr+i])){
+ data->mat++;
+ lastmat = data->mat;
+ lastmis = data->mis;
+ /*
+ * we reduce the overall penalty by the number of extensions
+ * and keep track of the last pnlty free hit
+ */
+ pnlty = MAX(0, (int)(pnlty-sext));
+
+ if(!pnlty) {
+ fallbackmat = data->mat;
+ fallbackmis = data->mis;
+ }
+ if (data->mis+data->ins+data->del == 0){
+ kd_updatebranches(b, data, seedlen);
+ }
+ } else {
+ /*
+ * check if mismatch violates the xoff constraint
+ */
+ data->mis++;
+
+ if((pnlty+=pmis) > xoff) {
+ break;
+ }
+ }
+ }
+ /*
+ * we need a clean suffix if lcp extension failed
+ * and we accept the move otherwise
+ */
+
+ if(cursuf[data->sptr+i] == s->seq->delim) {
+ i -= (i > 0) ? 1 : 0;
+ delimbreak =1;
+ }
+
+ if(pnlty > xoff) {
+ data->mat = fallbackmat;
+ data->mis = fallbackmis;
+ } else {
+ data->qptr+=i;
+ data->sptr+=i;
+ //CHANGED: only update if mismatches occured
+ //otherwise already updated during lcp matching
+ //and at least one more character was matched
+ //compared to begin of function
+ //if (data->mis+data->ins+data->del == 0) {
+ // kd_updatebranches(b, data, seedlen);
+ //}
+ if (lastmis+data->ins+data->del != 0) {
+ if (lastmat > lastmis + data->ins + data->del &&
+ lastmat > origmat){
+ if (b->noofbranches == 1 ||
+ (int)lastmat-(int)lastmis-(int)data->ins-(int)data->del > kdscore(&b->branches[1])) {
+ b->branches = realloc(b->branches, sizeof(branch_t) * 2);
+ b->noofbranches = 2;
+ matchstemModifyBranch(space, b, 1, lastmat, lastmat, lastmis, data->ins, data->del,
+ data->child.a, data->child.b, data->parent.a, data->parent.b);
+ } else {
+ if((int)lastmat-(int)lastmis-(int)data->ins-(int)data->del == kdscore(&b->branches[1])) {
+ //CHANGED: do not insert equal branches,
+ //can occur due to 'alignment' ambiguity
+ //--> performance needs to be tested (tradeoff
+ // between time consumption of this loop
+ // against multiple evaluation of equal branches
+ /*
+ Uint i;
+ for (i = 1; i < b->noofbranches; i++){
+ if (data->child.a == b->branches[i].l &&
+ data->child.b == b->branches[i].r &&
+ data->parent.a == b->branches[i].u &&
+ data->parent.b == b->branches[i].v &&
+ lastmat == b->branches[i].mat){
+ break;
+ }
+ }
+ if (i == b->noofbranches)*/
+ matchstemAddBranch(space, b, lastmat, lastmat, lastmis, data->ins, data->del,
+ data->child.a, data->child.b, data->parent.a, data->parent.b);
+ }
+ }
+ }
+ }
+ if (delimbreak) {
+ return 0;
+ } else {
+ return 1;
+ }
+ }
+
+ if (delimbreak) {
+ return 0;
+ }
+ //CHANGED: only update if mismatch occured
+ //otherwise already updated during lcp matching
+ //and at least one more character was matched
+ //compared to begin of function
+ if (data->mis+data->ins+data->del != 0 &&
+ data->mat > origmat){
+ kd_updatebranches(b, data, seedlen);
+ }
+
+ return (!(pnlty>xoff));
+}
+
+inline matchstem_t*
+kd_match ( void *space,
+ Suffixarray *s,
+ char *p,
+ Uint m,
+ Uint sext,
+ Uint pmis,
+ Uint xoff,
+ Uint maxdiff,
+ Uint iprime,
+ Uint jprime,
+ Uint kprime,
+ Uint lprime,
+ Uint sptr,
+ Uint qptr ) {
+
+ Uint lcplen = 0,
+ cursufpos = 0,
+ seedlen=10000;
+ VStack *vstack;
+ PairUint birth;
+ matchstem_t *b;
+ kdiffm_t data, *tmp;
+
+ data.kcnt = 0; data.mis = 0; data.mat = 0; data.ins = 0; data.del = 0;
+ data.sptr = sptr; data.qptr = qptr;
+ data.child.a = 1; data.child.b = 0;
+
+ b = (matchstem_t *) malloc(sizeof(matchstem_t));
+ b->branches = malloc(sizeof(branch_t)*2);
+ b->noofbranches = 0;
+
+ vstack = (VStack *) malloc(sizeof(VStack));
+ bl_vstackInit(vstack, 1000, sizeof(kdiffm_t));
+
+ data.l = getlcpval(s, iprime, jprime);
+ data.parent.a = iprime; data.parent.b = jprime;
+
+ if (data.l == data.sptr && data.l < m && iprime != jprime){
+ data.child = getCharIntervalArr(space, s, iprime, jprime, 0, p[data.qptr]);
+
+ if (data.kcnt < maxdiff ||
+ (isallowedIUPAC() && data.kcnt == maxdiff && couldMatchIUPAC(p[data.qptr]))){
+ pushkdbranches(space, s, vstack, p, m, maxdiff, &data, data.parent);
+ }
+ }
+
+ if (data.child.a > data.child.b){
+ //CHANGED: otherwise same intervals will be pushed again and hence twice
+ //data.child.a = iprime;
+ //data.child.b = jprime;
+ matchstemAddBranch(space, b, data.mat, data.mat, data.mis, data.ins, data.del,
+ iprime, jprime, data.parent.a, data.parent.b);
+ //CHANGED: only update best branch if at least one match
+ //matchstemAddBranch(space, b, data.mat, data.mat, data.mis, data.ins, data.del,
+ // iprime, jprime, data.parent.a, data.parent.b);
+ } else {
+ data.mat++; data.sptr++; data.qptr++;
+ matchstemAddBranch(space, b, data.mat, data.mat, data.mis, data.ins, data.del,
+ data.child.a, data.child.b, data.parent.a, data.parent.b);
+ matchstemAddBranch(space, b, data.mat, data.mat, data.mis, data.ins, data.del,
+ data.child.a, data.child.b, data.parent.a, data.parent.b);
+ }
+
+ while(1) {
+ while(data.qptr < m && data.child.a <= data.child.b) {
+ data.l = getlcpval(s, data.child.a, data.child.b);
+ cursufpos = s->suftab[data.child.a];
+ /*
+ * intermediate longest common prefixes (l > sptr)
+ * singletons or (implicitly: lcps exceeding m)
+ */
+ if(data.l > data.sptr || data.l == 0) {
+ /* if the lcp exceeds m we align and break*/
+ if(data.l >= m + data.del - data.ins || data.l == 0) {
+ kd_matchlcp(space, s, p, m, &data, cursufpos, m-data.del, seedlen, vstack,
+ maxdiff, sext, pmis, xoff, b);
+ /*
+ * full match triggers return
+ */
+ if(data.mat==m && data.mis + data.ins + data.del == 0) {
+ bl_vstackDestruct(vstack, NULL);
+ free(vstack);
+ return b;
+ } else {
+ break;
+ }
+ } else {
+ lcplen = data.l - data.sptr;
+ if(!kd_matchlcp(space, s, p, m, &data, cursufpos, lcplen, seedlen, vstack,
+ maxdiff, sext, pmis, xoff, b) ){
+ break;
+ }
+ }
+ } else {
+
+ birth = getCharIntervalArr(space, s, data.child.a,
+ data.child.b, 0, p[data.qptr]);
+ if(data.kcnt < maxdiff ||
+ (isallowedIUPAC() && data.kcnt == maxdiff && couldMatchIUPAC(p[data.qptr]))) {
+ pushkdbranches(space, s, vstack, p, m, maxdiff, &data, data.child);
+ }
+ /*this character was a dead end*/
+ if(birth.a > birth.b) {
+ break;
+ }
+ /*otherwise accept move*/
+ data.parent.a = data.child.a;
+ data.parent.b = data.child.b;
+ data.child.a = birth.a;
+ data.child.b = birth.b;
+ data.sptr++;
+ data.qptr++;
+ data.mat++;
+ kd_updatebranches(b, &data, seedlen);
+ }
+ }
+ if(bl_vstackIsEmpty(vstack)) break;
+ tmp = (kdiffm_t*) bl_vstackPop(vstack, NULL);
+ memcpy(&data, tmp, sizeof(kdiffm_t));
+ free(tmp);
+ }
+ bl_vstackDestruct(vstack, NULL);
+ free(vstack);
+ return(b);
+}
+
+inline void
+kdbest ( void *space,
+ Suffixarray *s,
+ char *seqs[],
+ Uint m,
+ Uint sext,
+ Uint pmis,
+ Uint xoff,
+ Uint kp,
+ matchstem_t *a[],
+ matchstem_t *b0[]) {
+
+ Uint i, j, k, l, remainder,
+ iprime,
+ jprime;
+ matchstem_t *b[2] = {NULL, NULL};
+ unsigned char a0[2] = {a[0] == NULL, a[1] == NULL};
+
+ iprime = 0;
+ jprime = s->numofsuffixes-1;
+
+ for (i = 0; i <= kp; i++){
+ for (j = 0; j < 2; j++){
+ // ignore initialized
+ if (a[j] != NULL){
+ continue;
+ }
+ b[j] = kd_match(space, s, seqs[j], m, sext, pmis, xoff, i, iprime, jprime, iprime, jprime, 0, 0);
+ for (k = 1; k < b[j]->noofbranches; k++){
+ // full match with correct error bounds
+ // --> store
+ remainder = m - b[j]->branches[k].mat - b[j]->branches[k].mis - b[j]->branches[k].ins;
+ //fprintf(stderr, "branch: i=%d, j=%d, k=%d, mat=%d, mis=%d, ins=%d, del=%d\n",
+ // i, j, k, b[j]->branches[k].mat, b[j]->branches[k].mis, b[j]->branches[k].ins, b[j]->branches[k].del);
+ if (b[j]->branches[k].mis + b[j]->branches[k].ins + b[j]->branches[k].del + remainder <= i){
+ a[j] = ALLOCMEMORY(space, NULL, matchstem_t, m);
+ a[j][0].branches = ALLOCMEMORY(space, NULL, branch_t, b[j]->noofbranches-1);
+ memmove(a[j][0].branches, &b[j]->branches[1], sizeof(branch_t) * (b[j]->noofbranches-1));
+ a[j][0].noofbranches = b[j]->noofbranches-1;
+ for (l = 1; l < m; l++){
+ a[j][l].branches = NULL;
+ a[j][l].noofbranches = 0;
+ }
+ break;
+ }
+ }
+ // clean stem except for last loop
+ if (i != kp){
+ FREEMEMORY(space, b[j]->branches);
+ FREEMEMORY(space, b[j]);
+ b[j] = NULL;
+ }
+ }
+ // break if at least one full match
+ // was found with the seqs
+ if ((a0[0] && a[0] != NULL) ||
+ (a0[1] && a[1] != NULL)){
+ break;
+ }
+ }
+ // full match found
+ if ((a0[0] && a[0] != NULL) || (a0[1] && a[1] != NULL)){
+ for (j = 0; j < 2; j++){
+ // init stems accordingly
+ if (a[j] == NULL){
+ a[j] = ALLOCMEMORY(space, NULL, matchstem_t, m);
+ for (l = 0; l < m; l++){
+ a[j][l].branches = NULL;
+ a[j][l].noofbranches = 0;
+ }
+ }
+ // clean stem
+ if (b[j] != NULL){
+ FREEMEMORY(space, b[j]->branches);
+ FREEMEMORY(space, b[j]);
+ }
+ }
+ }
+ else {
+ b0[0] = b[0];
+ b0[1] = b[1];
+ }
+}
+
+inline matchstem_t*
+kdseeds ( void *space,
+ Suffixarray *s,
+ char *p,
+ Uint m,
+ Uint jump,
+ Uint sext,
+ Uint pmis,
+ Uint xoff,
+ Uint kp,
+ matchstem_t *b0) {
+ Uint iprime,
+ jprime,
+ kprime,
+ lprime,
+ l,
+ j,
+ r,
+ u,
+ v,
+ i,
+ c=0,
+ q,
+ ll;
+ PairUint suflink;
+ matchstem_t *a,
+ *b;
+
+ iprime = 0;
+ jprime = s->numofsuffixes-1;
+ kprime = 0;
+ lprime = s->numofsuffixes-1;
+
+ if (b0 == NULL){
+ b = kd_match(space, s, p, m, sext, pmis, xoff, kp, iprime, jprime, iprime, jprime, 0, 0);
+ }
+ else {
+ b = b0;
+ }
+
+ a = ALLOCMEMORY(space, NULL, matchstem_t, m);
+ if (b->noofbranches > 1){
+ a[0].branches = ALLOCMEMORY(space, NULL, branch_t, b->noofbranches-1);
+ memmove(a[0].branches, &b->branches[1], sizeof(branch_t)*(b->noofbranches-1));
+ a[0].noofbranches = b->noofbranches-1;
+ }
+ else {
+ a[0].branches = NULL;
+ a[0].noofbranches = 0;
+ }
+
+ q = b->branches[0].q;
+ l = b->branches[0].l; r = b->branches[0].r;
+ u = b->branches[0].u; v = b->branches[0].v;
+
+ FREEMEMORY(space, b->branches);
+ FREEMEMORY(space, b);
+
+ for(i=jump; i < m; i+=jump) {
+
+ iprime = l;
+ jprime = r;
+ ll = getlcpval(s, iprime, jprime);
+ if(q == ll && q > jump) {
+ //c is new lcp val -> ll-jump
+ for (j = 0; j < jump; j++){
+ suflink = getSuflink(s, iprime, jprime);
+ kprime = iprime;
+ lprime = jprime;
+ iprime = suflink.a;
+ jprime = suflink.b;
+ }
+ c = ll-jump;
+ } else {
+ // jump was further than last matching stem
+ // or suflinks on last completed will be whole array
+ if(q <= jump || getlcpval(s, u, v) <= jump) {
+ //whole suffix array -> c is 0
+ kprime = 0;
+ iprime = 0;
+ lprime = s->numofsuffixes-1;
+ jprime = s->numofsuffixes-1;
+ c = 0;
+ } else {
+ //match was stuck within interval -> take last completed interval and jump
+ //iprime & jprime new interval
+ //kprime & lprime last completed
+ iprime = u;
+ jprime = v;
+ for (j = 0; j < jump; j++){
+ suflink = getSuflink(s, iprime, jprime);
+ kprime = iprime;
+ lprime = jprime;
+ iprime = suflink.a;
+ jprime = suflink.b;
+ }
+ c = getlcpval(s, iprime, jprime);
+ //not true due to wrong suflinks
+ //assert(c > 0);
+ }
+ }
+ // fill matching stems in between jumps
+ // accordingly (no further evaluation)
+ for (j = i - jump + 1; j < i; j++){
+ a[j].branches = NULL;
+ a[j].noofbranches = 0;
+ }
+ assert(i < m);
+ b = kd_match(space, s, &p[i], m-i, sext, pmis, xoff, kp,
+ iprime, jprime, kprime, lprime, c, c);
+
+ for(j=0; j < b->noofbranches; j++) {
+ //add all the previous matches not seen by last kd_match in the
+ //new interval
+ b->branches[j].mat += c;
+ }
+ if (b->noofbranches > 1){
+ a[i].branches = ALLOCMEMORY(space, NULL, branch_t, b->noofbranches-1);
+ memmove(a[i].branches, &b->branches[1], sizeof(branch_t)*(b->noofbranches-1));
+ a[i].noofbranches = b->noofbranches-1;
+ }
+ else {
+ a[i].branches = NULL;
+ a[i].noofbranches = 0;
+ }
+
+ q = b->branches[0].q + c;
+ l = b->branches[0].l; r = b->branches[0].r;
+ u = b->branches[0].u; v = b->branches[0].v;
+
+ FREEMEMORY(space, b->branches);
+ FREEMEMORY(space, b);
+ }
+
+ // fill matching stems in between jumps
+ // accordingly (no further evaluation)
+ for (j = i - jump + 1; j < m; j++){
+ a[j].branches = NULL;
+ a[j].noofbranches = 0;
+ }
+ return a;
+}
+
+inline void
+dumpkdseeds(Suffixarray *s, matchstem_t *M, Uint m, char strand, Uint T) {
+ Uint i,j,k;
+
+ //MSG("kdseeds:\n");
+ for(i=0; i < m; i++) {
+ for(k=0; k < M[i].noofbranches; k++) {
+ printf("%d %c ", M[i].branches[k].mat, strand);
+ if(M[i].branches[k].mat > 0 && M[i].branches[k].r-M[i].branches[k].l <= T) {
+ for(j=M[i].branches[k].l; j <= M[i].branches[k].r; j++) {
+ printf("%d ", s->suftab[j]);
+ }
+ }
+ }
+ printf("\n");
+ //printf("%d:%d(i:%d, d:%d, m:%d)-(%d..%d)\t%c\n", i, M[i].mat, M[i].ins, M[i].del, M[i].mis, M[i].l, M[i].r, strand);
+ // printf("%d:%d-(%d..%d)\t", i, M[i].mat, s->suftab[M[i].l], s->suftab[M[i].r]);
+ }
+}
+
+void kdcompare(TripleSint *a, branch_t *b, Uint m) {
+ Uint i;
+
+ for(i=0; i < m; i++) {
+ if (b[i].mat != a[i].c || b[i].l != a[i].a || b[i].r != a[i].b) {
+ NFO("failure at %d of %d", i, m);
+ }
+ }
+}
+
+/*----------------------------------- kmis ------------------------------------
+ *
+ * @brief enumerates all matches in the suffix array with $k$ mismatches for a
+ * given pattern $P$ of length $m$.
+ * @author Steve Hoffmann
+ *
+ */
+
+inline branch_t*
+kmis (void *space,
+ Suffixarray *s,
+ char *P,
+ Uint m,
+ Uint k,
+ Uint *noofmatches) {
+
+ branch_t *matches=NULL;
+ VQueue vqueue;
+ Uint i,
+ matchno=0;
+ char *cursuf;
+ PairUint child;
+ kmis_t data;
+ Container *c;
+ int lcp=0, llcp=0;
+
+ data.l = 0; data.r = s->numofsuffixes-1;
+ child.a = data.l;
+ child.b = data.r;
+ data.u = 0; data.v = s->numofsuffixes-1;
+ data.p = 0; data.mis = 0;
+
+ bl_vqueueInit(&vqueue, 1000, sizeof(kmis_t));
+ bl_vqueueEnqueue(&vqueue, &data);
+
+ while (!bl_vqueueIsEmpty(&vqueue)) {
+ kmis_t *tmp = bl_vqueueDequeue(&vqueue, NULL);
+ memcpy(&data, tmp, sizeof(kmis_t));
+ free(tmp);
+ llcp = data.p;
+
+ /* if not singleton scan the string and enqueue alternatives */
+ if (data.l < data.r) {
+ while(1){
+
+ lcp = getlcpval(s, data.l, data.r);
+ if (lcp > llcp+1) {
+ for(i = llcp; i < lcp && i < m; i++) {
+ cursuf = &s->seq->sequences[s->suftab[data.l]];
+ if(P[i] != cursuf[i]) {
+ data.mis++;
+ }
+ }
+ }
+ if (lcp > m-1) {
+ child.a = data.l;
+ child.b = data.r;
+ break;
+ }
+ c = getChildintervals(space, s, data.l, data.r, 1);
+ child = getCharInterval(space, s, data.l, data.r, 0, P[lcp]);
+
+ data.u = data.l;
+ data.v = data.r;
+
+ for (i = 0; i < bl_containerSize(c); i++) {
+
+ data.l = ((PairUint*)bl_containerGet(c,i))->a;
+ data.r = ((PairUint*)bl_containerGet(c,i))->b;
+
+ if (data.l <= data.r) {
+ if ((data.l != child.a || data.r != child.b) &&
+ data.mis + 1 <= k) {
+ bl_vqueueEnqueue(&vqueue, &data);
+ kmis_t *tmp = bl_vqueueFrontN(&vqueue, bl_vqueueSize(&vqueue)-1);
+ tmp->p = lcp + 1;
+ tmp->mis++;
+ }
+ }
+ }
+ bl_containerDestruct(c, NULL);
+ free(c);
+
+ if (child.a >= child.b) {
+ break;
+ }
+
+ llcp = lcp;
+ data.l = child.a;
+ data.r = child.b;
+ }
+ } else {
+ child.a = data.l;
+ child.b = data.r;
+ lcp = data.p;
+ }
+
+ if (child.a == child.b) {
+ for(i = lcp; i < m; i++) {
+ cursuf = &s->seq->sequences[s->suftab[child.a]];
+ if(i+s->suftab[child.a] > s->numofsuffixes || P[i] != cursuf[i]) {
+ data.mis++;
+ }
+ if (data.mis > k) break;
+ }
+ data.r = child.a;
+ data.l = child.a;
+ }
+ if(data.mis <= k && child.a <= child.b) {
+ cursuf = &s->seq->sequences[s->suftab[child.a]];
+ matches = (branch_t *) realloc(matches, sizeof(branch_t) * (matchno+1));
+ KMSTOREBRANCH(matches, matchno);
+ matchno++;
+ }
+ }
+ bl_vqueueDestruct(&vqueue, NULL);
+
+ *noofmatches = matchno;
+ return matches;
+}
+
+
+void
+bl_kdMatchstemDestruct(void *space, matchstem_t* stem, Uint len) {
+ Uint i;
+ for(i=0; i < len; i++) {
+ if (stem[i].noofbranches > 0) {
+ FREEMEMORY(space, stem[i].branches);
+ }
+ }
+ FREEMEMORY(space, stem);
+ return;
+}
+
diff --git a/segemehl/libs/kdseed.h b/segemehl/libs/kdseed.h
new file mode 100644
index 0000000..31b2034
--- /dev/null
+++ b/segemehl/libs/kdseed.h
@@ -0,0 +1,185 @@
+#ifndef KDSEEDS_H
+#define KDSEEDS_H
+
+/*
+ *
+ * kdseed.h
+ * gettin k-diff seeds
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 04/13/2008 12:05:48 AM CEST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 77 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-11-17 13:16:59 +0100 (Mon, 17 Nov 2008) $
+ *
+ * Id: $Id: kdseed.h 77 2008-11-17 12:16:59Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/kdseed.h $
+ */
+
+#include "basic-types.h"
+#include "vstack.h"
+#include "sufarray.h"
+#include "karlin.h"
+
+#define KMSTOREBRANCH(B,P)\
+ B[P].mat = m-data.mis;\
+ B[P].q = m-data.mis;\
+ B[P].mis = data.mis;\
+ B[P].ins = 0;\
+ B[P].del = 0;\
+ B[P].l = data.l;\
+ B[P].r = data.r;\
+ B[P].u = data.u;\
+ B[P].v = data.v
+
+
+typedef struct branch_s {
+ Uint mis;
+ Uint mat;
+ Uint q;
+ Uint ins;
+ Uint del;
+ Uint l;
+ Uint r;
+ Uint u;
+ Uint v;
+} branch_t;
+
+typedef struct matchstem_s {
+
+ branch_t *branches;
+ Uint noofbranches;
+} matchstem_t;
+
+
+
+typedef struct {
+ Uint sptr;
+ Uint qptr;
+ Uint l;
+ Uint kcnt;
+ Uint mat;
+ Uint mis;
+ Uint ins;
+ Uint del;
+ PairUint child;
+ PairUint parent;
+} kdiffm_t;
+
+typedef struct {
+ Uint l;
+ Uint r;
+ Uint u;
+ Uint v;
+ Uint p;
+ Uint mis;
+} kmis_t;
+
+
+extern int kdscore(branch_t *b);
+
+extern void
+dumpkdseeds(Suffixarray *s, matchstem_t *M, Uint m, char strand, Uint T);
+
+double
+kd_getBranchEvalue(matchstem_t *stem, Uint i, Uint q, Uint m, Uint n,
+ karlin_t *stats);
+
+int
+kd_getBranchScore(matchstem_t *M, Uint i, Uint q);
+
+extern int
+kd_matchlcp(void *space,
+ Suffixarray *s, char *p, unsigned int m,
+ kdiffm_t *data, unsigned int cursufpos,
+ unsigned int lcplen, unsigned int seedlen, VStack *vstack,
+ unsigned int maxdiff, int sext, int pmis, int xoff, matchstem_t *b);
+extern void
+pushkdbranches ( void *space,
+ Suffixarray *s,
+ VStack *vstack,
+ char *p,
+ Uint m,
+ Uint kp,
+ kdiffm_t *data,
+ PairUint pr);
+
+extern void
+pushkdlcp ( void *space,
+ VStack *vstack,
+ char *p,
+ Uint m,
+ Uint kp,
+ kdiffm_t *data);
+
+extern void
+kdbest ( void *space,
+ Suffixarray *s,
+ char *seqs[],
+ Uint m,
+ Uint sext,
+ Uint pmis,
+ Uint xoff,
+ Uint kp,
+ matchstem_t *a[],
+ matchstem_t *b0[]);
+
+extern matchstem_t*
+kdseeds ( void *space,
+ Suffixarray *s,
+ char *p,
+ Uint m,
+ Uint jump,
+ Uint sext,
+ Uint pmis,
+ Uint xoff,
+ Uint kp,
+ matchstem_t *b0);
+
+extern matchstem_t*
+kd_match ( void *space,
+ Suffixarray *s,
+ char *p,
+ Uint m,
+ Uint sext,
+ Uint pmis,
+ Uint xoff,
+ Uint maxdiff,
+ Uint iprime,
+ Uint jprime,
+ Uint kprime,
+ Uint lprime,
+ Uint sptr,
+ Uint qptr );
+
+extern void
+matchstemModifyBranch(void *space,
+ matchstem_t *a, Uint k,
+ Uint mat, Uint q,
+ Uint mis, Uint ins, Uint del,
+ Uint l, Uint r, Uint u, Uint v);
+
+extern void
+matchstemAddBranch(void *space,
+ matchstem_t *a,
+ Uint mat, Uint q,
+ Uint mis, Uint ins, Uint del,
+ Uint l, Uint r, Uint u, Uint v);
+
+extern void
+kd_updatebranches(matchstem_t* b, kdiffm_t *data, unsigned int seedlen);
+
+void matchstemDestruct(void *space, matchstem_t *M);
+
+ branch_t* kmis (void *space, Suffixarray *s, char *P, Uint m, Uint k, Uint *noofmatches);
+void kdcompare(TripleSint *a, branch_t *b, Uint m);
+
+void
+bl_kdMatchstemDestruct(void *space, matchstem_t* stem, Uint len);
+
+ branch_t *
+copyBranch(void *space, branch_t *a);
+#endif
diff --git a/segemehl/libs/list.c b/segemehl/libs/list.c
new file mode 100644
index 0000000..f0f031c
--- /dev/null
+++ b/segemehl/libs/list.c
@@ -0,0 +1,474 @@
+/**
+ * list.c
+ * implementation of a simple lineary linked list for object pointer
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Wed Oct 15 11:39:42 CEST 2008
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 73 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-29 10:03:28 +0100 (Wed, 29 Oct 2008) $
+ * Id: $Id$
+ * Url: $URL$
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+#include "debug.h"
+#include "basic-types.h"
+#include "sort.h"
+#include "list.h"
+
+/*------------------------------ bl_listInit -----------------------------------
+ *
+ * @brief init list
+ * @author Christian Otto
+ *
+ */
+void bl_listInit(List *l, int allocelem, size_t sizeofelem){
+ if (allocelem <= 0){
+ DBG("list.c: Attempt to initialize a list of size %d.\
+ Exit forced.\n", allocelem);
+ exit(-1);
+ }
+ if (sizeofelem <= 0){
+ DBG("list.c: Attempt to initialize a list with sizeofelem %d.\
+ Exit forced.\n", sizeofelem);
+ exit(-1);
+ }
+ l->nodes = (Listelem *) malloc(allocelem * sizeof(Listelem));
+ if (l->nodes == NULL){
+ DBG("list.c: Memory allocation for nodes failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ l->data = malloc(allocelem * sizeofelem);
+ if (l->nodes == NULL){
+ DBG("list.c: Memory allocation for data failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ l->first = -1;
+ l->last = -1;
+ l->nextfree = 0;
+ l->numofelem = 0;
+ l->allocelem = allocelem;
+ l->sizeofelem = sizeofelem;
+}
+
+/*----------------------------- bl_listDestruct --------------------------------
+ *
+ * @brief destruct list,
+ * remove method for elems as parameter possible
+ * @author Christian Otto
+ *
+ */
+void bl_listDestruct(List *l, void (*rmv)(void*)){
+ Uint cur;
+ char *p;
+ if (rmv != NULL && l->numofelem > 0){
+ p = l->data;
+ for (cur = l->first; cur != -1; cur = l->nodes[cur].next){
+ rmv(p + (cur * l->sizeofelem));
+ }
+ }
+ free(l->nodes);
+ free(l->data);
+ l->first = 0;
+ l->last = 0;
+ l->nextfree = 0;
+ l->numofelem = 0;
+ l->allocelem = 0;
+ l->sizeofelem = 0;
+}
+
+/*----------------------------- bl_listIsEmpty ---------------------------------
+ *
+ * @brief returns if the container is empty
+ * @author Christian Otto
+ *
+ */
+BOOL bl_listIsEmpty(List *l){
+ return (l->numofelem == 0);
+}
+
+/*----------------------------- bl_listInsert ----------------------------------
+ *
+ * @brief adds element after an given element in the list
+ * (at beginning for cur == -1, at end for cur == l->last)
+ * @author Christian Otto
+ *
+ */
+void bl_listInsert(List *l, int cur, void *elem){
+ char *p;
+ if (cur > l->allocelem || (cur < 0 && cur != -1)){
+ return;
+ }
+ /* reallocation */
+ if (l->nextfree >= l->allocelem){
+ l->nodes = (Listelem *) realloc(l->nodes, sizeof(Listelem) *
+ (l->allocelem + BASEINC));
+ if (l->nodes == NULL){
+ DBG("list.c: Memory reallocation of nodes failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ l->data = realloc(l->data, l->sizeofelem * (l->allocelem + BASEINC));
+ if (l->data == NULL){
+ DBG("list.c: Memory reallocation of data failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ l->allocelem += BASEINC;
+ }
+ p = (char *) l->data;
+ /* insert data */
+ memmove(p + (l->nextfree * l->sizeofelem), elem, l->sizeofelem);
+ /* insert at begin (or in empty list) */
+ if (cur == -1){
+ l->nodes[l->nextfree].next = l->first;
+ l->nodes[l->nextfree].prev = -1;
+ if (l->first != -1){
+ l->nodes[l->first].prev = l->nextfree;
+ } else {
+ l->last = l->nextfree;
+ }
+ l->first = l->nextfree;
+ }
+ /* insert after elem cur */
+ else {
+ l->nodes[l->nextfree].prev = cur;
+ l->nodes[l->nextfree].next = l->nodes[cur].next;
+ /* new elem is last one */
+ if (cur == l->last){
+ l->last = l->nextfree;
+ }
+ /* otherwise */
+ else {
+ l->nodes[l->nodes[l->nextfree].next].prev = l->nextfree;
+ }
+ l->nodes[cur].next = l->nextfree;
+ }
+ l->numofelem++;
+ l->nextfree++;
+}
+
+/*-------------------------------- bl_listGetCur ------------------------------
+ *
+ * @brief get nth element from list
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+bl_listGetCur (List *l, Uint n)
+{
+ Uint i=0;
+ int cur = l->first;
+
+ while(i < n && cur != -1) {
+ cur = l->nodes[cur].next;
+ i++;
+ }
+
+ return cur;
+}
+
+
+/*------------------------------ bl_listGetElem ------------------------------
+ *
+ * @brief getting the element at cursor position cur
+ * @author Steve Hoffmann
+ *
+ */
+
+void*
+bl_listGetElem (List *l, int cur)
+{
+ char *p;
+
+ if(cur > l->allocelem || cur < 0)
+ return NULL;
+
+ p = l->data;
+ p += (cur * l->sizeofelem);
+
+ return p;
+}
+
+/*------------------------------ bl_listUnlink ---------------------------------
+ *
+ * @brief removes element from the list
+ * does not free
+ * @author Christian Otto
+ *
+ */
+
+void* bl_listUnlink(List *l, Uint cur, void (*rmv)(void*)){
+
+ char *p, *elem;
+ p = (char *) l->data;
+ if (cur > l->allocelem || cur < 0){
+ return NULL;
+ }
+
+ if(cur != l->first && l->nodes[cur].next == -1 &&
+ l->nodes[cur].next == l->nodes[cur].prev) {
+ /*previously unlinked element*/
+ return NULL;
+ }
+
+ elem = (char *) malloc(l->sizeofelem);
+ memmove(elem, p + (cur * l->sizeofelem), l->sizeofelem);
+
+ if (rmv != NULL){
+ rmv(p + (cur * l->sizeofelem));
+ }
+
+ if (l->nodes[cur].prev != -1){
+ l->nodes[l->nodes[cur].prev].next = l->nodes[cur].next;
+ } else {
+ l->first = l->nodes[cur].next;
+ }
+
+ if (l->nodes[cur].next != -1){
+ l->nodes[l->nodes[cur].next].prev = l->nodes[cur].prev;
+ } else {
+ l->last = l->nodes[cur].prev;
+ }
+
+ l->nodes[cur].prev = -1;
+ l->nodes[cur].next = -1;
+ l->numofelem--;
+
+ return elem;
+}
+
+/*------------------------------ bl_listSweep ----------------------------------
+ *
+ * @brief cleans the list of all unlinked elements,
+ * implicitly sorts the nodes
+ * @author Christian Otto
+ *
+ */
+void bl_listSweep(List *l){
+ Uint cur, last = 0;
+ Listelem *bufnodes;
+ char *bufdata, *p;
+ p = (char *) l->data;
+ bufnodes = (Listelem *) malloc(sizeof(Listelem) * (l->numofelem + BASEINC));
+ if (bufnodes == NULL){
+ DBG("list.c: Memory allocation for nodes in sweep failed. Exit forced.\n",
+ NULL);
+ exit(-1);
+ }
+ bufdata = (char *) malloc(l->sizeofelem * (l->numofelem + BASEINC));
+ if (bufdata == NULL){
+ DBG("list.c: Memory allocation for data in sweep failed. Exit forced.\n",
+ NULL);
+ exit(-1);
+ }
+ for(cur = l->first; cur != -1; cur = l->nodes[cur].next){
+ bufnodes[last].prev = last - 1;
+ if (l->nodes[cur].next != -1){
+ bufnodes[last].next = last + 1;
+ }
+ else {
+ bufnodes[last].next = -1;
+ }
+ memmove(bufdata + (last * l->sizeofelem), p + (cur * l->sizeofelem),
+ l->sizeofelem);
+ last++;
+ }
+ free(l->nodes);
+ free(l->data);
+ l->nodes = bufnodes;
+ l->data = bufdata;
+ if(l->numofelem)
+ l->first = 0;
+ else
+ l->first = -1;
+ l->last = l->numofelem - 1;
+ l->allocelem = l->numofelem + BASEINC;
+ l->nextfree = l->numofelem;
+}
+
+/*------------------------------ bl_listSize -----------------------------------
+ *
+ * @brief returns number of elements in the list
+ * @author Christian Otto
+ *
+ */
+Uint bl_listSize(List *l){
+ return l->numofelem;
+}
+
+
+/*--------------------------- bl_listSearchInsert ----------------------------
+ *
+ * @brief searches the list position if srch(cur, elem, nfo) returns 0.
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_listBinarySearchInsert (List *l, void *elem,
+ Uint (*cmp)(Uint, void *, void*, void*), void *nfo)
+{
+
+ Uint cur, left, i=0;
+
+ if(l->numofelem ==0) {
+ bl_listInsert(l, -1, elem);
+ return 0;
+ }
+
+ left = binarySearch_left(l, l->numofelem, elem, cmp, nfo);
+ left = (left > 0) ? left - 1: 0;
+ cur = l->first;
+
+ while(i < left && cur != l->last) {
+ cur = l->nodes[cur].next;
+ i++;
+ }
+
+
+ if(cmp(left, l, elem, nfo) < 2 && cur == l->first){
+ bl_listInsert(l, -1, elem);
+ return 0;
+ }
+
+ if(cmp(left, l, elem, nfo) == 2 && cur == l->last){
+ bl_listInsert(l, l->last, elem);
+ return l->last+1;
+ }
+
+
+ if(cmp(left, l, elem, nfo) == 2 &&
+ (cmp(left+1, l, elem, nfo) == 1 || cmp(left+1, l, elem, nfo) == 0)){
+ bl_listInsert(l, cur, elem);
+ return cur+1;
+ }
+
+
+ return -1;
+}
+
+#ifdef LISTTEST
+
+Uint
+cmp_listtestobj(Uint no, void *list, void *elem, void *nfo) {
+ List *l;
+ listtestobj_t *a, *b;
+
+ l = (List*) list;
+ a = (listtestobj_t*) bl_listGetElem (l, bl_listGetCur(l,no));
+ b = (listtestobj_t*) elem;
+
+ if(a->unsigned1 > b->unsigned1) return 1;
+ if(a->unsigned1 < b->unsigned1) return 2;
+ if(a->unsigned2 > b->unsigned2) return 1;
+ if(a->unsigned2 < b->unsigned2) return 2;
+
+
+ return 0;
+}
+
+void
+rmv_listtestobj(void *elem) {
+ listtestobj_t *o = (listtestobj_t*) elem;
+ free(o->string);
+}
+
+int
+main(int argv, char **argc) {
+ List l, sl;
+ listtestobj_t obj, *ret, *ret2;
+ int i;
+ Uint last=0;
+
+ srand(time(NULL));
+
+ bl_listInit(&l, 1000, sizeof(listtestobj_t));
+ obj.unsigned1 = 1; obj.unsigned2 = 1;
+ bl_listInsert(&l, l.last, &obj);
+ obj.unsigned1 = 1; obj.unsigned2 = 100;
+ bl_listInsert(&l, l.last, &obj);
+ obj.unsigned1 = 1; obj.unsigned2 = 1000;
+ bl_listInsert(&l, l.last, &obj);
+ obj.unsigned1 = 1; obj.unsigned2 = 10000;
+ bl_listInsert(&l, l.last, &obj);
+ obj.unsigned1 = 1; obj.unsigned2 = 100000;
+ bl_listInsert(&l, l.last, &obj);
+ obj.unsigned1 = 1; obj.unsigned2 = 100000;
+ bl_listInsert(&l, l.last, &obj);
+
+
+ for(i=0; i < l.numofelem; i++) {
+ ret = (listtestobj_t*) bl_listGetElem (&l, bl_listGetCur(&l, i));
+ printf("elem: %d, value:%d\n", i, ret->unsigned2);
+ }
+
+ bl_listInit(&sl, 1000, sizeof(listtestobj_t));
+
+ obj.unsigned1 = 4; obj.unsigned2 = 4603315; obj.integer=1;
+ obj.string = malloc(sizeof(char)*10);
+ bl_listBinarySearchInsert (&sl, &obj, cmp_listtestobj, NULL);
+
+ obj.unsigned1 = 4; obj.unsigned2 = 4604837; obj.integer=2;
+ obj.string = malloc(sizeof(char)*10);
+ bl_listBinarySearchInsert (&sl, &obj, cmp_listtestobj, NULL);
+
+ obj.unsigned1 = 4; obj.unsigned2 = 1822274; obj.integer=3;
+ obj.string = malloc(sizeof(char)*10);
+ bl_listBinarySearchInsert (&sl, &obj, cmp_listtestobj, NULL);
+
+
+
+
+/* for(i=0; i < 5000; i++) {
+ if(i % 1000 == 0) printf("%d\n",i);
+ obj.unsigned1 = 0; obj.unsigned2 = rand() % 5000; obj.integer=i;
+ obj.string = malloc(sizeof(char)*10);
+ bl_listBinarySearchInsert (&sl, &obj, cmp_listtestobj, NULL);
+ }
+*/
+ fprintf(stdout, "sl has %d elems\n", sl.numofelem);
+/*
+ for(i=0; i < 100; i++) {
+ last = rand() % 3000;
+ printf("delete elem i= %d\n", last);
+ ret = bl_listUnlink(&sl, last, NULL);
+ if(ret) {
+ free(ret->string);
+ free(ret);
+ }
+ }
+
+ last = 0;
+ bl_listSweep(&sl);
+ fprintf(stdout, "sl has %d elems\n", sl.numofelem);
+*/
+
+ for(i=0; i < sl.numofelem; i++) {
+ ret = (listtestobj_t*) bl_listGetElem (&sl, bl_listGetCur(&sl, i));
+ ret2 = (listtestobj_t*) bl_listGetElem (&sl, i);
+
+ if(ret) {
+ assert(last <= ret->unsigned2);
+ printf("elem: %d, value:%d, rank:%d\n", i, ret->unsigned2, ret->integer);
+ printf("elem: %d, value:%d, rank:%d\n", i, ret2->unsigned2, ret2->integer);
+ last = ret->unsigned2;
+ } else {
+ printf("elem %d is empty\n",i);
+ }
+ }
+
+ bl_listDestruct(&sl, rmv_listtestobj);
+ bl_listDestruct(&l, NULL);
+}
+
+#endif
diff --git a/segemehl/libs/list.h b/segemehl/libs/list.h
new file mode 100644
index 0000000..7233a9a
--- /dev/null
+++ b/segemehl/libs/list.h
@@ -0,0 +1,73 @@
+/**
+ * list.h
+ * implementation of a simple lineary linked list for object pointer
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Wed Oct 15 11:39:42 CEST 2008
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 73 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-29 10:03:28 +0100 (Wed, 29 Oct 2008) $
+ * Id: $Id$
+ * Url: $URL$
+ */
+
+#ifndef LIST_H
+#define LIST_H
+
+#include <stdlib.h>
+#include "basic-types.h"
+
+#define LISTINC 1000
+#ifndef BASEINC
+#define BASEINC LISTINC
+#endif
+
+typedef struct {
+ int next;
+ int prev;
+} Listelem;
+
+
+typedef struct {
+ Listelem *nodes;
+ void *data;
+ int first;
+ int last;
+ int nextfree;
+ Uint numofelem;
+ int allocelem;
+ size_t sizeofelem;
+} List;
+
+int bl_listGetCur (List *l, Uint n);
+void bl_listInit(List *l, int allocelem, size_t sizeofelem);
+void bl_listDestruct(List *l, void (*rmv)(void*));
+BOOL bl_listIsEmpty(List *l);
+void bl_listResize(List *l);
+void bl_listInsert(List *l, int cur, void *elem);
+void* bl_listUnlink(List *l, Uint cur, void (*rmv)(void*));
+void bl_listSweep(List *l);
+Uint bl_listSize(List *l);
+Uint bl_listBinarySearchInsert (List *l, void *elem,
+ Uint (*cmp)(Uint, void *, void*, void*), void *nfo);
+void* bl_listGetElem (List *l, int cur);
+
+#ifdef LISTTEST
+
+typedef struct {
+ Uint unsigned1;
+ Uint unsigned2;
+ int integer;
+ char *string;
+
+} listtestobj_t;
+
+#endif
+
+#endif /* LIST_H */
diff --git a/segemehl/libs/manopt.c b/segemehl/libs/manopt.c
new file mode 100644
index 0000000..9518185
--- /dev/null
+++ b/segemehl/libs/manopt.c
@@ -0,0 +1,1125 @@
+
+/*
+ * manopt.c
+ * implementations for the option manager
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 09/01/2008 11:12:56 AM CEST
+ *
+ * Revision of last commit:
+ * $Rev: 74 $1
+ * $Author: steve $
+ * $Date: 2008-10-29 15:03:04 +0100 (Wed, 29 Oct 2008) $
+ *
+ *
+ * $Id: manopt.c 74 2008-10-29 14:03:04Z steve $
+ * $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/manopt.c $
+ *
+ */
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <limits.h>
+#include <assert.h>
+#include <ctype.h>
+#include "manopt.h"
+#include <sys/ioctl.h>
+
+
+char*
+getNiceSVNVersion(const char *version) {
+ int i;
+ char *p, *src, *subs[] = {"$Rev: ","$Date: ", " $"};
+
+ src = calloc(strlen(version)+1, sizeof(char));
+ strcpy(src, version);
+ for(i=0; i < 3; i++) {
+ while((p=strstr(src,subs[i])) != NULL) {
+ memmove(p,p+strlen(subs[i]), strlen(p+strlen(subs[i]))+1);
+ }
+ }
+ return src;
+}
+
+int
+detectTerminalwidth(void) {
+ struct winsize termwinsize;
+ ioctl(0,TIOCGWINSZ, &termwinsize);
+ return termwinsize.ws_col;
+}
+
+unsigned char
+isfloat(char *s) {
+ int i=0;
+ int len=strlen(s);
+ unsigned char dpt = 0;
+
+ i += (s[i] == 43 || s[i] == 45) ? 1 : 0;
+ while((s[i] >= 48 && s[i] <= 57) || (dpt=(!dpt && s[i]==46))) i++;
+
+ return (len==i);
+}
+
+unsigned char
+isint(char *s) {
+ int i=0;
+ int len=strlen(s);
+ i += (s[i] == 43 || s[i] == 45) ? 1 : 0;
+ while((s[i] >= 48 && s[i] <= 57)) i++;
+ return (len==i);
+}
+
+void
+manopt_usage(manopt_optionset *set) {
+ unsigned int i=0,j=0,k,l,
+ aptr = 0,
+ msglen = 0,
+ restlen = 0,
+ lastspace = 0,
+ offset =0,
+ synopsislen=0,
+ calllen=0;
+ char shortopt = 0;
+ char *longopt = NULL;
+ unsigned char flags = 0;
+ unsigned int maxarglen = 0;
+
+ char string[2];
+ char *synopsis;
+ char **arg;
+ char **msg;
+ char *fill=NULL;
+ char *call;
+ int width = detectTerminalwidth();
+
+ width = (width < 70) ? 70 : width;
+ synopsis = malloc(sizeof(char)*MANOPT_MAXSYNOPSIS);
+ call = malloc(sizeof(char)*MANOPT_MAXSYNOPSIS);
+ call[0] = '\0',
+ synopsis[0] = '\0' ;
+
+ arg = malloc(sizeof(char*)*set->noofopts);
+ msg = malloc(sizeof(char*)*set->noofopts);
+
+ for(i=0;i<set->noofopts;i++) {
+ arg[i] = malloc(sizeof(char)*MANOPT_MAXSYNOPSIS);
+ arg[i][0] = 0;
+ msg[i] = malloc(sizeof(char)*MANOPT_MAXSYNOPSIS);
+ msg[i][0] = 0;
+ }
+
+ strcat(call, "usage: ");
+ strcat(call, set->call);
+ strcat(call, " ");
+ calllen = strlen(call);
+ if (calllen > 40) {
+ strcat(synopsis, "\n");
+ calllen = 20;
+ }
+
+ for(i=0; i < set->noofopts; i++) {
+ shortopt = set->opts[i].shortopt;
+ longopt = set->opts[i].longopt;
+ if (set->opts[i].type == FLAG && shortopt) {
+ if (!flags) {
+ strcat(synopsis, "[-");
+ flags = 1;
+ }
+ string[0] = set->opts[i].shortopt;
+ string[1] = 0;
+ strcat(synopsis, string);
+ }
+ }
+
+ if(flags) {
+ strcat(synopsis, "]\t");
+ }
+
+ flags = 0;
+ for(i=0; i < set->noofopts; i++) {
+ shortopt = set->opts[i].shortopt;
+ longopt = set->opts[i].longopt;
+ if (set->opts[i].type == MANOPT_BLOCKSEPARATOR) {
+ strcat(arg[aptr], " [");
+ strcat(arg[aptr], set->opts[i].longopt);
+ strcat(arg[aptr], "]");
+ msg[aptr][0]=0;
+ aptr++;
+ }
+ else if (set->opts[i].type != FLAG ||
+ (set->opts[i].type == FLAG && !shortopt)) {
+ if (!set->opts[i].required) {
+ strcat(synopsis, "[");
+ flags = 1;
+ }
+ if(shortopt) {
+ strcat(arg[aptr], " ");
+ strcat(synopsis, "-");
+ strcat(arg[aptr], "-");
+ string[0] = set->opts[i].shortopt;
+ string[1] = 0;
+ strcat(synopsis, string);
+ strcat(arg[aptr],string);
+ if(longopt) {
+ strcat(arg[aptr], ",");
+ }
+ }
+ if(longopt) {
+ strcat(arg[aptr]," --");
+ strcat(arg[aptr], longopt);
+ if(!shortopt) {
+ strcat(synopsis,"--");
+ strcat(synopsis, longopt);
+ }
+ }
+
+ if(set->opts[i].argdesc) {
+ strcat(arg[aptr], " ");
+ strcat(arg[aptr], set->opts[i].argdesc);
+ strcat(synopsis, " ");
+ strcat(synopsis, set->opts[i].argdesc);
+ }
+ strcat(arg[aptr], " ");
+
+ strcat(msg[aptr], set->opts[i].helpmsg);
+ if (set->opts[i].defaultval) {
+ strcat(msg[aptr], " (default:");
+ strcat(msg[aptr], set->opts[i].defaultval);
+ strcat(msg[aptr], ")");
+ }
+ aptr++;
+
+ if (!set->opts[i].required) {
+ strcat(synopsis, "]\t");
+ flags = 0;
+ } else {
+ strcat(synopsis, "\t");
+ }
+ } else {
+ string[0] = set->opts[i].shortopt;
+ string[1] = 0;
+ strcat(arg[aptr], " ");
+ strcat(arg[aptr], "-");
+ strcat(arg[aptr], string);
+ if(longopt) {
+ strcat(arg[aptr], ", ");
+ strcat(arg[aptr], "--");
+ strcat(arg[aptr], longopt);
+ }
+ strcat(arg[aptr], " ");
+ strcat(msg[aptr], set->opts[i].helpmsg);
+ aptr++;
+ }
+ }
+
+ if(set->unflagged) {
+ strcat(synopsis, set->unflagged);
+ strcat(synopsis, "\t");
+ }
+
+ fill = realloc(fill, calllen*sizeof(char));
+ for(i=0; i < calllen; i++) {
+ fill[i]=' ';
+ }
+
+ synopsislen = strlen(synopsis);
+ if(calllen+synopsislen > width) {
+ l = (synopsislen)/(width-calllen)+1;
+ offset =0;
+ for(k=0; k < l; k++) {
+ for(j=(k*(width-calllen));
+ j < ((k+1)*(width-calllen))-1-offset && j < strlen(synopsis); j++) {
+ if(synopsis[j]=='\t') {
+ lastspace = j;
+ }
+ }
+ offset = (k+1)*(width-calllen)-lastspace;
+ restlen = strlen(&synopsis[lastspace+1]);
+ memmove(&synopsis[lastspace+2], &synopsis[lastspace+1],
+ (restlen)*sizeof(char));
+ synopsis[lastspace+2+restlen] = 0;
+ synopsis[lastspace+1] = '\n';
+ }
+ for(j=0; j < strlen(synopsis);j++) {
+ if(synopsis[j] == '\t') synopsis[j] = ' ';
+ if(synopsis[j] == '\n') {
+ restlen = strlen(&synopsis[j+1]);
+ memmove(&synopsis[j+1+calllen], &synopsis[j+1], (restlen)*sizeof(char));
+ synopsis[j+1+calllen+restlen] = 0;
+ memmove(&synopsis[j+1], fill, calllen*sizeof(char));
+ }
+ }
+ } else {
+
+ for(k=0; k < strlen(synopsis); k++) {
+ if(synopsis[k] == '\t') synopsis[k] = ' ';
+ }
+ }
+
+ for(i=0; i < set->noofopts; i++) {
+ maxarglen = strlen(arg[i]) > maxarglen ? strlen(arg[i]) : maxarglen;
+ }
+ maxarglen++;
+ assert(maxarglen < 60);
+
+ fill = realloc(fill, maxarglen*sizeof(char));
+ for(i=0; i < maxarglen; i++) {
+ fill[i]=' ';
+ }
+
+ for(i=0; i < set->noofopts; i++) {
+ if((msglen=strlen(msg[i])) > width-maxarglen) {
+ l = (msglen)/(width-maxarglen)+1;
+ offset =0;
+ for(k=0; k < l; k++) {
+ for(j=(k*(width-maxarglen));
+ j < ((k+1)*(width-maxarglen))-1-offset && j < strlen(msg[i])
+ ; j++) {
+ if(isspace((int)msg[i][j])) {
+ lastspace = j;
+ }
+ }
+ if (j >= (k+1)*(width-maxarglen)-1-offset) {
+ offset = (k+1)*(width-maxarglen)-lastspace;
+ restlen = strlen(&msg[i][lastspace+1]);
+ memmove(&msg[i][lastspace+2],&msg[i][lastspace+1],
+ restlen*sizeof(char));
+ msg[i][lastspace+2+restlen] = 0;
+ msg[i][lastspace+1] = '\n';
+ }
+ }
+ for(j=0; j < strlen(msg[i]); j++) {
+ if(msg[i][j] == '\n') {
+ restlen = strlen(&msg[i][j+1]);
+ memmove(&msg[i][j+1+maxarglen],&msg[i][j+1], restlen*sizeof(char));
+ msg[i][j+1+maxarglen+restlen] = 0;
+ memmove(&msg[i][j+1], fill, maxarglen*sizeof(char));
+ }
+ }
+ }
+ }
+
+ fprintf(stderr, "%s", call);
+ fprintf(stderr, "%s\n" ,synopsis);
+ fprintf(stderr, " %s\n", set->description);
+ for(i=0; i < set->noofopts; i++) {
+ fprintf(stderr, "%s", arg[i]);
+ for(j=0; j < maxarglen-strlen(arg[i]); j++) {
+ fprintf(stderr, " ");
+ }
+ fprintf(stderr, "%s\n", msg[i]);
+ }
+ fprintf(stderr, " [VERSION]\n %s\n", set->version);
+ fprintf(stderr, " [BUGS]\n %s\n", set->bugs);
+ fprintf(stderr, " [REFERENCES]\n %s\n", set->references);
+
+
+ for(i=0;i<set->noofopts;i++) {
+ free(arg[i]);
+ free(msg[i]);
+ }
+
+ free(fill);
+ free(call);
+ free(synopsis);
+ free(arg);
+ free(msg);
+
+}
+
+void
+manopt_help(manopt_optionset *set, const char *fmt, ...) {
+ //int ret;
+ va_list ap;
+ va_start(ap, fmt);
+
+ fprintf(stderr, "%s: ", set->call);
+ //not used: ret = vfprintf(stderr, fmt, ap);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ manopt_usage(set);
+
+ exit(-1);
+ return;
+}
+
+void
+manopt_initoptionset(manopt_optionset *set,
+ char *call, char *unflagged,
+ char *description,
+ char *references,
+ char *version,
+ char *bugs) {
+
+ set->call = call;
+ set->description = description;
+ set->unflagged = unflagged;
+ set->references = references;
+ set->bugs = bugs;
+ set->version = version;
+ set->opts = NULL;
+ set->noofopts = 0;
+
+ return;
+}
+
+void
+manopt_initarg(manopt_arg* arg) {
+ arg->flagname = NULL;
+ arg->noofvalues = 0;
+ arg->values = NULL;
+ return;
+}
+
+void
+manopt_initoption(manopt_option *opt) {
+ opt->shortopt = 0;
+ opt->longopt = NULL;
+ opt->helpmsg = NULL;
+ opt->type = 0;
+ opt->required = 0;
+ opt->constraint = NULL;
+ opt->set = (unsigned char) 0;
+ opt->defaultval = NULL;
+ opt->reg_var = NULL;
+ manopt_initarg(&(opt->arg));
+
+ return;
+}
+
+
+void
+manopt_destructarg(manopt_arg *arg) {
+ free(arg->values);
+ return;
+}
+
+void
+manopt_destructoptionset(manopt_optionset *set) {
+ int i;
+
+ for(i=0; i < set->noofopts; i++) {
+ if (set->opts[i].arg.noofvalues) {
+ manopt_destructarg(&set->opts[i].arg);
+ }
+ if(set->opts[i].defaultval) {
+ free(set->opts[i].defaultval);
+ set->opts[i].defaultval = NULL;
+ }
+ }
+ if (set->noofopts > 0) free(set->opts);
+ return;
+}
+
+int
+manopt_parse_commandline(manopt_argset* argset, int argc, char **argv) {
+ int i,
+ cnt=0,
+ len=0,
+ offset=0;
+ manopt_arg *arg=NULL;
+
+ for (i=0; i < argc; i++) {
+ /*if a number follows '-' expression is considered argument*/
+ if(argv[i][0] == '-' && (argv[i][1] < 48 || argv[i][1] > 57)) {
+ offset = (argv[i][1] == '-') ? 1 : 0;
+ arg = realloc(arg, sizeof(manopt_arg)*(cnt+1));
+ manopt_initarg(&arg[cnt]);
+ len = strlen(&argv[i][offset+1])+1;
+ if (len <= 0) {
+ fprintf(stderr, "flaglen <= 0!");
+ return 0;
+ }
+ arg[cnt].flagname=&argv[i][offset+1];
+ cnt++;
+ } else {
+ if(cnt == 0) {
+ arg = realloc(arg, sizeof(manopt_arg)*(cnt+1));
+ manopt_initarg(&arg[cnt]);
+ cnt++;
+ }
+ arg[cnt-1].values = realloc(arg[cnt-1].values,
+ sizeof(char*)*(arg[cnt-1].noofvalues+1));
+ arg[cnt-1].values[arg[cnt-1].noofvalues] = (char*) argv[i];
+ arg[cnt-1].noofvalues++;
+ }
+ }
+ argset->noofargs = cnt;
+ argset->args= arg;
+
+ return 1;
+}
+
+void
+manopt_blockseparator(manopt_optionset *set, char *blockname) {
+
+ set->opts = realloc(set->opts, sizeof(manopt_option)*(set->noofopts+1));
+ manopt_initoption(&set->opts[set->noofopts]);
+ set->opts[set->noofopts].longopt = blockname;
+ set->opts[set->noofopts].type = MANOPT_BLOCKSEPARATOR;
+ set->noofopts++;
+}
+
+void
+manopt(manopt_optionset *set,
+ manopt_type type,
+ unsigned char required,
+ char shortopt,
+ char *longopt,
+ char *helpmsg,
+ char *argdesc,
+ void *constraint,
+ void *reg_var) {
+ unsigned int *uintval,
+ *uintrangeval;
+ char *charval,
+ **ptr;
+ int i,
+ *intval,
+ *intrangeval;
+ double *dblval,
+ *dblrangeval;
+
+ for(i=0; i < set->noofopts; i++) {
+ if (shortopt && shortopt == set->opts[i].shortopt) {
+ fprintf(stderr, "shortopt %c already defined", shortopt);
+ exit(-1);
+ }
+ if (longopt && !strcmp(longopt,set->opts[i].longopt)) {
+ fprintf(stderr, "longopt %s already defined", longopt);
+ exit(-1);
+ }
+ }
+
+ set->opts = realloc(set->opts, sizeof(manopt_option)*(set->noofopts+1));
+ manopt_initoption(&set->opts[set->noofopts]);
+ set->opts[set->noofopts].argdesc = argdesc;
+ set->opts[set->noofopts].shortopt = shortopt;
+ set->opts[set->noofopts].longopt = longopt;
+ set->opts[set->noofopts].helpmsg = helpmsg;
+ set->opts[set->noofopts].type = type;
+ set->opts[set->noofopts].required = required;
+ set->opts[set->noofopts].constraint = constraint;
+ set->opts[set->noofopts].reg_var = reg_var;
+ set->noofopts++;
+
+ if(reg_var) {
+ set->opts[i].defaultval = malloc(sizeof(char)*MANOPT_MAXSYNOPSIS);
+ set->opts[i].defaultval[0]=0;
+ switch(type) {
+ case CHAROPT:
+ charval = (char*) set->opts[i].reg_var;
+ sprintf(set->opts[i].defaultval, "%c", charval[0]);
+ break;
+ case REQUINTOPT:
+ case REQINTOPT:
+ uintval = (unsigned int*) set->opts[i].reg_var;
+ sprintf(set->opts[i].defaultval, "%d", uintval[0]);
+ break;
+ case UINTOPT:
+ case INTOPT:
+ intval = (int*) set->opts[i].reg_var;
+ sprintf(set->opts[i].defaultval, "%d", intval[0]);
+ break;
+ case REQDBLOPT:
+ case DBLOPT:
+ dblval = (double*) set->opts[i].reg_var;
+ sprintf(set->opts[i].defaultval, "%f", dblval[0]);
+ break;
+ case REQSTRINGOPT:
+ case STRINGOPT:
+ ptr = (char**) set->opts[i].reg_var;
+ if(ptr[0]) {
+ sprintf(set->opts[i].defaultval, "\"%s\"", ptr[0]);
+ } else {
+ sprintf(set->opts[i].defaultval, "none");
+ }
+ break;
+ case INTRANGEOPT:
+ intrangeval = (int*) set->opts[i].reg_var;
+ if (intrangeval) {
+ sprintf(set->opts[i].defaultval, "[%d,%d]",
+ intrangeval[0], intrangeval[1]);
+ }
+ break;
+ case UINTRANGEOPT:
+ uintrangeval = (unsigned int*) set->opts[i].reg_var;
+ if (uintrangeval) {
+ sprintf(set->opts[i].defaultval, "[%d,%d]",
+ uintrangeval[0], uintrangeval[1]);
+ }
+ break;
+ case DBLRANGEOPT:
+ dblrangeval = (double*) set->opts[i].reg_var;
+ if (dblrangeval) {
+ sprintf(set->opts[i].defaultval, "[%f,%f]",
+ dblrangeval[0], dblrangeval[1]);
+ }
+ break;
+ default:
+ free(set->opts[i].defaultval);
+ set->opts[i].defaultval = NULL;
+ break;
+ }
+ }
+
+ return;
+}
+
+void
+manopt_unflag(manopt_argset *argset,
+ int arg,
+ int offset) {
+ int size = argset->args[arg].noofvalues-offset;
+
+ argset->args[0].values = realloc(argset->args[0].values,
+ (argset->args[0].noofvalues+size)*sizeof(char*));
+ memmove(&argset->args[0].values[argset->args[0].noofvalues],
+ &argset->args[arg].values[offset], size*sizeof(char*));
+ argset->args[0].noofvalues += size;
+ argset->args[arg].values = realloc(argset->args[arg].values,
+ size*sizeof(char*));
+ argset->args[arg].noofvalues = offset;
+
+ return;
+}
+
+
+unsigned char
+manopt_checkconstraint(manopt_optionset* optset,
+ int opt,
+ manopt_argset *argset,
+ int arg) {
+
+ unsigned char lastarg = (unsigned char) (arg == argset->noofargs-1);
+ int noofvalues = argset->args[arg].noofvalues;
+ int i,
+ j,
+ rintval = 0,
+ lintval = 0;
+ void *constraint = optset->opts[opt].constraint;
+ double ldblval = .0,
+ rdblval = .0;
+ unsigned char valid_select = 0;
+
+ manopt_dblconstraint *dblconstraint = NULL;
+ manopt_intconstraint *intconstraint = NULL;
+ manopt_listconstraint *listconstraint = NULL;
+
+ switch(optset->opts[opt].type) {
+ case FLAG:
+ if(noofvalues > 0) {
+ if(!lastarg) {
+ manopt_help(optset, "flag %c (%s) with argument given\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ manopt_unflag(argset, arg, 0);
+ }
+ }
+ break;
+ case REQCHAROPT:
+ if(noofvalues < 1) {
+ manopt_help(optset, "option %c (%s) without required argument\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ case CHAROPT:
+ if(noofvalues > 1) {
+ if(!lastarg) {
+ manopt_help(optset, "option with multiple arguments\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ manopt_unflag(argset, arg, 1);
+ }
+ } else if(strlen(argset->args[arg].values[0]) > 1) {
+ manopt_help(optset, "a char for option %c (%s) argument required\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ break;
+ case REQSTRINGOPT:
+ if (noofvalues < 1) {
+ manopt_help(optset, "option %c (%s) without required argument\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ case FILEOPT:
+ case STRINGOPT:
+ if (noofvalues > 1) {
+ if(!lastarg) {
+ manopt_help(optset,"option %c (%s) with multiple arguments\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ manopt_unflag(argset, arg, 1);
+ }
+ }
+ break;
+ case REQDBLOPT:
+ if (noofvalues < 1) {
+ manopt_help(optset, "option %c (%s) without required argument\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ case DBLOPT:
+ if (noofvalues > 1) {
+ if(!lastarg) {
+ manopt_help(optset, "option %c (%s) with multiple arguments\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ manopt_unflag(argset, arg, 1);
+ }
+ } else if (noofvalues) {
+ if (!isfloat(argset->args[arg].values[0]) ||
+ (ldblval=atof(argset->args[arg].values[0])) == HUGE_VAL) {
+ manopt_help(optset, "double '%s' argument for option %c (%s) out of range\n",
+ argset->args[arg].values[0],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ dblconstraint = (manopt_dblconstraint*) constraint;
+ if (dblconstraint &&
+ (ldblval > dblconstraint->max || ldblval < dblconstraint->min)) {
+ manopt_help(optset, "double '%s' argument for option %c (%s) out of bounds\n",
+ argset->args[arg].values[0],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ }
+ }
+ break;
+ case REQINTOPT:
+ if (noofvalues < 1) {
+ manopt_help(optset, "option %c (%s) without required argument\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ case INTOPT:
+ if (noofvalues > 1) {
+ if(!lastarg) {
+ manopt_help(optset, "option %c (%s) with multiple arguments\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ manopt_unflag(argset, arg, 1);
+ }
+ } else if (noofvalues) {
+ if (!isint(argset->args[arg].values[0]) ||
+ (lintval=atoi(argset->args[arg].values[0])) == INT_MIN ||
+ lintval == INT_MAX) {
+ manopt_help(optset, "int argument '%s' for option %c (%s) out of range\n",
+ argset->args[arg].values[0],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ intconstraint = (manopt_intconstraint*) constraint;
+ if (intconstraint &&
+ (lintval > intconstraint->max || lintval < intconstraint->min)) {
+ manopt_help(optset, "int argument '%s' for option %c (%s) out of bounds\n",
+ argset->args[arg].values[0],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ }
+ }
+ break;
+ case REQUINTOPT:
+ if (noofvalues < 1) {
+ manopt_help(optset, "option %c (%s) without required argument\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ case UINTOPT:
+ if (noofvalues > 1) {
+ if(!lastarg) {
+ manopt_help(optset, "option %c (%s) with multiple arguments\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ manopt_unflag(argset, arg, 1);
+ }
+ } else if (noofvalues) {
+ if (!isint(argset->args[arg].values[0]) ||
+ (lintval=atoi(argset->args[arg].values[0])) < 0 ||
+ lintval == INT_MAX) {
+ manopt_help(optset, "unsigned int argument '%s' for option %c (%s) out of range\n",
+ argset->args[arg].values[0],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+
+ } else {
+ intconstraint = (manopt_intconstraint*) constraint;
+ if (intconstraint &&
+ (lintval > intconstraint->max || lintval < intconstraint->min)) {
+ manopt_help(optset, "unsigned int argument '%s' for option %c (%s) out of bounds\n",
+ argset->args[arg].values[0],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ }
+ }
+ break;
+ case INTRANGEOPT:
+ if (noofvalues < 2) {
+ manopt_help(optset, "range option %c (%s) requires at least two values",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ if (noofvalues > 2) {
+ if(!lastarg) {
+ manopt_help(optset, "range option %c (%s) requires exactly two values",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ manopt_unflag(argset, arg, 2);
+ }
+ }
+ if (!isint(argset->args[arg].values[0]) ||
+ !isint(argset->args[arg].values[1]) ||
+ (lintval=atoi(argset->args[arg].values[0])) == INT_MIN ||
+ (rintval=atoi(argset->args[arg].values[1])) == INT_MIN ||
+ lintval == INT_MAX || rintval == INT_MAX) {
+ manopt_help(optset, "'%s'-'%s' for option %c (%s) out of range\n",
+ argset->args[arg].values[0], argset->args[arg].values[1],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+
+ } else {
+ if (lintval > rintval) {
+ manopt_help(optset, "'%s' > '%s' for option %c (%s)\n",
+ argset->args[arg].values[0], argset->args[arg].values[1],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+
+ } else {
+ intconstraint = (manopt_intconstraint*) constraint;
+ if (intconstraint &&
+ (rintval > intconstraint->max || lintval < intconstraint->min)) {
+ manopt_help(optset, "'%s'-'%s' for option %c (%s) out of range\n",
+ argset->args[arg].values[0], argset->args[arg].values[1],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ }
+ }
+ break;
+ case UINTRANGEOPT:
+ if (noofvalues < 2) {
+ manopt_help(optset, "range option %c (%s) requires at least two values\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ if (noofvalues > 2) {
+ if(!lastarg) {
+ manopt_help(optset,"range option %c (%s) requires exactly two values\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ manopt_unflag(argset, arg, 2);
+ }
+ }
+ if (!isint(argset->args[arg].values[0]) ||
+ !isint(argset->args[arg].values[1]) ||
+ (lintval=atoi(argset->args[arg].values[0])) < 0 ||
+ (rintval=atoi(argset->args[arg].values[1])) < 0 ||
+ lintval == INT_MAX || rintval == INT_MAX) {
+ manopt_help(optset, "'%s'-'%s' for option %c (%s) out of range\n",
+ argset->args[arg].values[0], argset->args[arg].values[1],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ if (lintval > rintval) {
+ manopt_help(optset, "'%s'>'%s' for option %c (%s)\n",
+ argset->args[arg].values[0], argset->args[arg].values[1],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ intconstraint = (manopt_intconstraint*) constraint;
+ if (intconstraint &&
+ (rintval > intconstraint->max || lintval < intconstraint->min)) {
+ manopt_help(optset, "'%s'-'%s' for option %c (%s) out of range\n",
+ argset->args[arg].values[0], argset->args[arg].values[1],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ }
+ }
+ break;
+
+ case DBLRANGEOPT:
+ if (noofvalues < 2) {
+ manopt_help(optset, "range option %c (%s) requires at least two values\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+
+ } else if (noofvalues > 2) {
+ if(!lastarg) {
+ manopt_help(optset,"range option %c (%s) requires exactly two values\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ manopt_unflag(argset, arg, 2);
+ }
+ }
+ if (!isfloat(argset->args[arg].values[0]) ||
+ !isfloat(argset->args[arg].values[1]) ||
+ (ldblval=atof(argset->args[arg].values[0])) == HUGE_VAL ||
+ (rdblval=atof(argset->args[arg].values[1])) == HUGE_VAL) {
+ manopt_help(optset, "'%s'-'%s' for option %c (%s) out of range\n",
+ argset->args[arg].values[0], argset->args[arg].values[1],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ if (ldblval > rdblval) {
+ manopt_help(optset, "'%s'>'%s' for option %c (%s)\n",
+ argset->args[arg].values[0], argset->args[arg].values[1],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ dblconstraint = (manopt_dblconstraint*) constraint;
+ if (dblconstraint &&
+ (rdblval > dblconstraint->max || ldblval < dblconstraint->min)) {
+ manopt_help(optset, "'%s'-'%s' for option %c (%s) out of range\n",
+ argset->args[arg].values[0], argset->args[arg].values[1],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ }
+ }
+ break;
+ case LISTOPT:
+ if (noofvalues < 1) {
+ manopt_help(optset, "list option %c (%s) requires at least one argument\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ listconstraint = (manopt_listconstraint*) constraint;
+ if (listconstraint) {
+ if(noofvalues > listconstraint->maxlength) {
+ if(!lastarg) {
+ manopt_help(optset, "list option %c (%s) too long!\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ manopt_unflag(argset, arg, listconstraint->maxlength);
+ }
+ }
+ if(noofvalues < listconstraint->minlength) {
+ manopt_help(optset, "list option %c (%s) too short!\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ }
+ }
+ }
+ break;
+ case SELECTOPT:
+ if (noofvalues < 1) {
+ manopt_help(optset, "list option %c (%s) requires at least one argument\n",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ listconstraint = (manopt_listconstraint*) constraint;
+ if(listconstraint) {
+ if(noofvalues > listconstraint->maxlength) {
+ if(!lastarg) {
+ manopt_help(optset, "list option %c (%s) too long!",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ manopt_unflag(argset, arg, listconstraint->maxlength);
+ }
+ } else if(noofvalues < listconstraint->minlength) {
+ if(!lastarg) {
+ manopt_help(optset, "list option %c (%s) too short!",
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+ } else {
+ manopt_unflag(argset, arg, listconstraint->maxlength);
+ }
+ } else {
+ for(i=0; i < noofvalues; i++) {
+ valid_select = (unsigned char) 0;
+ for(j=0; j < listconstraint->noofitems; j++) {
+ if(!strcmp(listconstraint->items[j],
+ argset->args[arg].values[i])) {
+ valid_select = (unsigned char) 1;
+ }
+ }
+ if(!valid_select) {
+ manopt_help(optset, "unknown value %s for select option %c (%s)",
+ argset->args[arg].values[i],
+ optset->opts[opt].shortopt, optset->opts[opt].longopt);
+
+ }
+ }
+ }
+ }
+ }
+ break;
+ case MANOPT_BLOCKSEPARATOR:
+ break;
+ default:
+ manopt_help(optset, "unkown option %s type\n", argset->args[arg].flagname);
+ break;
+ }
+
+ return 1;
+}
+
+
+manopt_arg*
+manopt_getopts(manopt_optionset* set, int argc, char **argv) {
+ unsigned char *ucharval,
+ optionfound = 0;
+ unsigned int *uintval, *uintrangeval;
+ char *charval, **ptr;
+ int i, j, *intval, *intrangeval;
+ double *dblval, *dblrangeval;
+ manopt_argset argset;
+
+ argset.noofargs = 0;
+ argset.args = NULL;
+
+ if(!manopt_parse_commandline(&argset, argc, argv)) {
+ manopt_help(set, "error while parsing commandline.\n");
+ }
+
+ set->call = argv[0];
+
+ for(j=0; j < argset.noofargs; j++) {
+ if(argset.args[j].flagname) {
+ optionfound = (unsigned char) 0;
+ for(i=0; i < set->noofopts; i++) {
+ if ((set->opts[i].longopt &&
+ !strcmp(set->opts[i].longopt, argset.args[j].flagname))
+ || (set->opts[i].shortopt && strlen(argset.args[j].flagname)==1 &&
+ set->opts[i].shortopt == argset.args[j].flagname[0])) {
+ if (set->opts[i].set) {
+ manopt_help(set, "option %s (%c) multiply selected\n",
+ set->opts[i].longopt, set->opts[i].shortopt);
+ } else {
+ set->opts[i].set = (unsigned char) 1;
+ optionfound = 1;
+ manopt_checkconstraint(set, i, &argset, j);
+ memmove(&set->opts[i].arg, &argset.args[j], sizeof(manopt_arg));
+ if(set->opts[i].reg_var) {
+ switch(set->opts[i].type) {
+ case FLAG:
+ ucharval = set->opts[i].reg_var;
+ *ucharval = 1;
+ break;
+ case REQCHAROPT:
+ case CHAROPT:
+ charval = set->opts[i].reg_var;
+ *charval = argset.args[j].values[0][0];
+ break;
+ case REQUINTOPT:
+ case REQINTOPT:
+ uintval = (unsigned int*) set->opts[i].reg_var;
+ *uintval = atoi(argset.args[j].values[0]);
+ break;
+ case UINTOPT:
+ case INTOPT:
+ intval = (int*) set->opts[i].reg_var;
+ *intval = atoi(argset.args[j].values[0]);
+ break;
+ case REQDBLOPT:
+ case DBLOPT:
+ dblval = (double*) set->opts[i].reg_var;
+ *dblval = atof(argset.args[j].values[0]);
+ break;
+ case REQSTRINGOPT:
+ ptr = (char**) set->opts[i].reg_var;
+ ptr[0] = argset.args[j].values[0];
+ break;
+ case STRINGOPT:
+ ptr = (char**) set->opts[i].reg_var;
+ if(argset.args[j].values) {
+ ptr = (char**) set->opts[i].reg_var;
+ ptr[0] = argset.args[j].values[0];
+ }
+ break;
+ case INTRANGEOPT:
+ intrangeval = (int*) set->opts[i].reg_var;
+ intrangeval[0] = atoi(argset.args[j].values[0]);
+ intrangeval[1] = atoi(argset.args[j].values[1]);
+ break;
+ case UINTRANGEOPT:
+ uintrangeval = (unsigned int*) set->opts[i].reg_var;
+ uintrangeval[0] = atoi(argset.args[j].values[0]);
+ uintrangeval[1] = atoi(argset.args[j].values[1]);
+ break;
+ case DBLRANGEOPT:
+ dblrangeval = (double*) set->opts[i].reg_var;
+ dblrangeval[0] = atof(argset.args[j].values[0]);
+ dblrangeval[1] = atof(argset.args[j].values[1]);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (!strcmp(argset.args[j].flagname,"h")||
+ !strcmp(argset.args[j].flagname,"help")) {
+ manopt_usage(set);
+ exit(EXIT_FAILURE);
+ }
+
+ else if(!optionfound) {
+ manopt_help(set, "option '%s' unknown\n",
+ argset.args[j].flagname);
+ }
+ }
+ }
+ argset.args = realloc(argset.args, sizeof(manopt_arg));
+
+ for(i=0; i < set->noofopts; i++){
+ if (set->opts[i].required && !set->opts[i].set) {
+ manopt_help(set, "required option '%s' (%c) missing\n",
+ set->opts[i].longopt, set->opts[i].shortopt);
+ }
+ }
+
+
+ return &argset.args[0];
+}
+
+unsigned char
+manopt_isset(manopt_optionset *set, char shortopt, char *longopt) {
+ int i;
+
+ for(i=0; i < set->noofopts; i++) {
+ if((set->opts[i].shortopt == shortopt && set->opts[i].set) ||
+ (set->opts[i].longopt && longopt &&
+ !strcmp(set->opts[i].longopt,longopt) && set->opts[i].set )) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+manopt_arg*
+manopt_getarg(manopt_optionset *set, char shortopt, char *longopt) {
+ int i;
+
+ for(i=0; i < set->noofopts; i++) {
+ if((set->opts[i].shortopt == shortopt && set->opts[i].set) ||
+ (set->opts[i].longopt && longopt &&
+ !strcmp(set->opts[i].longopt,longopt) && set->opts[i].set )) {
+ return &set->opts[i].arg;
+ }
+ }
+ return NULL;
+}
+
+
+
+manopt_option*
+manopt_longopt(manopt_optionset *set, char *longopt) {
+ int i;
+
+ for(i=0; i < set->noofopts; i++) {
+ if(!strcmp(set->opts[i].longopt,longopt)) {
+ return &set->opts[i];
+ }
+ }
+ return NULL;
+}
+
+manopt_option*
+manopt_shortopt(manopt_optionset *set, char shortopt){
+int i;
+ for(i=0; i < set->noofopts; i++) {
+ if(set->opts[i].shortopt == shortopt) {
+ return &set->opts[i];
+ }
+ }
+ return NULL;
+}
+
+
+void
+manopt_dumpoptionset(manopt_optionset *set) {
+ int i,j;
+
+ for(i=0; i < set->noofopts; i++) {
+ printf("option: %s (%c)\n", set->opts[i].longopt, set->opts[i].shortopt);
+ if(set->opts[i].arg.noofvalues) {
+ for(j=0; j < set->opts[i].arg.noofvalues; j++) {
+ printf("arg\n");
+ printf("\t%s\n", set->opts[i].arg.values[j]);
+ }
+ }
+ }
+}
+
diff --git a/segemehl/libs/manopt.h b/segemehl/libs/manopt.h
new file mode 100644
index 0000000..3c90cd5
--- /dev/null
+++ b/segemehl/libs/manopt.h
@@ -0,0 +1,170 @@
+#ifndef MANOPT_H
+#define MANOPT_H
+/*
+ *
+ * manopt.h
+ * declarartions for the option manager
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 09/01/2008 11:13:03 AM CEST
+ *
+ * Revision of last commit:
+ * $Rev: 74 $
+ * $Author: steve $
+ * $Date: 2008-10-29 15:03:04 +0100 (Wed, 29 Oct 2008) $
+ *
+ *
+ * $Id: manopt.h 74 2008-10-29 14:03:04Z steve $
+ * $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/manopt.h $
+ *
+ */
+
+#define MANOPT_MAXSYNOPSIS 10000
+
+typedef enum{
+ FLAG,
+ REQSTRINGOPT,
+ REQCHAROPT,
+ REQINTOPT,
+ REQUINTOPT,
+ REQDBLOPT,
+ MANOPT_ENUMREQUIRED, /*marker to distinguish required and optional*/
+ FILEOPT,
+ STRINGOPT,
+ CHAROPT,
+ INTOPT,
+ UINTOPT,
+ DBLOPT,
+ INTRANGEOPT,
+ UINTRANGEOPT,
+ DBLRANGEOPT,
+ LISTOPT,
+ SELECTOPT,
+ MANOPT_ENUMSIZE, /*end of enumeration*/
+ MANOPT_BLOCKSEPARATOR
+} manopt_type;
+
+
+typedef struct {
+ char *flagname;
+ int noofvalues;
+ char **values;
+} manopt_arg;
+
+typedef struct {
+ int noofargs;
+ manopt_arg* args;
+} manopt_argset;
+
+typedef struct {
+ char shortopt;
+ char *longopt;
+ char *argdesc;
+ char *helpmsg;
+ char *defaultval;
+ unsigned char set;
+ unsigned char required;
+ manopt_type type;
+ void *constraint;
+ manopt_arg arg;
+ void *reg_var;
+} manopt_option;
+
+typedef struct {
+ char *call;
+ char *unflagged;
+ char *references;
+ char *bugs;
+ char *version;
+ char *description;
+ int noofopts;
+ manopt_option *opts;
+ void *mutually_exclusive_opts;
+} manopt_optionset;
+
+typedef struct {
+ int maxlength;
+ int minlength;
+ int noofitems;
+ char** items;
+} manopt_listconstraint;
+
+typedef struct {
+ int max;
+ int min;
+ int diff;
+} manopt_intconstraint;
+
+typedef struct {
+ unsigned int max;
+ unsigned int min;
+ unsigned int diff;
+} manopt_uintconstraint;
+
+typedef struct {
+ double max;
+ double min;
+ double diff;
+} manopt_dblconstraint;
+
+int
+manopt_parse_commandline(manopt_argset* argset,
+ int argc,
+ char **argv);
+
+void
+manopt(manopt_optionset* set,
+ manopt_type type,
+ unsigned char required,
+ char shortopt,
+ char *longopt,
+ char *helpmsg,
+ char *argdesc,
+ void *constraints,
+ void *reg_var);
+
+manopt_arg*
+manopt_getopts(manopt_optionset* set,
+ int argc,
+ char **argv);
+
+char*
+getNiceSVNVersion(const char *version);
+
+void
+manopt_destructarg(manopt_arg *arg);
+
+void
+manopt_destructoptionset(manopt_optionset *set);
+
+void
+manopt_dumpoptionset(manopt_optionset *set);
+
+void
+manopt_helpmsg(manopt_optionset *set);
+
+void
+manopt_help(manopt_optionset *set, const char *fmt, ...);
+
+void
+manopt_initoptionset(manopt_optionset *set,
+ char *call, char *unflagged, char *description, char *references, char *version, char *bugs);
+
+manopt_option*
+manopt_longopt(manopt_optionset *set, char *longopt);
+
+manopt_option*
+manopt_shortopt(manopt_optionset *set, char shortopt);
+
+unsigned char
+manopt_isset(manopt_optionset *set, char shortopt, char *longopt);
+
+void
+manopt_blockseparator(manopt_optionset *set, char *blockname);
+
+manopt_arg*
+manopt_getarg(manopt_optionset *set, char shortopt, char *longopt);
+
+#endif
+
diff --git a/segemehl/libs/manout.c b/segemehl/libs/manout.c
new file mode 100644
index 0000000..f497519
--- /dev/null
+++ b/segemehl/libs/manout.c
@@ -0,0 +1,1866 @@
+/*
+ * manout.c
+ * attempt for flexible output of genome mapping w/ SEGEMEHL
+ *
+ * @author Christian Otto
+ * @email christan at bioinf.uni-leipzig.de
+ * @date Wed Sep 24 10:56:23 CEST 2008
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <string.h>
+#include "basic-types.h"
+#include "bitArray.h"
+#include "memory.h"
+#include "mathematics.h"
+#include "sort.h"
+#include "info.h"
+#include "biofiles.h"
+#include "fileio.h"
+#include "vtprogressbar.h"
+#include "debug.h"
+#include "charsequence.h"
+#include "manout.h"
+#include <assert.h>
+#include <pthread.h>
+#include "alignment.h"
+#include "manoutformats.h"
+#include "kdseed.h"
+#include "fileBins.h"
+#include "segemehl.h"
+
+unsigned char
+se_kdMatchListhasMatches(gmatchlist_t *list) {
+ return (list->n[0] > 0 || list->n[1] > 0);
+}
+
+unsigned char
+se_kdMatchListhasMates(gmatchlist_t *list) {
+ Uint u,i,k;
+ for(u=0; u < 2; u++) {
+ for(i=0; i < list->n[u]; i++) {
+ for(k=0; k < 4; k++) {
+ if(list->matches[u][i].mates[k].al)
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+unsigned char
+se_kdMatchHasMates(gmatch_t *match) {
+ Uint i;
+ for(i=0; i < 4; i++) {
+ if (match->mates[i].isset) return 1;
+ }
+ return 0;
+}
+
+void
+se_destructMatches(void *space, gread_t *read) {
+ FREEMEMORY(space, read->matches[0]);
+ FREEMEMORY(space, read->matches[1]);
+}
+
+Uint
+se_setMatches (void *space, gread_t *read,
+ gmatchlist_t *list, Uint maxedist, segemehl_t *nfo, char rep) {
+
+ Uint i, k, n, noofmatepairs=0, matches=0;
+ gmatch_t *m = NULL, *cur;
+
+ read->noofmatepairs = 0;
+ read->noofmatches = 0;
+
+ for(k=0; k <= 1; k++) {
+ m = NULL;
+ n = 0;
+
+// if(rep) fprintf(stdout, "setMatches: iter list of %d matches\n", list->n[k]);
+ for(i=0; i < list->n[k]; i++) {
+
+ cur = &list->matches[k][i];
+ if((cur->edist <= maxedist ||
+ (se_kdMatchHasMates(cur) && cur->mateminedist+cur->edist <= list->pairminedist))
+ && !cur->skip) {
+
+// if(rep) fprintf(stdout, "setMatches: match %d with edist %d < %d maxedist\n", i, cur->edist, maxedist);
+ if(se_kdMatchHasMates(cur)) {
+ //matemindist of match compared to matemindist of list!
+ //if(cur->mateminedist <= cur->mateminedist || !nfo->bestonly) {
+// if(rep) fprintf(stdout, "setMatches: read has Mates\n");
+ if(cur->mateminedist+cur->edist <= list->pairminedist || !nfo->bestonly) {
+
+// if(rep) fprintf(stdout, "setMatches: added with combinded edist %d+%d=%d <= %d maxedist\n", cur->edist, cur->mateminedist, cur->edist+cur->mateminedist, list->pairminedist);
+ noofmatepairs++;
+ m = ALLOCMEMORY(space, m, gmatch_t, n+1);
+ memmove(&m[n], cur, sizeof(gmatch_t));
+ n++;
+ matches++;
+ } else {
+
+// if(rep) fprintf(stdout, "setMatches: failed to add because combinded edist %d with edist %d > %d maxedist\n", i, cur->edist+cur->mateminedist, list->pairminedist);
+ }
+ } else {
+ m = ALLOCMEMORY(space, m, gmatch_t, n+1);
+ memmove(&m[n], cur, sizeof(gmatch_t));
+ n++;
+ matches++;
+ }
+ } else {
+
+// if(rep) fprintf(stdout, "setMatches: match %d failed because %d > %d or skip:%d\n", i, cur->edist, maxedist, cur->skip);
+ }
+ }
+
+ read->noofmatches += n;
+ read->n[k] = n;
+ read->matches[k] = m;
+ }
+
+ read->noofmatepairs = noofmatepairs;
+ return matches;
+ }
+
+
+gmatchlist_t*
+se_kdMatchListAdd(gmatchlist_t *list,
+ Uint chr_idx,
+ Uint chr_start,
+ Uint chr_end,
+ Uint edist,
+ int scr,
+ Uint start,
+ Uint end,
+ double evalue, Alignment *al, Uint u,
+ Uint previdx, Uint prevpos, char prevstrand,
+ Uint nextidx, Uint nextpos, char nextstrand, Uint fragno) {
+
+ Uint n, mat, mis, del, ins, i;
+ gmatch_t *match;
+
+ n = list->n[u];
+ list->matches[u]=realloc(list->matches[u], sizeof(gmatch_t)*(n+1));
+ match = &list->matches[u][n];
+
+ initMatch(match);
+ countEops(al, &mat, &mis, &ins, &del);
+ //changed edist to mis + ins + del
+ list->minedist = MIN(list->minedist, mis+ins+del);
+ match->scr = scr;
+ match->evalue = evalue;
+ //changed edist to mis + ins + del
+ match->edist = mis+ins+del;
+ match->p = chr_start;
+ match->q = chr_end;
+ match->i = start;
+ match->j = end;
+ match->mat = mat;
+ match->mis = mis;
+ match->ins = ins;
+ match->del = del;
+ match->subject = chr_idx;
+ match->al = al;
+ match->noofmatematches = 0;
+ match->skip = 0;
+ match->mateminedist = -1;
+
+ for(i=0; i < 4; i++) {
+ match->mates[i].scr = 0;
+ match->mates[i].al = NULL;
+ match->mates[i].isset = 0;
+ match->mates[i].materefdesc = NULL;
+ match->mates[i].materefseq = NULL;
+ match->mates[i].materefdesclen = 0;
+ }
+
+ match->previdx = previdx;
+ match->prevpos = prevpos;
+ match->nextidx = nextidx;
+ match->nextpos = nextpos;
+ match->fragno = fragno;
+ match->prevflags = 0;
+ match->nextflags = 0;
+
+ match->prevseqstart = 0;
+ match->prevseqrefdesc = NULL;
+ match->nextseqstart = 0;
+ match->nextseqrefdesc = NULL;
+ match->refdesc = NULL;
+ match->refdesclen = 0;
+ match->refseq = NULL;
+
+ if(prevstrand == '+') {
+ match->prevflags |= SPLIT_PREV_PLUS;
+ }
+ if(nextstrand == '+') {
+ match->nextflags |= SPLIT_NEXT_PLUS;
+ }
+
+ list->n[u]++;
+ return list;
+}
+
+
+gmatchlist_t*
+se_kdMatchListSet(void *space,
+ gmatchlist_t *list,
+ Uint chr_idx,
+ Uint chr_start,
+ Uint chr_end,
+ Uint edist,
+ int scr,
+ Uint start,
+ Uint end,
+ double evalue, Alignment *al, Uint u, Uint n) {
+
+ Uint mat, mis, del, ins, i;
+ gmatch_t *match;
+
+ if (n < 0 || n >= list->n[u]){
+ return list;
+ }
+
+ match = &list->matches[u][n];
+ countEops(al, &mat, &mis, &ins, &del);
+ list->minedist = MIN(list->minedist, (mis+ins+del));
+ match->scr = scr;
+ match->evalue = evalue;
+ //changed edist to mis + ins + del
+ match->edist = mis+ins+del;
+ match->p = chr_start;
+ match->q = chr_end;
+ match->i = start;
+ match->j = end;
+ match->mat = mat;
+ match->mis = mis;
+ match->ins = ins;
+ match->del = del;
+ match->subject = chr_idx;
+
+ if (match->al){
+ wrapAlignment(match->al);
+ FREEMEMORY(space, match->al);
+ }
+
+ match->al = al;
+ match->noofmatematches = 0;
+ match->skip = 0;
+ match->mateminedist = -1;
+
+ for(i=0; i < 4; i++) {
+ if(match->mates[i].al != NULL) {
+ wrapAlignment(match->mates[i].al);
+ }
+ match->mates[i].scr = 0;
+ match->mates[i].al = NULL;
+ match->mates[i].isset = 0;
+ }
+
+ match->fragno = 0;
+ match->previdx = -1;
+ match->prevpos = -1;
+ match->nextidx = -1;
+ match->nextpos = -1;
+
+ return list;
+}
+
+
+/*------------------------------- se_kdSetMate -------------------------------
+ *
+ * @brief set mate
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+se_kdSetMate(void *space, gmatch_t *match,
+ Uint chr_idx, Uint chr_start, Uint chr_end, Uint edist,
+ Alignment *al, unsigned char downstream, unsigned char rc)
+{
+ int scr=0;
+ unsigned char pos=0;
+ gmate_t *mates;
+ Uint mat, mis, del, ins;
+
+
+ assert (downstream < 2 && rc < 2);
+ pos = (downstream << 1) | rc;
+ countEops(al, &mat, &mis, &ins, &del);
+
+ scr = mat;
+ scr -= mis+ins+del;
+
+ mates = match->mates;
+
+ if(mates[pos].isset && mates[pos].al != NULL && mates[pos].scr != 0) {
+ if(scr > mates[pos].scr) {
+ wrapAlignment(mates[pos].al);
+ } else {
+ wrapAlignment(al);
+ FREEMEMORY(space, al);
+ return match->mateminedist;
+ }
+ }
+
+ mates[pos].isset = 1;
+ mates[pos].p = chr_start;
+ mates[pos].q = chr_end;
+ mates[pos].scr = scr;
+ mates[pos].mat = mat;
+ mates[pos].mis = mis;
+ mates[pos].ins = ins;
+ mates[pos].del = del;
+ mates[pos].subject = chr_idx;
+ //changed edist to mis + ins + del
+ mates[pos].edist = mis+ins+del;
+ mates[pos].al = al;
+
+ if((mis+ins+del) < match->mateminedist || !match->noofmatematches) {
+// match->mateminedist = edist;
+// fprintf(stdout, "mateminedist CHECKPOINT e: %d\n", match->mateminedist);
+ match->mateminedist = mis+ins+del;
+ }
+
+ return match->mateminedist;
+}
+
+
+/*-------------------------------- initStruct ---------------------------------
+ *
+ * @brief inits
+ * @author Steve Hoffmann
+ *
+ */
+
+ inline void
+initMatch (gmatch_t* m)
+{
+ m->i = 0;
+ m->p = 0;
+ m->q = 0;
+ m->scr = 0;
+ m->mat = 0;
+ m->mis = 0;
+ m->del = 0;
+ m->ins = 0;
+ m->subject = 0;
+ m->edist = 0;
+ m->al = NULL;
+ m->skip = 0;
+
+ memset(&m->mates, 0, sizeof(gmate_t)*4);
+
+ m->fragno = 0;
+ m->previdx = -1;
+ m->prevpos = -1;
+ m->prevflags = 0;
+ m->nextidx = -1;
+ m->nextpos = -1;
+ m->nextflags = 0;
+
+ return ;
+}
+
+ void
+initRead(gread_t *r, Uint readid)
+{
+ r->id = readid;
+ r->n[MINUSSTRAND] = 0;
+ r->n[PLUSSTRAND] = 0;
+ r->matches[MINUSSTRAND] = NULL;
+ r->matches[PLUSSTRAND] = NULL;
+
+ return;
+}
+
+ void
+initGmap(Gmap *map, MultiCharSeq *mseq, Uint offset)
+{
+ map->mseq = mseq;
+ map->mapoffset = offset;
+ map->noofreads = 0;
+ map->reads = 0;
+
+ return;
+}
+
+
+
+inline void
+setReads(Gmap *map, gread_t *r, Uint noofreads){
+ map->reads = r;
+ map->noofreads = noofreads;
+}
+
+
+
+/*---------------------------- bl_gmatchlistInit -----------------------------
+ *
+ * @brief init gmatchlist
+ * @author Steve Hoffmann
+ *
+ */
+
+gmatchlist_t*
+bl_gmatchlistInit(void *space, int maxedist, int matemaxedist) {
+
+ gmatchlist_t* list;
+ list = ALLOCMEMORY(space, NULL, gmatchlist_t, 1);
+ list->matches = ALLOCMEMORY(space, NULL, gmatch_t*, 2);
+ list->n = ALLOCMEMORY(space, NULL, Uint, 2);
+ list->matches[0] = NULL;
+ list->matches[1] = NULL;
+ list->n[0] = 0;
+ list->n[1] = 0;
+ list->minedist = maxedist;
+ list->mateminedist = matemaxedist;
+ list->pairminedist = maxedist+matemaxedist;
+
+ return list;
+
+}
+
+/*-------------------------- bl_gmatchlistDestruct ---------------------------
+ *
+ * @brief destruct gmatchlist
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_gmatchlistDestruct(void *space, gmatchlist_t *list){
+ Uint i, k, u;
+
+ for(u=0; u < 2; u++) {
+ if (list->matches[u]) {
+ for(i=0; i < list->n[u]; i++) {
+ for(k=0; k < 4; k++) {
+ if(list->matches[u][i].mates[k].al) {
+ wrapAlignment(list->matches[u][i].mates[k].al);
+ FREEMEMORY(space, list->matches[u][i].mates[k].al);
+ }
+ }
+ if(list->matches[u][i].al) {
+ wrapAlignment(list->matches[u][i].al);
+ FREEMEMORY(space, list->matches[u][i].al);
+ list->matches[u][i].al = NULL;
+ }
+ }
+ FREEMEMORY(space, list->matches[u]);
+ }
+ }
+
+ FREEMEMORY(space, list->matches);
+ FREEMEMORY(space, list->n);
+ FREEMEMORY(space, list);
+}
+
+
+/*----------------------------- genericOutput --------------------------------
+ *
+ * @brief write generic output from given string list according to format
+ * @author Christian Otto
+ *
+ */
+
+void
+genericOutput (FILE *dev, char **list, Uint rep_type, char lf){
+ Uint i = 0;
+ while(FORMAT[rep_type][i] > -1){
+ if (i > 0){
+ fprintf(dev, "%s", SEPARATOR);
+ }
+ fprintf(dev, "%s", list[FORMAT[rep_type][i]]);
+ i++;
+ }
+ fprintf(dev, "%c", lf);
+}
+
+
+/*---------------------------- reportSplicedMatch ----------------------------
+ *
+ * @brief reporting spliced matches
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+reportSplicedMatch(void *space, char* qrydesc, MultiCharSeqAlignment *mcsa,
+ Uint noofaligns, Uint coverage, Uint edist, int score, segemehl_t *nfo) {
+
+ Uint i, j, ulen, vlen, ustart, vstart;
+ FILE *dev = NULL;
+ bl_fileBin_t *fx = NULL;
+ char strands[]= {'+','-'};
+ char alignlf= '\n';
+
+ for(j=0; j < noofaligns; j++) {
+
+ ulen = getUalignlen(mcsa[j].al);
+ vlen = getValignlen(mcsa[j].al);
+ ustart = mcsa[j].al->uoff;
+ vstart = mcsa[j].al->voff;
+
+ if (nfo->splitdev != NULL) {
+ dev = nfo->splitdev;
+ if (nfo->threadno > 1) {
+ pthread_mutex_lock(nfo->mtx3);
+ }
+ } else {
+ fx = bl_fileBinsDomainGetBin(nfo->splitbins, mcsa[j].refdesc,
+ mcsa[j].refstart - mcsa[j].substart + vstart + 1);
+ bl_fileBinsLock(fx);
+ dev=bl_fileBinsOpen(space, fx, "w");
+ }
+
+ fprintf(dev, "%s\t%d\t%d\t%d\t", qrydesc, edist, coverage, noofaligns);
+ fprintf(dev, "%d\t%u\t%u\t%u\t%u\t%u\t%c\t%s\t",
+ j,
+ getEdist(mcsa[j].al),
+ ustart, ustart+ulen,
+ mcsa[j].refstart - mcsa[j].substart + vstart +1,
+ mcsa[j].refstart - mcsa[j].substart + vstart +vlen,
+ strands[mcsa[j].strand],
+ mcsa[j].refdesc);
+
+ for (i = 0; i < noofaligns; i++) {
+ ulen = getUalignlen(mcsa[i].al);
+ vlen = getValignlen(mcsa[i].al);
+ ustart = mcsa[i].al->uoff;
+ vstart = mcsa[i].al->voff;
+
+ if (mcsa[i].strand == 1) {
+ ustart = mcsa[i].qrylen-ustart-ulen;
+ }
+
+ fprintf(dev, "%u\t%u\t%u\t%u\t%u\t%c\t%s\t",
+ getEdist(mcsa[i].al),
+ ustart, ustart+ulen,
+ mcsa[i].refstart - mcsa[i].substart + vstart +1,
+ mcsa[i].refstart - mcsa[i].substart + vstart +vlen,
+ strands[mcsa[i].strand],
+ mcsa[i].refdesc);
+ }
+
+
+ if (nfo->align){
+ if(nfo->order) {
+ alignlf = 7;
+ fprintf(dev, "%c", alignlf);
+ } else {
+ fprintf(dev, "\n");
+ }
+ for(i = 0; i < noofaligns; i++) {
+ showAlignLF(mcsa[i].al, dev, alignlf);
+ fprintf(dev, "%c", alignlf);
+ }
+ }
+
+ fprintf(dev,"\n");
+ fflush(dev);
+
+ if (nfo->splitdev && nfo->threadno > 1) {
+ pthread_mutex_unlock(nfo->mtx3);
+ }
+
+ if(nfo->splitbins) {
+ bl_fileBinsUnlock(fx);
+ }
+ }
+
+ return ;
+}
+
+
+/*-------------------------------- getDevice ---------------------------------
+ *
+ * @brief get the device
+ * @author Steve Hoffmann
+ *
+ */
+
+FILE*
+getDevice (void *space, char *chr, Uint pos, bl_fileBin_t **fx, segemehl_t *nfo)
+{
+ FILE *dev;
+ char *bisulfite;
+
+ if(!nfo->bins) {
+ dev = nfo->dev;
+ } else {
+ if (!nfo->bisulfitemerging){
+ *fx = bl_fileBinsDomainGetBin(nfo->bins, chr, pos);
+ bl_fileBinsLock(*fx);
+ dev=bl_fileBinsOpen(space, *fx, "w");
+ } else {
+ bisulfite = calloc(MAX_INT_LENGTH+1, 1);
+ sprintf(bisulfite, "%u", (nfo->bisulfite + 1) % 2);
+ *fx = bl_fileBinsDomainGetBin(nfo->bins, bisulfite, nfo->threadid);
+ //DBG("info.bisulfite=%u\tbisulfite=%s\tthreadid=%u\tfilename=%s\tstart=%llu\tend=%llu\n",
+ // nfo->bisulfite, bisulfite, nfo->threadid, (*fx)->fname, (*fx)->id->start, (*fx)->id->end);
+ bl_fileBinsLock(*fx);
+ dev=bl_fileBinsOpen(space, *fx, "w");
+ free(bisulfite);
+ }
+ }
+
+ return dev;
+}
+
+
+/*-------------------------------- getSAMTags --------------------------------
+ *
+ * @brief get SAM tags
+ * @author Steve Hoffmann
+ *
+ */
+
+
+char*
+getSAMTags(Gmap *map, Uint ustart, Uint uend, Uint uno, Alignment *al,
+ Uint edist, Uint previdx, Uint prevpos, char prevflags,
+ Uint nextidx, Uint nextpos, char nextflags, Uint matchid,
+ Uint noofmatches, Uint noofsplits, char pStatChr, char *meopstr, segemehl_t *nfo, gmatch_t *match) {
+
+ char *tag,
+ *md,
+ *refdesc=NULL;
+ Uint ptr=0,
+ len, mis,
+ prevseqstart = 0,
+ nextseqstart = 0;
+
+ md = mdstring(al, 0);
+
+
+ if(previdx != -1 || nextidx != -1 ) {
+ //only one split alignment per read
+ len = snprintf(NULL, 0, "NM:i:%d\tMD:Z:%s\tNH:i:1\tXI:i:%d\tXL:i:%d", edist, md, matchid, noofsplits);
+ tag = ALLOCMEMORY(space, NULL, char, len+1);
+ snprintf(tag, len+1, "NM:i:%d\tMD:Z:%s\tNH:i:1\tXI:i:%d\tXL:i:%d", edist, md, matchid, noofsplits);
+ ptr += len;
+
+ } else {
+
+ len = snprintf(NULL, 0, "NM:i:%d\tMD:Z:%s\tNH:i:%d\tXI:i:%d", edist, md, noofmatches, matchid);
+ tag = ALLOCMEMORY(space, NULL, char, len+1);
+ snprintf(tag, len+1, "NM:i:%d\tMD:Z:%s\tNH:i:%d\tXI:i:%d", edist, md, noofmatches, matchid);
+ ptr += len;
+ }
+
+ if (nfo->SAMmeop) {
+ len = snprintf(NULL, 0, "\tXE:Z:%s", meopstr);
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1, "\tXE:Z:%s", meopstr);
+ ptr += len;
+ }
+
+ if(nfo->SAMpairstat) {
+ tag = ALLOCMEMORY(space, tag, char, ptr+8);
+ snprintf(&tag[ptr], 8, "\tXA:Z:%c", pStatChr);
+ ptr += 7;
+ }
+
+ if (nfo->bisulfiterun == 1){
+ len = snprintf(NULL, 0, "\tXB:Z:F%u/CT", nfo->bisulfiteprotocol);
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1, "\tXB:Z:F%u/CT", nfo->bisulfiteprotocol);
+ ptr += len;
+ } else if (nfo->bisulfiterun == 2){
+ len = snprintf(NULL, 0, "\tXB:Z:F%u/GA", nfo->bisulfiteprotocol);
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1, "\tXB:Z:F%u/GA", nfo->bisulfiteprotocol);
+ ptr += len;
+ }
+
+ if (nfo->bisulfite){
+ /*
+ * get bisulfite mismatches (hence ones that could
+ * be explained by the bisulfite treatment
+ */
+ mis = getBisulfiteMismatches(al, nfo->bisulfite);
+ len = snprintf(NULL, 0, "\tXD:i:%u", mis);
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1, "\tXD:i:%u", mis);
+ ptr += len;
+
+ /*
+ * get wrong-strand bisulfite mismatches (hence ones
+ * that could be explained by the bisulfite treatment
+ * but ONLY on the other genomic strand (i.e. G/A
+ * mismatches in C/T matching run or vice versa)
+ * => used to identify wrong strand matches
+ */
+ mis = getWrongStrandBisulfiteMismatches(al, nfo->bisulfite);
+ len = snprintf(NULL, 0, "\tXF:i:%u", mis);
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1, "\tXF:i:%u", mis);
+ ptr += len;
+ }
+
+ if(previdx != -1 || nextidx != -1) {
+ len = snprintf(NULL, 0, "\tXX:i:%d\tXY:i:%d\tXQ:i:%d", ustart, uend, uno);
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1, "\tXX:i:%d\tXY:i:%d\tXQ:i:%d", ustart, uend, uno);
+ ptr += len;
+ }
+
+ if(previdx != -1) {
+ if(map && map->mseq) {
+ if(previdx > 0)
+ prevseqstart = map->mseq->markpos[previdx-1]+1;
+ refdesc = ((CharSequence*)map->mseq->ref[previdx].ref)->description;
+ } else {
+ prevseqstart = match->prevseqstart;
+ refdesc = match->prevseqrefdesc;
+ }
+ len = snprintf(NULL, 0, "\tXP:Z:%s\tXU:i:%d\tXS:i:%d", refdesc, prevpos-prevseqstart, prevflags);
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1,"\tXP:Z:%s\tXU:i:%d\tXS:i:%d", refdesc, prevpos-prevseqstart, prevflags);
+ ptr += len;
+ }
+
+ if(nextidx != -1) {
+ if(map && map->mseq) {
+ if(nextidx > 0)
+ nextseqstart = map->mseq->markpos[nextidx-1]+1;
+ refdesc = ((CharSequence*) map->mseq->ref[nextidx].ref)->description;
+ } else {
+ nextseqstart = match->nextseqstart;
+ refdesc = match->nextseqrefdesc;
+ }
+ len = snprintf(NULL, 0, "\tXC:Z:%s\tXV:i:%d\tXT:i:%d", refdesc, nextpos-nextseqstart, nextflags);
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1,"\tXC:Z:%s\tXV:i:%d\tXT:i:%d", refdesc, nextpos-nextseqstart, nextflags);
+ ptr += len;
+ }
+
+ tag[ptr] = 0;
+ FREEMEMORY(space, md);
+ return tag;
+}
+
+
+/*--------------------------- repoertMatchSetFlags ---------------------------
+ *
+ * @brief set the flags
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+reportMatchSetFlags(matchstatus_t pStat, char isMate, Uint noofmatepairs,
+ Uint *flag, Uint *mateflag, char *pStatChr, Uint *noofmatches) {
+
+
+ /*is paired in sequencing*/
+ *flag |= 1 ;
+ *mateflag |= 1;
+ *pStatChr= ' ';
+
+ switch(pStat) {
+ /*query itself is unmapped*/
+ case MATE: *flag |= (1 << 3);
+ /*second in pair*/
+ *flag |= (1 << 7);
+ *pStatChr = 'M';
+ break;
+ /*mate is unmapped*/
+ case QUERY: *flag |= (1 << 3);
+ /*first in pair*/
+ *flag |= (1 << 6);
+ *pStatChr = 'Q';
+ break;
+ /*both mapped but separately*/
+ case PAIR_INS:
+ *flag |= (1 << 6);
+ *mateflag |= (1 << 7);
+ *pStatChr = 'p';
+ *flag |= (1 << 1);
+ *mateflag |= (1 << 1);
+ *noofmatches = noofmatepairs;
+ break;
+ /*both mapped in reverse order*/
+ case PAIR_REV:
+ *flag |= (1 << 7);
+ *mateflag |= (1 << 6);
+ *pStatChr = 'R';
+ *flag |= (1 << 1);
+ *mateflag |= (1 << 1);
+ *noofmatches = noofmatepairs;
+ break;
+ /*both mapped properly*/
+ case PAIR:
+ *flag |= (1 << 6);
+ *mateflag |= (1 << 7);
+ *pStatChr = 'P';
+ *flag |= (1 << 1);
+ *mateflag |= (1 << 1);
+ *noofmatches = noofmatepairs;
+ break;
+ /*query spliced, mate full*/
+ case QUERY_SPL_FULL_MATE:
+ if (isMate) {
+ /*second in pair*/
+ *flag |= (1 << 7);
+ } else {
+ /*first in pair*/
+ *flag |= (1 << 6);
+ /*split hit*/
+ *flag |= (1 << 8);
+ *noofmatches = 1;
+ }
+ *pStatChr = 'S';
+ break;
+ /*query spliced, no mate*/
+ case QUERY_SPL_NO_MATE:
+ *flag |= (1 << 3);
+ /*first in pair*/
+ *flag |= (1 << 6);
+ /*split hit*/
+ *flag |= (1 << 8);
+ *noofmatches = 1;
+ *pStatChr = 'T';
+ break;
+ /*mate spliced, query full*/
+ case MATE_SPL_FULL_QUERY:
+ if (isMate) {
+ *flag |= (1 << 7);
+ /*split hit*/
+ *flag |= (1 << 8);
+ *noofmatches = 1;
+ } else {
+ *flag |= (1 << 6);
+ }
+ *pStatChr = 'U';
+ break;
+ /*mate spliced, no query*/
+ case MATE_SPL_NO_QUERY:
+ *flag |= (1 << 3);
+ /*second in pair*/
+ *flag |= (1 << 7);
+ /*split hit*/
+ *flag |= (1 << 8);
+ *pStatChr = 'V';
+ *noofmatches = 1;
+ break;
+ /*both spliced*/
+ case PAIR_SPL:
+ if(isMate) {
+ /*second in pair*/
+ *flag |= (1 << 7);
+ } else {
+ /*first in pair*/
+ *flag |= (1 << 6);
+ }
+ /*split hit*/
+ *flag |= (1 << 8);
+ *noofmatches = 1;
+ *pStatChr = 'X';
+ break;
+ }
+}
+
+
+/*------------------------------- reportMatch --------------------------------
+ *
+ * @brief reports a match to device with different output formats
+ * @author Steve Hoffmann
+ *
+ */
+
+inline Uint
+reportMatch (void *space, Gmap *map, fasta_t *queries,
+ segemehl_t *nfo, matchstatus_t pStat,
+ unsigned char isMate){
+
+ int i, k, j, l, u, seqlen, qrylen, matedesclen, matelen, matchid,
+ mateseqlen, desclen;
+ Uint off, clipoff=0, mateclipoff=0, seqstart=0, mateseqstart=0, flag=0,
+ mateflag=0, noofmatepairs=0, noofmatches=0,
+ lclip=0, rclip=0, lclipf=0, rclipf=0, matelclip=0, materclip=0, refdesclen=0, materefdesclen=0,
+ fraglen=0, quallen = 0;
+
+ gread_t *reads, *read;
+ gmatch_t *match;
+ gmate_t *mates;
+ Alignment *al;
+
+ FILE *dev=NULL, *matedev=NULL;
+ char *qry,
+ *qual,
+ *description,
+ *refdesc,
+ *refseq,
+ *materefdesc,
+ *matedesc,
+ *materefseq,
+ *mate,
+ *matequal,
+ *meopstr,
+ *matemeopstr,
+ *cigar,
+ *matecigar,
+ *tmp,
+ *tag,
+ strands[2] = {'+','-'},
+ strandchr,
+ matestrandchr,
+ pStatChr = 'Q',
+ alignlf = '\n',
+ lf = '\n',
+ **list;
+ unsigned char mateReported=0;
+ Uint matereport=0; Uint report=0;
+ bl_fileBin_t *fx = NULL, *mfx = NULL;
+
+
+ if (!nfo->bins && (nfo->threadno > 1)) {
+ pthread_mutex_lock(nfo->mtx);
+ }
+
+ if((nfo->order || nfo->bisulfitemerging) && nfo->align) {
+ lf = 7;
+ }
+
+ off = map->mapoffset;
+ reads = map->reads;
+ dev = nfo->dev;
+
+
+ for(i = 0; i < map->noofreads; i++) {
+ read = &reads[i];
+
+ noofmatches = read->noofmatches;
+ noofmatepairs = read->noofmatepairs;
+ matchid = 0;
+ //fprintf(stderr, "\nread noofmatches:%d, noofmatepairs:%d\n", noofmatches, noofmatepairs);
+
+ for (k = 0; k <= 1; k++) {
+
+ //fprintf(stdout, "k:%d, read->n[k]=%d\n", k, read->n[k]);
+ for(j = 0; j < read->n[k]; j++) {
+
+ if(nfo->maxout > 0 && (report >= nfo->maxout ||
+ matereport >= nfo->maxout))
+ continue;
+
+ mateReported = 0;
+ tmp = NULL;
+ flag = 0;
+ mateflag = 0;
+ seqstart = 0;
+
+ match = &read->matches[k][j];
+
+ if(match->subject > 0 && map->mseq) {
+ seqstart = map->mseq->markpos[match->subject-1]+1;
+ }
+
+ if (!isMate){
+ bl_fastaGetClipPos(queries, read->id, &lclip, &rclip);
+ // if soft-clipping
+ if (!nfo->hardclip) bl_fastaSetClip(queries, read->id, 0, 0);
+
+ qry = bl_fastaGetSequence(queries, read->id);
+ qual = bl_fastaGetQuality(queries, read->id);
+ qrylen = bl_fastaGetSequenceLength(queries, read->id);
+ }
+ else {
+ bl_fastaGetMateClipPos(queries, read->id, &lclip, &rclip);
+ // if soft-clipping
+ if (!nfo->hardclip) bl_fastaSetMateClip(queries, read->id, 0, 0);
+
+ qry = bl_fastaGetMate(queries, read->id);
+ qual = bl_fastaGetMateQuality(queries, read->id);
+ qrylen = bl_fastaGetMateLength(queries, read->id);
+ }
+
+ rclipf = rclip;
+ lclipf = lclip;
+
+ //fragment clipping
+ if(match->nextpos != -1 || match->prevpos != -1) {
+ rclipf = 0;
+ lclipf = 0;
+ }
+
+
+ seqlen = off+match->q - match->p;
+ al = match->al;
+
+ list = (char **) calloc(OUTLENGTH+1, sizeof(char *));
+
+ if(!isMate) {
+ desclen = bl_fastaGetDescriptionLength(queries, read->id)+1;
+ description = bl_fastaGetDescription(queries, read->id);
+ } else {
+ desclen = bl_fastaGetMateDescriptionLength(queries, read->id)+1;
+ description = bl_fastaGetMateDescription(queries, read->id);
+ }
+
+ sprintstr(&list[QRY_DESC], description, desclen);
+ //fprintf(stdout, "CHECKPOINT 1\n");
+ //use alignment instead
+ if (seqstart > off+match->p) continue;
+ strandchr = strands[k];
+
+ /*default quality string is * */
+ if (qual == NULL){
+ list[QUAL] = calloc(2, 1);
+ list[QUAL][0] = '*';
+ } else {
+ if(match->previdx != -1 || match->nextidx != -1 ) // || qrylen != match->j - match->i + 1)
+ {
+ quallen = match->j - match->i + 1;
+ if(lclip+ match->i + quallen > qrylen) {
+ DBG("warning: wrong fragment alignment for '%s'! skipping alignment.", description);
+ continue;
+ }
+
+ sprintstr(&list[QUAL], &qual[lclip+match->i], quallen);
+ } else {
+ quallen = qrylen;
+ sprintstr(&list[QUAL], qual , quallen);
+ }
+ }
+
+ if (strandchr == '-') {
+ meopstr = multieopstring(al, rclipf, lclipf, 0);
+ cigar = cigarstring(al, rclipf, lclipf, (nfo->hardclip) ? 'H':'S', 0);
+
+ if(match->previdx != -1 || match->nextidx != -1) {
+
+ fraglen =match->j - match->i + 1;
+ if(lclip+ match->i + fraglen > qrylen) {
+ DBG("warning: wrong fragment alignment for '%s'! skipping alignment.", description);
+ continue;
+ }
+
+ tmp = charIUPACcomplement(space, &qry[lclip+match->i], fraglen);
+ } else {
+ fraglen = qrylen;
+ tmp = charIUPACcomplement(space, qry, fraglen);
+ }
+
+ sprintstr(&list[QRY_SEQ], tmp, fraglen);
+ free(tmp);
+
+ if (qual != NULL){
+ list[QUAL] = strrev(list[QUAL], quallen);
+ }
+
+ flag |= (1 << 4);
+ mateflag |= (1 << 5);
+
+
+ clipoff = (nfo->hardclip) ? rclipf : 0;
+
+ } else {
+
+ meopstr = multieopstring(al, lclipf, rclipf, 0);
+ cigar = cigarstring(al, lclipf, rclipf, (nfo->hardclip) ? 'H':'S', 0);
+
+ if(match->previdx != -1 || match->nextidx != -1 ) //|| qrylen != match->j - match->i +1)
+ {
+
+ fraglen = match->j - match->i + 1;
+ if(lclip+ match->i + fraglen > qrylen) {
+ DBG("warning: wrong fragment alignment for '%s'! skipping alignment.", description);
+ continue;
+ }
+
+ tmp = ALLOCMEMORY(space, NULL, char, fraglen + 1);
+ memmove(tmp, &qry[lclip+match->i], fraglen);
+ tmp[fraglen] = 0;
+ sprintstr(&list[QRY_SEQ], tmp, fraglen);
+ FREEMEMORY(space, tmp);
+ } else {
+ sprintstr(&list[QRY_SEQ], qry, qrylen);
+ }
+
+ clipoff = (nfo->hardclip) ? lclipf : 0;
+ }
+
+ /* restore clipping positions for next query match */
+ if (!isMate) {
+ if (!nfo->hardclip) bl_fastaSetClip(queries, read->id, lclip, rclip);
+ }
+ else {
+ if (!nfo->hardclip) bl_fastaSetMateClip(queries, read->id, lclip, rclip);
+ }
+
+ if(map->mseq) {
+ refdesc = ((CharSequence*)
+ map->mseq->ref[match->subject].ref)->description;
+ refdesclen = ((CharSequence*)
+ map->mseq->ref[match->subject].ref)->descrlen;
+ refseq = map->mseq->sequences+match->p;
+ } else {
+ refdesc = match->refdesc;
+ refdesclen = match->refdesclen;
+ refseq = match->refseq;
+ }
+
+ list[MEOP_STR] = meopstr;
+ list[SAM_CIGAR] = cigar;
+ sprintUint(&list[QRY_LEN], qrylen-1);
+ sprintUint(&list[SCR], match->scr);
+ sprintflt (&list[EVALUE], match->evalue);
+ sprintUint(&list[EDIST], match->edist);
+ sprintUint(&list[QRY_S], off+match->i);
+ sprintUint(&list[QRY_E], off+match->j);
+ sprintUint(&list[SEQ_S], off+match->p-seqstart+clipoff);
+ sprintUint(&list[SEQ_E], off+match->q-seqstart+clipoff);
+ sprintchar(&list[STRAND], strandchr);
+ sprintUint(&list[MAT], match->mat);
+ sprintUint(&list[MIS], match->mis);
+ sprintUint(&list[INS], match->ins);
+ sprintUint(&list[DEL], match->del);
+ sprintstr(&list[REF_SEQ], refseq, seqlen);
+ sprintUint(&list[NOOFMATCHES], noofmatches);
+ sprintstr (&list[SEQ_DESC], refdesc, refdesclen);
+ sprintstr(&list[SAM_QRY], list[QRY_DESC], desclen);
+ //strtok(list[SAM_QRY], "/");
+ sprintUint(&list[SAM_MAPQ], 255);
+ sprintchar(&list[SAM_MATE_REF], '*');
+ sprintUint(&list[MATE_SEQ_S], 0);
+ sprintUint(&list[SAM_ISIZE], 0);
+
+ //like here ...
+ if(bl_fastaHasMate(queries)) {
+ reportMatchSetFlags(pStat, isMate, noofmatepairs,
+ &flag, &mateflag, &pStatChr, &noofmatches);
+ }
+
+ sprintchar(&list[PAIR_STATUS], pStatChr);
+
+ /* if(noofmatepairs) {
+ //...nofmatches -> noofmatepairs here
+ tag = getSAMTags(map, off+match->i, off+match->j, match->fragno, match->al,
+ match->edist,
+ match->previdx, match->prevpos+off, match->prevflags,
+ match->nextidx, match->nextpos+off, match->nextflags,
+ noofmatepairs, pStatChr, meopstr, nfo);
+ } else {*/
+ tag = getSAMTags(map, off+match->i, off+match->j, match->fragno, match->al,
+ match->edist,
+ match->previdx, match->prevpos+off, match->prevflags,
+ match->nextidx, match->nextpos+off, match->nextflags, matchid,
+ noofmatches, read->noofmatches, pStatChr, meopstr, nfo, match);
+ //}
+
+ sprintstr(&list[TAG], tag, strlen(tag));
+ FREEMEMORY(space, tag);
+
+
+ if(bl_fastaHasMate(queries) &&
+ (pStat == PAIR || pStat == PAIR_REV || pStat == PAIR_INS)) {
+
+ if (!isMate){
+ bl_fastaGetMateClipPos(queries, read->id, &matelclip, &materclip);
+
+ //if soft-clipping
+ if (!nfo->hardclip){
+ bl_fastaSetMateClip(queries, read->id, 0, 0);
+ }
+ }
+ else {
+ bl_fastaGetClipPos(queries, read->id, &matelclip, &materclip);
+
+ //if soft-clipping
+ if (!nfo->hardclip){
+ bl_fastaSetClip(queries, read->id, 0, 0);
+ }
+ }
+
+ mates = match->mates;
+ if (!isMate){
+ matelen = bl_fastaGetMateLength(queries, read->id);
+ }
+ else {
+ matelen = bl_fastaGetSequenceLength(queries, read->id);
+ }
+
+
+ if(se_kdMatchHasMates(&read->matches[k][j])) {
+
+ for (u=0; u < 4 && !mateReported; u++) {
+
+
+ if(!mates[u].isset || mates[u].edist >
+ match->mateminedist) continue;
+
+ if(map->mseq) {
+ if (mates[u].subject > 0){
+ mateseqstart = map->mseq->markpos[mates[u].subject-1]+1;
+ }
+ else {
+ mateseqstart = 0;
+ }
+ materefseq = map->mseq->sequences + mates[u].p;
+ } else {
+ mateseqstart = 0;
+ materefseq = mates[u].materefseq;
+ }
+
+ if (!isMate){
+ matedesclen =
+ bl_fastaGetMateDescriptionLength(queries, read->id)+1;
+ matedesc = bl_fastaGetMateDescription(queries, read->id);
+ mate = bl_fastaGetMate(queries, read->id);
+ matequal = bl_fastaGetMateQuality(queries, read->id);
+ }
+ else {
+ matedesclen =
+ bl_fastaGetDescriptionLength(queries, read->id)+1;
+ matedesc = bl_fastaGetDescription(queries, read->id);
+ mate = bl_fastaGetSequence(queries, read->id);
+ matequal = bl_fastaGetQuality(queries, read->id);
+ }
+ mateseqlen = off+mates[u].q - mates[u].p;
+
+ sprintstr(&list[MATE_REF_SEQ], materefseq, mateseqlen);
+ //default quality string is *
+ if (matequal == NULL){
+ list[MATE_QUAL] = calloc(2, 1);
+ list[MATE_QUAL][0] = '*';
+ } else {
+ sprintstr(&list[MATE_QUAL], matequal, matelen);
+ }
+
+ /*use alignment instead*/
+ if (mateseqstart > off+mates[u].p) continue;
+
+ matestrandchr = strands[k];
+ if (u & 1) matestrandchr = strands[(~k)&1];
+
+ if (matestrandchr == '-') {
+ matemeopstr = multieopstring(mates[u].al, materclip, matelclip, 0);
+ matecigar = cigarstring(mates[u].al, materclip, matelclip, (nfo->hardclip)?'H':'S', 0);
+ tmp = charIUPACcomplement(space, mate, matelen);
+ sprintstr(&list[MATE_QRY_SEQ], tmp, matelen);
+ free(tmp);
+ if (matequal != NULL){
+ list[MATE_QUAL] = strrev(list[MATE_QUAL], matelen);
+ }
+ flag |= (1 << 5);
+ mateflag |= (1 << 4);
+ mateclipoff = (nfo->hardclip) ? rclip : 0;
+ } else {
+ matemeopstr = multieopstring(mates[u].al, matelclip, materclip, 0);
+ matecigar = cigarstring(mates[u].al, matelclip, materclip, (nfo->hardclip)?'H':'S', 0);
+ sprintstr(&list[MATE_QRY_SEQ], mate, matelen);
+ mateclipoff = (nfo->hardclip) ? lclip : 0;
+ }
+
+ list[MATE_MEOP] = matemeopstr;
+ list[SAM_MATE_CIGAR] = matecigar;
+
+ sprintUint(&list[MATE_LEN], matelen);
+ sprintUint(&list[MATE_SCR], mates[u].mat - mates[u].edist);
+ sprintflt (&list[MATE_EVALUE], 0.0);
+ sprintUint(&list[MATE_QRY_S], 1);
+ sprintUint(&list[MATE_QRY_E], matelen);
+
+ free(list[MATE_SEQ_S]);
+
+ if(map->mseq) {
+ materefdesc = ((CharSequence*)
+ map->mseq->ref[mates[u].subject].ref)->description;
+ materefdesclen = ((CharSequence*)
+ map->mseq->ref[mates[u].subject].ref)->descrlen;
+ } else {
+ materefdesc = mates[u].materefdesc;
+ materefdesclen = mates[u].materefdesclen;
+ }
+
+ sprintUint(&list[MATE_SEQ_S], off+mates[u].p - mateseqstart + mateclipoff);
+ sprintUint(&list[MATE_SEQ_E], off+mates[u].q - mateseqstart + mateclipoff);
+ sprintUint(&list[MATE_MAT], mates[u].mat);
+ sprintUint(&list[MATE_MIS], mates[u].mis);
+ sprintUint(&list[MATE_INS], mates[u].ins);
+ sprintUint(&list[MATE_DEL], mates[u].del);
+ sprintUint(&list[MATE_EDIST], mates[u].edist);
+ sprintchar(&list[MATE_STRAND], matestrandchr);
+ sprintstr (&list[MATE_SEQ_DESC], materefdesc, materefdesclen);
+ sprintUint(&list[MATE_NOOFMATCHES], noofmatepairs);
+ sprintstr (&list[MATE_DESC], matedesc, matedesclen-1);
+
+ sprintstr(&list[SAM_MATE_QRY], list[MATE_DESC], desclen);
+ //strtok(list[SAM_MATE_QRY], "/");
+
+ sprintUint(&list[SAM_FLAG], flag);
+ sprintUint(&list[SAM_MATE_FLAG], mateflag);
+ sprintUint(&list[SAM_MATE_MAPQ], 255);
+
+ if(strcmp(list[MATE_SEQ_DESC],list[SEQ_DESC])) {
+ free(list[SAM_MATE_REF]);
+
+ sprintstr(&list[SAM_MATE_REF], list[MATE_SEQ_DESC],
+ strlen(list[MATE_SEQ_DESC]));
+
+ sprintstr(&list[SAM_QRY_REF], list[SEQ_DESC],
+ strlen(list[SEQ_DESC]));
+
+ sprintint(&list[SAM_MATE_ISIZE], 0);
+
+ } else {
+
+ sprintf(list[SAM_MATE_REF],"=");
+ sprintchar(&list[SAM_QRY_REF], '=');
+ free(list[SAM_ISIZE]);
+
+ sprintint(&list[SAM_ISIZE],
+ (off+mates[u].p - mateseqstart + matelen)-
+ (off+match->p-seqstart));
+
+ sprintint(&list[SAM_MATE_ISIZE],
+ (off+match->p-seqstart)-
+ (off+mates[u].p - mateseqstart + matelen));
+ }
+
+ //noofmatematches -> noofmatepairs
+ //the number of splits should be 0 because
+ //split alignments are always single
+ tag = getSAMTags(map, 1, matelen, 1,
+ mates[u].al, mates[u].edist,
+ -1, -1, 0, -1, -1, 0, matchid, noofmatepairs, 0, pStatChr, matemeopstr, nfo, NULL);
+
+ sprintstr(&list[MATE_TAG], tag, strlen(tag));
+ FREEMEMORY(space, tag);
+
+
+ if(mfx) {
+ bl_fileBinsUnlock(mfx);
+ mfx = NULL;
+ }
+
+ if(nfo->rep_type == 5) {
+
+ dev = getDevice(space, refdesc, off+match->p-seqstart, &fx, nfo);
+ genericOutput(dev, list, 5, lf);
+ report++;
+
+ if(nfo->align) {
+ if(nfo->order || nfo->bisulfitemerging) {
+ alignlf = 7;
+ }
+ showAlignLF(match->al, dev, alignlf);
+ fprintf(dev, "\n");
+ }
+
+ if(nfo->bins && !nfo->bisulfitemerging) {
+ /*TODO: avoid lock!*/
+
+ mfx = bl_fileBinsDomainGetBin(nfo->bins, materefdesc,
+ off+mates[u].p - mateseqstart);
+
+ if(mfx != fx) {
+ bl_fileBinsUnlock(fx);
+ fx = NULL;
+ matedev = getDevice(space, materefdesc,
+ off+mates[u].p - mateseqstart, &mfx, nfo);
+ } else {
+ mfx = NULL;
+ matedev = dev;
+ }
+ } else {
+ matedev = dev;
+ }
+
+ genericOutput(matedev, list, 6, lf);
+ matereport++;
+
+ if(nfo->align) {
+ if(nfo->order || nfo->bisulfitemerging) {
+ alignlf = 7;
+ }
+ showAlignLF(mates[u].al, dev, alignlf);
+ fprintf(dev, "\n");
+ }
+ }
+
+ if(nfo->rep_type == 12) {
+
+ dev = getDevice(space, refdesc, off+match->p-seqstart, &fx, nfo);
+ genericOutput(dev, list, 11, lf);
+ report++;
+
+ if(nfo->align) {
+ if(nfo->order || nfo->bisulfitemerging) {
+ alignlf = 7;
+ }
+ showAlignLF(match->al, dev, alignlf);
+ fprintf(dev, "\n");
+ }
+
+ if(nfo->bins && !nfo->bisulfitemerging) {
+ /*TODO: avoid lock!*/
+
+ mfx = bl_fileBinsDomainGetBin(nfo->bins, materefdesc,
+ off+mates[u].p - mateseqstart);
+
+ if(mfx != fx) {
+ bl_fileBinsUnlock(fx);
+ fx = NULL;
+ matedev = getDevice(space, materefdesc,
+ off+mates[u].p - mateseqstart, &mfx, nfo);
+ } else {
+ mfx = NULL;
+ matedev = dev;
+ }
+ } else {
+ matedev = dev;
+ }
+
+ genericOutput(matedev, list, 13, lf);
+ matereport++;
+
+ if(nfo->align) {
+ if(nfo->order || nfo->bisulfitemerging) {
+ alignlf = 7;
+ }
+ showAlignLF(mates[u].al, dev, alignlf);
+ fprintf(dev, "\n");
+ }
+ }
+
+ if(nfo->rep_type == 15) {
+
+ dev = getDevice(space, refdesc, off+match->p-seqstart, &fx, nfo);
+ genericOutput(dev, list, 15, lf);
+ report++;
+
+ if(nfo->align) {
+ if(nfo->order || nfo->bisulfitemerging) {
+ alignlf = 7;
+ }
+ showAlignLF(match->al, dev, alignlf);
+ fprintf(dev, "\n");
+ }
+
+ if(nfo->bins && !nfo->bisulfitemerging) {
+ /*TODO: avoid lock!*/
+
+ mfx = bl_fileBinsDomainGetBin(nfo->bins, materefdesc,
+ off+mates[u].p - mateseqstart);
+
+ if(mfx != fx) {
+ bl_fileBinsUnlock(fx);
+ fx = NULL;
+ matedev = getDevice(space, materefdesc,
+ off+mates[u].p - mateseqstart, &mfx, nfo);
+ } else {
+ mfx = NULL;
+ matedev = dev;
+ }
+ } else {
+ matedev = dev;
+ }
+
+ genericOutput(matedev, list, 16, lf);
+ matereport++;
+
+ if(nfo->align) {
+ if(nfo->order || nfo->bisulfitemerging) {
+ alignlf = 7;
+ }
+ showAlignLF(mates[u].al, dev, alignlf);
+ fprintf(dev, "\n");
+ }
+ }
+ if (match->previdx == -1 && match->nextidx == -1){
+ matchid++;
+ }
+
+ mateReported = 1;
+
+ for (l=MATE_LEN; l <= MATE_QUAL; l++) {
+ if(list[l]) free(list[l]);
+ list[l] = NULL;
+ }
+
+ for(l=SAM_MATE_FLAG; l <= SAM_MATE_ISIZE; l++) {
+ if(list[l]) free(list[l]);
+ list[l] = NULL;
+ }
+ }
+ assert(mateReported);
+ }
+ /* restore clipping positions for next query match */
+ if (!isMate){
+ if (!nfo->hardclip){
+ bl_fastaSetMateClip(queries, read->id, matelclip, materclip);
+ }
+ }
+ else {
+ if (!nfo->hardclip){
+ bl_fastaSetClip(queries, read->id, matelclip, materclip);
+ }
+
+ }
+ } else {
+
+
+ dev = getDevice(space, refdesc, off+match->p-seqstart, &fx, nfo);
+ sprintUint(&list[SAM_FLAG], flag);
+ genericOutput(dev, list, nfo->rep_type, lf);
+ report++;
+ if (match->previdx == -1 && match->nextidx == -1){
+ matchid++;
+ }
+
+ if(match->al) {
+ if (nfo->align){
+ if (nfo->order || nfo->bisulfitemerging){
+ alignlf = 7;
+ }
+ showAlignLF(match->al, dev, alignlf);
+ fprintf(dev, "\n");
+ }
+ }
+ }
+
+ for (l = 0; l < OUTLENGTH; l++){
+ if(list[l]) free(list[l]);
+ }
+ free(list);
+
+ if(nfo->bins) {
+ if(fx) {
+ bl_fileBinsUnlock(fx);
+ fx = NULL;
+ }
+ if(mfx) {
+ bl_fileBinsUnlock(mfx);
+ mfx = NULL;
+ }
+ }
+ }
+ }
+ }
+
+ if (!nfo->bins && nfo->threadno > 1){
+ pthread_mutex_unlock(nfo->mtx);
+ }
+
+ if(report && matereport) return 3;
+ if(!report && matereport) return 2;
+ if(report && !matereport) return 1;
+
+ return 0;
+}
+
+
+
+/*------------------------------- se_SAMHeader -------------------------------
+ *
+ * @brief SAM header
+ * @author Steve Hoffmann
+ *
+ */
+
+ char*
+se_SAMHeader (void *space, char **seq, Uint *seqlen,
+ Uint size, char *cmdline, char sep, char lf,
+ unsigned char sorted)
+{
+
+ Uint i,len=1000, curlen=0;
+ char *header;
+
+ len += strlen(VERSION);
+ if(cmdline)
+ len += strlen(cmdline);
+
+
+ for(i=0; i < size; i++) {
+ len += snprintf(NULL, 0, "@SQ%cSN:%s%cLN:%d%c", sep, seq[i], sep, seqlen[i], lf);
+ }
+
+ header = calloc(len, sizeof(char));
+ sprintf(header,"@HD%cVN:1.0",sep);
+ curlen = strlen(header);
+
+ if(sorted) sprintf(&header[curlen], "%cSO:coordinate", sep);
+
+ curlen = strlen(header);
+ sprintf(&header[curlen],"%c",lf);
+
+ for(i=0; i < size; i++) {
+
+ curlen = strlen(header);
+ sprintf(&header[curlen],"@SQ%cSN:%s%cLN:%d%c", sep, seq[i], sep, seqlen[i], lf);
+ }
+
+ curlen = strlen(header);
+ sprintf(&header[curlen],"@PG%cID:segemehl", sep);
+
+ curlen = strlen(header);
+ sprintf(&header[curlen],"%cVN:%s", sep, VERSION);
+
+ curlen = strlen(header);
+ if(cmdline)
+ sprintf(&header[curlen],"%cCL:%s", sep, cmdline);
+
+ curlen = strlen(header);
+ sprintf(&header[curlen],"%c",lf);
+
+ return header;
+}
+
+/*----------------------------- se_initChromBins -----------------------------
+ *
+ * @brief set up bins for chromosomes
+ * @author Steve Hoffmann
+ *
+ */
+
+ bl_fileBinDomains_t*
+se_createChromDomains (void *space, fasta_t *f, Uint avgbins, Uint maxbins,
+ char *filetemplate, Uint tmplen)
+{
+ bl_fileBinDomains_t* domains;
+ char **desc;
+ Uint *size;
+ Uint i, no, total=0;
+
+ no = f->noofseqs;
+ if(no > maxbins) return NULL;
+
+ desc = ALLOCMEMORY(space, NULL, char*, no);
+ size = ALLOCMEMORY(space, NULL, Uint, no);
+
+ for(i=0; i < no; i++) {
+ desc[i] = bl_fastaGetDescription(f, i);
+ size[i] = bl_fastaGetSequenceLength(f, i);
+ total += size[i];
+ }
+
+ domains = bl_fileBinsDomainsInit(space, desc, size, no, total,
+ avgbins, maxbins, filetemplate, tmplen);
+
+ FREEMEMORY(space, desc);
+ FREEMEMORY(space, size);
+
+ return domains;
+}
+
+/*----------------------------- se_initChromBins -----------------------------
+ *
+ * @brief set up bins for chromosomes
+ * @author Steve Hoffmann
+ *
+ */
+
+ bl_fileBins_t*
+se_createChromBins (void *space, fasta_t *f, int maxbins, char *template,
+ Uint tmplen)
+{
+ bl_fileBins_t* bins;
+ char **desc;
+ Uint i, no;
+
+ no = f->noofseqs;
+ if(no > maxbins) return NULL;
+
+ bins = ALLOCMEMORY(space, NULL, bl_fileBins_t, 1);
+ desc = ALLOCMEMORY(space, NULL, char*, no);
+ bl_fileBinsInit(space, bins);
+
+ for(i=0; i < no; i++) {
+ desc[i] = bl_fastaGetDescription(f, i);
+ }
+
+ bl_fileBinsAdd(space, bins, no, bl_fileBinCClassAssign, desc, NULL,
+ template, tmplen);
+
+ FREEMEMORY(space, desc);
+ return bins;
+}
+
+/*-------------------------------- se_createBisulifteBins ---------------------
+ *
+ * @brief set up bin domains for matching runs and threads,
+ * domain names are simply 0...(noofdomains-1) as strings
+ * @author Christian Otto
+ *
+ */
+
+bl_fileBinDomains_t*
+se_createBisulfiteBins (void *space, Uint noofdomains,
+ Uint threadno, char *filetemplate, Uint tmplen){
+ Uint i, j;
+ bl_fileBinDomains_t *domains;
+ bl_fileBinClass_t *class;
+
+ domains = ALLOCMEMORY(space, NULL, bl_fileBinDomains_t, 1);
+ domains->noofdomains = noofdomains;
+ domains->exp = 0;
+ domains->domain = ALLOCMEMORY(space, NULL, bl_fileBinDomain_t, noofdomains);
+
+ for (i = 0; i < noofdomains; i++){
+ domains->domain[i].domainsize = threadno;
+ domains->domain[i].domainname = ALLOCMEMORY(space, NULL, char, log10(i+1) + 3);
+ snprintf(domains->domain[i].domainname, log10(i+1)+2, "%d", i);
+
+ bl_fileBinsInit(space, &domains->domain[i].bins);
+ bl_fileBinsAdd(space, &domains->domain[i].bins, threadno, NULL, NULL, NULL,
+ filetemplate, tmplen);
+
+ domains->domain[i].bins.noofbins = threadno;
+
+ for (j = 0; j < domains->domain[i].bins.noofbins; j++){
+ class = ALLOCMEMORY(space, NULL, bl_fileBinClass_t, 1);
+ class->start = j;
+ class->end = j;
+ class->classname = NULL;
+ domains->domain[i].bins.b[j].id = class;
+ }
+ }
+ /*
+ DBG("domains: noofdomains=%u\texp=%u\n", domains->noofdomains, domains->exp);
+ for (i = 0; i < domains->noofdomains; i++){
+ DBG("domain %u: domainname=%s\tdomainsize=%u\tnoofbins=%u\n", i,
+ domains->domain[i].domainname, domains->domain[i].domainsize,
+ domains->domain[i].bins.noofbins);
+ for (j = 0; j < domains->domain[i].bins.noofbins; j++){
+ DBG("bin %u: filename=%s\tstart=%llu\tend=%llu\n", j, domains->domain[i].bins.b[j].fname,
+ domains->domain[i].bins.b[j].id->start, domains->domain[i].bins.b[j].id->end);
+ }
+ }*/
+ return domains;
+
+}
+
+/*----------------------------- se_defaultHeader ------------------------------
+ *
+ * @brief get default header
+ * @author Steve Hoffmann
+ *
+ */
+
+ char *
+se_defaultHeader (void *space, segemehl_t *info, char sep, char lf)
+{
+ char *buffer, *timestr;
+ struct tm *timeinfo;
+ time_t rawtime;
+
+ time(&rawtime);
+ timeinfo = localtime (&rawtime);
+ timestr = asctime(timeinfo);
+ timestr[strlen(timestr)-1] = 0;
+
+ buffer = ALLOCMEMORY(space, NULL, char, 5000);
+ memset(buffer, 0, 5000);
+ snprintf(buffer, 5000, "#segemehl %s%c#query: %s (%u seqs)%c#mate: %s%c#subject: %s\
+ %c#minsize=%d, diff_seed=%d, jump=%d, acc=%d, maxEvalue: %.5f, hitstrategy: %d\
+ %c#splitreads: %s%c%s%c",
+ timestr, lf, info->queryfilename, info->reads->noofseqs, lf, info->matefilename, lf,
+ info->idxfilename, lf, info->minsize, info->k_p, info->jump,
+ info->accuracy, info->maxevalue, info->bestonly, lf, info->splitfilebasename, lf,
+ HEAD[info->rep_type], lf);
+ return buffer;
+}
+
+/*--------------------------- registerOutputDevice ---------------------------
+ *
+ * @brief select the output device for matches and print header
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+se_registerOutputDevice(void *space, segemehl_t *info) {
+ Uint *dmlen, dmno, i;
+ char *buffer, **dms;
+
+
+ if(info->outfile) {
+ info->dev=fopen(info->outfile, "w");
+ if (info->dev == NULL) {
+ fprintf(stderr, "Couldn't open file '%s'. Exit forced.\n", info->outfile);
+ exit(EXIT_FAILURE);
+ }
+ }
+ if (info->nohead){
+ return;
+ }
+
+ if (info->rep_type != 15) {
+ if(info->order) {
+ buffer = se_defaultHeader(space, info, 8 ,7);
+ buffer[strlen(buffer)-1] = 29;
+ } else {
+ buffer = se_defaultHeader(space, info, '\t', '\n');
+ }
+ } else {
+
+ dmno = info->fasta->noofseqs;
+ dms = ALLOCMEMORY(space, NULL, char**, dmno);
+ dmlen = ALLOCMEMORY(space, NULL, Uint*, dmno);
+
+ for(i=0; i < dmno; i++) {
+ dms[i] = bl_fastaGetDescription(info->fasta, i);
+ dmlen[i] = bl_fastaGetSequenceLength(info->fasta, i);
+ }
+
+ if(info->order){
+ buffer = se_SAMHeader(space, dms, dmlen, dmno, info->cmdline, 8, 7, info->order);
+ buffer[strlen(buffer)-1] = 29;
+ } else {
+ buffer = se_SAMHeader(space, dms, dmlen, dmno, info->cmdline, '\t', '\n', info->order);
+ }
+
+
+ FREEMEMORY(space, dms);
+ FREEMEMORY(space, dmlen);
+ }
+
+ fprintf(info->dev, "%s", buffer);
+ if (info->order){
+ fprintf(info->dev, "\n");
+ }
+ FREEMEMORY(space, buffer);
+}
+
+
+
+
+/*------------------------------- se_storeHeader ------------------------------
+ *
+ * @brief read and store header (delimitted by '\n') from file
+ * @author Steve Hoffmann
+ *
+ */
+void
+se_storeHeader(void *space, char *filename, char **header, Uint *headerlen){
+ FILE *fp;
+ int ret;
+
+ fp = fopen(filename, "rb+");
+ if(!fp) {
+ fprintf(stderr,"Couldnt open file '%s'. Exit forced!\n", filename);
+ exit(-1);
+ }
+
+ ret = bl_fgets(space, fp, header);
+
+ if (ret == EOF || ret < 0){
+ fprintf(stderr, "Couldn't retrieve header information. End of file reached.\n");
+ exit(-1);
+ }
+ fclose(fp);
+
+ *headerlen = (Uint) ret;
+
+ return;
+}
diff --git a/segemehl/libs/manout.h b/segemehl/libs/manout.h
new file mode 100644
index 0000000..f931023
--- /dev/null
+++ b/segemehl/libs/manout.h
@@ -0,0 +1,215 @@
+#ifndef MANOUT_H
+#define MANOUT_H
+
+/*
+ * manout.h
+ * attempt for flexible output of genome mapping w/ SEGEMEHL
+ *
+ * @author Christian Otto
+ * @email christan at bioinf.uni-leipzig.de
+ * @date Wed Sep 24 10:56:23 CEST 2008
+ *
+ */
+
+#include <pthread.h>
+#include "basic-types.h"
+#include "bitArray.h"
+#include "biofiles.h"
+#include "charsequence.h"
+#include "multicharseq.h"
+#include "alignment.h"
+#include "kdseed.h"
+#include "segemehl.h"
+
+#define MINUSSTRAND 0
+#define PLUSSTRAND 1
+#define SPLIT_NEXT_PLUS ((unsigned char) (1 << 5))
+#define SPLIT_PREV_PLUS ((unsigned char) (1 << 6))
+
+
+typedef enum matchstatus_e {
+ QUERY,
+ MATE,
+ PAIR,
+ PAIR_REV,
+ PAIR_INS,
+ QUERY_SPL_NO_MATE,
+ QUERY_SPL_FULL_MATE,
+ MATE_SPL_NO_QUERY,
+ MATE_SPL_FULL_QUERY,
+ PAIR_SPL
+} matchstatus_t;
+
+typedef struct gmate_s {
+
+ unsigned char isset;
+ Uint p;
+ Uint q;
+ int scr;
+ int mat;
+ int mis;
+ int ins;
+ int del;
+ int edist;
+ char *materefdesc;
+ Uint materefdesclen;
+ char *materefseq;
+
+ Alignment *al;
+ Uint subject;
+
+} gmate_t;
+
+typedef struct gmatch_s{
+ Uint subject;
+ unsigned char rc;
+ Uint i;
+ Uint j;
+ Uint p;
+ Uint q;
+ int scr;
+ int mat;
+ int mis;
+ int ins;
+ int del;
+ int edist;
+ Alignment *al;
+ double evalue;
+
+ Uint noofmatematches;
+ Uint mateminedist;
+ gmate_t mates[4];
+
+ Uint fragno;
+ Uint previdx;
+ Uint prevpos;
+ char prevflags;
+ Uint nextidx;
+ Uint nextpos;
+ char nextflags;
+
+ Uint prevseqstart;
+ char *prevseqrefdesc;
+ Uint nextseqstart;
+ char *nextseqrefdesc;
+ char *refdesc;
+ Uint refdesclen;
+ char *refseq;
+
+
+ unsigned char skip;
+} gmatch_t;
+
+
+typedef struct gmatchlist_s{
+
+ Uint minedist;
+ Uint mateminedist;
+ Uint pairminedist;
+
+ Uint *n;
+ gmatch_t **matches;
+
+} gmatchlist_t;
+
+typedef struct gread_s{
+ Uint id;
+ Uint noofmatepairs;
+ Uint noofmatches;
+
+ Uint n[2];
+ gmatch_t* matches[2];
+
+} gread_t;
+
+
+typedef struct Gmap_s{
+
+ MultiCharSeq *mseq;
+ Uint mapoffset;
+ Uint noofreads;
+ gread_t *reads;
+
+} Gmap;
+
+void
+se_destructMatches(void *space, gread_t *read);
+
+unsigned char
+se_hasMatches(gmatchlist_t *list);
+
+unsigned char
+se_hasMateMatches(gmatchlist_t *list);
+
+Uint
+se_kdSetMate(void *space, gmatch_t *match,
+ Uint chr_idx, Uint chr_start, Uint chr_end, Uint edist,
+ Alignment *al, unsigned char downstream, unsigned char rc);
+
+gmatchlist_t*
+se_kdMatchListAdd(gmatchlist_t *list,
+ Uint chr_idx,
+ Uint chr_start,
+ Uint chr_end,
+ Uint edist,
+ int scr,
+ Uint start,
+ Uint end,
+ double evalue, Alignment *al, Uint u,
+ Uint previdx, Uint prevpos, char prevstrand,
+ Uint nextidx, Uint nextpos, char nextstrand, Uint fragno);
+
+gmatchlist_t*
+se_kdMatchListSet(void *space,
+ gmatchlist_t *list,
+ Uint chr_idx,
+ Uint chr_start,
+ Uint chr_end,
+ Uint edist,
+ int scr,
+ Uint start,
+ Uint end,
+ double evalue, Alignment *al, Uint u, Uint n);
+
+void reportSplicedMatch(void *space, char *qrydesc,
+ MultiCharSeqAlignment *mcsa, Uint noofaligns,
+ Uint coverage, Uint edist, int score, segemehl_t *nfo);
+Uint se_kdMatchListLength(gmatchlist_t *list, unsigned char strand);
+unsigned char se_kdMatchListhasMatches(gmatchlist_t *list);
+unsigned char se_kdMatchListhasMates(gmatchlist_t *list);
+Uint se_kdMatchListLength(gmatchlist_t *list, unsigned char strand);
+Uint se_kdMatchListScore(gmatchlist_t *list);
+gmatch_t* se_kdMatchListGet(gmatchlist_t *list, unsigned char strand,
+ Uint elem);
+gmate_t* se_kdMatchGetMates(gmatch_t *match);
+Uint se_kdMatchGetSubject(gmatch_t *match);
+Uint se_kdMatchGetRefStart(gmatch_t *match);
+extern void reportMap(FILE*, Gmap *map, Uint level);
+extern void initMatch(gmatch_t *);
+void initRead(gread_t *, Uint);
+void initGmap(Gmap *, MultiCharSeq *, Uint);
+extern void setMatches(gread_t*, gmatch_t *, Uint, unsigned char,
+ unsigned char);
+extern void setReads(Gmap *, gread_t *, Uint);
+extern Uint reportMatch (void *, Gmap *, fasta_t *, segemehl_t *,
+ matchstatus_t pairStatus, unsigned char mate);
+Uint se_setMatches(void *space, gread_t *read, gmatchlist_t *list, Uint maxedist, segemehl_t *nfo, char rep);
+void matchHeader(FILE* dev, Uint level);
+void genericOutput (FILE *dev, char **list, Uint rep_type, char);
+void bl_gmatchlistDestruct(void *space, gmatchlist_t *list);
+gmatchlist_t* bl_gmatchlistInit(void *space, int maxedist, int matemaxedist);
+void se_registerOutputDevice(void *space, segemehl_t *info);
+bl_fileBins_t* se_createChromBins (void *space, fasta_t *f, int maxbins, char
+ *template, Uint tmplen);
+bl_fileBinDomains_t* se_createChromDomains (void *space, fasta_t *f,
+ Uint minbins, Uint maxbins, char *filetemplate, Uint tmplen);
+bl_fileBinDomains_t*
+se_createBisulfiteBins (void *space, Uint noofdomains, Uint threadno, char *filetemplate, Uint tmplen);
+char* se_SAMHeader (void *space, char **seq, Uint *seqlen,
+ Uint size, char *cmdline, char sep, char lf,
+ unsigned char sorted);
+char * se_defaultHeader (void *space, segemehl_t *info, char, char);
+void se_storeHeader(void *space, char *filename, char **header, Uint *headerlen);
+
+#endif /* MANOUT_H */
+
diff --git a/segemehl/libs/manoutformats.h b/segemehl/libs/manoutformats.h
new file mode 100644
index 0000000..0bc67cd
--- /dev/null
+++ b/segemehl/libs/manoutformats.h
@@ -0,0 +1,153 @@
+#ifndef MANOUTFORMATS_H
+#define MANOUTFORMATS_H
+
+/*
+ * outformats.h
+ * definition of used symbols and output formats
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @date Thu Oct 2 09:59:57 CEST 2008
+ */
+
+#define SEPARATOR "\t"
+#define OUTLENGTH 56
+
+
+/* definition of used symbols*/
+
+#define QRY_LEN 0 /* length of query */
+
+#define SCR 1 /* score of match */
+#define EVALUE 2 /* evalue of match */
+#define QRY_S 3 /* start position on query */
+#define QRY_E 4 /* end position on query */
+
+#define SEQ_S 5 /* start position on sequence (absolute) */
+#define SEQ_E 6 /* end position on sequence (absolute) */
+#define MAT 7 /* number of matching symbols */
+#define MIS 8 /* number of mismatching symbols */
+#define INS 9 /* number */
+#define DEL 10 /* number of deletions */
+#define EDIST 11 /* alignment edist */
+#define REF_SEQ 12 /* sequence of match */
+#define QRY_SEQ 13 /* the query sequence*/
+#define STRAND 14 /* strand of match */
+
+#define QRY_DESC 15 /* description of query */
+#define NOOFMATCHES 16 /* no of matches */
+#define PAIR_STATUS 17 /* pair status */
+#define SEQ_DESC 18 /* sequence description in multi fasta file */
+#define MEOP_STR 19 /* meop string*/
+
+/*mate information*/
+
+#define MATE_LEN 20
+#define MATE_SCR 21
+#define MATE_EVALUE 22
+#define MATE_QRY_S 23
+#define MATE_QRY_E 24
+
+#define MATE_SEQ_S 25
+#define MATE_SEQ_E 26
+#define MATE_MAT 27
+#define MATE_MIS 28
+#define MATE_INS 29
+#define MATE_DEL 30
+#define MATE_EDIST 31
+#define MATE_REF_SEQ 32
+#define MATE_QRY_SEQ 33
+#define MATE_STRAND 34
+#define MATE_SEQ_DESC 35
+#define MATE_MEOP 36
+#define MATE_NOOFMATCHES 37
+#define MATE_DESC 38
+#define MATE_QUAL 39
+
+#define QUAL 41 /*quality string*/
+#define SAM_QRY 42 /*SAM query name*/
+#define SAM_FLAG 43 /*SAM FLAGS*/
+#define SAM_MAPQ 44 /*SAM reference name*/
+#define SAM_CIGAR 45 /*SAM CIGAR*/
+#define SAM_QRY_REF 46 /*SAM mate query*/
+#define SAM_ISIZE 47 /*insert size*/
+
+#define SAM_MATE_FLAG 48 /*SAM FLAGS*/
+#define SAM_MATE_MAPQ 49 /*SAM reference name*/
+#define SAM_MATE_CIGAR 50 /*SAM CIGAR*/
+#define SAM_MATE_REF 51 /*SAM reference of mate*/
+#define SAM_MATE_ISIZE 52
+#define TAG 53
+#define MATE_TAG 54
+#define SAM_MATE_QRY 55
+
+ const char EMPTY[] = " ";
+ const char HEAD0[] = "#descr;score;Evalue;qstart;qend;matches;mismatches;insertions;deletions;strand;sstart;send;sequence;sequence descr\n";
+ const int FORMAT0[] = {QRY_DESC, SCR, EVALUE, QRY_S, QRY_E, MAT, MIS, INS, DEL, STRAND, SEQ_S, SEQ_E, REF_SEQ, SEQ_DESC, -1};
+ const char HEAD1[] = "#descr;score;qstart;qend;matches;mismatches;insertions;deletions;strand;sstart;send;sequence\n";
+ const int FORMAT1[] = {QRY_DESC, SCR, QRY_S, QRY_E, MAT, MIS, INS, DEL, STRAND, SEQ_S, SEQ_E, REF_SEQ, -1};
+ const char HEAD2[] = "#gff-format\n";
+ const int FORMAT2[] = {SEQ_S, SEQ_E, SCR, STRAND, QRY_S, QRY_E, MAT, MIS, INS, DEL, QRY_DESC, QRY_LEN, REF_SEQ, -1};
+ const char HEAD3[] = "#descr;score;Evalue;qstart;qend;matches;mismatches;insertions;deletions;strand;sstart;send;sequence descr";
+ const int FORMAT3[] = {QRY_DESC, SCR, EVALUE, QRY_S, QRY_E, MAT, MIS, INS, DEL, STRAND, SEQ_S, SEQ_E, SEQ_DESC, -1};
+ const char HEAD4[] = "#descr;full alignment edist;fragment score;fragment Evalue;fragment qstart;fragment qend;fragment matches;fragment mismatches;fragment insertions;fragment deletions;strand;sstart;send;sequence descr";
+ const int FORMAT4[] = {QRY_DESC, EDIST, SCR, EVALUE, QRY_S, QRY_E, MAT, MIS, INS, DEL, STRAND, SEQ_S, SEQ_E, SEQ_DESC, -1};
+ const char HEAD5[] = "#descr;sstart;send;strand;edist;sequence descr";
+ const int FORMAT5[] = {QRY_DESC, SEQ_S, SEQ_E, STRAND, EDIST, SEQ_DESC, -1};
+ const char SORTBIN5[] = "-k2,2n";
+ const char SORT5[] = "-k5,5 -k2,2n";
+
+ const char HEAD6[] = "#descr;sstart;send;strand;edist;sequence descr\n";
+ const int FORMAT6[] = {QRY_DESC, MATE_SEQ_S, MATE_SEQ_E, MATE_STRAND, MATE_EDIST, MATE_SEQ_DESC, -1};
+
+// const char HEAD6[] = "#descr;full alignment edist;fragment score;fragment Evalue;fragment qstart;fragment qend;fragment matches;fragment mismatches;fragment insertions;fragment deletions;strand;sstart;send;subject;query;sequence descr\n";
+// const int FORMAT6[] = {QRY_DESC, EDIST, SCR, EVALUE, QRY_S, QRY_E, MAT, MIS, INS, DEL, STRAND, SEQ_S, SEQ_E, REF_SEQ, QRY_SEQ, SEQ_DESC, -1};
+ const char HEAD7[] = "#descr;seed score;seed Evalue;seed qstart;seed qend;semi global alignment matches;semi global alignment mismatches;semi global alignment insertions;semi global alginment deletions;strand;start of semi global alignment in subject(reference) sequence;end of semi global alignment in subject sequence;sequence descr;meop string";
+ const int FORMAT7[] = {QRY_DESC, SCR, EVALUE, QRY_S, QRY_E, MAT, MIS, INS, DEL, STRAND, SEQ_S, SEQ_E, SEQ_DESC, MEOP_STR, -1};
+ const char HEAD8[] = "#pair status;descr;semi global alignment distance;seed score;seed Evalue;seed qstart;seed qend;semi global alignment matches;semi global alignment mismatches;semi global alignment insertions;semi global alginment deletions;strand;start of semi global alignment in subject(reference) sequence;end of semi global alignment in subject sequence;sequence descr;meop string;number of matches";
+ const int FORMAT8[] = {PAIR_STATUS, QRY_DESC, EDIST, SCR, EVALUE, QRY_S, QRY_E, MAT, MIS, INS, DEL, STRAND, SEQ_S, SEQ_E, SEQ_DESC, MEOP_STR, NOOFMATCHES, -1};
+ const char HEAD9[] = "#descr;semi global alignment distance;seed score;seed Evalue;seed qstart;seed qend;semi global alignment matches;semi global alignment mismatches;semi global alignment insertions;semi global alginment deletions;strand;start of semi global alignment in subject(reference) sequence;end of semi global alignment in subject sequence;sequence descr;meop stringi;query";
+ const int FORMAT9[] = {QRY_DESC, EDIST, SCR, EVALUE, QRY_S, QRY_E, MAT, MIS, INS, DEL, STRAND, SEQ_S, SEQ_E, SEQ_DESC, MEOP_STR, QRY_SEQ, -1};
+
+ const char HEAD10[] = "#descr;semi global alignment distance;seed score;seed Evalue;seed qstart;seed qend;semi global alignment matches;semi global alignment mismatches;semi global alignment insertions;semi global alginment deletions;strand;start of semi global alignment in subject(reference) sequence;end of semi global alignment in subject sequence;sequence descr;meop stringi;query";
+ const int FORMAT10[] = {QRY_DESC, EDIST, SCR, EVALUE, QRY_S, QRY_E, MAT, MIS, INS, DEL, STRAND, SEQ_S, SEQ_E, SEQ_DESC, MEOP_STR, QRY_SEQ, -1};
+
+/*query and mate*/
+ const char HEAD11[] = "#pair status;descr;semi global alignment distance;seed score;seed Evalue;seed qstart;seed qend;semi global alignment matches;semi global alignment mismatches;semi global alignment insertions;semi global alginment deletions;strand;start of semi global alignment in subject(reference) sequence;end of semi global alignment in subject sequence;sequence descr;meop string;number of matches";
+ const int FORMAT11[] = {PAIR_STATUS, QRY_DESC, EDIST, QRY_S, QRY_E, MAT, MIS, INS, DEL, STRAND, SEQ_S, SEQ_E, SEQ_DESC, MEOP_STR, MATE_EDIST, MATE_STRAND, MATE_SEQ_S, MATE_SEQ_E, NOOFMATCHES, -1};
+
+/*unpaired query or mate*/
+ const char HEAD12[] = "#pair status;descr;semi global alignment distance;seed score;seed qstart;seed qend;semi global alignment matches;semi global alignment mismatches;semi global alignment insertions;semi global alginment deletions;strand;start of semi global alignment in subject(reference) sequence;end of semi global alignment in subject sequence;sequence descr;meop string;number of matches;number of mate matches";
+ const int FORMAT12[] = {PAIR_STATUS, QRY_DESC, EDIST, QRY_S, QRY_E, MAT, MIS, INS, DEL, STRAND, SEQ_S, SEQ_E, SEQ_DESC, MEOP_STR, NOOFMATCHES, -1};
+
+ const char SORTBIN12[] = "-k11,11n";
+ const char SORT12[] = "-k13,13 -k11,11n";
+
+/*mate and query*/
+ const char HEAD13[] = "#pair status;descr;semi global alignment distance;seed score;seed Evalue;seed qstart;seed qend;semi global alignment matches;semi global alignment mismatches;semi global alignment insertions;semi global alignment deletions;strand;start of semi global alignment in subject(reference) sequence;end of semi global alignment in subject sequence;sequence descr;meop string;number of matches;number of mate matches";
+ const int FORMAT13[] = {PAIR_STATUS, MATE_DESC, MATE_EDIST, MATE_QRY_S, MATE_QRY_E, MATE_MAT, MATE_MIS, MATE_INS, MATE_DEL, MATE_STRAND, MATE_SEQ_S, MATE_SEQ_E, MATE_SEQ_DESC, MATE_MEOP, EDIST, STRAND, SEQ_S, SEQ_E, NOOFMATCHES, -1};
+
+ const char HEAD14[] = "#descr;semi global alignment distance;seed score;seed Evalue;seed qstart;seed qend;semi global alignment matches;semi global alignment mismatches;semi global alignment insertions;semi global alignment deletions;strand;start of semi global alignment in subject(reference) sequence;end of semi global alignment in subject sequence;sequence descr;meop string;query";
+ const int FORMAT14[] = {QRY_DESC, EDIST, SCR, EVALUE, QRY_S, QRY_E, MAT, MIS, INS, DEL, STRAND, SEQ_S, SEQ_E, SEQ_DESC, MEOP_STR, QRY_SEQ, MATE_EDIST, MATE_MAT, MATE_MIS, MATE_INS, MATE_DEL, MATE_STRAND, MATE_SEQ_S, MATE_SEQ_E, MATE_MEOP, MATE_QRY_SEQ, -1};
+
+/*SAM*/
+
+ const char HEAD15[] = "SAM";
+ const int FORMAT15[] = {SAM_QRY, SAM_FLAG, SEQ_DESC, SEQ_S, SAM_MAPQ, SAM_CIGAR, SAM_MATE_REF, MATE_SEQ_S, SAM_ISIZE, QRY_SEQ, QUAL, TAG, -1};
+ const char SORTBIN15[] = "-k4,4n";
+ const char SORT15[] = "-k3,3 -k4,4n";
+
+ const char HEAD16[] = "SAM";
+ const int FORMAT16[] = {SAM_MATE_QRY, SAM_MATE_FLAG, MATE_SEQ_DESC, MATE_SEQ_S, SAM_MATE_MAPQ, SAM_MATE_CIGAR, SAM_QRY_REF, SEQ_S, SAM_MATE_ISIZE, MATE_QRY_SEQ, MATE_QUAL, MATE_TAG, -1};
+
+/* definition of constant arrays */
+ const char* HEAD[] = {HEAD0, HEAD1, HEAD2, HEAD3, HEAD4, HEAD5, HEAD6, HEAD7, HEAD8, HEAD9, HEAD10, HEAD11, HEAD12, HEAD13, HEAD14, HEAD15};
+ const int* FORMAT[] = {FORMAT0, FORMAT1, FORMAT2, FORMAT3, FORMAT4, FORMAT5, FORMAT6, FORMAT7, FORMAT8, FORMAT9, FORMAT10, FORMAT11, FORMAT12, FORMAT13, FORMAT14, FORMAT15, FORMAT16};
+
+ const char* SORT[] = {EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, SORT5, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, SORT12, EMPTY, EMPTY, SORT15};
+ const char* SORTBIN[] = {EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, SORTBIN5, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, EMPTY, SORTBIN12, EMPTY, EMPTY, SORTBIN15};
+ const char SORTDELIM = 9;
+
+
+
+#endif
diff --git a/segemehl/libs/matchfiles.c b/segemehl/libs/matchfiles.c
new file mode 100644
index 0000000..09c6659
--- /dev/null
+++ b/segemehl/libs/matchfiles.c
@@ -0,0 +1,2536 @@
+
+/*
+ * readmatchfiles.c
+ * read alignment files
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 06.10.2010 01:02:08 CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <ctype.h>
+#include "alignment.h"
+#include "debug.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "sort.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "zran.h"
+#include "nw.h"
+#include "matchfiles.h"
+#include "evalmatchfiles.h"
+#include "manout.h"
+#include "matchfilesfields.h"
+#include "matepairs.h"
+
+/*-------------------------- bl_matchfileInitSplit ---------------------------
+ *
+ * @brief initialize a split structure
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileAddSplit (void *space, matchfileCross_t *cs, Uint v,
+ char strand, char edgetype, Uint edgechridx, Uint edge, Uint adjoint,
+ Uint xstart, Uint xend, Uint row, Uint xno, Uint flags)
+{
+
+ matchfileSplit_t *split;
+ cs[v].splits = ALLOCMEMORY(space, cs[v].splits, matchfileSplit_t,
+ cs[v].noofsplits+1);
+
+
+ split = &cs[v].splits[cs[v].noofsplits];
+
+ split->strand = strand;
+ split->edgetype = edgetype;
+ split->xno = xno;
+ split->xstart = xstart;
+ split->xend = xend;
+ split->trans = 0;
+
+
+ if(edgetype == 'A' && (flags & SPLIT_PREV_PLUS) && strand == '-') {
+ split->trans = 1;
+ }
+
+ if(edgetype == 'A' && !(flags & SPLIT_PREV_PLUS) && strand == '+') {
+ split->trans = 1;
+ }
+
+ if(edgetype == 'D' && (flags & SPLIT_NEXT_PLUS) && strand == '-') {
+ split->trans = 1;
+ }
+
+ if(edgetype == 'D' && !(flags & SPLIT_NEXT_PLUS) && strand == '+') {
+ split->trans = 1;
+ }
+
+ split->edgechridx = edgechridx;
+ split->edge = edge;
+ split->adjoint = adjoint;
+
+ cs[v].noofsplits++;
+
+ return ;
+}
+
+/*---------------------- bl_matchfileCmpDeletionLength -----------------------
+ *
+ * @brief a compare function for qsort to sort the deletions by length
+ * a helper function for matchfilegapalign
+ * @author Steve Hoffmann
+ *
+ */
+
+
+Uint
+bl_matchfileCmpDeletionLength(Uint a, Uint b, void *tosort, void *info) {
+ matchfileDeletion_t *arr;
+
+ arr = (matchfileDeletion_t*) tosort;
+ if(arr[a].len > arr[b].len) return 2;
+ if(arr[b].len > arr[a].len) return 1;
+
+ return 0;
+}
+
+
+/*-------------------------- bl_matchfileGapAdjust ---------------------------
+ *
+ * @brief a helper function for gap align
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_matchfileGapAdjust(Alignment *al, Uint *len, unsigned char template) {
+ Uint i = 0, j = 0, p = 0, q = 0, r = 0;
+ char *string;
+
+ string = malloc(sizeof(char)*(al->vlen+al->ulen+1));
+
+ for(i=0; i < al->uoff; i++) {
+ string[i] = al->u[i];
+ }
+
+ for(; i < al->voff; i++) {
+ if(template) string[i] = al->v[i];
+ else string[i] = '^';
+ }
+
+
+ r = i;
+ for (i=0; i < al->numofmeops; i++) {
+ //if Replacement occured
+ if (al->meops[i].eop == Replacement) {
+ for (j=0; j < al->meops[i].steps; j++) {
+ string[j+r] = al->u[j+p+al->uoff];
+ }
+ p+=j;
+ q+=j;
+ r+=j;
+ }
+
+ if (al->meops[i].eop == Deletion) {
+ //iter over all steps
+ for (j=0; j < al->meops[i].steps; j++) {
+ string[j+r] = '^';
+ }
+ //set ptrs
+ r+=j;
+ q+=j;
+ }
+
+ if (al->meops[i].eop == Insertion) {
+ for (j=0; j < al->meops[i].steps; j++) {
+ string[j+r] = al->u[j+p+al->uoff];
+ }
+ r+=j;
+ p+=j;
+ }
+ }
+
+ string[r]=0;
+
+ *len = r;
+ return string;
+}
+
+
+/*--------------------------- bl_matchfileGapAlign ---------------------------
+ *
+ * @brief alignment of gaps
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_matchfileGapAlign(matchfileDeletion_t *dels, Uint noofdels) {
+ Uint i, *sidx, templatelen=0;
+ int scores[] = {1,-1};
+ int *matrix;
+ char *template, *temp;
+ Alignment al;
+
+
+ sidx = quickSort(NULL, dels, noofdels, bl_matchfileCmpDeletionLength, NULL);
+ templatelen = dels[sidx[0]].len;
+ template = ALLOCMEMORY(NULL, NULL, char, templatelen+1);
+ memmove(template, dels[sidx[0]].string, templatelen);
+ template[templatelen] = 0;
+
+ /*progressive multiple alignment w/o guide tree. Generate template*/
+ for(i=1; i < noofdels; i++) {
+
+ matrix = nwmatrix(NULL, template, templatelen,
+ dels[sidx[i]].string, dels[sidx[i]].len, -1, constscr, scores);
+
+ initAlignment(&al, template, templatelen, 0,
+ dels[sidx[i]].string, dels[sidx[i]].len, 0);
+
+ nwtraceback(NULL, matrix, template, templatelen, dels[sidx[i]].string,
+ dels[sidx[i]].len, -1, constscr, scores, &al);
+
+ temp = bl_matchfileGapAdjust(&al, &templatelen, 1);
+
+ FREEMEMORY(space, template);
+ template = temp;
+ FREEMEMORY(space, matrix);
+ wrapAlignment(&al);
+ }
+
+ /*progressive multiple alignment w/o guide tree. fit to template*/
+ for(i=0; i < noofdels; i++) {
+ matrix = nwmatrix(NULL, dels[sidx[i]].string, dels[sidx[i]].len,
+ template, templatelen, -1, constscr, scores);
+
+ initAlignment(&al, dels[sidx[i]].string, dels[sidx[i]].len, 0,
+ template, templatelen, 0);
+
+ nwtraceback(NULL, matrix, dels[sidx[i]].string, dels[sidx[i]].len,
+ template, templatelen, -1, constscr, scores, &al);
+
+ temp = bl_matchfileGapAdjust(&al, &(dels[sidx[i]].len), 0);
+
+ FREEMEMORY(space, dels[sidx[i]].string);
+ dels[sidx[i]].string = temp;
+ FREEMEMORY(space, matrix);
+ wrapAlignment(&al);
+ }
+
+ FREEMEMORY(space, template);
+ FREEMEMORY(space, sidx);
+}
+
+/*-------------------------- bl_matchfileInitIndex ---------------------------
+ *
+ * @brief init the index
+ * @author Steve Hoffmann
+ *
+ */
+
+matchfileindex_t* bl_matchfileInitIndex(void *space) {
+
+ matchfileindex_t *index;
+
+ index = ALLOCMEMORY(space, NULL, matchfileindex_t, 1);
+ index->exp = 15;
+ index->noofbins = NULL;
+ index->maxreadlen = 0;
+ index->noofchroms = 0;
+ index->bins = NULL;
+ index->gzindex = NULL;
+ index->chromnames = NULL;
+ index->matchstart = NULL;
+ index->matchend = NULL;
+ index->stats = NULL;
+ index->md5 = 0;
+ index->submatrix = NULL;
+ index->mean_coverage = 0;
+ index->mean_qual = 0;
+ index->Q_ERR = NULL;
+ index->Q_N = NULL;
+ index->P_ERR = NULL;
+
+ return index;
+}
+
+
+/*------------------------ bl_matchfileDestructIndex -------------------------
+ *
+ * @brief remove the index
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_matchfileDestructIndex(void *space, matchfileindex_t *index) {
+ Uint i;
+
+ if(index->gzindex) {
+ if(index->gzindex->list) {
+ FREEMEMORY(space, index->gzindex->list);
+ index->gzindex->list = NULL;
+ }
+ FREEMEMORY(space, index->gzindex);
+ index->gzindex = NULL;
+ }
+
+ for(i=0; i < index->noofchroms; i++) {
+ FREEMEMORY(space, index->bins[i]);
+ FREEMEMORY(space, index->chromnames[i]);
+ }
+
+
+ if(index->chromnames) FREEMEMORY(space, index->chromnames);
+ if(index->noofbins) FREEMEMORY(space, index->noofbins);
+ if(index->bins) FREEMEMORY(space, index->bins);
+
+ if(index->Q_ERR) FREEMEMORY(space, index->Q_ERR);
+ if(index->P_ERR) FREEMEMORY(space, index->P_ERR);
+ if(index->Q_N) FREEMEMORY(space, index->Q_N);
+ if(index->matchstart) FREEMEMORY(space, index->matchstart);
+ if(index->matchend) FREEMEMORY(space, index->matchend);
+ if(index->submatrix) FREEMEMORY(space, index->submatrix);
+
+ if(index->stats) {
+ bl_matchfileDestructSampleStats(space, index->stats);
+ FREEMEMORY(space, index->stats);
+ }
+}
+
+
+/*--------------------- bl_matchfileGetChromIndexNumber ----------------------
+ *
+ * @brief get the number of the chromosome chrname in the file index
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetChromIndexNumber(matchfileindex_t *index, char *chromname) {
+ Uint k, j, desclen;
+ char *desc;
+
+ for(k=0; k < index->noofchroms; k++) {
+
+ desc = index->chromnames[k];
+ desclen = strlen(index->chromnames[k]);
+
+ if(strcmp(chromname, desc) == 0) {
+ break;
+ }
+
+ for(j=0; j < desclen; j++) {
+ if (isspace(desc[j])) break;
+ }
+
+ if(strlen(chromname) <= j && strncmp(chromname, desc, j) == 0)
+ break;
+ }
+
+ return k;
+}
+
+
+/*-------------------- bl_matchfileGetChromIndexNumberDB ---------------------
+ *
+ * @brief get index number of chromosome in DB this is necessary
+ * because order in DB is not necessarily the same in index
+ * @author Steve Hoffmann
+ *
+ */
+
+ Uint
+bl_matchfileGetChromIndexNumberDB (fasta_t *set, matchfileindex_t *index,
+ char *chr)
+{
+ Uint idxchrid=-1;
+ Uint dbchrid=-1;
+
+ idxchrid = bl_matchfileGetChromIndexNumber(index, chr);
+ assert(idxchrid != -1);
+ if(set) {
+ for(dbchrid=0; dbchrid < set->noofseqs; dbchrid++) {
+ if(strcmp(bl_fastaGetDescription(set,dbchrid),
+ index->chromnames[idxchrid])==0) {
+ break;
+ }
+ }
+ assert(dbchrid < set->noofseqs);
+ return dbchrid;
+ }
+
+ return idxchrid;
+}
+
+
+
+
+/*--------------------------- bl_matchfileInitBin ----------------------------
+ *
+ * @brief initialize a bin within the index for the interval [start,end] ond
+ * chrom no k
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileInitBin(void *space, matchfileindex_t *index, Uint k, Uint bin,
+ Uint start, Uint end) {
+
+ Uint i;
+
+ if(bin >= index->noofbins[k]) {
+
+ index->bins[k] = ALLOCMEMORY(space, index->bins[k],
+ matchfileBin_t, bin+1);
+
+ for(i=index->noofbins[k]; i <= bin; i++) {
+ index->bins[k][i].end = end;
+ index->bins[k][i].start = start;
+ index->bins[k][i].offset = 0;
+ index->bins[k][i].matches = 0;
+ index->bins[k][i].endoff = 0;
+ }
+ index->noofbins[k] = bin + 1;
+ }
+
+ return;
+}
+
+/*------------------------ bl_matchfileIndexAddChrom -------------------------
+ *
+ * @brief add a chromosome name to the index and initialize the bins
+ * @author Steve Hoffmann
+ *
+ */
+
+
+Uint
+bl_matchfileIndexAddChrom(matchfileindex_t *index, char *chromname) {
+
+ Uint k;
+
+ k = bl_matchfileGetChromIndexNumber(index, chromname);
+
+ if(k == index->noofchroms) {
+
+ index->chromnames =
+ ALLOCMEMORY(space, index->chromnames, char*, index->noofchroms+1);
+ index->chromnames[k] = ALLOCMEMORY(space, NULL, char, strlen(chromname)+1);
+ memmove(index->chromnames[k], chromname, strlen(chromname));
+ index->chromnames[k][strlen(chromname)] = 0;
+
+ index->matchstart =
+ ALLOCMEMORY(space, index->matchstart, Uint, index->noofchroms+1);
+ index->matchend =
+ ALLOCMEMORY(space, index->matchend, Uint, index->noofchroms+1);
+ index->matchstart[k] = 0;
+ index->matchend[k] = 0;
+
+ index->noofbins =
+ ALLOCMEMORY(space, index->noofbins, Uint, index->noofchroms+1);
+ index->noofbins[k] = 0;
+
+ index->bins =
+ ALLOCMEMORY(space, index->bins, matchfileBin_t*, index->noofchroms+1);
+ index->bins[k] = NULL;
+ index->noofchroms++;
+
+ }
+
+ return k;
+}
+
+/*--------------------- bl_matchfileIndexAddAccessPoint ----------------------
+ *
+ * @brief add an access point to the matchfileindex_t structure for
+ * given chromname, bin, start and end
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileIndexAddAccessPoint(void *space, matchfileindex_t *index,
+ char *chromname, Uint bin, Uint start, Uint end, off_t off, Uint matches,
+ Uint lastchrom, Uint lastbin, Uint lastend) {
+
+ Uint k,i;
+
+ k = bl_matchfileIndexAddChrom(index, chromname);
+
+ index->matchstart[k] = (index->matchstart[k] > start) ?
+ start : index->matchstart[k];
+ index->matchend[k] = (index->matchend[k] < end) ?
+ end : index->matchend[k];
+
+
+ bl_matchfileInitBin(space, index, k, bin, start, end);
+
+// fprintf(stderr, "adding access point for %s:%d, %d-%d (bin:%d): %llu\n",
+// chromname, k, start, end, bin, off);
+
+ if(!index->bins[k][bin].offset) {
+ index->bins[k][bin].offset = off;
+ }
+
+ if(index->bins[k][bin].start > start) {
+ index->bins[k][bin].start = start;
+ }
+
+ if(index->bins[k][bin].end < end) {
+ index->bins[k][bin].end = end;
+ }
+
+
+ // if(bin > 0) {
+ //should never be 0
+ if(off > 0){
+ index->bins[lastchrom][lastbin].endoff = off-1;
+ }
+ index->bins[lastchrom][lastbin].matches = matches;
+ index->bins[lastchrom][lastbin].end = lastend;
+// fprintf(stderr, "resetting lastchrom:%d, original matchend:%d, new matchend:%d\n", lastchrom, index->matchend[lastchrom], lastend);
+ index->matchend[lastchrom] = (index->matchend[lastchrom] < lastend) ?
+ lastend : index->matchend[lastchrom];
+// }
+
+ if(lastchrom != k) {
+ lastbin = 0;
+ } else {
+ lastbin +=1;
+ }
+
+ for(i=lastbin; i < bin; i++) {
+ index->bins[k][i].offset = off;
+ }
+}
+
+
+/*------------------- bl_matchfileIndexAccessPointOverlap --------------------
+ *
+ * @brief update most distant access point and bin current read overlaps with
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_matchfileIndexAccessPointOverlap (void *space,
+ matchfileindex_t *index, char *chromname, Uint startbin, Uint endbin,
+ Uint start, Uint end, off_t ovlstart) {
+
+ Uint i, k;
+
+ k = bl_matchfileIndexAddChrom(index, chromname);
+
+ index->matchstart[k] = (index->matchstart[k] > start) ?
+ start : index->matchstart[k];
+ index->matchend[k] = (index->matchend[k] < end) ?
+ end : index->matchend[k];
+
+ bl_matchfileInitBin(space, index, k, endbin, start, end);
+
+ for(i=startbin+1; i <= endbin; i++) {
+ if (!index->bins[k][i].offset) {
+ index->bins[k][i].offset = ovlstart;
+ }
+ if(index->bins[k][i].end < end) {
+ index->bins[k][i].end = end;
+ }
+ if(index->bins[k][i].start > start) {
+ index->bins[k][i].start = start;
+ }
+ }
+}
+
+
+
+/*---------------------------- bl_matchfileIndex -----------------------------
+ *
+ * @brief build an index for matchfile_t
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void bl_matchfileIndex (void *space, matchfile_t *file, fasta_t *set) {
+
+ FILE *fp = NULL;
+ stringset_t *fields = NULL;
+ char *buffer = NULL, ch, first=1;
+ char *curchrom = NULL, *filename, *diff, *read, *qual, *aln, *ref, strand=0,
+ *acceptorchr, *donorchr;
+ Uint buffersize = 1024, len = 0, curstart = 0,
+ curend = 0, matches = 0, curbin = 0, lastchromidx = 0,
+ startbin = 0, endbin = 0, lastbin = 0, k = 0,
+ lastend = 0, p, u, s, q, id, allen=0,
+ readlen=0, noofreads=0;
+ unsigned char minqual = 255, maxqual = 0;
+
+ matchfileindex_t *index;
+ unsigned char mdcigarcheck = 1, header = 1;
+ char *code;
+ int gzlen;
+ unsigned char gzip, fmt, getsubmatrix=1;
+ struct gzidxfile *gzf = NULL;
+ off_t off=0;
+ Uint *Q_ERR;
+ Uint *P_ERR;
+ Uint *Q_N;
+
+ code = getNTcodekey(space);
+ filename = file->filename;
+ gzip = file->gzip;
+ fmt = file->fmt;
+ index = bl_matchfileInitIndex(space);
+
+ if (gzip) {
+ index->gzindex = bl_zranGetIndex(filename, &gzlen);
+ fp = fopen(filename, "rb");
+ gzf = bl_initgzidxfile(fp, index->gzindex, 0, CHUNK);
+ } else {
+ fp = fopen(filename, "r");
+ }
+
+ if(fp == NULL) {
+ DBGEXIT("Couldn't open file %s. Exit forced!\n", filename);
+ }
+
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+
+ if(getsubmatrix) {
+ index->submatrix = ALLOCMEMORY(space, NULL, double,
+ 6*6*QRNGE*MAXREADLENGTH+1);
+ memset(index->submatrix, 0, sizeof(double)*(6*6*QRNGE*MAXREADLENGTH+1));
+
+ P_ERR = ALLOCMEMORY(space, NULL, Uint, MAXREADLENGTH);
+ memset(P_ERR, 0, sizeof(Uint)*MAXREADLENGTH);
+ Q_ERR = ALLOCMEMORY(space, NULL, Uint, QRNGE);
+ memset(Q_ERR, 0, sizeof(Uint)*QRNGE);
+ Q_N = ALLOCMEMORY(space, NULL, Uint, QRNGE);
+ memset(Q_N, 0, sizeof(Uint)*QRNGE);
+ }
+
+
+ for(u=0; u < set->noofseqs; u++) {
+ bl_matchfileIndexAddChrom(index, bl_fastaGetDescription(set, u));
+ }
+
+ while((ch = (gzip) ? bl_getgzidxc(gzf) : getc(fp)) != EOF) {
+
+ if(len == buffersize-1) {
+ buffersize = 2*buffersize+1;
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ }
+
+ if(ch == '\n' && len > 0) {
+
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+ header = (header) ? bl_matchfileIsHeader(buffer, len, fmt) : header;
+
+ if (!header) {
+ fields = tokensToStringset(space, "\t", buffer, len);
+ curstart = bl_matchfileGetStartPos(fields, fmt);
+ curend = bl_matchfileGetEndPos(fields, fmt);
+
+ if(curstart != curend && strand != -1) {
+
+ curchrom = bl_matchfileGetChrom(fields, fmt);
+ strand = bl_matchfileGetStrand(fields, fmt);
+ read = bl_matchfileGetRead(fields, fmt);
+ qual = bl_matchfileGetQual(fields, fmt);
+ diff = bl_matchfileGetDiffString(fields, fmt);
+ aln = bl_matchfileGetAln(fields, fmt);
+ acceptorchr = bl_matchfileGetNextChr(fields, fmt);
+ donorchr = bl_matchfileGetPrevChr(fields, fmt);
+
+ if(read) {
+ readlen = strlen(read);
+ if (readlen > index->maxreadlen) {
+ index->maxreadlen = readlen;
+ }
+ noofreads++;
+ }
+
+ if(curend+1 > 0) {
+ if(aln && curchrom) {
+
+ if(acceptorchr) {
+ id = bl_fastxFindIDIdx(acceptorchr, set);
+ if (id == -1) {
+ DBGEXIT("reference sequence '%s' not found\n", acceptorchr);
+ }
+ acceptorchr = bl_fastaGetDescription(set, id);
+ bl_matchfileIndexAddChrom(index, acceptorchr);
+ }
+
+ if(donorchr) {
+ id = bl_fastxFindIDIdx(donorchr, set);
+ if (id == -1) {
+ DBGEXIT("reference sequence '%s' not found\n", donorchr);
+ }
+ donorchr = bl_fastaGetDescription(set, id);
+ bl_matchfileIndexAddChrom(index, donorchr);
+ }
+
+ id = bl_fastxFindIDIdx(curchrom, set);
+
+ if (id == -1) {
+ DBGEXIT("reference sequence '%s' not found\n", curchrom);
+ }
+
+ curchrom = bl_fastaGetDescription(set, id);
+ ref = &bl_fastaGetSequence(set, id)[curstart-1];
+ allen =strlen(aln);
+
+ for(u=0, q=0, p=0; u < allen; u++) {
+
+ //skip soft clipped part here
+ if(aln[u] == 'S') {
+ q++;
+ continue;
+ }
+
+ if(qual[q] < minqual) minqual = qual[q];
+ if(qual[q] > maxqual) maxqual = qual[q];
+
+
+ Q_N[(int)qual[q]-MINQUAL]++;
+
+ if(aln[u] == 'M') {
+ MATRIX4D(index->submatrix, 6, QRNGE, MAXREADLENGTH,
+ (int)code[(int)read[q]],
+ (int)code[(int)ref[p]],
+ ((int)qual[q])-MINQUAL, q)+=1;
+
+ if(read[q] != ref[p]) {
+ Q_ERR[(int)qual[q]-MINQUAL]++;
+ if(strand == '+')
+ P_ERR[q]++;
+ else
+ P_ERR[readlen-q-1]++;
+
+
+ if(mdcigarcheck && ref[p] != diff[p]) {
+ NFO("MD doesnt match CIGAR in '%s'\n", buffer);
+ NFO("Further MD errors ignored in '%s' !\n", filename);
+ mdcigarcheck = 0;
+ }
+ }
+ } else {
+ Q_ERR[(int)qual[q]-MINQUAL]++;
+ if(strand == '-')
+ P_ERR[q]++;
+ else
+ P_ERR[readlen-q-1]++;
+ }
+
+ if(aln[u] == 'I') {
+ MATRIX4D(index->submatrix, 6, QRNGE, MAXREADLENGTH,
+ (int)code[(int)read[q]],
+ (int)code[(int)'-'],
+ ((int)qual[q])-MINQUAL, q)+=1;
+ }
+
+ if(aln[u] == 'D') {
+ if(q > 0) s = 1; else s = 0;
+ MATRIX4D(index->submatrix, 6, QRNGE, MAXREADLENGTH,
+ (int)code[(int)'-'],
+ (int)code[(int)ref[p]],
+ MIN(((int)qual[q]),((int)qual[q-s]))-MINQUAL, q)+=1;
+ }
+
+ if(aln[u] != 'I' ) {
+ p++;
+ }
+ if(aln[u] != 'D') {
+ q++;
+ }
+ }
+ FREEMEMORY(space, aln);
+ }
+
+ if(diff) {
+ FREEMEMORY(space, diff);
+ }
+
+ if(curchrom) {
+
+ k = bl_matchfileGetChromIndexNumber(index, curchrom);
+
+ if(first) {
+ lastchromidx = k;
+ first = 0;
+ }
+
+ startbin = (curstart >> index->exp);
+ endbin = (curend >> index->exp);
+
+ if(k == index->noofchroms) {
+ curbin = 0;
+ startbin = 0;
+ lastbin = 0;
+ }
+
+ if (k == index->noofchroms || !index->noofbins[k] || startbin > curbin || lastchromidx != k) {
+
+#ifdef DBGIDX
+ DBG("AP add: curchrom:%s curstart:%d, startbin:%d (exp:%d), off:%llu, lastchromidx:%d, lastbin:%d, lastend:%d\n",
+ curchrom, curstart, startbin, index->exp, off, lastchromidx, lastbin, lastend);
+#endif
+ bl_matchfileIndexAddAccessPoint (space, index, curchrom,
+ startbin, curstart, curend, off, matches, lastchromidx, lastbin, lastend);
+
+ curbin = startbin;
+ lastbin = startbin;
+ lastchromidx = k;
+ lastend = curend;
+ matches = 0;
+ }
+
+ if(curend > lastend) {
+ lastend = curend;
+ }
+
+ if (endbin > curbin) {
+
+ bl_matchfileIndexAccessPointOverlap (space, index, curchrom,
+ startbin, endbin, curstart, curend, off);
+ }
+
+ if(gzip) {
+ off = bl_ftellgzidx(gzf);
+ } else {
+ off = ftello(fp);
+ if (off == -1) {
+ DBGEXIT("ftello for '%s' failed. Exit forced.\n", filename);
+ }
+ }
+
+ }
+ matches++;
+ }
+ }
+ destructStringset(space, fields);
+ }
+
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ len = 0;
+
+ } else {
+ if(ch != '\n') buffer[len++] = ch;
+ }
+ }
+
+
+ if(matches && index->noofbins[k] == 0) {
+
+ bl_matchfileIndexAddAccessPoint (space, index, curchrom,
+ startbin, curstart, curend, off, matches, lastchromidx,
+ lastbin, lastend);
+
+ lastend = curend;
+ }
+
+
+ if(gzip) {
+ off = bl_ftellgzidx(gzf);
+ } else {
+ off = ftello(fp);
+ if (off == -1) {
+ MSG("ftello failed. Exit forced.\n");
+ exit(-1);
+ }
+ }
+
+ if(matches) {
+ index->bins[k][curbin].endoff = off;
+ index->bins[k][curbin].matches = matches;
+ index->bins[k][curbin].end = curend;
+ index->matchend[k] = (curend > index->matchend[k]) ?
+ curend : index->matchend[k];
+#ifdef DBGIDX
+ DBG("setting matchend[%d]=%d (off:%llu) after %d matches, curend %d\n",
+ k, index->matchend[k], off, matches, curend);
+#endif
+ }
+
+ if(gzip) {
+ bl_destructgzidxfile(gzf);
+ FREEMEMORY(space, gzf);
+ }
+
+ file->index = index;
+ FREEMEMORY(space, buffer);
+ FREEMEMORY(space, code);
+ fclose(fp);
+
+ file->index->Q_ERR = Q_ERR;
+ file->index->P_ERR = P_ERR;
+ file->index->Q_N = Q_N;
+ file->index->noofreads = noofreads;
+ file->index->minqual = minqual;
+ file->index->maxqual = maxqual;
+
+ return;
+}
+
+
+
+/*-------------------------- bl_matchfileNextRowGet --------------------------
+ *
+ * @brief assign a row to the read for visual representation on screen
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileNextRowGet(void *space, matchfileCross_t *cs, Uint v, Uint len) {
+ Uint i, size1=0, size2=0, max1=0, max2=0, max;
+ bitvector a;
+
+ if(v-1 < len) {
+ size1 = cs[v-1].len;
+ max1 = cs[v-1].maxrow;
+ }
+
+ size2 = cs[v].len;
+ max2 = cs[v].maxrow;
+ max = MAX(max1, max2);
+
+ a = initbitvector(space, max+2);
+ for(i=0; i < MAX(size1, size2); i++) {
+
+ if (i < size1) {
+ bitvector_setbit(a, cs[v-1].row[i], 1);
+ }
+ if (i < size2) {
+ bitvector_setbit(a, cs[v].row[i], 1);
+ }
+ }
+
+ for(i=0; i < max+2; i++) {
+ if (bitvector_getbit(a,i) == 0) {
+ break;
+ }
+ }
+
+ FREEMEMORY(space, a);
+ return i;
+}
+
+
+/*----------------------------- bl_matchfileRead -----------------------------
+ *
+ * @brief read all matches from start to end on chromname
+ * @author Steve Hoffmann
+ *
+ */
+
+matchfileCross_t*
+bl_matchfileRead(void *space, matchfile_t *file, char *chromname,
+ Uint start, Uint end, Uint maxcover, fasta_t *set, unsigned char fields,
+ matchfileCross_t *input) {
+
+ stringset_t *token;
+
+ Uint buffersize=1024, row=0, startbin, //endbin,
+ startsite, endsite, len=0, i=0, u=0, v=0, k=0,
+// curcnt=0,
+// bisulfite=0,
+ curalnlen,
+ dellen = 0,
+// curstart=0, curend=0,
+ startcover=0,
+ curcover=0,
+// acceptorpos=0, donorpos=0, xstart=0, xend = 0, xno = 0,
+ acceptorchridx, donorchridx, adjoint,
+// acceptorflg = 0, donorflg = 0, pnext = 0,
+ rnextidx=0,
+ curchromidx;
+ char *buffer = NULL, *delstring=NULL, *delquals=NULL, ch,
+ //*curseq=NULL, *curqual=NULL, *curaln,
+ *filename//, strand
+// ,*acceptorchr = NULL, *donorchr = NULL, *rnext, *curchrom
+ ;
+ unsigned char header = 1;
+ int ret=0, // edist=0,
+ readlen=0;
+ matchfileCross_t *cs = NULL;
+ matchfileindex_t *index;
+ matchfileRec_t *r;
+ unsigned char startsiteset=0, curedist=0, gzip, fmt;
+ char eop;
+ FILE *fp = NULL;
+ struct gzidxfile *gzf = NULL;
+
+#ifdef DBGIDX
+ Uint acceptorcnt = 0, donorcnt = 0;
+#endif
+
+ gzip = file->gzip;
+ fmt = file->fmt;
+ index = file->index;
+ filename = file->filename;
+
+ r = ALLOCMEMORY(space, NULL, matchfileRec_t, 1);
+ memset(r, 0, sizeof(matchfileRec_t));
+
+
+
+ startbin = (start >> index->exp);
+ k = bl_matchfileGetChromIndexNumber(index, chromname);
+
+ if(!input) {
+ cs = ALLOCMEMORY(space, NULL, matchfileCross_t, (end-start+1));
+ memset(cs, 0, sizeof(matchfileCross_t)*(end-start+1));
+ } else {
+ cs = input;
+ }
+
+
+ if(k >= index->noofchroms || startbin >= index->noofbins[k]) {
+ return cs;
+ }
+
+ if (gzip) {
+ fp = fopen(filename, "rb");
+ gzf = bl_initgzidxfile(fp, index->gzindex, index->bins[k][startbin].offset, MEDIUMCHUNK);
+ } else {
+ fp = fopen(filename, "r");
+ ret = fseeko (fp, index->bins[k][startbin].offset, SEEK_SET);
+
+ if(ret == -1) {
+ DBGEXIT("fseeko failed for '%s' Exit forced!\n", filename);
+ }
+ }
+
+ if(fp == NULL) {
+ DBGEXIT("Couldn't open file %s. Exit forced!\n", filename);
+ }
+
+ curchromidx = k;
+ r->curend = start;
+
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+
+ while((ch = (gzip) ? bl_getgzidxc(gzf) : getc(fp)) != EOF &&
+ r->curstart <= end && curchromidx == k) {
+
+
+ if(len == buffersize-1) {
+ buffersize = 2*buffersize+1;
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ }
+
+ if(ch == '\n' && len > 0) {
+
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+
+#ifdef DBGIDX
+ DBG("buffer: %s\n", buffer);
+#endif
+
+ if(header) header = bl_matchfileIsHeader(buffer, len, fmt);
+
+ if(!header) {
+ token = tokensToStringset(space, "\t", buffer, len);
+
+#ifdef NEWFUNCT
+ curchrom = bl_matchfileGetChrom(token, fmt);
+ curstart = bl_matchfileGetStartPos(token, fmt);
+ curend = bl_matchfileGetEndPos(token, fmt);
+#endif
+
+ r = bl_matchfileGetMatchFileRec(r, fields, token, fmt);
+ curchromidx = bl_matchfileGetChromIndexNumber(index, r->curchrom);
+
+
+ //fprintf(stderr, "reading %s: %d - %d\n", curchrom, curstart, curend);
+ /*last condition to avoid inclusion of 0-alignments in BAM files*/
+ if (r->curstart != r->curend &&
+ r->curend >= start && r->curstart <= end &&
+ r->curend+1 > 0 && curchromidx == k) {
+
+ readlen = strlen(r->curseq);
+#ifdef NEWFUNCT
+ curseq = bl_matchfileGetRead(token, fmt);
+
+
+ if(fields & MFREAD_QUAL)
+ curqual = bl_matchfileGetQual(token, fmt);
+
+ if(fields & MFREAD_MCNT)
+ curcnt = bl_matchfileGetMatchCnt(token, fmt);
+
+ // if (fields & MFREAD_BISULFITE)
+ bisulfite = bl_matchfileGetBisulfite(token, fmt);
+
+ curaln = bl_matchfileGetAln(token, fmt);
+ edist = bl_matchfileGetEdist(token, fmt);
+ strand = bl_matchfileGetStrand(token, fmt);
+ rnext = bl_matchfileGetRNext(token, fmt);
+ pnext = bl_matchfileGetPNext(token, fmt);
+
+ if(fields & MFREAD_SPLITS) {
+ acceptorpos = bl_matchfileGetNextPos(token, fmt);
+ acceptorchr = bl_matchfileGetNextChr(token, fmt);
+ acceptorflg = bl_matchfileGetNextFlag(token, fmt);
+ donorpos = bl_matchfileGetPrevPos(token, fmt);
+ donorchr = bl_matchfileGetPrevChr(token, fmt);
+ donorflg = bl_matchfileGetPrevFlag(token, fmt);
+ xstart = bl_matchfileGetSplitStart(token, fmt);
+ xend = bl_matchfileGetSplitEnd(token, fmt);
+ xno = bl_matchfileGetSplitNumber(token, fmt);
+ }
+#endif
+ if(r->edist > 255) {
+ curedist = 255;
+ } else curedist = r->edist;
+
+
+
+ curalnlen = strlen(r->curaln);
+ v = r->curstart - start;
+ curcover = (v < end-start+1) ? cs[v].len : startcover;
+
+ if(curcover < maxcover) {
+ startsite = 0;
+ endsite = curalnlen-1;
+
+ if (r->curaln[startsite] != 'M' || r->curaln[endsite] != 'M') {
+ startsiteset = 0;
+
+ for(i=0; i < curalnlen; i++) {
+ eop = r->curaln[i];
+ if(!startsiteset && eop == 'M') {
+ startsite = i;
+ startsiteset = 1;
+ }
+ if(eop == 'M') {
+ endsite = i;
+ }
+ }
+ }
+
+ if(fields & (MFREAD_ROW | MFREAD_SPLITS)) {
+ if(v < end-start+1) {
+ row = bl_matchfileNextRowGet(space, cs, v, end-start+1);
+ } else {
+ startcover++;
+ row = bl_matchfileNextRowGet(space, cs, 0, end-start+1);
+ }
+ }
+
+ donorchridx = -1;
+ acceptorchridx = -1;
+
+ if(r->donorchr) {
+#ifdef DBGIDX
+ DBG("setting donor chr:%d\n", r->donorchr);
+ if(i==0) donorcnt++;
+#endif
+ donorchridx=bl_matchfileGetChromIndexNumberDB(set, index, r->donorchr);
+ }
+
+ if(r->acceptorchr) {
+#ifdef DBGIDX
+ DBG("setting acceptor %d\n", r->acceptorchr);
+ if(i==0) acceptorcnt++;
+#endif
+ acceptorchridx=bl_matchfileGetChromIndexNumberDB(set, index, r->acceptorchr);
+ }
+
+ if(r->rnext && r->rnext[0] == '=') {
+ rnextidx = k;
+ } else if(r->rnext && r->rnext[0] != '*') {
+ rnextidx = bl_matchfileGetChromIndexNumberDB(set, index, r->rnext);
+ }
+
+
+ for(i=0, u=0; i < curalnlen; i++) {
+
+ if(r->curaln[i] == 'S') {
+ u++;
+ continue;
+ }
+
+ if(fields & MFREAD_SPLITS) {
+ if(r->donorchr && v < end-start+1 &&
+ ((i == 0 && r->strand == '+') ||
+ (i == curalnlen-1 && r->strand == '-'))) {
+ adjoint = -1;
+
+ if (r->acceptorchr) {
+ if (r->strand == '+')
+ adjoint = r->curend;
+ else
+ adjoint = r->curstart;
+ }
+
+ bl_matchfileAddSplit(space, cs, v, r->strand, 'A',
+ donorchridx, r->donorpos, adjoint,
+ r->xstart, r->xend, row, r->xno, r->donorflg);
+ }
+
+ if(r->acceptorchr && v < end-start+1 &&
+ (( i == 0 && r->strand == '-') ||
+ (i == curalnlen-1 && r->strand == '+'))) {
+ adjoint = -1;
+
+ if (r->donorchr) {
+ if(r->strand == '+')
+ adjoint = r->curstart;
+ else
+ adjoint = r->curend;
+ }
+
+ bl_matchfileAddSplit(space, cs, v, r->strand, 'D',
+ acceptorchridx, r->acceptorpos, adjoint,
+ r->xstart, r->xend, row, r->xno, r->acceptorflg);
+
+ }
+ }
+
+ eop = r->curaln[i];
+
+
+ if(startsite == i && v < end-start+1) {
+ cs[v].starts++;
+ if(r->rnext && r->rnext[0] != '*') {
+ bl_matchfileAddMate (space, &cs[v], rnextidx, r->pnext);
+ }
+ }
+
+ if(endsite == i && v < end-start+1) {
+ cs[v].ends++;
+ }
+
+ if (eop != 'I') {
+
+ if (delstring && fields & MFREAD_DELS) {
+ if (v < end-start+1) {
+ cs[v].dels = ALLOCMEMORY(space, cs[v].dels,
+ matchfileDeletion_t, cs[v].noofdels+1);
+ cs[v].dels[cs[v].noofdels].len = dellen;
+ cs[v].dels[cs[v].noofdels].string = delstring;
+
+ if(fields & MFREAD_QUAL)
+ cs[v].dels[cs[v].noofdels].quals = delquals;
+
+ if(fields & MFREAD_RPOS)
+ cs[v].dels[cs[v].noofdels].readpos = u - dellen;
+
+ if (fields & MFREAD_MCNT)
+ cs[v].dels[cs[v].noofdels].matchcnt = r->curcnt;
+
+ //if (fields & MFREAD_BISULFITE)
+ cs[v].dels[cs[v].noofdels].bisulfite = r->bisulfite;
+
+ if (fields & MFREAD_ROW)
+ cs[v].dels[cs[v].noofdels].row = row;
+
+ cs[v].dels[cs[v].noofdels].edist = curedist;
+ cs[v].noofdels++;
+ } else {
+
+ FREEMEMORY(space, delstring);
+ if(fields & MFREAD_QUAL)
+ FREEMEMORY(space, delquals);
+ }
+ delstring = NULL;
+ delquals = NULL;
+ dellen = 0;
+ }
+
+ if(v < end-start+1) {
+ cs[v].len++;
+
+ cs[v].chars = ALLOCMEMORY(space, cs[v].chars,
+ char, cs[v].len+1);
+
+ if(fields & MFREAD_FEAT) {
+ cs[v].feat = ALLOCMEMORY(space, cs[v].feat,
+ char, cs[v].len+1);
+ if(i==startsite) {
+ cs[v].feat[cs[v].len-1] = '*';
+ } else if(i==endsite) {
+ cs[v].feat[cs[v].len-1] = '$';
+ } else {
+ cs[v].feat[cs[v].len-1] = 0;
+ }
+ }
+
+ if(fields & MFREAD_QUAL)
+ cs[v].quals = ALLOCMEMORY(space, cs[v].quals,
+ char, cs[v].len+1);
+
+ cs[v].strands = ALLOCMEMORY(space, cs[v].strands,
+ char, cs[v].len+1);
+
+ if(fields & MFREAD_RPOS)
+ cs[v].readpos = ALLOCMEMORY(space, cs[v].readpos,
+ uint32_t, cs[v].len+1);
+
+ if(fields & MFREAD_RLEN)
+ cs[v].readlen = ALLOCMEMORY(space, cs[v].readlen,
+ uint32_t, cs[v].len+1);
+
+ if(fields & MFREAD_MCNT)
+ cs[v].matchcnt = ALLOCMEMORY(space, cs[v].matchcnt,
+ uint32_t, cs[v].len+1);
+
+ //if (fields & MFREAD_BISULFITE)
+ cs[v].bisulfite = ALLOCMEMORY(space, cs[v].bisulfite,
+ uint32_t, cs[v].len+1);
+
+
+ if(fields & MFREAD_ROW)
+ cs[v].row = ALLOCMEMORY(space, cs[v].row,
+ uint32_t, cs[v].len+1);
+
+ cs[v].edist = ALLOCMEMORY(space, cs[v].edist,
+ unsigned char, cs[v].len+1);
+ }
+
+ if (eop != 'D') {
+ if(v < end-start+1) {
+
+ cs[v].chars[cs[v].len-1] = r->curseq[u];
+ cs[v].chars[cs[v].len] = 0 ;
+
+
+ if(fields & MFREAD_QUAL) {
+ cs[v].quals[cs[v].len-1] = r->curqual[u];
+ cs[v].quals[cs[v].len] = 0;
+ }
+
+ if(fields & MFREAD_ROW) {
+ cs[v].row[cs[v].len-1] = row;
+ if(cs[v].maxrow < row) {
+ cs[v].maxrow = row;
+ }
+ }
+
+ if(fields & MFREAD_RLEN)
+ cs[v].readlen[cs[v].len-1] = readlen;
+
+ cs[v].edist[cs[v].len-1] = curedist;
+ cs[v].strands[cs[v].len-1] = r->strand;
+
+ }
+ u++;
+ } else {
+ if(v < end-start+1) {
+ cs[v].chars[cs[v].len-1] = '-';
+ cs[v].chars[cs[v].len] = 0 ;
+
+ if(fields & MFREAD_QUAL) {
+ cs[v].quals[cs[v].len-1] = r->curqual[u];
+ cs[v].quals[cs[v].len] = 0;
+ }
+
+ if(fields & MFREAD_ROW) {
+ cs[v].row[cs[v].len-1] = row;
+ if(cs[v].maxrow < row) {
+ cs[v].maxrow = row;
+ }
+ }
+
+
+ if(fields & MFREAD_RLEN)
+ cs[v].readlen[cs[v].len-1] = readlen;
+
+ cs[v].edist[cs[v].len-1] = curedist;
+ cs[v].strands[cs[v].len-1] = r->strand;
+
+ }
+ }
+
+ if(v < end-start+1) {
+
+ if(fields & MFREAD_RPOS)
+ cs[v].readpos[cs[v].len-1] = u-1;
+
+ if(fields & MFREAD_RLEN)
+ cs[v].readlen[cs[v].len-1] = readlen;
+
+ if(fields & MFREAD_MCNT)
+ cs[v].matchcnt[cs[v].len-1] = r->curcnt;
+
+ //if (fields & MFREAD_BISULFITE)
+ cs[v].bisulfite[cs[v].len-1] = r->bisulfite;
+
+ }
+ v++;
+ } else {
+ if(fields & MFREAD_DELS) {
+ delstring = ALLOCMEMORY(space, delstring, char, dellen+2);
+ if(fields & MFREAD_QUAL) {
+ delquals = ALLOCMEMORY(space, delquals, char, dellen+2);
+ delquals[dellen] = r->curqual[u];
+ delquals[dellen+1] = 0;
+ }
+ delstring[dellen] = r->curseq[u];
+ delstring[dellen+1] = 0;
+
+ dellen++;
+ u++;
+ }
+ }
+ }
+
+ if(delstring) {
+ FREEMEMORY(space, delstring);
+ if(fields & MFREAD_QUAL)
+ FREEMEMORY(space, delquals);
+ dellen = 0;
+ delstring = NULL;
+ delquals = NULL;
+ }
+ }
+ }
+ FREEMEMORY(space, r->curaln);
+ FREEMEMORY(space, r->diff);
+
+ destructStringset(space, token);
+ }
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ len = 0;
+ } else {
+ if(ch != '\n') buffer[len++] = ch;
+ }
+ }
+
+ fclose(fp);
+
+ if(gzip) {
+ bl_destructgzidxfile(gzf);
+ FREEMEMORY(space, gzf);
+ }
+
+ FREEMEMORY(space, r);
+ FREEMEMORY(space, buffer);
+ return cs;
+}
+
+/*-------------------------- bl_matchfileIndexShow ---------------------------
+ *
+ * @brief dump the index for debugging
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_matchfileIndexShow(void *space, matchfileindex_t *index, char *filename) {
+ Uint i, k;
+ stringset_t *fields;
+ char *line;
+ FILE *fp;
+ Uint curend; //, curstart;
+
+ line = ALLOCMEMORY(space, NULL, char, 1000);
+ fp = fopen(filename, "r");
+
+ for(k=0; k < index->noofchroms; k++) {
+
+ for(i=0; i < index->noofbins[k]; i++) {
+ fseeko (fp, index->bins[k][i].offset, SEEK_SET);
+ line = fgets(line, 1000, fp);
+
+
+ if(index->bins[k][i].offset) {
+ fields = tokensToStringset(space, "\t", line, strlen(line));
+ //not used: curstart = bl_matchfileGetStartPos(fields, 0);
+ curend = bl_matchfileGetEndPos(fields, 0);
+
+ assert(curend >= i *(1 << index->exp));
+
+ destructStringset(space, fields);
+ } else {
+
+ DBG( "binstart:%d start:%d, end:%d, matches:%d, offset:%lu\n",
+ i*(1 << index->exp),
+ index->bins[k][i].start, index->bins[k][i].end,
+ index->bins[k][i].matches, index->bins[k][i].offset);
+
+ }
+ }
+ }
+}
+
+
+
+
+/*-------------------------- bl_matchfileCopyCross ---------------------------
+ *
+ * @brief copy the x-section
+ * @author Steve Hoffmann
+ *
+ */
+
+
+matchfileCross_t*
+bl_matchfileCopyCross(void *space, matchfileCross_t *xs, matchfileCross_t *cs) {
+
+ uint32_t len;
+ Uint j;
+
+ memmove(xs, cs, sizeof(matchfileCross_t));
+ len = cs->len;
+
+ xs->noofsplits = cs->noofsplits;
+ xs->noofdels = cs->noofdels;
+ xs->chars = ALLOCMEMORY(space, NULL, char, len+1);
+ memset(xs->chars, 0, sizeof(char)*(len+1));
+ xs->feat = ALLOCMEMORY(space, NULL, char, len+1);
+ memset(xs->feat, 0, sizeof(char)*(len+1));
+ xs->quals = ALLOCMEMORY(space, NULL, char, len+1);
+ memset(xs->quals, 0, sizeof(char)*(len+1));
+ xs->strands = ALLOCMEMORY(space, NULL, char, len+1);
+ memset(xs->strands, 0, sizeof(char)*(len+1));
+
+ xs->bisulfite = ALLOCMEMORY(space, NULL, uint32_t, len);
+ xs->readpos = ALLOCMEMORY(space, NULL, uint32_t, len);
+ xs->readlen = ALLOCMEMORY(space, NULL, uint32_t, len);
+ xs->matchcnt = ALLOCMEMORY(space, NULL, uint32_t, len);
+ xs->row = ALLOCMEMORY(space, NULL, uint32_t, len);
+ xs->edist = ALLOCMEMORY(space, NULL, unsigned char, len);
+
+
+ memmove(xs->chars, cs->chars, sizeof(char)*len);
+ memmove(xs->feat, cs->feat, sizeof(char)*len);
+ memmove(xs->quals, cs->quals, sizeof(char)*len);
+ memmove(xs->strands, cs->strands, sizeof(char)*len);
+ memmove(xs->bisulfite, cs->bisulfite, sizeof(uint32_t)*len);
+ memmove(xs->readpos, cs->readpos, sizeof(uint32_t)*len);
+ memmove(xs->readlen, cs->readlen, sizeof(uint32_t)*len);
+ memmove(xs->matchcnt, cs->matchcnt, sizeof(uint32_t)*len);
+ memmove(xs->row, cs->row, sizeof(uint32_t)*len);
+ memmove(xs->edist, cs->edist, sizeof(unsigned char)*len);
+
+ if(cs->dels) {
+ xs->dels = ALLOCMEMORY(space, NULL, matchfileDeletion_t, cs->noofdels);
+ memmove(xs->dels, cs->dels, sizeof(matchfileDeletion_t)*cs->noofdels);
+
+ for(j=0; j < cs->noofdels; j++) {
+ xs->dels[j].string =
+ ALLOCMEMORY(space, NULL, char, strlen(cs->dels[j].string)+1);
+ memset(xs->dels[j].string, 0, strlen(cs->dels[j].string)+1);
+ xs->dels[j].quals =
+ ALLOCMEMORY(space, NULL, char, strlen(cs->dels[j].quals)+1);
+ memset(xs->dels[j].quals, 0, strlen(cs->dels[j].quals)+1);
+ memmove(xs->dels[j].string, cs->dels[j].string,
+ strlen(cs->dels[j].string));
+ memmove(xs->dels[j].quals, cs->dels[j].string,
+ strlen(cs->dels[j].quals));
+ }
+ }
+
+ if(cs->matelinks) {
+ xs->matelinks = ALLOCMEMORY(space, NULL, matelink_t, cs->noofmatelinks);
+ memmove(xs->matelinks, cs->matelinks, sizeof(matelink_t)*cs->noofmatelinks);
+ xs->noofmatelinks = cs->noofmatelinks;
+ }
+
+ if(cs->splits) {
+ xs->splits = ALLOCMEMORY(space, NULL, matchfileSplit_t, cs->noofsplits);
+ memmove(xs->splits, cs->splits, sizeof(matchfileSplit_t)*cs->noofsplits);
+ }
+
+ return xs;
+}
+
+
+
+/*------------------------ bl_matchfileDestructCross -------------------------
+ *
+ * @brief remove the x-section from the heap
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDestructCross(void *space, matchfileCross_t *cs, Uint len) {
+ Uint i,j;
+
+ for(i=0; i < len; i++) {
+
+ if(cs[i].chars) FREEMEMORY(space, cs[i].chars);
+ if(cs[i].feat) FREEMEMORY(space, cs[i].feat);
+ if(cs[i].quals) FREEMEMORY(space, cs[i].quals);
+ if(cs[i].strands) FREEMEMORY(space, cs[i].strands);
+ if(cs[i].readpos) FREEMEMORY(space, cs[i].readpos);
+ if(cs[i].matchcnt) FREEMEMORY(space, cs[i].matchcnt);
+ if(cs[i].bisulfite) FREEMEMORY(space, cs[i].bisulfite);
+ if(cs[i].readlen) FREEMEMORY(space, cs[i].readlen);
+ if(cs[i].row) FREEMEMORY(space, cs[i].row);
+ if(cs[i].edist) FREEMEMORY(space, cs[i].edist);
+ if(cs[i].matelinks) FREEMEMORY(space, cs[i].matelinks);
+
+
+ if(cs[i].dels) {
+ for(j=0; j < cs[i].noofdels; j++) {
+ FREEMEMORY(space, cs[i].dels[j].string);
+ FREEMEMORY(space, cs[i].dels[j].quals);
+ }
+ FREEMEMORY(space, cs[i].dels);
+ }
+
+ if(cs[i].splits) {
+ FREEMEMORY(space, cs[i].splits);
+ }
+
+
+ // memset(&cs[i], 0, sizeof(matchfileCross_t));
+ }
+}
+
+
+/*-------------------------- bl_matchfileReadIndex ---------------------------
+ *
+ * @brief read the index (including stats if avail) from disk
+ * @author Steve Hoffmann
+ *
+ */
+
+matchfileindex_t * bl_matchfileReadIndex (void *space, char *filename)
+{
+
+ FILE *fp;
+ unsigned char flags = 0;
+ struct point *list;
+ Uint i, j, len;
+ matchfileindex_t *idx;
+
+ idx = bl_matchfileInitIndex(space);
+
+ fp = fopen(filename, "r");
+ if (fp == NULL) {
+ DBG("Couldn't open file '%s'. Exit forced.\n", filename);
+ exit(-1);
+ }
+
+ fread(&flags, sizeof(unsigned char), 1, fp);
+
+ if(flags & GZ_IDX_STORED) {
+ idx->gzindex = ALLOCMEMORY(space, NULL, struct access, 1);
+ fread(&idx->gzindex->have, sizeof(int), 1, fp);
+ fread(&idx->gzindex->size, sizeof(int), 1, fp);
+
+ list = ALLOCMEMORY(space, NULL, struct point, idx->gzindex->size);
+
+ for(i=0; i < idx->gzindex->size; i++) {
+ fread(&list[i].out, sizeof(off_t), 1, fp);
+ fread(&list[i].in, sizeof(off_t), 1, fp);
+ fread(&list[i].bits, sizeof(int), 1, fp);
+ fread(&list[i].window, sizeof(char), WINSIZE, fp);
+ }
+
+ idx->gzindex->list = list;
+ }
+
+ fread(&idx->exp, sizeof(Uint), 1, fp);
+ fread(&idx->noofchroms, sizeof(Uint), 1, fp);
+ fread(&idx->noofreads, sizeof(Uint), 1, fp);
+
+
+ idx->chromnames = ALLOCMEMORY(space, NULL, char*, idx->noofchroms);
+
+ for(i=0; i < idx->noofchroms; i++) {
+ fread(&len, sizeof(Uint), 1, fp);
+ idx->chromnames[i] = ALLOCMEMORY(space, NULL, char, len+1);
+ memset(idx->chromnames[i], 0, len+1);
+ fread(idx->chromnames[i], sizeof(char), len, fp);
+ }
+
+ idx->matchstart = ALLOCMEMORY(space, NULL, Uint, idx->noofchroms);
+ fread(idx->matchstart, sizeof(Uint), idx->noofchroms, fp);
+ idx->matchend = ALLOCMEMORY(space, NULL, Uint, idx->noofchroms);
+ fread(idx->matchend, sizeof(Uint), idx->noofchroms, fp);
+ idx->noofbins= ALLOCMEMORY(space, NULL, Uint, idx->noofchroms);
+ fread(idx->noofbins, sizeof(Uint), idx->noofchroms, fp);
+
+ idx->bins = ALLOCMEMORY(space, NULL, matchfileBin_t*, idx->noofchroms);
+
+ for(i=0; i < idx->noofchroms; i++) {
+ idx->bins[i] = NULL;
+ if(idx->noofbins[i]) {
+ idx->bins[i]=ALLOCMEMORY(space, NULL, matchfileBin_t, idx->noofbins[i]);
+ for(j=0; j < idx->noofbins[i]; j++) {
+ fread(&idx->bins[i][j].start, sizeof(Uint), 1, fp);
+ fread(&idx->bins[i][j].end, sizeof(Uint), 1, fp);
+ fread(&idx->bins[i][j].matches, sizeof(Uint), 1, fp);
+ fread(&idx->bins[i][j].offset, sizeof(off_t), 1, fp);
+ fread(&idx->bins[i][j].endoff, sizeof(off_t), 1, fp);
+ }
+ }
+ }
+
+ fread(&idx->maxreadlen, sizeof(Uint), 1, fp);
+ idx->submatrix = ALLOCMEMORY(space, NULL, double, 6*6*QRNGE*MAXREADLENGTH);
+ fread(idx->submatrix, sizeof(double), 6*6*QRNGE*MAXREADLENGTH, fp);
+ fread(&idx->mean_coverage, sizeof(double), 1, fp);
+ fread(&idx->mean_qual, sizeof(double), 1, fp);
+
+ idx->P_ERR = ALLOCMEMORY(space, NULL, Uint, MAXREADLENGTH);
+ idx->Q_ERR = ALLOCMEMORY(space, NULL, Uint, QRNGE);
+ idx->Q_N = ALLOCMEMORY(space, NULL, Uint, QRNGE);
+
+ fread(idx->P_ERR, sizeof(Uint), MAXREADLENGTH, fp);
+ fread(idx->Q_ERR, sizeof(Uint), QRNGE, fp);
+ fread(idx->Q_N, sizeof(Uint), QRNGE, fp);
+
+ if(flags & STATS_TAB_STORED) {
+
+ idx->stats = ALLOCMEMORY(space, NULL, matchfileSampleStats_t, 1);
+ memset(idx->stats, 0, sizeof(matchfileSampleStats_t));
+
+ fread(&idx->stats->n, sizeof(Uint), 1, fp);
+ fread(&idx->stats->px, sizeof(double), 1, fp);
+ if(flags & PXX_STORED) {
+ fread(&idx->stats->pxx, sizeof(double), 1, fp);
+ }
+ fread(&idx->stats->maxcover, sizeof(Uint), 1, fp);
+ fread(&idx->stats->mincover, sizeof(Uint), 1, fp);
+
+ /*errordenstiy*/
+ fread(&idx->stats->e_N, sizeof(Uint), 1, fp);
+ NFO("reading %d e-samples from index.\n", idx->stats->e_N);
+
+ fread(&idx->stats->entropydensitystep, sizeof(double), 1, fp);
+ fread(&idx->stats->entropydensitylen, sizeof(Uint), 1, fp);
+
+ idx->stats->entropydensity =
+ ALLOCMEMORY(space, NULL, double, idx->stats->entropydensitylen);
+ fread(idx->stats->entropydensity, sizeof(double), idx->stats->entropydensitylen, fp);
+
+ idx->stats->entropy = ALLOCMEMORY(space, NULL, double, idx->stats->e_N);
+ fread(idx->stats->entropy, sizeof(double), idx->stats->e_N, fp);
+
+ if(flags & STRAND_STORED) {
+ idx->stats->b = ALLOCMEMORY(space, NULL, double, idx->stats->e_N);
+ fread(idx->stats->b, sizeof(double), idx->stats->e_N, fp);
+ fread(&idx->stats->b_mu, sizeof(double), 1, fp);
+ fread(&idx->stats->b_sd, sizeof(double), 1, fp);
+ fread(&idx->stats->b_ll, sizeof(double), 1, fp);
+ }
+
+ idx->stats->eraw = ALLOCMEMORY(space, NULL, double, idx->stats->e_N);
+ fread(idx->stats->eraw, sizeof(double), idx->stats->e_N, fp);
+
+ idx->stats->e = ALLOCMEMORY(space, NULL, double, idx->stats->e_N);
+ fread(idx->stats->e, sizeof(double), idx->stats->e_N, fp);
+
+ idx->stats->e_mu = ALLOCMEMORY(space, NULL, double, 2);
+ idx->stats->e_sd = ALLOCMEMORY(space, NULL, double, 2);
+
+ fread(&idx->stats->e_mu[0], sizeof(double), 1, fp);
+ fread(&idx->stats->e_mu[1], sizeof(double), 1, fp);
+ fread(&idx->stats->e_sd[0], sizeof(double), 1, fp);
+ fread(&idx->stats->e_sd[1], sizeof(double), 1, fp);
+ fread(&idx->stats->e_ll, sizeof(double), 1, fp);
+
+ idx->stats->e_mu = ALLOCMEMORY(space, NULL, double, 2);
+ idx->stats->e_sd = ALLOCMEMORY(space, NULL, double, 2);
+ idx->stats->gev_mu = ALLOCMEMORY(space, NULL, double, 2);
+ idx->stats->gev_si = ALLOCMEMORY(space, NULL, double, 2);
+ idx->stats->gev_xi = ALLOCMEMORY(space, NULL, double, 2);
+ idx->stats->gev_ll = ALLOCMEMORY(space, NULL, double, 2);
+
+
+ if(flags & STATS_GEV_STORED) {
+
+ NFO("reading gev %d.\n", flags);
+ fread(&idx->stats->gev_mu[0], sizeof(double), 1, fp);
+ fread(&idx->stats->gev_mu[1], sizeof(double), 1, fp);
+ fread(&idx->stats->gev_si[0], sizeof(double), 1, fp);
+ fread(&idx->stats->gev_si[1], sizeof(double), 1, fp);
+ fread(&idx->stats->gev_xi[0], sizeof(double), 1, fp);
+ fread(&idx->stats->gev_xi[1], sizeof(double), 1, fp);
+ fread(&idx->stats->gev_ll[0], sizeof(double), 1, fp);
+ fread(&idx->stats->gev_ll[1], sizeof(double), 1, fp);
+ } else {
+ NFO("not reading gev %d.\n", flags);
+ idx->stats->gev_mu[0] = 0.044763;
+ idx->stats->gev_mu[1] = 0.020171;
+ idx->stats->gev_si[0] = 0.022864;
+ idx->stats->gev_si[1] = 0.031077;
+ idx->stats->gev_xi[0] = 0.212219;
+ idx->stats->gev_xi[1] = -0.041355;
+ idx->stats->gev_ll[0] = 6291.208397;
+ idx->stats->gev_ll[1] = 5908.074411;
+ }
+
+ fread(&idx->stats->P, sizeof(Uint), 1, fp);
+ fread(&idx->stats->X, sizeof(Uint), 1, fp);
+ fread(&idx->stats->N, sizeof(Uint), 1, fp);
+
+ fread(&idx->stats->RR_N, sizeof(Uint), 1, fp);
+ idx->stats->RR = ALLOCMEMORY(space, NULL, Uint, 11);
+ fread(idx->stats->RR, sizeof(Uint), 11, fp);
+
+ fread(&idx->stats->MM_N, sizeof(Uint), 1, fp);
+ idx->stats->MM = ALLOCMEMORY(space, NULL, Uint, 51);
+ fread(idx->stats->MM, sizeof(Uint), 51, fp);
+
+ fread(&idx->stats->areasize, sizeof(Uint), 1, fp);
+ fread(&idx->stats->maxareae, sizeof(double), 1, fp);
+
+ /*substition*/
+ idx->stats->S_N = ALLOCMEMORY(space, NULL, Uint, 6);
+ fread(idx->stats->S_N, sizeof(Uint), 6, fp);
+ idx->stats->S = ALLOCMEMORY(space, NULL, double, 6*6);
+ fread(idx->stats->S, sizeof(double), 6*6, fp);
+
+ idx->stats->Sx_N = ALLOCMEMORY(space, NULL, Uint, 6);
+ fread(idx->stats->Sx_N, sizeof(Uint), 6, fp);
+ idx->stats->Sx = ALLOCMEMORY(space, NULL, double, 6*6);
+ fread(idx->stats->Sx, sizeof(double), 6*6, fp);
+
+ /*noise*/
+ idx->stats->R_N = ALLOCMEMORY(space, NULL, Uint, 100*255);
+ fread(idx->stats->R_N, sizeof(Uint), 100*255, fp);
+ idx->stats->R = ALLOCMEMORY(space, NULL, Uint, 100*255);
+ fread(idx->stats->R, sizeof(Uint), 100*255, fp);
+
+ idx->stats->RP_N = ALLOCMEMORY(space, NULL, Uint, 100);
+ fread(idx->stats->RP_N, sizeof(Uint), 100, fp);
+ idx->stats->RP = ALLOCMEMORY(space, NULL, Uint, 100);
+ fread(idx->stats->RP, sizeof(Uint), 100, fp);
+
+ idx->stats->RQ_N = ALLOCMEMORY(space, NULL, Uint, 255);
+ fread(idx->stats->RQ_N, sizeof(Uint), 255, fp);
+ idx->stats->RQ = ALLOCMEMORY(space, NULL, Uint, 255);
+ fread(idx->stats->RQ, sizeof(Uint), 255, fp);
+
+ /*read variance*/
+ fread(&idx->stats->V_N, sizeof(Uint), 1, fp);
+ idx->stats->V = ALLOCMEMORY(space, NULL, double, idx->stats->V_N);
+ fread(idx->stats->V, sizeof(double), idx->stats->V_N, fp);
+ fread(&idx->stats->V_mu, sizeof(double), 1, fp);
+ fread(&idx->stats->V_sd, sizeof(double), 1, fp);
+ fread(&idx->stats->V_ll, sizeof(double), 1, fp);
+
+ fread(&idx->stats->Vx_N, sizeof(Uint), 1, fp);
+ idx->stats->Vx = ALLOCMEMORY(space, NULL, double, idx->stats->Vx_N);
+ fread(idx->stats->Vx, sizeof(double), idx->stats->Vx_N, fp);
+ fread(&idx->stats->Vx_mu, sizeof(double), 1, fp);
+ fread(&idx->stats->Vx_sd, sizeof(double), 1, fp);
+ fread(&idx->stats->Vx_ll, sizeof(double), 1, fp);
+
+ if(flags & MOTIF_STORED) {
+ idx->stats->MO_N = ALLOCMEMORY(space, NULL, Uint, 1024);
+ fread(idx->stats->MO_N, sizeof(Uint), 1024, fp);
+ idx->stats->MO = ALLOCMEMORY(space, NULL, Uint, 1024);
+ fread(idx->stats->MO, sizeof(Uint), 1024, fp);
+ }
+
+ fread(&idx->stats->s_N, sizeof(Uint), 1, fp);
+ NFO("reading %d scores from index.\n", idx->stats->s_N);
+ idx->stats->s = ALLOCMEMORY(space, NULL, double, idx->stats->s_N);
+ fread(idx->stats->s, sizeof(double), idx->stats->s_N, fp);
+
+ }
+
+
+ fclose(fp);
+ return idx ;
+}
+
+/*-------------------------- bl_matchfileWriteIndex --------------------------
+ *
+ * @brief write the index (including statistics) to disk
+ * @author Steve Hoffmann
+ *
+ */
+
+void bl_matchfileWriteIndex(matchfileindex_t *idx, char *filename) {
+
+ FILE *fp;
+ unsigned char flags = 0;
+ Uint i, j, nchrom, nreads, exp, len;
+ matchfileBin_t *bin;
+
+ fp = fopen(filename, "w");
+ if (fp == NULL) {
+ DBG("Couldn't open file %s. Exit forced.\n", filename);
+ exit(-1);
+ }
+
+ if(idx->gzindex) {
+ flags |= GZ_IDX_STORED;
+ }
+
+ if(idx->stats) {
+ flags |= STATS_TAB_STORED;
+ flags |= PXX_STORED;
+ if(idx->stats->gev_mu) {
+ flags |= STATS_GEV_STORED;
+ }
+ if(idx->stats->MO) {
+ flags |= MOTIF_STORED;
+ }
+ if(idx->stats->b) {
+ flags |= STRAND_STORED;
+ }
+ }
+
+ if(idx->md5) {
+ flags |= IDXMD5_STORED;
+ }
+
+ fwrite(&flags, sizeof(unsigned char), 1, fp);
+
+ if(idx->gzindex) {
+ fwrite(&idx->gzindex->have, sizeof(int), 1, fp);
+ fwrite(&idx->gzindex->size, sizeof(int), 1, fp);
+
+ struct point *list = idx->gzindex->list;
+ for(i=0; i < idx->gzindex->size; i++) {
+
+ fwrite(&list[i].out, sizeof(off_t), 1, fp);
+ fwrite(&list[i].in, sizeof(off_t), 1, fp);
+ fwrite(&list[i].bits, sizeof(int), 1, fp);
+ fwrite(list[i].window, sizeof(unsigned char), WINSIZE, fp);
+ }
+ }
+
+ nchrom = idx->noofchroms;
+ nreads = idx->noofreads;
+ exp = idx->exp;
+
+ fwrite(&exp, sizeof(Uint), 1, fp);
+ fwrite(&nchrom, sizeof(Uint), 1, fp);
+ fwrite(&nreads, sizeof(Uint), 1, fp);
+
+ for(i=0; i < nchrom; i++) {
+ len = strlen(idx->chromnames[i]);
+ fwrite(&len, sizeof(Uint), 1, fp);
+ fwrite(idx->chromnames[i], sizeof(char), len, fp);
+ }
+
+ fwrite(idx->matchstart, sizeof(Uint), nchrom, fp);
+ fwrite(idx->matchend, sizeof(Uint), nchrom, fp);
+ fwrite(idx->noofbins, sizeof(Uint), nchrom, fp);
+
+ for(i=0; i < nchrom; i++) {
+
+ //fwrite(idx->bins[i], sizeof(matchfileBin_t), idx->noofbins[i], fp);
+ for(j=0; j < idx->noofbins[i]; j++) {
+ bin = idx->bins[i];
+ fwrite(&bin[j].start, sizeof(Uint), 1, fp);
+ fwrite(&bin[j].end, sizeof(Uint), 1, fp);
+ fwrite(&bin[j].matches, sizeof(Uint), 1, fp);
+ fwrite(&bin[j].offset, sizeof(off_t), 1, fp);
+ fwrite(&bin[j].endoff, sizeof(off_t), 1, fp);
+ }
+ }
+
+ fwrite(&idx->maxreadlen, sizeof(Uint), 1, fp);
+ fwrite(idx->submatrix, sizeof(double), (6*6*QRNGE*MAXREADLENGTH), fp);
+ fwrite(&idx->mean_coverage, sizeof(double), 1, fp);
+ fwrite(&idx->mean_qual, sizeof(double), 1, fp);
+
+ fwrite(idx->P_ERR, sizeof(Uint), MAXREADLENGTH, fp);
+ fwrite(idx->Q_ERR, sizeof(Uint), QRNGE, fp);
+ fwrite(idx->Q_N, sizeof(Uint), QRNGE, fp);
+
+ if(idx->stats) {
+
+ fwrite(&idx->stats->n, sizeof(Uint), 1, fp);
+ fwrite(&idx->stats->px, sizeof(double), 1, fp);
+ fwrite(&idx->stats->pxx, sizeof(double), 1, fp);
+ fwrite(&idx->stats->maxcover, sizeof(Uint), 1, fp);
+ fwrite(&idx->stats->mincover, sizeof(Uint), 1, fp);
+
+ /*errordensity*/
+ NFO("writing %d e-samples to index.\n", idx->stats->e_N);
+ fwrite(&idx->stats->e_N, sizeof(Uint), 1, fp);
+
+ fwrite(&idx->stats->entropydensitystep, sizeof(double), 1, fp);
+ fwrite(&idx->stats->entropydensitylen, sizeof(Uint), 1, fp);
+
+ fwrite(idx->stats->entropydensity, sizeof(double), idx->stats->entropydensitylen, fp);
+ fwrite(idx->stats->entropy, sizeof(double), idx->stats->e_N, fp);
+
+ if(idx->stats->b) {
+ fwrite(idx->stats->b, sizeof(double), idx->stats->e_N, fp);
+ fwrite(&idx->stats->b_mu, sizeof(double), 1, fp);
+ fwrite(&idx->stats->b_sd, sizeof(double), 1, fp);
+ fwrite(&idx->stats->b_ll, sizeof(double), 1, fp);
+ }
+
+ fwrite(idx->stats->eraw, sizeof(double), idx->stats->e_N, fp);
+ fwrite(idx->stats->e, sizeof(double), idx->stats->e_N, fp);
+ fwrite(&idx->stats->e_mu[0], sizeof(double), 1, fp);
+ fwrite(&idx->stats->e_mu[1], sizeof(double), 1, fp);
+ fwrite(&idx->stats->e_sd[0], sizeof(double), 1, fp);
+ fwrite(&idx->stats->e_sd[1], sizeof(double), 1, fp);
+ fwrite(&idx->stats->e_ll, sizeof(double), 1, fp);
+
+ if(idx->stats->gev_mu) {
+
+ fwrite(&idx->stats->gev_mu[0], sizeof(double), 1, fp);
+ fwrite(&idx->stats->gev_mu[1], sizeof(double), 1, fp);
+ fwrite(&idx->stats->gev_si[0], sizeof(double), 1, fp);
+ fwrite(&idx->stats->gev_si[1], sizeof(double), 1, fp);
+ fwrite(&idx->stats->gev_xi[0], sizeof(double), 1, fp);
+ fwrite(&idx->stats->gev_xi[1], sizeof(double), 1, fp);
+ fwrite(&idx->stats->gev_ll[0], sizeof(double), 1, fp);
+ fwrite(&idx->stats->gev_ll[1], sizeof(double), 1, fp);
+
+ }
+
+ fwrite(&idx->stats->P, sizeof(Uint), 1, fp);
+ fwrite(&idx->stats->X, sizeof(Uint), 1, fp);
+ fwrite(&idx->stats->N, sizeof(Uint), 1, fp);
+
+ fwrite(&idx->stats->RR_N, sizeof(Uint), 1, fp);
+ fwrite(idx->stats->RR, sizeof(Uint), 11, fp);
+ fwrite(&idx->stats->MM_N, sizeof(Uint), 1, fp);
+ fwrite(idx->stats->MM, sizeof(Uint), 51, fp);
+
+ fwrite(&idx->stats->areasize, sizeof(Uint), 1, fp);
+ fwrite(&idx->stats->maxareae, sizeof(double), 1, fp);
+
+ /*substitution*/
+ fwrite(idx->stats->S_N, sizeof(Uint), 6, fp);
+ fwrite(idx->stats->S, sizeof(double), 6*6, fp);
+ fwrite(idx->stats->Sx_N, sizeof(Uint), 6, fp);
+ fwrite(idx->stats->Sx, sizeof(double), 6*6, fp);
+
+ /*noise*/
+ fwrite(idx->stats->R_N, sizeof(Uint), 100*255, fp);
+ fwrite(idx->stats->R, sizeof(Uint), 100*255, fp);
+ fwrite(idx->stats->RP_N, sizeof(Uint), 100, fp);
+ fwrite(idx->stats->RP, sizeof(Uint), 100, fp);
+ fwrite(idx->stats->RQ_N, sizeof(Uint), 255, fp);
+ fwrite(idx->stats->RQ, sizeof(Uint), 255, fp);
+
+ /*readvariance*/
+ fwrite(&idx->stats->V_N, sizeof(Uint), 1, fp);
+ fwrite(idx->stats->V, sizeof(double), idx->stats->V_N, fp);
+ fwrite(&idx->stats->V_mu, sizeof(double), 1, fp);
+ fwrite(&idx->stats->V_sd, sizeof(double), 1, fp);
+ fwrite(&idx->stats->V_ll, sizeof(double), 1, fp);
+
+ fwrite(&idx->stats->Vx_N, sizeof(Uint), 1, fp);
+ fwrite(idx->stats->Vx, sizeof(double), idx->stats->Vx_N, fp);
+ fwrite(&idx->stats->Vx_mu, sizeof(double), 1, fp);
+ fwrite(&idx->stats->Vx_sd, sizeof(double), 1, fp);
+ fwrite(&idx->stats->Vx_ll, sizeof(double), 1, fp);
+
+ /*motif*/
+ if(idx->stats->MO) {
+ fwrite(idx->stats->MO_N, sizeof(Uint), 1024, fp);
+ fwrite(idx->stats->MO, sizeof(Uint), 1024, fp);
+ }
+
+ NFO("writing %d scores to index.\n", idx->stats->s_N);
+ fwrite(&idx->stats->s_N, sizeof(Uint), 1, fp);
+ fwrite(idx->stats->s, sizeof(double), idx->stats->s_N, fp);
+
+ }
+
+ fclose(fp);
+ return;
+}
+
+Uint
+bl_compareIndices(matchfileindex_t *i1, matchfileindex_t *i2){
+
+ Uint i, j;
+
+ if(i1->gzindex == NULL && i2->gzindex == NULL) return 0;
+
+ if(i1->gzindex->have != i2->gzindex->have) return 1;
+ if(i1->gzindex->size != i2->gzindex->size) return 2;
+
+ for(i=0; i < i1->gzindex->size; i++) {
+ if(i1->gzindex->list[i].out != i1->gzindex->list[i].out) return 3;
+ if(i1->gzindex->list[i].in != i1->gzindex->list[i].in) return 4;
+ if(i1->gzindex->list[i].bits != i1->gzindex->list[i].bits) return 5;
+
+ for(j=0; j < WINSIZE; j++) {
+ if(i1->gzindex->list[i].window[j] !=
+ i1->gzindex->list[i].window[j]) return 6;
+ }
+ }
+
+ if(i1->exp != i2->exp) return 7;
+ if(i1->noofchroms != i2->noofchroms) return 8;
+ if(i1->noofreads != i2->noofreads) return 9;
+
+ for(i=0; i < i1->noofchroms; i++) {
+ if(strcmp(i1->chromnames[i], i2->chromnames[i]) != 0) return 10;
+ if(i1->matchstart[i] != i2->matchstart[i]) return 11;
+ if(i1->matchend[i] != i2->matchend[i]) return 12;
+ if(i1->noofbins[i] != i2->noofbins[i]) return 13;
+
+ for(j=0; j < i1->noofbins[i]; j++) {
+ if(i1->bins[i][j].start != i2->bins[i][j].start) return 14;
+ if(i1->bins[i][j].end != i2->bins[i][j].end) return 15;
+ if(i1->bins[i][j].matches != i2->bins[i][j].matches) return 16;
+ if(i1->bins[i][j].offset != i2->bins[i][j].offset) return 17;
+ if(i1->bins[i][j].endoff != i2->bins[i][j].endoff) return 18;
+ }
+ }
+
+
+ if(i1->maxreadlen != i2->maxreadlen) return 19;
+ if(i1->mean_coverage != i2->mean_coverage) return 20;
+ if(i1->mean_qual != i2->mean_qual) {
+ return 21;
+ }
+
+ for(i=0; i < MAXREADLENGTH; i++) {
+ if(i1->P_ERR[i] != i2->P_ERR[i]) return 22;
+ }
+
+ for(i=0; i < QRNGE; i++) {
+ if(i1->Q_ERR[i] != i2->Q_ERR[i]) return 23;
+ if(i1->Q_N[i] != i2->Q_N[i]) return 24;
+ }
+
+ for(i=0; i < QRNGE*MAXREADLENGTH*6*6; i++) {
+ if(i1->submatrix[i] != i2->submatrix[i]) return 24;
+ }
+
+
+ if(i1->stats && i2->stats) {
+
+ if(i1->stats->n != i2->stats->n) return 25;
+ if(i1->stats->px != i2->stats->px) return 26;
+ if(i1->stats->maxcover != i2->stats->maxcover) return 27;
+ if(i1->stats->mincover != i2->stats->mincover) return 28;
+ if(i1->stats->e_N != i2->stats->e_N) return 29;
+
+ for(i=0; i < i1->stats->e_N; i++) {
+ if(i1->stats->e[i] != i2->stats->e[i]) return 30;
+ }
+
+ if(i1->stats->e_mu[0] != i2->stats->e_mu[0]) return 31;
+ if(i1->stats->e_mu[1] != i2->stats->e_mu[1]) return 32;
+ if(i1->stats->e_sd[0] != i2->stats->e_sd[0]) return 33;
+ if(i1->stats->e_sd[1] != i2->stats->e_sd[1]) return 34;
+
+ if(i1->stats->e_ll != i2->stats->e_ll) return 35;
+ if(i1->stats->areasize != i2->stats->areasize) return 36;
+ if(i1->stats->maxareae != i2->stats->maxareae) return 37;
+
+ for(i=0; i < 6*6; i++) {
+ if(i1->stats->S[i] != i2->stats->S[i]) return 38;
+ if(i1->stats->Sx[i] != i2->stats->Sx[i]) return 39;
+ }
+
+ for(i=0; i < 6; i++) {
+ if(i1->stats->S_N[i] != i2->stats->S_N[i]) return 40;
+ if(i1->stats->Sx_N[i] != i2->stats->Sx_N[i]) return 41;
+ }
+
+ for(i=0; i < 100*255; i++) {
+ if(i1->stats->R_N[i] != i2->stats->R_N[i]) return 42;
+ if(i1->stats->R[i] != i2->stats->R[i]) return 43;
+ }
+
+ for(i=0; i < 100; i++) {
+ if(i1->stats->RP_N[i] != i2->stats->RP_N[i]) return 42;
+ if(i1->stats->RP[i] != i2->stats->RP[i]) return 43;
+ }
+
+ for(i=0; i < 255; i++) {
+ if(i1->stats->RQ_N[i] != i2->stats->RQ_N[i]) return 42;
+ if(i1->stats->RQ[i] != i2->stats->RQ[i]) return 43;
+ }
+
+
+ if (i1->stats->V_N != i2->stats->V_N) return 45;
+
+ for(i=0; i < i1->stats->V_N; i++) {
+ if(i1->stats->V[i] != i2->stats->V[i]) return 46;
+ }
+
+ if (i1->stats->V_mu != i2->stats->V_mu) return 47;
+ if (i1->stats->V_sd != i2->stats->V_sd) return 48;
+ if (i1->stats->V_ll != i2->stats->V_ll) return 49;
+
+ if (i1->stats->Vx_N != i2->stats->Vx_N) return 50;
+
+ for(i=0; i < i1->stats->Vx_N; i++) {
+ if(i1->stats->Vx[i] != i2->stats->Vx[i]) return 51;
+ }
+
+ if (i1->stats->Vx_mu != i2->stats->Vx_mu) {
+ fprintf(stderr, "vx_mu :%f!=%f:vx_mu\n", i1->stats->Vx_mu, i2->stats->Vx_mu);
+ fprintf(stderr, "vx_sd :%f!=%f:vx_sd\n", i1->stats->Vx_sd, i2->stats->Vx_sd);
+ fprintf(stderr, "vx_ll :%f!=%f:vx_ll\n", i1->stats->Vx_ll, i2->stats->Vx_ll);
+ return 52;
+ }
+ if (i1->stats->Vx_sd != i2->stats->Vx_sd) return 53;
+ if (i1->stats->Vx_ll != i2->stats->Vx_ll) return 54;
+
+ }
+
+ return 0;
+}
+
+/*-------------------------- bl_matchfileDumpStats ---------------------------
+ *
+ * @brief dump the match file stats
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDumpFileStats (void *space, matchfile_t *file)
+{
+
+ Uint i, sum;
+ Uint *Q_ERR, *P_ERR, *Q_N;
+
+ Q_N = file->index->Q_N;
+ Q_ERR = file->index->Q_ERR;
+ P_ERR = file->index->P_ERR;
+
+ fprintf(stderr, "Q " );
+ for(i=0; i < QRNGE; i++){
+ fprintf(stderr, "%d\t\t\t", MINQUAL+i);
+ }
+
+ sum = 0;
+
+ for(i=0; i < QRNGE; i++) {
+ sum += Q_N[i];
+ }
+
+ fprintf(stderr, "Q_N \t");
+ for(i=0; i < QRNGE; i++) {
+ fprintf(stderr, "%.1f\t\t\t", (double)Q_N[i]/sum);
+ }
+
+ fprintf(stderr, "\nQ_COR\t");
+ for(i=0; i < QRNGE; i++) {
+ fprintf(stderr, "%d (%.3f)\t\t\t", Q_N[i]-Q_ERR[i],
+ (double)((double)Q_N[i]-Q_ERR[i])/Q_N[i]);
+ }
+
+ fprintf(stderr, "\nQ_ERR\t");
+ for(i=0; i < QRNGE; i++) {
+ fprintf(stderr, "%d (%.3f)\t\t\t", Q_ERR[i], (double)Q_ERR[i]/Q_N[i]);
+ }
+
+ fprintf(stderr, "\n\nP_ERR\t");
+ for(i=0; i < MAXREADLENGTH; i++) {
+ fprintf(stderr, "%.6f\t\t\t", (double)P_ERR[i]/file->index->noofreads);
+ }
+
+ fprintf(stderr,"\n");
+
+ return ;
+}
+
+
+/*----------------------- bl_matchfileDumpCrossSection -----------------------
+ *
+ * @brief dump cross section
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDumpCrossSection (matchfileCross_t *cs, Uint pos)
+{
+
+ printf("%c\t", cs->cons);
+ printf("%d\t", cs->len);
+ printf("%s\t", cs->chars);
+ printf("%s\t", cs->quals);
+ printf("\n");
+
+ return ;
+}
+
+/*------------------------- bl_matchfileAdjustBounds -------------------------
+ *
+ * @brief check and adjust frame bounds if violated
+ * @author Steve Hoffmann
+ *
+ */
+
+
+Uint
+bl_matchfileAdjustBounds(void *space, matchfile_t *file, fasta_t *set,
+ Uint setid, Uint matid, Uint start, Uint width,
+ Uint *newstart, Uint *newwidth) {
+
+ Uint maxmat=0, maxchr=0, len=0;
+
+ *newstart = start;
+ *newwidth = width;
+
+ if(set && setid < set->noofseqs) {
+ maxchr = bl_fastaGetSequenceLength(set, setid);
+ }
+
+ if(matid < file->index->noofchroms &&
+ file->index->noofbins[matid] && file->index->bins[matid]) {
+ maxmat = file->index->bins[matid][file->index->noofbins[matid]-1].end;
+ }
+
+ len = maxmat;
+ if(maxmat) {
+ if(start+width > maxmat) {
+ if(maxmat > width) {
+ *newstart = maxmat-width;
+ } else {
+ *newstart = 1;
+ *newwidth = maxmat;
+ }
+
+ }
+ }
+
+ if(maxchr) {
+ len = maxchr;
+ *newstart = start;
+ if(start+width > maxchr) {
+ if (maxchr > width ) {
+ *newstart = maxchr - width;
+ } else {
+ *newstart = 1;
+ *newwidth = maxchr;
+ }
+ }
+ }
+
+ return len;
+}
+
+/*--------------------------- bl_matchfileGetFrame ---------------------------
+ *
+ * @brief get a frame of specified width from indexed matchfile
+ * ATTENTION: start is 1-offset
+ * @author Steve Hoffmann
+ *
+ */
+
+matchfileFrame_t *
+bl_matchfileGetFrame(void *space, matchfile_t *file, char *chrname,
+ Uint start, Uint width, fasta_t *set, Uint maxcover, matchfileCross_t *inputcs) {
+
+ Uint i=0, k=0, len, adjstart=0, adjwidth=0;
+ matchfileFrame_t *frame;
+
+ frame = ALLOCMEMORY(space, NULL, matchfileFrame_t, 1);
+ frame->chrname = chrname;
+ frame->start = start;
+ frame->width = width;
+ frame->ref = NULL;
+ frame->chrseq = NULL;
+
+ k = bl_matchfileGetChromIndexNumber(file->index, chrname);
+
+
+ if (set) i = bl_fastxFindIDIdx(chrname, set);
+
+
+ len = bl_matchfileAdjustBounds(space, file, set, i, k, start, width,
+ &adjstart, &adjwidth);
+
+
+ frame->start = adjstart;
+ frame->width = adjwidth;
+ frame->chrlen = len;
+
+#ifdef DBGIDX
+ DBG("frame [%d,%d] (chrlen:%d)\n", frame->start,
+ frame->start+frame->width, frame->chrlen);
+#endif
+
+ if (i < set->noofseqs){
+ frame->chrseq = bl_fastaGetSequence(set, i);
+ frame->ref = &bl_fastaGetSequence(set, i)[adjstart-1];
+ frame->chrname = bl_fastaGetDescription(set, i);
+ }
+
+ frame->cs = bl_matchfileRead(space, file, chrname, frame->start,
+ frame->start+adjwidth-1, maxcover, set, 255, inputcs);
+
+
+ return frame;
+}
+
+
+/*------------------------- bl_matchfileMergeFrames --------------------------
+ *
+ * @brief merge two frames, ie. attach frame g to frame f
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileMergeFrames (void *space, matchfileFrame_t* f, matchfileFrame_t *g)
+{
+
+ assert(!strcmp(f->chrname,g->chrname));
+ f->cs = ALLOCMEMORY(NULL, f->cs, matchfileCross_t, f->width+g->width);
+ memmove(&f->cs[f->width], g->cs, sizeof(matchfileCross_t)*g->width);
+ f->width += g->width;
+
+ return ;
+}
+
+/*------------------------------ bl_writewiggle ------------------------------
+ *
+ * @brief write a wiggle to a file
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_writeexpression (void *space, char *filename, Uint filenamelen,
+ matchfile_t *file, fasta_t *set, Uint width, Uint maxcover)
+{
+
+ char *chromname, *minusfilename, *plusfilename, *basename, *buffer;
+ FILE *plus, *minus, *outfile;
+ size_t buffersize = 1024, len;
+ matchfileCross_t *cs;
+ char mheader, pheader;
+ Uint i, j, k, s, idx, mstrands, pstrands, basenamelen;
+
+
+ basename = bl_basename(filename);
+ basenamelen = strlen(basename);
+
+ minusfilename = bl_getTempFile(basename, basenamelen);
+ plusfilename = bl_getTempFile(basename, basenamelen);
+
+ minus = fopen(minusfilename, "w");
+ plus = fopen(plusfilename, "w");
+
+
+ for(k=0; k < file->index->noofchroms; k++) {
+ chromname = file->index->chromnames[k];
+ mheader = 0;
+ pheader = 0;
+
+ for(j=file->index->matchstart[k];
+ j < file->index->matchend[k]; j+= width) {
+
+ idx = bl_fastxFindIDIdx(chromname, set);
+
+ if (idx < set->noofseqs){
+ chromname = bl_fastaGetDescription(set, idx);
+ } else {
+ NFO("the chromsome %s was not found in the db. Exit forced!\n", chromname);
+ exit(-1);
+ }
+
+ NFO("reading interval [%d,%d] on '%s'\n", j, j+width-1, chromname);
+
+ cs = bl_matchfileRead(space, file, chromname,
+ j, j+width-1, maxcover, set, 0, NULL);
+
+ NFO("dumping interval [%d,%d] to device\n", j, j+width-1);
+
+ for(i=0; i < width; i++) {
+ mstrands = 0;
+ pstrands = 0;
+ for(s=0; s < cs[i].len; s++) {
+ if (cs[i].strands[s] == '+')
+ pstrands++;
+ else
+ mstrands++;
+ }
+ assert(mstrands+pstrands == cs[i].len);
+ if(pstrands) {
+ if(!pheader) {
+ fprintf(plus, "track type=wiggle_0 name=\"%s_(+)\" description=\"%s exp (+)\"\n",
+ basename, basename);
+ fprintf(plus, "variableStep chrom=%s\n", chromname);
+ }
+ fprintf(plus, "%d\t%d\n", j+i, pstrands);
+ pheader = 1;
+ }
+
+ if(mstrands){
+ if(!mheader) {
+ fprintf(minus,"track type=wiggle_0 name=\"%s_(-)\" description=\"%s exp (-)\"\n",
+ basename, basename);
+ fprintf(minus, "variableStep chrom=%s\n", chromname);
+ }
+ fprintf(minus, "%d\t%d\n", j+i, mstrands);
+ mheader = 1;
+ }
+ }
+
+ bl_matchfileDestructCross(space, cs, width);
+ FREEMEMORY(space, cs);
+ }
+ }
+
+ fclose(minus);
+ fclose(plus);
+
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+
+ minus = fopen(minusfilename, "rb");
+ outfile = fopen(filename, "wb");
+
+ while((len = fread(buffer, 1, buffersize, minus)) > 0) {
+ fwrite(buffer, 1, len, outfile);
+
+ }
+
+ fclose(minus);
+
+ plus = fopen(plusfilename, "rb");
+ while((len = fread(buffer, 1, buffersize, plus)) > 0) {
+ fwrite(buffer, 1, len, outfile);
+ }
+
+ fclose(plus);
+ fclose(outfile);
+ bl_rm(space, minusfilename);
+ bl_rm(space, plusfilename);
+
+ FREEMEMORY(space, buffer);
+ FREEMEMORY(space, minusfilename);
+ FREEMEMORY(space, plusfilename);
+
+ return 0;
+}
+
+
+/*----------------------------- bl_writeVars2vix -----------------------------
+ *
+ * @brief write variants to internal format
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_writeVars2vix ( )
+{
+ return ;
+}
+
diff --git a/segemehl/libs/matchfiles.h b/segemehl/libs/matchfiles.h
new file mode 100644
index 0000000..04f33b5
--- /dev/null
+++ b/segemehl/libs/matchfiles.h
@@ -0,0 +1,475 @@
+#ifndef MATCHFILES_H
+#define MATCHFILES_H
+
+/*
+ *
+ * readmatchfiles.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 06.10.2010 01:10:47 CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include "zran.h"
+#include "biofiles.h"
+#include "basic-types.h"
+#include "mathematics.h"
+
+#define SAM 0
+#define SEG 1
+#define MAXREADLENGTH 1500
+#define MINQUAL 33
+#define MAXQUAL 104
+#define QRNGE (MAXQUAL-MINQUAL+1)
+
+#define GZ_IDX_STORED ((unsigned char) (1 << 0))
+#define STATS_TAB_STORED ((unsigned char) (1 << 1))
+#define IDXMD5_STORED ((unsigned char ) (1 << 2))
+#define STATS_GEV_STORED ((unsigned char ) (1 << 3))
+#define MOTIF_STORED ((unsigned char ) (1 << 4))
+#define STRAND_STORED ((unsigned char ) (1 << 5))
+#define PXX_STORED ((unsigned char ) (1 << 6))
+
+#define MFREAD_FEAT ((unsigned char) (1 << 0))
+#define MFREAD_QUAL ((unsigned char) (1 << 1))
+#define MFREAD_DELS ((unsigned char) (1 << 2))
+#define MFREAD_SPLITS ((unsigned char) (1 << 3))
+#define MFREAD_RLEN ((unsigned char) (1 << 4))
+#define MFREAD_MCNT ((unsigned char) (1 << 5))
+#define MFREAD_ROW ((unsigned char) (1 << 6))
+#define MFREAD_RPOS ((unsigned char) (1 << 7))
+
+
+typedef struct {
+ Uint len;
+ char *string;
+ char *quals;
+ Uint row;
+ Uint edist;
+ Uint readpos;
+ Uint curstart;
+ Uint matchcnt;
+ Uint bisulfite;
+} matchfileDeletion_t;
+
+
+typedef struct {
+ Uint spliceid;
+ char edgetype;
+ char strand; /* strand of the exonedge */
+ char trans;
+ Uint xno;
+ Uint xstart; /* start of the split in the read */
+ Uint xend; /* end of the split in the read */
+ Uint edgechridx;
+ Uint edge;
+ Uint adjoint;
+} matchfileSplit_t;
+
+typedef struct {
+ Uint refidx;
+ Uint refpos;
+ Uint noofmates;
+} matelink_t;
+
+typedef struct {
+ Uint len;
+ char ref;
+ char cons;
+ char *chars;
+ char *quals;
+ char *strands;
+ char *feat;
+ uint32_t *readpos;
+ uint32_t *readlen;
+ uint32_t *row;
+ uint32_t *matchcnt;
+ uint32_t starts;
+ uint32_t ends;
+ Uint noofmatelinks;
+ matelink_t *matelinks;
+ Uint maxrow;
+ uint32_t noofdels;
+ matchfileDeletion_t *dels;
+ uint32_t noofsplits;
+ matchfileSplit_t *splits;
+ unsigned char *edist;
+ uint32_t *bisulfite;
+ double s_ref;
+ double s_refx;
+ double p_ref;
+ double p_refx;
+ double s_cons;
+ double s_consx;
+ double p_cons;
+ double p_consx;
+ double p_hom;
+ double entropy;
+ double longentropy;
+ double pentropy;
+ double ee;
+ double scr_ref;
+ double scr_cons;
+ double scr_sample;
+ double diff_rt;
+ double diff_rq;
+ double diff_rr;
+ double diff_mm;
+ double pee;
+ double pbinom;
+ Uint secondminimum;
+ Uint secondcnt;
+} matchfileCross_t;
+
+typedef struct {
+ Uint start;
+ Uint width;
+ Uint maxheight;
+ Uint chrlen;
+ char *chrname;
+ char *chrseq;
+ char *ref;
+ matchfileCross_t *cs;
+} matchfileFrame_t;
+
+
+typedef struct {
+ Uint n;
+ double px;
+ double pxx;
+ Uint maxcover;
+ Uint mincover;
+ double minfrac;
+ char entropyfilter;
+ /*errordensity*/
+ Uint e_N;
+ double *eraw;
+ double *entropy;
+ double *b;
+ double b_mu;
+ double b_sd;
+ double b_ll;
+ double *e;
+ double *e_mu;
+ double *e_sd;
+ double e_ll;
+ double *gev_mu;
+ double *gev_si;
+ double *gev_xi;
+ double *gev_ll;
+ Uint P;
+ Uint X;
+ Uint N;
+ /*readerror at non variant positions: 0,1,2,3,4,5,>5*/
+ Uint RR_N;
+ Uint *RR;
+ /*multiple mapping at non variant positions: 0,1,2,3,4,5,>5*/
+ Uint MM_N;
+ Uint *MM;
+ /*norm area*/
+ Uint areasize;
+ double maxareae;
+ /*substitution*/
+ Uint *S_N;
+ double *S;
+ Uint *Sx_N;
+ double *Sx;
+ /*noise*/
+ Uint *R_N;
+ Uint *R;
+ Uint *RP_N;
+ Uint *RP;
+ Uint *RQ_N;
+ Uint *RQ;
+ Uint *MO_N;
+ Uint *MO;
+ /*readvariance*/
+ Uint V_N;
+ double *V;
+ double V_mu;
+ double V_sd;
+ double V_ll;
+ Uint Vx_N;
+ double *Vx;
+ double Vx_mu;
+ double Vx_sd;
+ double Vx_ll;
+ double entropydensitystep;
+ Uint entropydensitylen;
+ double *entropydensity;
+ char usenativequal;
+ char usegev;
+ /*standardization*/
+ char standardized;
+ double maxrp;
+ double minrp;
+ double maxrq;
+ double minrq;
+ double currq;
+ ecdf_t *ecdf;
+ double *s;
+ Uint s_N;
+ double cut;
+ char strand;
+ unsigned char minqual;
+ unsigned char maxqual;
+} matchfileSampleStats_t;
+
+typedef struct {
+ Uint start;
+ Uint end;
+ Uint matches;
+ off_t offset;
+ off_t endoff;
+} matchfileBin_t;
+
+typedef struct {
+ struct access *gzindex;
+ Uint md5;
+ Uint noofchroms;
+ Uint noofreads;
+ Uint exp; //power of two
+ char **chromnames;
+ Uint *matchstart;
+ Uint *matchend;
+ Uint *noofbins;
+ matchfileBin_t **bins;
+ Uint maxreadlen;
+ double *submatrix;
+ double mean_coverage;
+ double mean_qual;
+ Uint *P_ERR;
+ Uint *Q_ERR;
+ Uint *Q_N;
+ matchfileSampleStats_t *stats;
+ unsigned char minqual;
+ unsigned char maxqual;
+} matchfileindex_t;
+
+typedef struct {
+ unsigned char fmt;
+ unsigned char gzip;
+ char *filename;
+ matchfileindex_t *index;
+} matchfile_t;
+
+typedef struct {
+ double *mean_err;
+ double *mean_sde;
+ double *mean_pos;
+ double *mean_mul;
+ double *mean_dis;
+ Uint rss;
+ Uint *char_frq;
+ Uint *dist_rss;
+ Uint dist_rss_N;
+ Uint dist_rss_ymax;
+ Uint dist_rss_ymax_1;
+ Uint dist_rss_xmax;
+ double mean_cov;
+ double prime5;
+ double prime3;
+ Uint *ntcnt;
+} matchfileFrameStats_t;
+
+typedef struct {
+ Uint len;
+ double var_ee;
+ double* var_s;
+ double* var_rt;
+ double* var_rq;
+ double* var_rr;
+ double* var_rv;
+ double* var_mm;
+ double* sub;
+ double pentropy;
+ double strandpenalty;
+ double mean_rt;
+ double mean_rq;
+ double mean_rr;
+ double mean_mm;
+
+} matchfileCrossStats_t;
+
+typedef struct{
+ Uint noofsplits;
+ Uint distpos;
+ Uint distcidx;
+ matchfileCross_t *cs;
+ Uint pos;
+ Uint cidx;
+ Uint splicesite;
+ Uint acceptor;
+ Uint donor;
+ Uint transsplits;
+ char seen;
+} distsplitsites_t;
+
+typedef struct{
+ Uint binpos;
+ Uint binref;
+ Uint noofmates;
+ Uint *matebinpos;
+ Uint *matebinref;
+ Uint *matebincnt;
+} matebin_t;
+
+typedef struct{
+ Uint noofbins;
+ matebin_t *bins;
+} matebinmap_t;
+
+typedef struct {
+ Uint noofmates;
+ Uint *refpos;
+ Uint *refidx;
+ Uint *materefpos;
+ Uint *materefidx;
+ Uint *matecount;
+} matemap_t;
+
+typedef struct {
+ annotationtrack_t *bed;
+ matchfile_t **files;
+ fasta_t *set;
+ Uint noofsplits;
+ Uint *pos;
+ Uint *cidx;
+ matchfileCross_t *cs;
+ matebinmap_t matemap;
+} splitmap_t;
+
+typedef struct{
+ char *chromname;
+ Uint cidx; /*the idx of the chromsome*/
+ Uint start; /*the start of the interval*/
+ Uint end; /*the end of the interval*/
+ Uint median; /*median split site = splice site*/
+
+ /* *
+ *
+ * information on splitsites and their splits
+ *
+ * */
+
+ Uint noofsplitsites; /*no of splitsites*/
+ Uint *splitsites; /*splitsites contributing to this splice site*/
+ Uint *noofsplits; /*no of splits for each splitsite*/
+
+ /*the respective cross sections*/
+ matchfileCross_t **cs;
+
+ Uint totalsplits; /*total no of splits*/
+ Uint dtypes; /*no of donor splits*/
+ Uint atypes; /*no of accep splits*/
+ Uint mstrands; /*no of minus splits*/
+ Uint pstrands; /*no of plus splits*/
+ Uint transsplits; /*no of trans splits*/
+
+ char type; /*consensus type (A or D)*/
+ char strand; /*consensus strand (- or +)*/
+
+ Uint noofrightsites;
+ /*pointers into this very list ( )*/
+ Uint *rightsiteidx;
+ Uint *rightedgeweight;
+ uint16_t *rightedgeacceptor;
+ uint16_t *rightedgedonor;
+ uint16_t *righttranssplits;
+ uint16_t *leftmatesupport;
+ uint16_t *rightmatesupport;
+
+ Uint noofleftsites;
+ /*pointers into this very list ( )*/
+ Uint *leftsiteidx;
+ Uint *leftedgeweight;
+ uint16_t *leftedgeacceptor;
+ uint16_t *leftedgedonor;
+ uint16_t *lefttranssplits;
+
+} splicesite_t;
+
+
+typedef struct {
+
+ Uint noofsplicesites;
+ splicesite_t *map;
+
+ //stats
+ Uint interval;
+ Uint *histogram;
+ Uint **charhist;
+ Uint **charhistA;
+ Uint *chrcnt;
+
+} splicemap_t;
+
+typedef struct{
+ char *curname;
+ char *curchrom;
+ Uint flag;
+ char *flgs;
+ Uint curstart;
+ Uint curend;
+ Uint curchromidx;
+ char *curseq;
+ char *curqual;
+ char *diff;
+ Uint curcnt;
+ char *curaln;
+ Uint curalnlen;
+ int edist;
+ Uint bisulfite;
+ char strand;
+ char *rnext;
+ Uint pnext;
+ Uint acceptorpos;
+ char *acceptorchr;
+ Uint acceptorflg;
+ Uint donorpos;
+ char *donorchr;
+ Uint donorflg;
+ Uint xstart;
+ Uint xend;
+ Uint xno;
+ Uint noofsplits;
+ Uint identity;
+} matchfileRec_t;
+
+
+Uint
+bl_matchfileGetChromIndexNumberDB (fasta_t *set, matchfileindex_t *index,
+ char *chr);
+matchfileCross_t* bl_matchfileRead(void *space, matchfile_t *file,
+ char *chromname, Uint start, Uint end, Uint maxcover, fasta_t *set, unsigned char fields, matchfileCross_t*);
+void bl_matchfileIndex(void *space, matchfile_t *file, fasta_t *set);
+void bl_matchfileDestructCross(void *space, matchfileCross_t *cs, Uint len);
+Uint bl_matchfileGetChromIndexNumber(matchfileindex_t *index, char *chromname);
+void bl_matchfileDestructIndex(void *space, matchfileindex_t *index);
+matchfileCross_t* bl_matchfileCopyCross(void *space, matchfileCross_t *xs, matchfileCross_t *cs);
+void bl_matchfileGapAlign(matchfileDeletion_t *dels, Uint noofdels);
+void bl_matchfileDumpFileStats (void *space, matchfile_t *file);
+void bl_matchfileDumpCrossSection (matchfileCross_t *cs, Uint len);
+matchfileindex_t * bl_matchfileReadIndex (void *space, char *filename);
+void bl_matchfileWriteIndex(matchfileindex_t *idx, char *filename);
+Uint bl_compareIndices(matchfileindex_t *i1, matchfileindex_t *i2);
+matchfileindex_t* bl_matchfileInitIndex(void *space);
+Uint bl_matchfileAdjustBounds(void *space, matchfile_t *file, fasta_t *set,
+ Uint setid, Uint matid, Uint start, Uint width, Uint *newstart, Uint *newwidth);
+matchfileFrame_t * bl_matchfileGetFrame(void *space, matchfile_t *file,
+ char *chrname, Uint start, Uint width, fasta_t *set, Uint maxcover, matchfileCross_t*);
+Uint bl_writeexpression (void *space, char *filename, Uint filenamelen,
+ matchfile_t *file, fasta_t *set, Uint width, Uint maxcover);
+Uint bl_matchfileIndexAddChrom(matchfileindex_t *index, char *chromname);
+void bl_matchfileMergeFrames (void *space, matchfileFrame_t* f, matchfileFrame_t *g);
+
+
+#endif
diff --git a/segemehl/libs/matchfilesfields.c b/segemehl/libs/matchfilesfields.c
new file mode 100644
index 0000000..f3aeba7
--- /dev/null
+++ b/segemehl/libs/matchfilesfields.c
@@ -0,0 +1,867 @@
+
+/*
+ * matchfilesfields.c
+ *
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 01.03.2011 17:37:06 CET
+ *
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <ctype.h>
+#include "alignment.h"
+#include "debug.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "sort.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "zran.h"
+#include "nw.h"
+#include "matchfiles.h"
+#include "evalmatchfiles.h"
+
+
+/*------------------------- bl_matchfileGetQname ------------------------------
+ *
+ * @brief access query name of hit
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_matchfileGetQname(stringset_t *fields, unsigned char fmt) {
+
+ if (fields->noofstrings < 4) return 0;
+
+ switch(fmt) {
+ case SAM:
+ return fields->strings[0].str;
+ break;
+
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+
+ }
+
+}
+
+
+
+/*--------------------------- bl_matchfileGetRNext ---------------------------
+ *
+ * @brief get mate chromosome
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_matchfileGetRNext ( stringset_t *fields, unsigned char fmt )
+{
+
+ if (fields->noofstrings < 6) return 0;
+
+ switch(fmt) {
+ case SAM:
+ return fields->strings[6].str;
+ break;
+
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+
+ }
+ return NULL;
+}
+
+
+/*--------------------------- bl_matchfileGetPNext ---------------------------
+ *
+ * @brief get mate position
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetPNext (stringset_t *fields, unsigned char fmt)
+{
+ if (fields->noofstrings < 7) return 0;
+
+ switch(fmt) {
+ case SAM:
+ return atoi(fields->strings[7].str);
+ break;
+
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+ return 0;
+}
+
+/*------------------------- bl_matchfileGetFlag -------------------------------
+ *
+ * @brief access flag of hit
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetFlag(stringset_t *fields, unsigned char fmt) {
+
+ if (fields->noofstrings < 4) return 0;
+
+ switch(fmt) {
+ case SAM:
+ return atoi(fields->strings[1].str);
+ break;
+
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+
+ }
+
+}
+
+
+/*-------------------------- bl_matchfileGetFlagStr --------------------------
+ *
+ * @brief get flag field as a string
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_matchfileGetFlagStr (stringset_t *fields, unsigned char fmt)
+{
+
+ if (fields->noofstrings < 4) return 0;
+
+ switch(fmt) {
+ case SAM:
+ return fields->strings[1].str;
+ break;
+
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+
+ }
+
+ return NULL;
+}
+
+
+
+/*--------------------------- bl_matchfileIsHeader ---------------------------
+ *
+ * @brief check header
+ * @author Steve Hoffmann
+ *
+ */
+
+unsigned char
+bl_matchfileIsHeader (char *buffer, Uint len, unsigned char fmt)
+{
+ char *samhtags[] = {"@HD","@SQ","@RG", "@PG", "@CO"};
+ int i, n = 5;
+
+ switch(fmt) {
+ case SAM:
+ for(i=0; i < n; i++) {
+ // fprintf(stderr, "cmp tag '%s' with '%s'\n", samhtags[i], buffer);
+ if(!strncmp(samhtags[i], buffer, 3)) return 1;
+ }
+ break;
+ default:
+ DBGEXIT("Unkown format (%d). Exit forced!\n", fmt);
+ }
+
+ return 0;
+}
+
+/*------------------------- bl_matchfileGetStartPos --------------------------
+ *
+ * @brief access start position of hit
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetStartPos(stringset_t *fields, unsigned char fmt) {
+
+ if(fields->noofstrings < 4) return 0;
+
+ switch(fmt) {
+ case SAM:
+ return atoi(fields->strings[3].str);
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+}
+
+/*-------------------------- bl_matchfileGetEndPos ---------------------------
+ *
+ * @brief access end position of hit
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetEndPos(stringset_t *fields, unsigned char fmt) {
+
+ if(fields->noofstrings < 6) return 0;
+ switch(fmt) {
+ case SAM:
+ return atoi(fields->strings[3].str) +
+ bl_cigarGetAlignLen(fields->strings[5].str) - 1;
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+}
+
+/*--------------------------- bl_matchfileGetRead ----------------------------
+ *
+ * @brief access read sequence of hit
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_matchfileGetRead(stringset_t *fields, unsigned char fmt) {
+
+ if(fields->noofstrings < 10) return NULL;
+
+ switch(fmt) {
+ case SAM:
+ return fields->strings[9].str;
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+}
+
+/*--------------------------- bl_matchfileGetQual ----------------------------
+ *
+ * @brief access quality string for hit
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_matchfileGetQual(stringset_t *fields, unsigned char fmt) {
+ Uint slen = 0;
+
+ if(fields->noofstrings < 10) return NULL;
+ switch(fmt) {
+ case SAM:
+
+ if (fields->strings[10].str[0] == '*' &&
+ fields->strings[10].len == 1) {
+ slen = strlen(fields->strings[9].str);
+ fields->strings[10].str =
+ ALLOCMEMORY(space, fields->strings[10].str, char, slen+1);
+ memset(fields->strings[10].str, 'b', slen);
+ fields->strings[10].str[slen] = 0;
+ }
+ return fields->strings[10].str;
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+}
+
+/*-------------------------- bl_matchfileGetStrand ---------------------------
+ *
+ * @brief access strandiness of hit
+ * @author Steve Hoffmann
+ *
+ */
+
+char
+bl_matchfileGetStrand(stringset_t *fields, unsigned fmt) {
+ char strands[] = {'+','-'};
+
+ if(fields->noofstrings < 6) return -1;
+ switch(fmt) {
+ case SAM:
+ return strands[(atoi(fields->strings[1].str) & 0x10) >> 4];
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return 0;
+}
+
+
+/*--------------------------- bl_matchfileGetChrom ---------------------------
+ *
+ * @brief access chromosome of hit
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_matchfileGetChrom(stringset_t *fields, unsigned fmt) {
+
+ if(fields->noofstrings < 4) return 0;
+ switch(fmt) {
+ case SAM:
+ return fields->strings[2].str;
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return NULL;
+}
+
+/*------------------------- bl_matchfileGetMatchCnt --------------------------
+ *
+ * @brief access number of parallel hits
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetMatchCnt(stringset_t *fields, unsigned char fmt) {
+ Uint i, xn=1;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+
+ for(i=11; i < fields->noofstrings; i++) {
+ if(fields->strings[i].len > 5 &&
+ (strncmp(fields->strings[i].str, "XN:i:", 5) == 0 ||
+ strncmp(fields->strings[i].str, "NH:i:", 5) == 0)) {
+ xn = atoi(&fields->strings[i].str[5]);
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return xn;
+}
+
+/*------------------------- bl_matchfileGetBisulfite -------------------------
+ *
+ * @brief access bisulfite mode by flag XB:Z:F./[CT|GA]/
+ * 0 = no bisulfite conversion
+ * 1 = C-to-T conversion
+ * 2 = G-to-A conversion
+ *
+ * @author Christian Otto
+ */
+
+Uint
+bl_matchfileGetBisulfite(stringset_t *fields, unsigned char fmt) {
+ Uint i;
+ unsigned int bisulfite=0;
+ char *mode;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+
+ for(i=11; i < fields->noofstrings; i++) {
+
+ if(fields->strings[i].len > 8 &&
+ strncmp(fields->strings[i].str, "XB:Z:", 5) == 0) {
+
+ mode = &fields->strings[i].str[8];
+
+ if (strcmp(mode, "CT") == 0){
+ bisulfite = 1;
+ }
+ else if (strcmp(mode, "GA") == 0){
+ bisulfite = 2;
+ }
+ else {
+ DBGEXIT("Unknown bisulfite flag (%s). Exit forced!\n",
+ fields->strings[i].str);
+ }
+ break;
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return bisulfite;
+}
+
+/*--------------------------- bl_matchfileGetEdist ---------------------------
+ *
+ * @brief access edist of hit
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetEdist(stringset_t *fields, unsigned char fmt) {
+ Uint i, nm = 0;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+ for(i=11; i < fields->noofstrings; i++) {
+ if(strncmp(fields->strings[i].str, "NM:i:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ nm= atoi(&fields->strings[i].str[5]);
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return nm;
+}
+
+/*--------------------------- bl_matchfileGetMappingID ---------------------------
+ *
+ * @brief access mapping ID of hit
+ * @author Stephan Bernhart
+ *
+ */
+
+Uint
+bl_matchfileGetMappingID(stringset_t *fields, unsigned char fmt) {
+ Uint i, id = 0;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+ for(i=11; i < fields->noofstrings; i++) {
+ if(strncmp(fields->strings[i].str, "XI:i:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ id= atoi(&fields->strings[i].str[5]);
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return id;
+}
+
+/*---------------------------- bl_matchfileGetAln ----------------------------
+ *
+ * @brief access alignment string for hit
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_matchfileGetAln(stringset_t *fields, unsigned char fmt) {
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+ return bl_cigarGetAlignString(fields->strings[5].str);
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return NULL;
+}
+
+/*------------------------ bl_matchfileGetPrevPos -------------------------
+ *
+ * @brief access the next split hit position
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_matchfileGetDiffString(stringset_t *fields, unsigned char fmt) {
+ Uint i;
+ char *res = NULL;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+ for(i=11; i < fields->noofstrings; i++) {
+ if(strncmp(fields->strings[i].str, "MD:Z:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ res = bl_mdGetDiffString(&fields->strings[i].str[5]);
+ return res;
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return NULL;
+}
+
+/*------------------------- bl_matchfileGetNextPos --------------------------
+ *
+ * @brief access the previous hit position
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetPrevPos(stringset_t *fields, unsigned char fmt) {
+ Uint i, xn=1;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+
+ for(i=11; i < fields->noofstrings; i++) {
+
+ if(strncmp(fields->strings[i].str, "XU:i:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ xn = atoi(&fields->strings[i].str[5]);
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return xn;
+}
+
+/*------------------------- bl_matchfileGetNextPos --------------------------
+ *
+ * @brief access the previous hit position
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetNextPos(stringset_t *fields, unsigned char fmt) {
+ Uint i, xn=1;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+
+ for(i=11; i < fields->noofstrings; i++) {
+ if(strncmp(fields->strings[i].str, "XV:i:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ xn = atoi(&fields->strings[i].str[5]);
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return xn;
+}
+
+/*------------------------- bl_matchfileGetPrevChr --------------------------
+ *
+ * @brief access number of parallel hits
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_matchfileGetPrevChr(stringset_t *fields, unsigned char fmt) {
+ Uint i;
+ char *chr = NULL;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+
+ for(i=11; i < fields->noofstrings; i++) {
+ if(strncmp(fields->strings[i].str, "XP:Z:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ //chr = ALLOCMEMORY(space, NULL, char, fields->strings[i].len-4);
+ //memmove(chr, &fields->strings[i].str[5], fields->strings[i].len-5);
+ //chr[fields->strings[i].len-5] = 0;
+ chr = &fields->strings[i].str[5];
+ }
+ }
+
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return chr;
+}
+
+/*------------------------- bl_matchfileGetNextChr --------------------------
+ *
+ * @brief access chromosome for next split hit
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_matchfileGetNextChr(stringset_t *fields, unsigned char fmt) {
+ Uint i;
+ char *chr = NULL;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+
+ for(i=11; i < fields->noofstrings; i++) {
+ if(strncmp(fields->strings[i].str, "XC:Z:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ //chr = ALLOCMEMORY(space, NULL, char, fields->strings[i].len-4);
+ //memmove(chr, &fields->strings[i].str[5], fields->strings[i].len-5);
+ //chr[fields->strings[i].len-5] = 0;
+ chr = &fields->strings[i].str[5];
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return chr;
+}
+
+/*------------------------ bl_matchfileGetSplitStart ------------------------
+ *
+ * @brief access the start of the split
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetSplitStart(stringset_t *fields, unsigned char fmt) {
+ Uint i, xn =0;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+
+ for(i=11; i < fields->noofstrings; i++) {
+ if(strncmp(fields->strings[i].str, "XX:i:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ xn = atoi(&fields->strings[i].str[5]);
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return xn;
+}
+
+/*------------------------ bl_matchfileGetSplitEnd --------------------------
+ *
+ * @brief access the end of split
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetSplitEnd(stringset_t *fields, unsigned char fmt) {
+ Uint i, xn = 0;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+
+ for(i=11; i < fields->noofstrings; i++) {
+ if(strncmp(fields->strings[i].str, "XY:i:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ xn = atoi(&fields->strings[i].str[5]);
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return xn;
+}
+
+/*----------------------- bl_matchfileGetNoOfSplits ------------------------
+ *
+ * @brief access the end of split
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetNoOfSplits(stringset_t *fields, unsigned char fmt) {
+ Uint i, xn = 0;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+
+ for(i=11; i < fields->noofstrings; i++) {
+ if(strncmp(fields->strings[i].str, "XL:i:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ xn = atoi(&fields->strings[i].str[5]);
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return xn;
+}
+
+
+/*----------------------- bl_matchfileGetSplitNumber ------------------------
+ *
+ * @brief access the end of split
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetSplitNumber(stringset_t *fields, unsigned char fmt) {
+ Uint i, xn = 0;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+
+ for(i=11; i < fields->noofstrings; i++) {
+ if(strncmp(fields->strings[i].str, "XQ:i:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ xn = atoi(&fields->strings[i].str[5]);
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return xn;
+}
+
+/*----------------------- bl_matchfileGetPrevFlag ------------------------
+ *
+ * @brief get flags of the prev split
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetPrevFlag(stringset_t *fields, unsigned char fmt) {
+ Uint i, fl = 0;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+
+ for(i=11; i < fields->noofstrings; i++) {
+ if(strncmp(fields->strings[i].str, "XS:i:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ fl = atoi(&fields->strings[i].str[5]);
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return fl;
+}
+
+/*----------------------- bl_matchfileGetNextFlag ------------------------
+ *
+ * @brief get flags for the next split
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileGetNextFlag(stringset_t *fields, unsigned char fmt) {
+ Uint i, fl = 0;
+
+ switch(fmt) {
+ case SAM:
+ if(fields->noofstrings < 12) return 0;
+
+ for(i=11; i < fields->noofstrings; i++) {
+ if(strncmp(fields->strings[i].str, "XT:i:", 5) == 0 &&
+ fields->strings[i].len > 5) {
+ fl = atoi(&fields->strings[i].str[5]);
+ }
+ }
+ break;
+ default:
+ DBGEXIT("Unknown format (%d). Exit forced!\n", fmt);
+ }
+
+ return fl;
+}
+
+/*----------------------- bl_matchfileGetMatchFileRec -----------------------
+ *
+ * @brief get matchfile record
+ * @author Steve Hoffmann
+ *
+ */
+
+matchfileRec_t *
+bl_matchfileGetMatchFileRec(matchfileRec_t *rec, Uint fields,
+ stringset_t *token, Uint fmt)
+{
+
+
+ rec->curname = bl_matchfileGetQname(token, fmt);
+ rec->flag = bl_matchfileGetFlag(token, fmt);
+ rec->curchrom = bl_matchfileGetChrom(token, fmt);
+
+ rec->curstart = bl_matchfileGetStartPos(token, fmt);
+ rec->curend = bl_matchfileGetEndPos(token, fmt);
+ rec->curseq = bl_matchfileGetRead(token, fmt);
+ rec->diff = bl_matchfileGetDiffString(token, fmt);
+
+ if(fields & MFREAD_QUAL)
+ rec->curqual = bl_matchfileGetQual(token, fmt);
+
+ if(fields & MFREAD_MCNT)
+ rec->curcnt = bl_matchfileGetMatchCnt(token, fmt);
+
+ // if (fields & MFREAD_BISULFITE)
+ rec->bisulfite = bl_matchfileGetBisulfite(token, fmt);
+
+ rec->curaln = bl_matchfileGetAln(token, fmt);
+ rec->edist = bl_matchfileGetEdist(token, fmt);
+ rec->strand = bl_matchfileGetStrand(token, fmt);
+ rec->rnext = bl_matchfileGetRNext(token, fmt);
+ rec->pnext = bl_matchfileGetPNext(token, fmt);
+ rec->identity = bl_matchfileGetMappingID(token, fmt);
+ if(fields & MFREAD_SPLITS) {
+ rec->noofsplits = bl_matchfileGetNoOfSplits(token, fmt);
+ rec->acceptorpos = bl_matchfileGetNextPos(token, fmt);
+ rec->acceptorchr = bl_matchfileGetNextChr(token, fmt);
+ rec->acceptorflg = bl_matchfileGetNextFlag(token, fmt);
+ rec->donorpos = bl_matchfileGetPrevPos(token, fmt);
+ rec->donorchr = bl_matchfileGetPrevChr(token, fmt);
+ rec->donorflg = bl_matchfileGetPrevFlag(token, fmt);
+ rec->xstart = bl_matchfileGetSplitStart(token, fmt);
+ rec->xend = bl_matchfileGetSplitEnd(token, fmt);
+ rec->xno = bl_matchfileGetSplitNumber(token, fmt);
+ }
+
+
+ return rec;
+}
+
diff --git a/segemehl/libs/matchfilesfields.h b/segemehl/libs/matchfilesfields.h
new file mode 100644
index 0000000..6088605
--- /dev/null
+++ b/segemehl/libs/matchfilesfields.h
@@ -0,0 +1,44 @@
+#ifndef MATCHFILESFIELDS_H
+#define MATCHFILESFIELDS_H
+
+/*
+ *
+ * matchfilesfields.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 01.03.2011 17:40:56 CET
+ *
+ */
+#include "matchfiles.h"
+
+Uint bl_matchfileGetPNext (stringset_t *fields, unsigned char fmt);
+char* bl_matchfileGetRNext ( stringset_t *fields, unsigned char fmt);
+char* bl_matchfileGetQname(stringset_t *fields, unsigned char fmt);
+Uint bl_matchfileGetFlag(stringset_t *fields, unsigned char fmt);
+unsigned char bl_matchfileIsHeader(char *buffer, Uint len, unsigned char fmt);
+Uint bl_matchfileGetStartPos(stringset_t *fields, unsigned char fmt);
+Uint bl_matchfileGetEndPos(stringset_t *fields, unsigned char fmt);
+char* bl_matchfileGetRead(stringset_t *fields, unsigned char fmt);
+char* bl_matchfileGetQual(stringset_t *fields, unsigned char fmt);
+char bl_matchfileGetStrand(stringset_t *fields, unsigned fmt);
+char* bl_matchfileGetChrom(stringset_t *fields, unsigned fmt);
+Uint bl_matchfileGetMatchCnt(stringset_t *fields, unsigned char fmt);
+Uint bl_matchfileGetEdist(stringset_t *fields, unsigned char fmt);
+Uint bl_matchfileGetBisulfite(stringset_t *fields, unsigned char fmt);
+char* bl_matchfileGetAln(stringset_t *fields, unsigned char fmt);
+char* bl_matchfileGetDiffString(stringset_t *fields, unsigned char fmt);
+Uint bl_matchfileGetPrevPos(stringset_t *fields, unsigned char fmt);
+Uint bl_matchfileGetNextPos(stringset_t *fields, unsigned char fmt);
+char* bl_matchfileGetNextChr(stringset_t *fields, unsigned char fmt);
+char* bl_matchfileGetPrevChr(stringset_t *fields, unsigned char fmt);
+Uint bl_matchfileGetSplitStart(stringset_t *fields, unsigned char fmt);
+Uint bl_matchfileGetSplitEnd(stringset_t *fields, unsigned char fmt);
+Uint bl_matchfileGetSplitNumber(stringset_t *fields, unsigned char fmt);
+Uint bl_matchfileGetPrevFlag(stringset_t *fields, unsigned char fmt);
+Uint bl_matchfileGetNextFlag(stringset_t *fields, unsigned char fmt);
+Uint bl_matchfileGetMappingID(stringset_t *fields, unsigned char fmt);
+matchfileRec_t * bl_matchfileGetMatchFileRec(matchfileRec_t *rec, Uint fields, stringset_t *token, Uint fmt);
+
+#endif
diff --git a/segemehl/libs/matepairs.c b/segemehl/libs/matepairs.c
new file mode 100644
index 0000000..8198e9c
--- /dev/null
+++ b/segemehl/libs/matepairs.c
@@ -0,0 +1,371 @@
+
+/*
+ * matepairs.c
+ *
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 11/09/2011 05:16:34 PM CET
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <ctype.h>
+#include "alignment.h"
+#include "debug.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "sort.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "zran.h"
+#include "nw.h"
+#include "matchfiles.h"
+#include "evalmatchfiles.h"
+#include "manout.h"
+#include "matchfilesfields.h"
+#include "matepairs.h"
+
+/*------------------------- bl_matchfileInitMateMap --------------------------
+ *
+ * @brief init the mate map
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileInitMateMap (void *space, matemap_t *map)
+{
+ map->noofmates = 0;
+ map->refpos = NULL;
+ map->refidx = NULL;
+ map->materefpos = NULL;
+ map->materefidx = NULL;
+ map->matecount = NULL;
+
+ return ;
+}
+
+/*----------------------- bl_matchfileDestructMateMap ------------------------
+ *
+ * @brief destruct the mate map
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDestructMateMap (void *space, matemap_t *map)
+{
+
+ if(map->refpos) FREEMEMORY(space, map->refpos);
+ if(map->refidx) FREEMEMORY(space, map->refidx);
+ if(map->materefpos) FREEMEMORY(space, map->materefpos);
+ if(map->materefidx) FREEMEMORY(space, map->materefidx);
+ if(map->matecount) FREEMEMORY(space, map->matecount);
+
+ return ;
+}
+
+/*--------------------------- bl_matchfileMateScan ---------------------------
+ *
+ * @brief get the mate locations for a map interval
+ * @author Steve Hoffmann
+ *
+ */
+
+matelink_t *
+bl_matchfileMateScan (void *space, matchfileCross_t *cs)
+{
+ matelink_t *mates = NULL;
+
+
+
+ return mates;
+}
+
+/*--------------------------- bl_matchfileAddMate ----------------------------
+ *
+ * @brief add a mate to the cross section
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileAddMate (void *space, matchfileCross_t *cs, Uint refidx,
+ Uint refpos)
+{
+
+ Uint i,k;
+
+ for(i=0; i < cs->noofmatelinks; i++) {
+ if(cs->matelinks[i].refpos == refpos &&
+ cs->matelinks[i].refidx == refidx) {
+ cs->matelinks[i].noofmates++;
+ break;
+ }
+ }
+
+ if(i == cs->noofmatelinks){
+ k = cs->noofmatelinks;
+ cs->matelinks = ALLOCMEMORY(space, cs->matelinks, matelink_t, k+1);
+ cs->matelinks[k].refidx = refidx;
+ cs->matelinks[k].refpos = refpos;
+ cs->matelinks[k].noofmates = 1;
+ cs->noofmatelinks++;
+ }
+
+ return ;
+}
+
+
+/*------------------------- bl_matchfileAddMateToMap -------------------------
+ *
+ * @brief adding a mate to mate map
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchfileAddDistMateToMap (void * space, matemap_t *map,
+ matchfileCross_t *cs, Uint cidx, Uint pos, char ref)
+{
+
+ Uint k,i;
+ k = map->noofmates;
+
+ for(i=0; i < cs->noofmatelinks; i++) {
+ if(cs->matelinks[i].refidx != cidx ||
+ llabs(((Lint)cs->matelinks[i].refpos-(Lint)pos)) > MATEPAIRDIST) {
+
+ map->refpos = ALLOCMEMORY(space, map->refpos, Uint, k+1);
+ map->refidx = ALLOCMEMORY(space, map->refidx, Uint, k+1);
+ map->materefpos = ALLOCMEMORY(space, map->materefpos, Uint, k+1);
+ map->materefidx = ALLOCMEMORY(space, map->materefidx, Uint, k+1);
+ map->matecount = ALLOCMEMORY(space, map->matecount, Uint, k+1);
+
+ map->refpos[k] = pos;
+ map->refidx[k] = cidx;
+ map->materefpos[k] = cs->matelinks[i].refpos;
+ map->materefidx[k] = cs->matelinks[i].refidx;
+ map->matecount[k] = cs->matelinks[i].noofmates;
+ k++;
+ }
+ }
+
+ map->noofmates = k;
+ return ;
+}
+
+
+/*------------------------- bl_matchfileInitMateBin --------------------------
+ *
+ * @brief init mate bin
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_matchfileInitMateBin (void *space, matebin_t *bin, Uint pos, Uint ref)
+{
+
+ bin->binpos = pos;
+ bin->binref = ref;
+ bin->noofmates = 0;
+ bin->matebinpos = NULL;
+ bin->matebinref = NULL;
+ bin->matebincnt = NULL;
+
+ return ;
+}
+
+
+/*------------------------ bl_matchfileInitMateBinMap ------------------------
+ *
+ * @brief init mate bin map
+ * @author Steve Hoffmann
+ *
+ */
+
+
+void
+bl_matchfileInitMateBinMap (void *space, matebinmap_t *map)
+{
+ map->noofbins = 0;
+ map->bins = NULL;
+
+ return ;
+}
+
+
+/*------------------------- bl_matchfileScanMateBins -------------------------
+ *
+ * @brief scan to find the bin at a given position
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileScanBins (void *space, matebinmap_t *map, Uint pos, Uint ref)
+{
+ Uint binpos, startbinpos, endbinpos, i;
+ char found = 0;
+
+ binpos = startbinpos = pos - (pos % MATEBINSIZE);
+
+ if (binpos > MATEBINSIZE) {
+ binpos -= MATEBINSIZE;
+ }
+ endbinpos = binpos + MATEBINSIZE;
+
+
+ for(i=0; i < map->noofbins; i++) {
+ if(map->bins[i].binpos >= startbinpos &&
+ map->bins[i].binpos <= endbinpos &&
+ map->bins[i].binref == ref) {
+ found = 1;
+ break;
+ }
+
+ if((map->bins[i].binpos > endbinpos &&
+ map->bins[i].binref == ref) ||
+ map->bins[i].binref > ref) {
+ break;
+ }
+ }
+
+
+ if(found) return i;
+
+ return -1;
+}
+
+
+/*-------------------------- bl_matchfileSearchMate --------------------------
+ *
+ * @brief find a mate link from pos to dpos
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileSearchMateLink (void *space, matebinmap_t *map,
+ Uint frompos, Uint fromref, Uint topos, Uint toref)
+{
+ Uint i, j, k, no=0, tobinpos;
+
+ k = bl_matchfileScanBins(space, map, frompos, fromref);
+
+ if (k == -1) {
+ return 0;
+ } else if (k > MATEPAIRSEARCHBINMARGIN) {
+ k -= MATEPAIRSEARCHBINMARGIN;
+ } else if (k > 0) {
+ k = 0;
+ }
+
+ tobinpos = topos - (topos % MATEBINSIZE);
+
+ for(i=k; i < map->noofbins && i < k+MATEPAIRSEARCHBINMARGIN+1; i++) {
+ for(j=0; j < map->bins[i].noofmates; j++) {
+ if(map->bins[i].matebinpos[j] == tobinpos &&
+ map->bins[i].matebinref[j] == toref) {
+ no += map->bins[i].matebincnt[j];
+ }
+ }
+ }
+
+ return no;
+}
+
+
+/*------------------------- bl_matchfileAddToMateBin -------------------------
+ *
+ * @brief adding a mate to mate map
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchfileAddToMateBinMap (void *space, matebinmap_t *map,
+ matchfileCross_t *cs, Uint cidx, Uint pos, char ref)
+{
+
+ Uint i, j, k, binpos, curbinpos = 0, curbinref = 0,
+ matebinpos = 0, matebinref = 0;
+
+ k = map->noofbins;
+
+ binpos = pos - (pos % MATEBINSIZE);
+ curbinpos = (k > 0) ? map->bins[k-1].binpos : 0;
+ curbinref = (k > 0) ? map->bins[k-1].binref : 0;
+
+ if(k == 0 || curbinpos != binpos || cidx != curbinref) {
+ map->bins = ALLOCMEMORY(space, map->bins, matebin_t, k+1);
+ bl_matchfileInitMateBin(space, &map->bins[k], binpos, cidx);
+ map->noofbins++;
+ k++;
+ }
+
+ for(i=0; i < cs->noofmatelinks; i++) {
+
+ matebinpos =
+ cs->matelinks[i].refpos - (cs->matelinks[i].refpos % MATEBINSIZE);
+ matebinref = cs->matelinks[i].refidx;
+
+ for(j=0; j < map->bins[k-1].noofmates; j++) {
+ if(map->bins[k-1].matebinpos[j] == matebinpos &&
+ map->bins[k-1].matebinref[j] == matebinref) {
+ break;
+ }
+ }
+
+ if(j == map->bins[k-1].noofmates) {
+ map->bins[k-1].matebinpos =
+ ALLOCMEMORY(space, map->bins[k-1].matebinpos, Uint, j+1);
+ map->bins[k-1].matebinref =
+ ALLOCMEMORY(space, map->bins[k-1].matebinref, Uint, j+1);
+ map->bins[k-1].matebincnt =
+ ALLOCMEMORY(space, map->bins[k-1].matebincnt, Uint, j+1);
+
+ map->bins[k-1].matebinpos[j] = matebinpos;
+ map->bins[k-1].matebinref[j] = matebinref;
+ map->bins[k-1].matebincnt[j] = 0;
+ map->bins[k-1].noofmates++;
+ }
+
+ map->bins[k-1].matebincnt[j]++;
+ }
+
+ return ;
+}
+
+
+/*---------------------- bl_matchfileDestructMateBinMap ----------------------
+ *
+ * @brief destruct mate bin map
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDestructMateBinMap (void *space, matebinmap_t *map)
+{
+ Uint i;
+ for(i=0; i < map->noofbins; i++) {
+ if(map->bins[i].matebinpos) {
+ FREEMEMORY(space, map->bins[i].matebinpos);
+ FREEMEMORY(space, map->bins[i].matebinref);
+ FREEMEMORY(space, map->bins[i].matebincnt);
+ }
+ }
+
+ if(map->bins) FREEMEMORY(space, map->bins);
+ return ;
+}
+
diff --git a/segemehl/libs/matepairs.h b/segemehl/libs/matepairs.h
new file mode 100644
index 0000000..51a3ed6
--- /dev/null
+++ b/segemehl/libs/matepairs.h
@@ -0,0 +1,55 @@
+#ifndef MATEPAIRS_H
+#define MATEPAIRS_H
+
+/*
+ *
+ * matepairs.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 11/09/2011 05:07:25 PM CET
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <ctype.h>
+#include "alignment.h"
+#include "debug.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "sort.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "zran.h"
+#include "nw.h"
+#include "matchfiles.h"
+#include "evalmatchfiles.h"
+#include "manout.h"
+#include "matchfilesfields.h"
+#include "matchfiles.h"
+#define MATEPAIRDIST 500
+#define MATEBINSIZE 200
+#define MATEPAIRSEARCHBINMARGIN 1
+
+Uint bl_matchfileSearchMateLink (void *space, matebinmap_t *map,
+ Uint frompos, Uint fromref, Uint topos, Uint toref);
+matelink_t * bl_matchfileMateScan (void *space, matchfileCross_t *cs);
+void bl_matchfileAddMate (void *space, matchfileCross_t *cs, Uint refidx, Uint refpos);
+void bl_matchfileInitMateMap (void *space, matemap_t *map);
+void bl_matchfileDestructMateMap (void *space, matemap_t *map);
+void bl_matchfileAddDistMateToMap (void * space, matemap_t *map, matchfileCross_t *cs,
+ Uint cidx, Uint pos, char ref);
+void bl_matchfileInitMateBinMap (void *space, matebinmap_t *map);
+void bl_matchfileDestructMateBinMap (void *space, matebinmap_t *map);
+void bl_matchfileAddToMateBinMap (void * space, matebinmap_t *map,
+ matchfileCross_t *cs, Uint cidx, Uint pos, char ref);
+
+
+
+#endif
diff --git a/segemehl/libs/matfile.c b/segemehl/libs/matfile.c
new file mode 100644
index 0000000..e7d2401
--- /dev/null
+++ b/segemehl/libs/matfile.c
@@ -0,0 +1,925 @@
+
+/*
+ * matfile.c
+ * match files
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 08/25/2010 03:42:37 PM CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include "alignment.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "bitVector.h"
+#include "matchfiles.h"
+#include "browsematchfiles.h"
+#include "matfile.h"
+#include "iupac.h"
+#include "info.h"
+#include "manopt.h"
+#include "evalmatchfiles.h"
+#include "evalmethylmatchfiles.h"
+#include "splicesites.h"
+#include "startsites.h"
+#include "snvsplines.h"
+
+unsigned char mute = 0;
+char *ntcode;
+
+/*------------------------------- getfullstats -------------------------------
+ *
+ * @brief
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+getfullstats (void *space, matchfile_t *file,
+ fasta_t *fasta, Uint maxframesize, Uint mincover, Uint maxcover, double minfrac,
+ char entropyfilter, char strandbias, char usenativequal, Uint samplecond, Uint samplescr)
+{
+
+ matchfileSampleStats_t *stats;
+ double mx, sx, kx;
+ samplescr = 10000000;
+ samplecond = 10000000;
+
+ stats = bl_matchfileInitSampleStats(space, MAX(samplescr,samplecond)+20000, mincover, maxcover, minfrac, entropyfilter, 10, .15);
+ stats->strand = strandbias;
+
+ file->index->stats = stats;
+
+ MSG("evaluating error distribution.\n");
+ bl_matchfileCensus(space, file, fasta, maxframesize,
+ bl_matchfileGetErrorDensity, stats);
+
+ qsort(stats->e, stats->e_N, sizeof(double), cmp_dbl_qsort);
+ gevLmoment(stats->e, stats->e_N, &mx, &sx, &kx);
+ gevmle(NULL, stats->e, stats->e_N, &mx, &sx, &kx, 100000, stats->e[0], stats->e[stats->e_N-1]);
+ stats->pxx = mx;
+
+ NFO("setting maximum error for sampling to:%f.\n", stats->pxx);
+
+
+ MSG("sampling parameter.\n");
+ bl_matchfileCensus(space, file, fasta, maxframesize,
+ bl_matchfileGetConditionals, stats);
+
+ MSG("sampling scores.\n");
+ stats->ecdf = ecdf_init(stats->eraw, stats->e_N);
+
+ bl_matchfileCensus(space, file, fasta, maxframesize,
+ bl_matchfileGetScoreSample, file->index);
+
+ return ;
+}
+
+
+/*---------------------------------- stats -----------------------------------
+ *
+ * @brief get matchfile stats
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+getstats (void *space, matchfile_t *file, fasta_t *fasta, Uint mincover,
+ Uint maxcover, double minfrac, char entropyfilter, char strandbias, char usenativequal, Uint samplecond, Uint samplescr)
+{
+ matchfileSampleStats_t *stats;
+ unsigned char **maps=NULL;
+ Uint *mapsize=NULL, i;
+ double mx, sx, kx;
+ Uint nchr = file->index->noofchroms;
+
+ stats = bl_matchfileInitSampleStats(space, MAX(samplescr,samplecond)+20000, mincover, maxcover, minfrac, entropyfilter, 10, .15);
+ stats->strand = strandbias;
+
+ MSG("generating small map\n");
+ maps = bl_matchfileSmallMap (space, file, &mapsize);
+
+
+ MSG("evaluating error distribution.\n");
+ bl_matchfileSampleCrossSections(space, file, fasta, 100000,
+ bl_matchfileGetErrorDensity, maps, mapsize, stats);
+
+ qsort(stats->e, stats->e_N, sizeof(double), cmp_dbl_qsort);
+ gevLmoment(stats->e, stats->e_N, &mx, &sx, &kx);
+ gevmle(NULL, stats->e, stats->e_N, &mx, &sx, &kx, 100000, stats->e[0], stats->e[stats->e_N-1]);
+ stats->pxx = mx;
+
+ NFO("setting maximum error for sampling to:%f.\n", stats->pxx);
+
+
+ MSG("sampling parameter.\n");
+ bl_matchfileSampleCrossSections(space, file, fasta, samplecond,
+ bl_matchfileGetConditionals, maps, mapsize, stats);
+
+
+ MSG("sampling scores.\n");
+ stats->ecdf = ecdf_init(stats->eraw, stats->e_N);
+
+ bl_matchfileSampleCrossSections(space, file, fasta, samplescr,
+ bl_matchfileGetScoreSample, maps, mapsize, file->index);
+
+// getcutoff(stats, NULL, NULL, NULL, NULL, NULL);
+
+
+ for(i=0; i < nchr; i++) {
+ if(maps[i]) {
+ FREEMEMORY(space, maps[i]);
+ }
+ }
+
+ FREEMEMORY(space, maps);
+ FREEMEMORY(space, mapsize);
+
+ return ;
+}
+
+/*----------------------------------- view -----------------------------------
+ *
+ * @brief view the matchfiles
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+view (void *space, matchfile_t **files, Uint nooffiles, fasta_t *fasta,
+ annotationtrack_t *bed)
+{
+
+ MSG("starting viewer.\n");
+ bl_matchfileViewer(space, files, nooffiles, fasta, bed, 100, 1000);
+ return ;
+}
+
+/*----------------------------------- eval -----------------------------------
+ *
+ * @brief evaluation of matchfiles
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+eval (void *space, matchfile_t **files, int *groups, Uint nooffiles, fasta_t *fasta, Uint maxframesize, double cut)
+{
+
+ MSG("evaluating x-sections.\n");
+
+ if(cut != -1) {
+ MSG("override cutoff\n");
+ files[0]->index->stats->cut = cut;
+ } else {
+ MSG("calculating cutoff\n");
+ getcutoff( files[0]->index->stats , NULL, NULL, NULL, NULL, NULL);
+ }
+
+ bl_matchfileEvalCrossSections(space, files, groups, nooffiles, fasta, maxframesize,
+ bl_matchfileTest, NULL);
+
+ return ;
+}
+/*----------------------------------- simpleeval -----------------------------------
+ *
+ * @brief simple evaluation of matchfiles
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+simpleeval (void *space, matchfile_t **files, int *groups, Uint nooffiles, fasta_t *fasta, Uint maxframesize, char usenativequal)
+{
+
+
+ MSG("simple evaluating x-sections.\n");
+
+ bl_matchfileEvalCrossSections(space, files, groups, nooffiles, fasta, maxframesize,
+ bl_matchfileSimpleGEV, &usenativequal);
+
+ return ;
+}
+
+/*----------------------------------- gatkeval -----------------------------------
+ *
+ * @brief simple evaluation of matchfiles
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+gatkeval (void *space, matchfile_t **files, int *groups, Uint nooffiles, fasta_t *fasta, Uint maxframesize, char usenativequal)
+{
+
+
+ MSG("gatk evaluating x-sections.\n");
+
+ bl_matchfileEvalCrossSections(space, files, groups, nooffiles, fasta, maxframesize,
+ bl_matchfileSimpleGATK , &usenativequal);
+
+ return ;
+}
+
+/*---------------------------------- consensus ---------------------------------
+ *
+ * @brief simple evaluation of matchfiles
+ * @author Christian Otto
+ *
+ */
+
+void
+evalconsensus (void *space, matchfile_t **files, int *groups, Uint nooffiles, fasta_t *fasta, Uint maxframesize)
+{
+
+
+ MSG("consensus evaluating x-sections.\n");
+
+ bl_matchfileEvalCrossSections(space, files, groups, nooffiles, fasta, maxframesize,
+ bl_matchfileConsensus , files);
+
+ return ;
+}
+
+/*------------------------------ callMethylSimple ------------------------------
+ *
+ * @brief simple evaluation of matchfiles
+ * @author Christian Otto
+ *
+ */
+
+void
+callMethylSimple (void *space, matchfile_t **files, int *groups, Uint nooffiles, fasta_t *fasta, Uint maxframesize)
+{
+ assert(nooffiles == 1);
+
+ Uint i, len;
+ char *name, *basename;
+ matfile_t matfile;
+ matfile.dev = stdout;
+ matfile.fasta = fasta;
+ matfile.files = files;
+
+ /* prepare sample name from file name */
+ basename = bl_basename(files[0]->filename);
+ len = strlen(basename);
+ name = ALLOCMEMORY(space, NULL, char, len + 1);
+ memmove(name, basename, len + 1);
+
+ if (files[0]->gzip){
+ if (strncmp(&name[len-3], ".gz", 3) == 0){
+ name[len-3] = '\0';
+ }
+ else {
+ if (strncmp(&name[len-5], ".gzip", 5) == 0) {
+ name[len-5] = '\0';
+ }
+ }
+ assert(strlen(name) != len);
+ len = strlen(name);
+ }
+ len = bl_fileprefixlen(name);
+ name[len] = '\0';
+
+ MSG("evaluating x-sections.\n");
+
+ /* output VCF header */
+ fprintf(matfile.dev, "##fileformat=VCFv4.1\n");
+
+ for (i = 0; i < fasta->nooffiles; i++){
+ fprintf(matfile.dev, "##reference=file://%s\n",
+ fasta->filenames[i]);
+ }
+
+ for (i = 0; i < fasta->noofseqs; i++){
+ fprintf(matfile.dev, "##contig=<ID=%s,length=%d>\n",
+ bl_fastaGetDescription(fasta, i),
+ bl_fastaGetSequenceLength(fasta,i));
+ }
+ //fileDate? program call?
+ fprintf(matfile.dev, "##INFO=<ID=CS,Number=1,Type=Character,Description=\"Bisulfite conversion strand, i.e., strand of cytosine relative to reference\">\n");
+ fprintf(matfile.dev, "##INFO=<ID=CC,Number=1,Type=String,Description=\"Sequence context of two bases on the reference relative to bisulfite conversion strand, missing value if context is beyond reference boundaries\">\n");
+ fprintf(matfile.dev, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">\n");
+ fprintf(matfile.dev, "##INFO=<ID=MMR,Number=1,Type=Float,Description=\"Mean methylation rate among samples with data\">\n");
+ fprintf(matfile.dev, "##INFO=<ID=DMR,Number=1,Type=Float,Description=\"Difference in methylation rate (Sample1 - Sample2).\">\n");
+ fprintf(matfile.dev, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth, not filtered by mapping quality, base quality, or bisulfite conversion information\">\n");
+ fprintf(matfile.dev, "##FORMAT=<ID=MDP,Number=1,Type=Integer,Description=\"Number of reads after filtering by bisulfite conversion strand\">\n");
+ fprintf(matfile.dev, "##FORMAT=<ID=MDP3,Number=3,Type=Integer,Description=\"Number of reads with 1) unconverted cytosines, 2) converted cytosines or thymines, and 3) other bases after filtering by bisulfite conversion strand\">\n");
+ fprintf(matfile.dev, "##FORMAT=<ID=MRDP,Number=1,Type=Integer,Description=\"Number of reads with unconverted (methylated) and converted (unmethylated) cytosines after filtering by bisulfite conversion strand\">\n");
+ fprintf(matfile.dev, "##FORMAT=<ID=CM,Number=1,Type=Integer,Description=\"Number of reads with unconverted (methylated) cytosines after filtering by bisulfite conversion strand\">\n");
+ fprintf(matfile.dev, "##FORMAT=<ID=CU,Number=1,Type=Integer,Description=\"Number of reads with converted (unmethylated) cytosines after filtering by bisulfite conversion strand\">\n");
+ fprintf(matfile.dev, "##FORMAT=<ID=MR,Number=1,Type=Float,Description=\"Estimated methylation rate\">\n");
+ fprintf(matfile.dev, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n", name);
+
+ bl_matchfileEvalCrossSections(space, files, groups, nooffiles, fasta, maxframesize,
+ bl_matchfileCallMethylSimple, &matfile);
+
+ FREEMEMORY(space, name);
+ return ;
+}
+
+void
+calcMethylBias (void *space, matchfile_t **files, int *groups, Uint nooffiles, fasta_t *fasta, Uint maxframesize)
+{
+ matfile_t matfile;
+ matfile.dev = stdout;
+ matfile.fasta = fasta;
+ matfile.files = files;
+
+ bl_matchfileEvalCrossSections(space, files, groups, nooffiles, fasta, maxframesize,
+ bl_matchfileCalcMethylBias, &matfile);
+
+ return ;
+}
+
+
+/*-------------------------------- evalstarts --------------------------------
+ *
+ * @brief get start sites
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+evalstarts (void *space, matchfile_t **files, int *groups, Uint nooffiles,
+ fasta_t *fasta, Uint maxframesize, annotationtrack_t *bed )
+{
+
+ Uint *cntr;
+// Uint i;
+ cntr = ALLOCMEMORY(space, NULL, Uint, 255);
+ memset(cntr, 0, sizeof(Uint)*255);
+
+ MSG("start sites.\n");
+/*
+ bl_matchfileEvalCrossSections(space, files, groups, nooffiles, fasta,
+ bl_matchfileStartSites, cntr);
+
+ for(i=0; i < 255; i++) {
+ printf("%d\t%d\n", i, cntr[i]);
+ }
+*/
+
+ MSG("coverage.\n");
+ bl_matchfileEvalCrossSections(space, files, groups, nooffiles, fasta, maxframesize,
+ bl_coverage, cntr);
+
+ return ;
+}
+
+/*-------------------------------- writewiggle -------------------------------
+ *
+ * @brief get start sites
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+writeexpression (void *space, matchfile_t **files, int *groups, Uint nooffiles,
+ fasta_t *fasta, annotationtrack_t *bed)
+{
+
+ Uint i;
+ char *filename;
+ MSG("write expression file.\n");
+
+ for(i=0; i < nooffiles; i++) {
+ filename = ALLOCMEMORY(space, NULL, char, strlen(files[i]->filename)+5);
+ memmove(filename, files[i]->filename, strlen(files[i]->filename));
+ sprintf(&filename[strlen(files[i]->filename)], ".wig");
+
+ bl_writeexpression(space, filename, strlen(filename), files[i], fasta,
+ 10000000, 1000000);
+ }
+
+ return ;
+}
+
+
+
+/*---------------------------------- splice ----------------------------------
+ *
+ * @brief dump the splice sites
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+evalsplice (void *space, matchfile_t **files, int *groups, Uint nooffiles, fasta_t *fasta, Uint maxframesize,
+ annotationtrack_t *bed, char *filename, char *transfilename, Uint minsplitno)
+{
+ Uint i, cidx, pos, len;
+ char *chr, *chr2, *ptr, *base;
+ splitmap_t map;
+ splicemap_t *sm;
+ FILE *fp1, *fp2;
+
+ bl_matchfileInitSplitMap(space, &map, bed, files, fasta);
+ MSG("eval splice sites.\n");
+ bl_matchfileEvalCrossSections(space, files, groups, nooffiles, fasta, maxframesize,
+ bl_matchfileSplit, &map);
+ MSG("condensing sites.\n");
+ sm = bl_matchfileSpliceMap (space, &map, 10, minsplitno);
+ MSG("writing splice sites to stdout.\n");
+
+
+
+ fp1 = fopen(filename, "w");
+ if (fp1 == NULL) {
+ fprintf(stderr, "Couldnt open %s for reading. Exit forced.\n", filename);
+ exit(-1);
+ }
+
+
+ fp2 = fopen(transfilename, "w");
+ if (fp2 == NULL) {
+ fprintf(stderr, "Couldnt open %s for reading. Exit forced.\n", transfilename);
+ exit(-1);
+ }
+
+
+ base = bl_basename(filename);
+
+ //printsplice(space, sm, stdout);
+ printsplicebed(space, sm, minsplitno, base, fp1, fp2);
+
+ MSG("writing splice sites to gff.\n");
+ if(bed) {
+ bl_annotationtrackGetStats (space, bed);
+ bl_matchfileSpliceAnnotation(space, sm, bed);
+ bl_GFFwrite("splice.gff", bed);
+ }
+
+ for(i=0; i < map.noofsplits; i++) {
+ cidx = map.cidx[i];
+ chr = bl_fastaGetDescription(fasta, cidx);
+ pos = map.pos[i];
+
+ len = strlen(chr);
+ chr2 = ALLOCMEMORY(space, NULL, char, len+1);
+ memmove(chr2, chr, len);
+ chr2[len] = 0;
+ ptr = strtok(chr2, " ");
+
+ // dummy if to avoid not used warnings
+ if (0){
+ printsplits(space, ptr, pos, &map.cs[i], &map);
+ }
+ FREEMEMORY(space, chr2);
+ }
+
+ bl_matchfileDestructSplitMap(space, &map);
+ bl_matchfileDestructSpliceMap (space, sm);
+ FREEMEMORY(space, sm);
+
+ fclose(fp1);
+ fclose(fp2);
+
+ return ;
+}
+
+/*----------------------------------- main -----------------------------------
+ *
+ * @brief the main
+ * @author Steve Hoffmann
+ *
+ */
+
+
+int
+main(int argc, char **argv) {
+
+ void *space = NULL;
+
+ manopt_optionset optset;
+ manopt_arg *unflagged;
+ manopt_arg *dbfilenames;
+ manopt_arg *queries;
+ manopt_arg *indices;
+ manopt_arg *grouplist;
+ manopt_arg *bedfilenames;
+ matchfile_t **files = NULL;
+ annotationtrack_t *track = NULL;
+ fasta_t *fasta = NULL;
+// matchfileindex_t *idx2 = NULL;
+ Uint prefixlen=0;
+ Uint splicebasenamelen;
+ Uint scoreplotbasenamelen;
+ Uint minsplitno = 4;
+ Uint mincover = 6;
+ Uint maxqual = 64;
+ Uint maxcover = 200;
+ char entropyfilter = 0;
+ double minfrac = 0.1;
+ unsigned char gzip = 0, browse=0, call=0, saveindex=0, stats=0, dumpstats=0, starts=0, wiggle=0, strandbias=0, consensus=0, methylcall=0, methylbias=0;
+ char version[]="0.1";
+ char *splicebasename = NULL;
+ char *scoreplotbasename = NULL;
+ char usenativequal = 0, simplegev=0, gatk=0;
+ int *groups = NULL;
+ int i;
+ Uint maxframesize = 100000;
+
+ FILE *beddev = NULL;
+ char *filename;
+ char *transfilename;
+ double minscore = -1;
+ Uint samplescr = 200000;
+ Uint samplecond = 60000;
+
+ initIUPAC(1,1);
+ manopt_initoptionset(&optset, argv[0], NULL,
+ "Heuristic mapping of short sequences\n",
+ "SEGEMEHL is free software for non-commercial use \n (C) 2008 Bioinformatik Leipzig\n",
+ version,
+ "Please report bugs to steve at bioinf.uni-leipzig.de");
+ manopt(&optset, LISTOPT, 1, 'd', "database",
+ "list of path/filename(s) of database sequence(s)", "<file> [<file> ...]",
+ NULL, NULL);
+ manopt(&optset, LISTOPT, 1, 'q', "query",
+ "path/filename of alignment file", "<file> [<file> ...]", NULL, NULL);
+ manopt(&optset, LISTOPT, 0, 'i', "index",
+ "path/filename of db index", "[<file> ... ]", NULL, NULL);
+ manopt(&optset, LISTOPT, 0, 'a', "annotation",
+ "path/filename of bed annotation", "[<bedfile> ... ]", NULL, NULL);
+ manopt(&optset, LISTOPT, 0, 'x', "generate",
+ "generate db index and store to disk", "<file>", NULL, NULL);
+ manopt(&optset, LISTOPT, 0, 'g', "group",
+ "group number for all nput files (1-offset and at most #files groups)", "[<int> ...]", NULL, NULL);
+ manopt(&optset, REQSTRINGOPT, 0, 's', "splice",
+ "dump splice sites to <basename>", NULL, NULL, &splicebasename);
+ manopt(&optset, FLAG, 0, 'S', "starts",
+ "dump start sites", NULL, NULL, &starts);
+ manopt(&optset, FLAG, 0, 'b', "browse",
+ "start browser", NULL, NULL, &browse);
+ manopt(&optset, FLAG, 0, 'c', "call",
+ "variant caller", NULL, NULL, &call);
+ manopt(&optset, FLAG, 0, 'w', "expression",
+ "generate a expression graph file from the match files", NULL, NULL, &wiggle);
+ manopt(&optset, FLAG, 0, 'Z', "dumpstats",
+ "dump data stats", NULL, NULL, &dumpstats);
+ manopt(&optset, REQSTRINGOPT, 0, 'R', "Rplots",
+ "write files for R score plots <basename>", NULL, NULL, &scoreplotbasename);
+ manopt(&optset, FLAG, 0, 'H', "stats",
+ "stats calculation", NULL, NULL, &stats);
+ manopt(&optset, REQUINTOPT, 0, 'm', "minsplitno",
+ "minimum number of splits required to call a splice site", "<n>",
+ NULL, &minsplitno);
+ manopt(&optset, REQUINTOPT, 0, 'N', "mincover",
+ "minimum coverage to call sites", "<n>",
+ NULL, &mincover);
+ manopt(&optset, REQUINTOPT, 0, 'M', "maxcover",
+ "minimum coverage to call sites", "<n>",
+ NULL, &maxcover);
+ manopt(&optset, REQUINTOPT, 0, 'K', "maxframesize",
+ "maximum size of memory resident frame", "<n>",
+ NULL, &maxframesize);
+ manopt(&optset, REQDBLOPT, 0, 'F', "minfrac",
+ "minimum fraction of ALT alleles to call sites", "<f>",
+ NULL, &minfrac);
+ manopt(&optset, REQDBLOPT, 0, 'u', "minscore",
+ "override minimum score", "<f>", NULL, &minscore);
+ manopt(&optset, FLAG, 0, 'E', "entropyfilter",
+ "turn on entropy filter", NULL, NULL, &entropyfilter);
+ manopt(&optset, FLAG, 0, 'G', "strand",
+ "use pbinom strand", NULL, NULL, &strandbias);
+ manopt(&optset, FLAG, 0, 'Q', "nativequal",
+ "use native quals", NULL, NULL, &usenativequal);
+ manopt(&optset, FLAG, 0, 'V', "simplegev",
+ "use simple gev", NULL, NULL, &simplegev);
+ manopt(&optset, FLAG, 0, 'A', "gatk",
+ "use GATK model", NULL, NULL, &gatk);
+ manopt(&optset, FLAG, 0, 'C', "consensus",
+ "use simple consensus calling", NULL, NULL, &consensus);
+ manopt(&optset, FLAG, 0, 'B', "methylcall",
+ "use simple methylation calling", NULL, NULL, &methylcall);
+ manopt(&optset, FLAG, 0, 'Y', "methylbias",
+ "calculate methylation bias", NULL, NULL, &methylbias);
+ manopt(&optset, REQUINTOPT, 0, 'O', "qoffset",
+ "quality offset", "<n>", NULL, &maxqual);
+ manopt(&optset, REQUINTOPT, 0, 'U', "samplecond",
+ "sample size for conditionals", "<n>", NULL, &samplecond);
+ manopt(&optset, REQUINTOPT, 0, 'X', "samplescr",
+ "sample size for scores", "<n>", NULL, &samplescr);
+
+
+
+ unflagged = manopt_getopts(&optset, argc, argv);
+ saveindex = manopt_isset(&optset, 'x', NULL);
+
+ if(!(!manopt_isset(&optset, 'i', NULL) ^ !manopt_isset(&optset, 'x', NULL))) {
+ manopt_help(&optset, "please give index filename using -i XOR -x option\n");
+ } else if(unflagged->noofvalues > 1) {
+ manopt_help(&optset, "unknown argument(s)\n");
+ }
+
+ MSG("reading database sequences.\n");
+ NFO("minsplitno set to %d\n", minsplitno);
+
+ dbfilenames = manopt_getarg(&optset, 'd', "database");
+ fasta = bl_fastxGetSet(space, dbfilenames->values,
+ dbfilenames->noofvalues, 1, 0, 0, 1);
+
+ NFO("%d database sequences found.\n", fasta->noofseqs);
+ MSG("reading query files.\n");
+
+ queries = manopt_getarg(&optset, 'q', "query");
+ if(queries->noofvalues > 30) {
+ manopt_help(&optset, "currently no more than 30 query files allowed\n");
+ }
+
+ grouplist = manopt_getarg(&optset, 'g', "group");
+ if(grouplist) {
+ if(grouplist->noofvalues != queries->noofvalues)
+ manopt_help(&optset, "please provide a group name for each input file");
+
+ groups = ALLOCMEMORY(space, NULL, Uint, grouplist->noofvalues);
+
+ for(i=0; i < grouplist->noofvalues; i++) {
+ groups[i] = atoi(grouplist->values[i]);
+ if(groups[i] == 0)
+ manopt_help(&optset, "please provide group numbers (int) > 0");
+ if(groups[i] > queries->noofvalues)
+ manopt_help(&optset, "please provide groupnumbers <= number of input files");
+ NFO("found group number %d\n", groups[i]);
+ }
+ }
+
+ if (methylcall || methylbias){
+ if (queries->noofvalues > 1){
+ manopt_help(&optset, "multiple query files are not supported for methylation analyses\n");
+ }
+ }
+
+ if(saveindex) {
+ indices = manopt_getarg(&optset, 'x', "generate");
+ } else {
+ indices = manopt_getarg(&optset, 'i', "index");
+ }
+
+ if(indices->noofvalues != queries->noofvalues) {
+ manopt_help(&optset, "please provide an index file name for each query file\n");
+ }
+
+ ntcode = getNTcodekey(space);
+ files = ALLOCMEMORY(space, NULL, matchfile_t*, queries->noofvalues);
+
+ for(i=0; i < queries->noofvalues; i++) {
+
+ files[i] = ALLOCMEMORY(space, NULL, matchfile_t, 1);
+ files[i]->fmt = 0;
+ files[i]->index = NULL;
+ files[i]->filename = queries->values[i];
+
+ prefixlen = bl_fileprefixlen(files[i]->filename);
+
+ if(strncmp(&files[i]->filename[prefixlen], ".gz", 3) == 0 ||
+ strncmp(&files[i]->filename[prefixlen], ".gzip", 5) == 0) {
+ gzip = 1;
+ }
+
+ files[i]->gzip = gzip;
+
+ if(saveindex) {
+
+ bl_matchfileIndex(space, files[i], fasta);
+ bl_matchfileWriteIndex(files[i]->index, indices->values[i]);
+ /* if(stats) {
+ getstats(space, files[i], fasta, mincover, maxcover, minfrac, entropyfilter, strandbias, usenativequal);
+ }
+
+ bl_matchfileWriteIndex(files[i]->index, indices->values[i]);
+ idx2 = bl_matchfileReadIndex(space, indices->values[i]);
+
+ fprintf(stderr, "compare index (%p:%p):%d\n",
+ (void*)files[i]->index, (void *)idx2,
+ bl_compareIndices(files[i]->index, idx2));
+ bl_matchfileDestructIndex(space, idx2);
+ FREEMEMORY(space, idx2);
+ */
+ } else if(indices->values[i]) {
+
+ MSG("reading index file\n");
+ files[i]->index = bl_matchfileReadIndex(space, indices->values[i]);
+ }
+
+
+ if(stats) {
+
+ /*if(files[i]->index->stats) {
+ bl_matchfileDestructSampleStats(NULL, files[i]->index->stats);
+ }
+ */
+ if(samplecond ==0 || samplescr ==0) {
+ MSG("sampling turned off.");
+ getfullstats (space, files[i], fasta, maxframesize, mincover, maxcover, minfrac,
+ entropyfilter, strandbias, usenativequal, samplecond, samplescr);
+ } else {
+ getstats(space, files[i], fasta, mincover, maxcover, minfrac, entropyfilter, strandbias, usenativequal, samplecond, samplescr);
+
+ }
+ bl_matchfileWriteIndex(files[i]->index, indices->values[i]);
+ /*
+ idx2 = bl_matchfileReadIndex(space, indices->values[i]);
+ fprintf(stderr, "compare index (%p:%p):%d\n",
+ (void*)files[i]->index, (void *)idx2, bl_compareIndices(files[i]->index, idx2));
+ bl_matchfileDestructIndex(space, idx2);
+ FREEMEMORY(space, idx2);
+ */
+ }
+
+
+ /*
+ * typically stats should be present if stats
+ * failed however we are not allowed to set
+ * this one
+ */
+
+ if(files[i]->index->stats) {
+ if(files[i]->index->stats->mincover != mincover ||
+ files[i]->index->stats->maxcover != maxcover ||
+ files[i]->index->stats->minfrac != minfrac) {
+ MSG("WARNING: resetting minfrac, mincover or maxcover!");
+ NFO("mincover: %d, maxcover:%d, minfrac:%f\n",files[i]->index->stats->mincover, files[i]->index->stats->maxcover, files[i]->index->stats->minfrac);
+ }
+ files[i]->index->stats->mincover = mincover;
+ files[i]->index->stats->maxcover = maxcover;
+ files[i]->index->stats->minfrac = minfrac;
+ files[i]->index->stats->entropyfilter = entropyfilter;
+ files[i]->index->stats->usenativequal = usenativequal;
+ files[i]->index->stats->usegev = 0;
+ files[i]->index->stats->strand = strandbias;
+ if(stats && files[i]->index->stats->ecdf) {
+ ecdf_destruct(files[i]->index->stats->ecdf);
+ FREEMEMORY(space, files[i]->index->stats->ecdf);
+ }
+ files[i]->index->stats->ecdf =
+ ecdf_init(files[i]->index->stats->e, files[i]->index->stats->e_N);
+
+ }
+
+ //fprintf(stderr, "maxcover: %d\n",files[i]->index->stats->maxcover);
+
+ /*
+ * if stats are not present in any file
+ * neither gev, dumpstats, nor call
+ * can be called
+ */
+ if (!files[i]->index->stats &&
+ (dumpstats || scoreplotbasename || call || starts)){
+ manopt_help(&optset, "please generate an index with stats (option -H) if the options -Z, -c, -S are used\n");
+ }
+ }
+
+
+ if(manopt_isset(&optset, 'a', "annotation")) {
+ bedfilenames = manopt_getarg(&optset, 'a', "annotation");
+ for(i=0; i < bedfilenames->noofvalues; i++) {
+
+ prefixlen = bl_fileprefixlen(bedfilenames->values[i]);
+ if( strncmp(&bedfilenames->values[i][prefixlen], ".bed", 3) == 0 ||
+ strncmp(&bedfilenames->values[i][prefixlen], ".BED", 3) == 0) {
+ track = bl_BEDread(space, bedfilenames->values[i]);
+
+ beddev = fopen("sorted.bed", "w");
+ if(beddev == NULL) {
+ fprintf(stderr, "could not open file %s. Exit forced.", "sorted.bed");
+ exit(-1);
+ }
+
+ bl_BEDwrite(track, beddev);
+
+ } else if( strncmp(&bedfilenames->values[i][prefixlen], ".gff", 3) == 0 ||
+ strncmp(&bedfilenames->values[i][prefixlen], ".GFF", 3) == 0) {
+ track = bl_GFFread(space, bedfilenames->values[i]);
+ bl_BEDwrite(track, beddev);
+
+ } else {
+ manopt_help(&optset, "please provide files with .GFF or .BED extension\n");
+ }
+ }
+ }
+
+ /*
+ * TODO: stats for all files, what if stats failed!
+ */
+
+ if(gatk) {fprintf(stderr, "using GATK model\n"); gatkeval(space, files, groups, queries->noofvalues, fasta, maxframesize, usenativequal); }
+ if(consensus){ fprintf(stderr, "calling consensus\n"); evalconsensus(space, files, groups, queries->noofvalues, fasta, maxframesize); }
+ //if(GEV) { fprintf(stderr, "fitting GEV\n"); bl_matchfileFitGEV (NULL, files[0]->index->stats); }
+ if(dumpstats) bl_matchfileDumpSampleStats(files[0]->index->stats);
+ if(call) {
+
+ eval(space, files, groups, queries->noofvalues, fasta, maxframesize, minscore);
+
+ }
+ if(simplegev) simpleeval(space, files, groups, queries->noofvalues, fasta, maxframesize, usenativequal);
+ if(methylcall) callMethylSimple(space, files, groups, queries->noofvalues, fasta, maxframesize);
+ if(methylbias) calcMethylBias(space, files, groups, queries->noofvalues, fasta, maxframesize);
+
+ if(manopt_isset(&optset, 'R', "scoreplot")) {
+
+
+ if(!scoreplotbasename) {
+ scoreplotbasename = bl_basename(files[0]->filename);
+ }
+ NFO("plotting sample scores to basename: %s", scoreplotbasename);
+
+ scoreplotbasenamelen = strlen(scoreplotbasename);
+
+ char *histofilename = ALLOCMEMORY(space, NULL, char, scoreplotbasenamelen+7+4+1);
+ sprintf(histofilename, "%s.histo.dat", scoreplotbasename);
+
+ char *scorefilename = ALLOCMEMORY(space, NULL, char, scoreplotbasenamelen+7+4+1);
+ sprintf(scorefilename, "%s.score.dat", scoreplotbasename);
+
+ char *cutfilename = ALLOCMEMORY(space, NULL, char, scoreplotbasenamelen+7+4+1);
+ sprintf(cutfilename, "%s.cut.dat", scoreplotbasename);
+
+ char *splinefilename = ALLOCMEMORY(space, NULL, char, scoreplotbasenamelen+7+4+1);
+ sprintf(splinefilename, "%s.spline.dat", scoreplotbasename);
+
+ char *estimatefilename = ALLOCMEMORY(space, NULL, char, scoreplotbasenamelen+7+4+1);
+ sprintf(estimatefilename, "%s.estim.dat", scoreplotbasename);
+
+ getcutoff(files[0]->index->stats, histofilename, scorefilename, cutfilename, splinefilename, estimatefilename);
+
+ FREEMEMORY(space, histofilename);
+ FREEMEMORY(space, scorefilename);
+ FREEMEMORY(space, cutfilename);
+ FREEMEMORY(space, splinefilename);
+ FREEMEMORY(space, estimatefilename);
+
+ }
+
+
+ if(manopt_isset(&optset, 's', "splice") &&
+ manopt_isset(&optset, 'q', "query")) {
+
+ if(!splicebasename) {
+ splicebasename = bl_basename(files[0]->filename);
+ }
+
+ splicebasenamelen = strlen(splicebasename);
+
+ filename = ALLOCMEMORY(space, NULL, char, splicebasenamelen+7+4+1);
+ sprintf(filename, "%s.splice.bed", splicebasename);
+
+ transfilename = ALLOCMEMORY(space, NULL, char, splicebasenamelen+7+4+1);
+ sprintf(transfilename, "%s.trans.bed", splicebasename);
+
+ // not used: splice = 1;
+ evalsplice(space, files, groups, queries->noofvalues, fasta, maxframesize, track, filename, transfilename, minsplitno);
+
+ FREEMEMORY(space, filename);
+ FREEMEMORY(space, transfilename);
+ }
+
+
+ if(browse) view(space, files, queries->noofvalues, fasta, track);
+ if(starts) evalstarts(space, files, groups, queries->noofvalues, fasta, maxframesize, track);
+ if(wiggle) writeexpression(space, files, groups, queries->noofvalues, fasta, track);
+
+ bl_fastaDestruct(space, fasta);
+ FREEMEMORY(space, fasta);
+
+ if(files) {
+ for(i=0; i < queries->noofvalues; i++) {
+ if (files[i]->index) {
+ bl_matchfileDestructIndex(space, files[i]->index);
+ FREEMEMORY(space, files[i]->index);
+ }
+ FREEMEMORY(space, files[i]);
+ }
+ FREEMEMORY(space, files);
+ }
+
+ if(groups) {
+ FREEMEMORY(space, groups);
+ }
+
+
+ if(track) {
+ bl_annotationtrackDestruct(space, track);
+ FREEMEMORY(space, track);
+ }
+
+
+ MSG("Goodbye.\n");
+
+ manopt_destructoptionset(&optset);
+ manopt_destructarg(unflagged);
+ FREEMEMORY(space, unflagged);
+
+ FREEMEMORY(space, ntcode);
+
+ return 0;
+}
+
diff --git a/segemehl/libs/matfile.h b/segemehl/libs/matfile.h
new file mode 100644
index 0000000..a145dfc
--- /dev/null
+++ b/segemehl/libs/matfile.h
@@ -0,0 +1,25 @@
+#ifndef MATFILE_H
+#define MATFILE_H
+
+/*
+ *
+ * matfile.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 08/25/2010 03:47:54 PM CEST
+ *
+ */
+
+#include "biofiles.h"
+#include "matchfiles.h"
+
+typedef struct {
+ FILE *dev;
+ fasta_t *fasta;
+ matchfile_t **files;
+} matfile_t;
+
+
+#endif
diff --git a/segemehl/libs/mathematics.c b/segemehl/libs/mathematics.c
new file mode 100644
index 0000000..deccfb6
--- /dev/null
+++ b/segemehl/libs/mathematics.c
@@ -0,0 +1,2488 @@
+
+/*
+ * mathematics.c
+ * implemtation of various mathematical functions
+ *
+ * @author Steve Hoffmann
+ * @date Wed 22 Nov 2006
+ *
+ * SVN
+ * Revision of last commit: $Rev: 54 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-09-10 22:13:30 +0200 (Wed, 10 Sep 2008) $
+ *
+ * Id: $Id: mathematics.c 54 2008-09-10 20:13:30Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/trunk/libs/mathematics.c $
+ *
+ */
+#include "mathematics.h"
+#include "sort.h"
+#include <float.h>
+#include <string.h>
+#include <limits.h>
+#include "708.h"
+#include <math.h>
+#include <complex.h>
+
+int* intrev(int* n, Uint len){
+ int end = len-1;
+ int start = 0;
+
+ while (start<end) {
+ n[start] ^= n[end];
+ n[end] ^= n[start];
+ n[start] ^= n[end];
+ start++;
+ end--;
+ }
+ return n;
+}
+
+ void *initArray(void *space, int size, size_t datatype) {
+ void *ptr=NULL;
+
+ /*dirty trick: sizeof(char) == 1*/
+ ptr = ALLOCMEMORY(space, ptr, char, size*datatype);
+ return ptr;
+ }
+
+
+void appendvector(void *space, vector_t *v, vectorelem elem) {
+
+ v->elements = (vectorelem*) ALLOCMEMORY(space, v->elements, vectorelem, (v->length+1));
+ v->elements[v->length]=elem;
+ v->length++;
+}
+
+
+
+
+/*--------------------------------- mindist ----------------------------------
+ *
+ * @brief expects a sorted vector to find the minimum distance between
+ * vec[i] and vec[j]
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+minvecdist(void *space, vector_t *vec, Uint i, Uint j) {
+ Uint k,
+ size_i,
+ size_j;
+ int range,
+ dist = INT_MAX,
+ l;
+ vectorelem *e_i,
+ *e_j;
+
+ size_j = LENGTHVEC(&vec[j]);
+ size_i = LENGTHVEC(&vec[i]);
+
+ if (size_i == 0 || size_j == 0)
+ return 0;
+
+ e_j = &vec[j].elements[0];
+ for(k=0; k < size_j ; k++, e_j++) {
+ e_i = &vec[i].elements[0];
+ for(l=0; l < size_i; l++, e_i++) {
+ range = abs((int)*e_j - (int)*e_i);
+ if (range < dist) {
+ dist = range;
+ }
+ }
+ }
+
+ return dist;
+}
+
+
+Uint
+minvecdist2(void *space, vector_t *vec1, vector_t *vec2, Uint *which) {
+ Uint k,
+ size_i,
+ size_j;
+ int range,
+ dist = INT_MAX,
+ l;
+ vectorelem *e_i,
+ *e_j;
+
+ size_j = LENGTHVEC(vec2);
+ size_i = LENGTHVEC(vec1);
+
+ if (size_i == 0 || size_j == 0)
+ return 0;
+
+ e_j = &vec2->elements[0];
+ for(k=0; k < size_j ; k++, e_j++) {
+ e_i = &vec1->elements[0];
+ for(l=0; l < size_i; l++, e_i++) {
+ range = abs((int)*e_j - (int)*e_i);
+ if (range < dist) {
+ dist = range;
+ *which = l;
+ }
+ }
+ }
+
+ return dist;
+}
+
+
+
+void dumpMatrix_int(int *M, int m, int n) {
+ int i,j;
+
+ for (i=0; i < m; i++) {
+ for (j=0; j < n; j++){
+ printf("%d ", MATRIX2D(M,n,i,j));
+ }
+ printf("\n");
+ }
+ }
+
+Uint uarraymax(Uint *arr, Uint l) {
+ Uint i;
+ Uint max =0;
+
+ for(i=0; i < l; i++) {
+ if (arr[i]>arr[max]) max=i;
+ }
+
+ return max;
+}
+
+Uint uarraysecond(Uint *arr, Uint l, Uint max) {
+ Uint i;
+ Uint second =0;
+
+ for(i=0; i < l; i++) {
+ if (arr[i]>arr[second] && i!=max)
+ second=i;
+ }
+
+ return second;
+}
+
+int arraymax(int *arr, int l) {
+ int i;
+ int max =0;
+
+ for(i=0; i < l; i++) {
+ if (arr[i]>arr[max]) max=i;
+ }
+
+ return max;
+}
+
+void dumpMatrix_Uint(Uint *M, Uint m, Uint n) {
+ Uint i,j;
+
+ for (i=0; i < m; i++) {
+ for (j=0; j < n; j++){
+ printf("%d ", MATRIX2D(M,n,i,j));
+ }
+ printf("\n");
+ }
+ }
+
+
+void dumpMatrix_dbl(double *M, Uint m, Uint n) {
+ Uint i,j;
+
+ for (i=0; i < m; i++) {
+ for (j=0; j < n; j++){
+ printf("%f ", MATRIX2D(M,n,i,j));
+ }
+ printf("\n");
+ }
+ }
+
+
+ void dumpMatrix3D_int(int *M, int m, int n, int l) {
+ int i,j,k;
+
+ for (i=0; i < m; i++) {
+ for (j=0; j < n; j++){
+ for (k=0; k < l; k++) {
+ printf("%d ", MATRIX3D(M,n,l,i,j,k));
+ }
+ printf(";");
+ }
+ printf("\n");
+ }
+ }
+
+void dumpVector(vector_t *v) {
+
+ int i;
+ for (i=0; i < v->length; i++) {
+ printf("%d ", v->elements[i]);
+ }
+
+ printf("\n");
+}
+
+
+void destructVector(void *space, vector_t *v) {
+
+ if (v!=NULL) {
+ if (v->elements) FREEMEMORY(space, v->elements);
+ FREEMEMORY(space, v);
+ }
+}
+
+void reverseVector(Uint a, Uint b, vector_t *v) {
+ Uint i;
+
+ for (i=0; i < (b-a); i++) {
+ SWAPVEC(a+i,b-i,v);
+ }
+}
+
+int nextPermutation(vector_t *v) {
+ Uint i,j;
+ vectorelem *e=v->elements;
+
+ for (i=(v->length)-1; i > 0; i--)
+ if(e[i-1]<=e[i]) break;
+
+ if (i==0) return 0;
+
+ for (j=i+1; j < (Uint) v->length; j++ )
+ if(e[i-1]>=e[j]) break;
+
+ SWAPVEC(i-1, j-1, v);
+ REVERSEVEC(i, (v->length)-1, v);
+
+ return 1;
+}
+
+/*----------------------------------- norm -----------------------------------
+ *
+ * @brief normalize
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+normalize (double *a, Uint n)
+{
+ Uint i;
+ double sum = 0;
+
+ for(i=0; i < n; i++) {
+ sum += a[i];
+ }
+
+ for(i=0; i < n; i++) {
+ a[i] /= sum;
+ }
+
+ return ;
+}
+
+
+
+/*----------------------------------- gcd ------------------------------------
+ *
+ * calculate the greatest common divisor of two integer values
+ *
+ */
+
+int
+gcd (int a, int b)
+{
+ int val;
+
+ b = abs(b);
+
+ if (b > a)
+ val=a, a=b, b=val;
+
+ while (b != 0) {
+ val = a%b;
+ a = b;
+ b = val;
+ }
+
+ return a;
+}
+
+
+/*---------------------------------- power -----------------------------------
+ *
+ * the power may be with you!
+ *
+ */
+
+double
+power (double x, int n)
+{
+ double y = 1.;
+
+ if(n==0)
+ return 1;
+ if(x==0) {
+ if(n < 0) {
+ return MAX_DOUBLE;
+ }
+ return 0;
+ }
+
+ if (n < 0) {
+ x = 1./x;
+ n = -n;
+ }
+
+ while(n > 0) {
+ if (n & 1) {
+ y *= x;
+ }
+ n /= 2;
+ x *= x;
+ }
+
+ return y;
+}
+
+
+
+
+/*----------------------------------- fak ------------------------------------
+ *
+ * @brief get the factorial (works only for n <=10!!)
+ * @author Steve Hoffmann
+ *
+ */
+
+
+Uint fak(Uint n) {
+ Uint i,x=n;
+
+ for(i=x-1; i > 0; i--) {
+ x *= i;
+ }
+
+ return x;
+}
+
+
+/*--------------------------------- uniroot ----------------------------------
+ *
+ * getting the zero-root of a given function
+ *
+ * according to G. Forsythe, M. Malcom et al.
+ * Computer methods for mathematical computations, 1980
+ *
+ */
+
+double
+uniroot (double start, double end, double (*f)(double, void*),
+ double tolx, void* info)
+{
+ double a, b, c;
+ double fa, fb, fc;
+ double prev;
+ double currenttol;
+ double p, q, new_step;
+ double cb, t1, t2;
+
+ a = start; b= end; fa = (*f)(a,info); fb=(*f)(b,info);
+ c = a; fc = fa;
+
+ if ((fa > (double) 0 && fb > (double) 0)
+ || (fa < (double)0 && fb < (double)0)) {
+ printf("mooep!\n");
+ /*return 0;*/
+ }
+
+ while(1) {
+
+ prev = b-a;
+
+ if (fabs(fc) < fabs(fb)) {
+ a=b; b=c; c=a;
+ fa=fb; fb=fc; fc=fa;
+ }
+ currenttol = 2 * FLT_EPSILON * fabs(b) + tolx/2;
+ new_step = (c-b)/2;
+ if (fabs(new_step) <= currenttol || fb == (double)0) {
+ return b;
+ }
+
+ if ( fabs(prev) >= currenttol && fabs(fa) > fabs(fb) ) {
+ cb = c-b;
+ if(a==c) {
+ t1 = fb/fa;
+ p = cb*t1;
+ q = 1.0 - t1;
+ } else {
+ q = fa/fc;
+ t1 = fb/fc;
+ t2 = fb/fa;
+ p = t2 * ( cb * q * (q-t1) - (b-a)*(t1-1.0) );
+ q = (q-1.0) * (t1-1.0) * (t2-1.0);
+ }
+ if ( p > (double)0) {
+ q = -q;
+ } else {
+ p = -p;
+ }
+
+ if(p < (0.75 * cb * q - fabs(currenttol*q)/2)
+ && p < fabs(prev * q/2) ) {
+ new_step = p/q;
+ }
+ }
+
+ if (fabs(new_step) < currenttol ) {
+ if(new_step > (double)0) {
+ new_step = currenttol;
+ } else {
+ new_step = -currenttol;
+ }
+ }
+
+ a=b; fa=fb;
+ b+= new_step;
+ fb = (*f)(b,info);
+ if( (fb>0 && fc>0) || (fb < 0 && fc < 0) ) {
+ c=a; fc=fa;
+ }
+ }
+
+ return 0;
+}
+
+
+/*---------------------------------- coldel ----------------------------------
+ *
+ * @brief delete column for matrix
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+coldel (void *space, double *a, Uint m, Uint n, Uint d) {
+
+ double *t;
+ Uint i,
+ j=-1,
+ k=0,
+ l=0;
+
+ t = (double*) INITMATRIX2D(space, m, (n-1), sizeof(double));
+
+ for(i=0; i < m*n; i++) {
+ if(i % n == 0) {
+ j++; k=0; l=0;
+ }
+ if(k++ != d) {
+ MATRIX2D(t, n-1, j, l++) = a[i];
+ }
+ }
+
+ FREEMEMORY(space, a);
+ return t;
+}
+
+
+/*---------------------------------- rowdel ----------------------------------
+ *
+ * @brief delete row from matrix
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+rowdel (void *space, double *a, Uint m, Uint n, Uint d) {
+
+ double *t;
+ Uint i,
+ j=-1,
+ k=0,
+ l=-1;
+
+ t = (double*) INITMATRIX2D(space, (n-1), m, sizeof(double));
+
+ for(i=0; i < m*n; i++) {
+ if(i % n == 0) {
+ j++; k=0;
+ l = (j != d) ? l+1 : l;
+ }
+ if(j != d) {
+ MATRIX2D(t, n, l, k++) = a[i];
+ }
+ }
+
+ FREEMEMORY(space, a);
+ return t;
+}
+
+
+/*---------------------------------- xprod -----------------------------------
+ *
+ * @brief calculate the cross product of two vectors
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+xprod (void *space, double* x, Uint m, double *y, Uint n) {
+ double *p;
+ Uint i,
+ j;
+
+ p = (double*) INITMATRIX2D(space, m, n, sizeof(double));
+
+ for (i=0; i < m; i++) {
+ for(j=0; j < n; j++) {
+ MATRIX2D(p, n, i, j) = x[i]*y[i];
+ }
+ }
+ return p;
+}
+
+
+/*-------------------------------- transpose ---------------------------------
+ *
+ * @brief transpose a matrix $a$ of dimensions $m x n$
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+transpose (void* space, double *a, Uint m, Uint n) {
+ double *t;
+ Uint i,
+ j=-1,
+ k=0;
+
+ t = (double*) INITMATRIX2D(space, n, m, sizeof(double));
+
+ for(i=0; i < m*n; i++) {
+ if(i % n == 0) { j++; k=0;}
+ MATRIX2D(t, m, k, j) = a[i];
+ k++;
+ }
+
+ FREEMEMORY(space, a);
+ return t;
+}
+
+
+
+/*--------------------------------- simpson ----------------------------------
+ *
+ * @brief implementation of the simpson algorithm to determine the integral
+ * of $f(x)$ in the interval [$a$,$b$]. sdiv denotes number of subdivisions
+ * @author Steve Hoffmann
+ *
+ */
+
+ double
+simpson( double a, double b, int sdiv,
+ double (*f) (double, void*),
+ void* info)
+{
+
+ double k,
+ sum1=0,
+ sum2=0;
+ int i;
+
+ k = ((double) b-a)/((double)2*sdiv);
+
+ for (i=1; i < sdiv; i++) {
+ sum1+=f(a + k*2*i, info);
+ sum2+=f(a + k*(2*i-1), info);
+ }
+
+ sum2+=f(a + k*(2*i-1), info);
+
+ return ((double)(k/3)*(f(a, info)+f(b, info)+2*sum1+4*sum2));
+}
+
+
+/*-------------------------------- simpson1D ---------------------------------
+ *
+ * @brief helper function for simpson2D
+ * @author Steve Hoffmann
+ *
+ */
+
+ double
+simpson1D(double x, int sdiv,
+ double (*f) (double, double, void*),
+ double (*c) (double, void*),
+ double (*d) (double, void*),
+ void* info)
+{
+
+ double k,
+ sum1=0,
+ sum2=0,
+ ca,
+ da;
+ int i;
+
+ ca = c(x, info);
+ da = d(x, info);
+
+ k = ((double) da-ca)/((double)2*sdiv);
+
+ for (i=1; i < sdiv; i++) {
+ sum1+=f(x, ca + k*2*i, info);
+ sum2+=f(x, ca + k*(2*i-1), info);
+ }
+
+ sum2+=f(x, ca + k*(2*i-1), info);
+
+
+ return ((double)(k/3)*(f(x, ca, info)+f(x, da, info)+2*sum1+4*sum2));
+}
+
+
+/*-------------------------------- simpson2D ---------------------------------
+ *
+ * @brief calculates the 2-dim integral of function $f$ given the interval
+ * [$a$,$b$] in the first and [$c(x)$,$d(x)$] in the second dimension
+ * sdiv, sdiv2 denote the subdivisions in the first and second dimension
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+simpson2D(double a, double b, int sdiv, int sdiv2,
+ double (*f) (double, double, void*),
+ double (*c) (double, void*),
+ double (*d) (double, void*),
+ void *info) {
+
+ double h,
+ sum1=0,
+ sum2=0;
+ int i;
+
+ h = ((double)b-a)/((double)2*sdiv);
+
+ for (i=1; i < sdiv; i++) {
+ sum1 += simpson1D((a+h*2*(i)), sdiv2, f, c, d, info);
+ sum2 += simpson1D((a+h*(2*i-1)), sdiv2, f, c, d, info);
+ }
+
+ sum2 += simpson1D((a+h*(2*i-1)), sdiv2, f, c, d, info);
+
+ return ((double)(h/3) * (simpson1D(a, sdiv2, f, c, d, info) +
+ simpson1D(b, sdiv2, f, c, d, info) + 2*sum1 + 4*sum2 ));
+}
+
+
+
+/*--------------------------------- myMinor ----------------------------------
+ *
+ * @brief helper function for the laplacian algorithm used in det()
+ * @author Steve Hoffmann
+ *
+ */
+double*
+myMinor(void *space, double* M, Uint m, Uint n, Uint i, Uint j) {
+
+ double *t;
+
+ t = (double*) ALLOCMEMORY(space, NULL, double, m*n);
+ memmove(t, M, sizeof(double)*(m*n));
+
+ t = rowdel(NULL, t, m, n, i);
+ t = coldel(NULL, t, m-1, n, j);
+
+ return t;
+}
+
+
+/*----------------------------------- det ------------------------------------
+ *
+ * @brief calculates the determinant of a square matrix m of size n x n using
+ * Laplacian algorithm (recursive implementation)
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+det(void *space, double *M, int n) {
+ double sum=0,
+ *t=NULL;
+ int j;
+
+ if (n==1) {
+ return MATRIX2D(M, n, 0, 0);
+ }
+
+ for(j=0; j < n; j++) {
+ t = myMinor(space, M, n, n, 0, j);
+ sum += pow(-1.0, (j+2))*MATRIX2D(M,n,0,j)*det(space, t, n-1);
+ FREEMEMORY(space, t);
+ }
+
+ return sum;
+}
+
+/*---------------------------------- invert ----------------------------------
+ *
+ * @brief invert a matrix
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+invert3D (void *space, double *M)
+{
+
+ double a, b, c, d, e, f, g, h, k;
+ double detM;
+
+ if((detM=det(space, M, 3)) !=0) {
+
+ a = MATRIX2D(M, 3, 0, 0);
+ b = MATRIX2D(M, 3, 0, 1);
+ c = MATRIX2D(M, 3, 0, 2);
+ d = MATRIX2D(M, 3, 1, 0);
+ e = MATRIX2D(M, 3, 1, 1);
+ f = MATRIX2D(M, 3, 1, 2);
+ g = MATRIX2D(M, 3, 2, 0);
+ h = MATRIX2D(M, 3, 2, 1);
+ k = MATRIX2D(M, 3, 2, 2);
+
+
+ MATRIX2D(M, 3, 0, 0) = (e*k-f*h)/detM;
+ MATRIX2D(M, 3, 0, 1) = (f*g-d*k)/detM;
+ MATRIX2D(M, 3, 0, 2) = (d*h-e*g)/detM;
+
+ MATRIX2D(M, 3, 1, 0) = (c*h-b*k)/detM;
+ MATRIX2D(M, 3, 1, 1) = (a*k-c*g)/detM;
+ MATRIX2D(M, 3, 1, 2) = (g*b-a*h)/detM;
+
+ MATRIX2D(M, 3, 2, 0) = (b*f-c*e)/detM;
+ MATRIX2D(M, 3, 2, 1) = (c*d-a*f)/detM;
+ MATRIX2D(M, 3, 2, 2) = (a*e-b*d)/detM;
+
+ M = transpose(space, M, 3, 3);
+ return M;
+ }
+
+ return NULL;
+}
+
+
+
+/*----------------------------------- add ------------------------------------
+ *
+ * @brief componentwise addition of a to a vector of length m
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+add(double *x, Uint m, double a) {
+ Uint i;
+
+ for(i=0; i < m; i++) {
+ x[i] += a;
+ }
+ return x;
+}
+
+
+/*----------------------------------- mean -----------------------------------
+ *
+ * @brief calculate the arithmetic mean for a vector of length m
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+mean (double *x, Uint m) {
+ Uint i;
+ double sum=0;
+
+ for (i=0; i < m; i++) {
+ sum += x[i];
+ }
+
+ return sum /= m;
+}
+
+/*----------------------------------- mean -----------------------------------
+ *
+ * @brief calculate the arithmetic mean for a vector of length m
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+mean_int (int *x, Uint m) {
+ Uint i;
+ double sum=0;
+
+ for (i=0; i < m; i++) {
+ sum += x[i];
+ }
+
+ return sum /= m;
+}
+
+
+
+/*---------------------------------- scalar ----------------------------------
+ *
+ * @brief calculate the scalar product of two vectors of length m
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+scalar (double* x, double *y, Uint m) {
+ double p=0;
+ Uint i;
+
+ for (i=0; i < m; i++) {
+ p += x[i]*y[i];
+ }
+ return p;
+}
+
+
+/*----------------------------------- cov ------------------------------------
+ *
+ * @brief get the covariance matrix (2x2) for two vectors of length m
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+cov (void *space, double *x, double *y, Uint m) {
+ double *c,
+ xm,
+ ym;
+
+ c = (double*) INITMATRIX2D(space, 2, 2, sizeof(double));
+ xm = mean(x, m);
+ ym = mean(y, m);
+
+ /*center*/
+ add(x, m, (-1)*xm);
+ add(y, m, (-1)*ym);
+
+ MATRIX2D(c, 2, 0, 0) = (double) scalar(x,x,m)/(m-1);
+ MATRIX2D(c, 2, 0, 1) = MATRIX2D(c, 2, 1, 0) = (double) scalar(x,y,m)/(m-1);
+ MATRIX2D(c, 2, 1, 1) = (double) scalar(y,y,m)/(m-1);
+
+ return c;
+}
+
+
+
+
+/*----------------------------------- var ------------------------------------
+ *
+ * @brief get the sample variance
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+samplevar (double *x, double *p, double n)
+{
+ int i;
+ double m, r, sum=0;
+
+ m=mean(x, n);
+ for (i=0; i < n; i++) {
+ r = x[i]-m;
+ sum += (r*r)*p[i];
+ }
+
+ return sum/n;
+}
+
+
+/*----------------------------------- var ------------------------------------
+ *
+ * @brief get the variance
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+var_int (int *x, Uint n)
+{
+ int i;
+ double m, r, sum=0;
+
+ m=mean_int(x, n);
+ for (i=0; i < n; i++) {
+ r = x[i]-m;
+ sum += (r*r);
+ }
+
+ return sum/n;
+}
+
+/*--------------------------------- poisson ----------------------------------
+ *
+ * @brief the <.><< distribution
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+poisson(double lambda, double x) {
+ assert(x >= 0);
+ return (pow(lambda,x)/tgamma(x+1))*exp(-lambda);
+}
+
+
+/*-------------------------------- logpoisson --------------------------------
+ *
+ * @brief poisson in log space
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+logpoisson (double loglambda, double logx)
+{
+ //assert(logx >= 0);
+ return (exp(logx)*loglambda) - log(tgamma(exp(logx)+1)) - exp(loglambda);
+
+}
+
+/*----------------------------------- var ------------------------------------
+ *
+ * @brief get the variance
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+var (double *x, Uint n)
+{
+ int i;
+ double m, r, sum=0;
+
+ m=mean(x, n);
+ for (i=0; i < n; i++) {
+ r = x[i]-m;
+ sum += (r*r);
+ }
+
+ return sum/n;
+}
+
+
+/*---------------------------------- stddev ----------------------------------
+ *
+ * @brief get the standard deviation
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+stddev (double *x, double n)
+{
+
+ return sqrt(var(x, n));
+}
+
+
+/*----------------------------------- rho ------------------------------------
+ *
+ * @brief calculate correlation $\rho$ for two vectors of length m
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+rho (void *space, double *x, double *y, Uint m) {
+ double *cv;
+
+ cv = cov(space, x, y, m);
+ return (MATRIX2D(cv, 2, 0, 1)/sqrt(MATRIX2D(cv, 2, 0, 0)*MATRIX2D(cv, 2, 1, 1)));
+}
+
+/*-------------------------------- univarnorm --------------------------------
+ *
+ * @brief pdf gaussian
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+univarnorm (double x, double mu, double sd)
+{
+ double d = (x-mu);
+ double sdsq = sd * sd;
+
+ return exp(-0.5*(d*d/sdsq))/sqrt(2*M_PI*sdsq);
+}
+
+
+/*------------------------------ univarnormcdf -------------------------------
+ *
+ * @brief cdf gaussian
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+univarnormcdf (double x, double mu, double sd)
+{
+ return 0.5 * (1+erf((x-mu)/(sd*M_SQRT2)));
+}
+
+
+/*------------------------------ randunivarnorm ------------------------------
+ *
+ * @brief algorithm adapted from Dr. Everett (Skip) Carter, Jr.
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+randunivarnorm (double mu, double sd)
+{
+ double x1, x2, w, y2;// y1;
+
+ do{
+ x1 = 2.0 * (((double)rand())/((double)RAND_MAX)) - 1.0;
+ x2 = 2.0 * (((double)rand())/((double)RAND_MAX)) - 1.0;
+ w = x1 * x1 + x2 * x2;
+ } while (w >= 1.0 || w == .0);
+
+ w = sqrt((-2.0*log(w))/w);
+ // not used: y1 = x1 *w;
+ y2 = x2 *w;
+
+ return y2*sd+mu;
+}
+
+/*-------------------------------- bivarcond ---------------------------------
+ *
+ * @brief conditional bivar. norm. distrib. f(y|x) given location parameter
+ * $mu1$, $mu2$ and covariance matrix $cv$ of size (2x2)
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+bivarcond(double x, double y, double mu1, double mu2, double *cv) {
+ double rho,
+ s1,
+ s1sq,
+ s2,
+ s2sq,
+ m,
+ e;
+
+ s1sq = MATRIX2D(cv, 2, 0, 0);
+ s2sq = MATRIX2D(cv, 2, 1, 1);
+ s1 = sqrt(s1sq);
+ s2 = sqrt(s2sq);
+ rho = MATRIX2D(cv, 2, 0, 1)/sqrt(s1sq*s2sq);
+
+ m = 1/sqrt((2*M_PI*s2sq*(1-(rho*rho))));
+ e = (y-mu2-rho*(s2/s1)*(x-mu1));
+ e *= e;
+ e /= s2sq*(1-(rho*rho));
+
+ return(m*exp(-0.5*e));
+}
+
+
+/*-------------------------------- bivarnorm ---------------------------------
+ *
+ * @brief bivariate normal distribution f(x,y) given location parameter
+ * $mu1$, $mu2$ and covariance matrix $cv$ of size (2x2)
+ * @author Steve Hoffmann
+ *
+ */
+
+
+double
+bivarnorm(double x, double y, double mu1, double mu2, double* cv) {
+ double rho,
+ s1,
+ s1sq,
+ s2,
+ s2sq,
+ m,
+ e1,
+ e2;
+
+ s1sq = MATRIX2D(cv, 2, 0, 0);
+ s2sq = MATRIX2D(cv, 2, 1, 1);
+ s1 = sqrt(s1sq);
+ s2 = sqrt(s2sq);
+ rho = MATRIX2D(cv, 2, 0, 1)/sqrt(s1sq*s2sq);
+
+ m = 1/(2*M_PI*s1*s2*sqrt(1-(rho*rho)));
+
+ e1 = (-1)/(2*(1-(rho*rho)));
+ e2 = ((x-mu1)*(x-mu1))/s1sq
+ - (2*rho*(x-mu1)*(y-mu2))/(s1*s2)
+ + ((y-mu2)*(y-mu2))/s2sq;
+
+ return m*exp(e1*e2);
+}
+
+/*------------------------------ multivarnorm -------------------------------
+ *
+ * @brief n-dimensional gaussian probability density function wo correlation term
+ * i.e. orthogonal variation!
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+multivarnorm (double *pt, double *mu, double *sd, Uint n)
+{
+ Uint i;
+ double det = 1;
+ double exponent = 0;
+ double perturb = 0;
+
+ for(i=0; i < n; i++) {
+ exponent += pow((pt[i]-mu[i]),2) * (1/pow(MAX(sd[i],perturb),2));
+ det *= pow(sd[i],2);
+ }
+
+ //fprintf(stderr, "exponent: %f, det: %f, exp(expo):%f, norm:%f\n", exponent, det,
+ // exp(exponent*-0.5), sqrt(pow((2*M_PI),n)*det));
+
+ return exp(exponent*-0.5)/sqrt(pow((2*M_PI),n)*det);
+}
+
+
+/*this is ncbi intellectual property*/
+
+double BLAST_Expm1(double x)
+{
+ double absx = ABS(x);
+
+ if (absx > .33)
+ return exp(x) - 1.;
+
+ if (absx < 1.e-16)
+ return x;
+
+ return x * (1. + x *
+ (1./2. + x *
+ (1./6. + x *
+ (1./24. + x *
+ (1./120. + x *
+ (1./720. + x *
+ (1./5040. + x *
+ (1./40320. + x *
+ (1./362880. + x *
+ (1./3628800. + x *
+ (1./39916800. + x *
+ (1./479001600. +
+ x/6227020800.))))))))))));
+}
+
+
+/*---------------------------------- log10 -----------------------------------
+ *
+ * @brief logarithm to the base 10
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+log10(double x) {
+ return (log(x)/log(10));
+}
+
+/*----------------------------------- log2 -----------------------------------
+ *
+ * @brief logarithm to the base 2
+ * @author Steve Hoffmann
+ *
+ */
+
+
+double
+log2(double x) {
+ return (log(x)/log(2));
+}
+
+
+/*--------------------------------- log10add ---------------------------------
+ *
+ * @brief addition in log10 space
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+log10add(double a, double b) {
+ double max, min;
+
+ if(a > b) {
+ if (b == log10(0)) return a;
+ else {
+ max=a;
+ min=b;
+ }
+ } else {
+ if (a == log10(0)) return b;
+ else {
+ max=b;
+ min=a;
+ }
+ }
+ return max + log10(1+pow(10.,min-max));
+}
+
+/*---------------------------------- logadd ----------------------------------
+ *
+ * @brief addition in logarithmus naturalis space
+ * @author Steve Hoffmann
+ *
+ */
+
+
+double
+logadd(double a, double b) {
+ double max, min;
+
+ if(a > b) {
+ if (b == log(0)) return a;
+ else {
+ max=a;
+ min=b;
+ }
+ } else {
+ if (a == log(0)) return b;
+ else {
+ max=b;
+ min=a;
+ }
+ }
+ return max + log(1+exp(min-max));
+}
+
+/*-------------------------------- seqentropy --------------------------------
+ *
+ * @brief zero order sequence entropy
+ * @author Steve Hoffmann
+ *
+ */
+
+
+double
+shannonentropy(void *space, char *seq, Uint len, Uint asize, Uint *encodetab) {
+ //CHANGED: count should start with 0
+ //--> possibly just use len instead of norm
+ //Uint i, norm=1;
+ Uint i, norm=0;
+ double *p, H=0;
+
+
+ p = ALLOCMEMORY(space, NULL, double, asize);
+ memset(p, 0, sizeof(double)*asize);
+
+ for(i=0; i < len; i++) {
+ p[encodetab[(Uint)seq[i]]]++;
+ norm++;
+ }
+
+ for(i=0; i < asize; i++) {
+ p[i]/=norm;
+ }
+
+ for(i=0; i < asize; i++) {
+ if(p[i] > 0)
+ H += p[i] * log2(p[i]);
+ }
+
+ FREEMEMORY(space, p);
+ return -1*H;
+}
+
+/*-------------------------------- smoothavg ---------------------------------
+ *
+ * @brief smoothing average
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+smoothavg (double *y, Uint n, Uint r)
+{
+ double *ys;
+ Uint i, j;
+
+ ys = ALLOCMEMORY(space, NULL, double, n);
+
+ for(i=r; i < n; i++) {
+ for(j=0; j < 2*r; j++) {
+ ys[r+j] += y[i-r+j] / (2*r+1);
+ }
+ }
+
+ return ys;
+}
+
+/*----------------------------------- gmm ------------------------------------
+ *
+ * @brief gaussian mixture model for m n-dimensional data points and g gaussians
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+gmm(void *space, double *pt, Uint m, Uint n,
+ double *mu, double *sd, double *w, Uint g, Uint maxiter) {
+
+ double *ms, *mu_, *sd_, no, dd, ll, oll, epsilon=0.0000000000000001;
+ Uint i, j, k, l=0;
+
+ ms = ALLOCMEMORY(space, NULL, double, m*g);
+ mu_ = ALLOCMEMORY(space, NULL, double, n);
+ sd_ = ALLOCMEMORY(space, NULL, double, n);
+ memset(sd_, 0, sizeof(double)*n);
+ memset(mu_, 0, sizeof(double)*n);
+ memset(ms, 0, sizeof(double)*m*g);
+
+ ll = 1;
+
+ do {
+
+ oll = ll;
+
+ /*expectation step*/
+ for(ll=0, i=0; i < m; i++) {
+ for(j=0; j < g; j++) {
+ MATRIX2D(ms, g, i, j) = w[j] *
+ multivarnorm(&MATRIX2D(pt, n, i, 0), &MATRIX2D(mu, n, j, 0),
+ &MATRIX2D(sd, n, j, 0), n);
+ ll += log(MATRIX2D(ms, g, i, j));
+ }
+ normalize(&MATRIX2D(ms, g, i, 0), g);
+ }
+
+ /*maximization step - weighted normalized mean*/
+ for(j=0; j < g; j++) {
+ memset(mu_, 0, sizeof(double)*n);
+
+ for(i=0, no=0; i < m; i++) {
+ no += MATRIX2D(ms, g, i, j);
+ for(k=0; k < n; k++) {
+ mu_[k] += MATRIX2D(pt, n, i, k) * MATRIX2D(ms, g, i, j);
+ }
+ }
+
+ /*update mean*/
+ for(k=0; k < n; k++) {
+ mu_[k] /= no;
+ MATRIX2D(mu, n, j, k) = mu_[k];
+ }
+
+ for(i=0; i < m; i++) {
+ for(k=0; k < n; k++) {
+ dd = MATRIX2D(pt, n, i, k) - mu_[k];
+ sd_[k] += (dd * dd) * MATRIX2D(ms, g, i, j);
+ }
+ }
+
+ /*update sd*/
+ for(k=0; k < n; k++) {
+ sd_[k] /= no;
+ MATRIX2D(sd, n, j, k) = sqrt(sd_[k]);
+ }
+
+ /*update weight*/
+ w[j] = no/((double)m);
+
+ }
+
+ } while(l++ < maxiter && fabs((double)ll-oll) > -1.0*ll*epsilon);
+
+
+ FREEMEMORY(space, mu_);
+ FREEMEMORY(space, sd_);
+ FREEMEMORY(space, ms);
+
+ return ll;
+}
+
+
+
+/*--------------------------------- bl_RSSi ----------------------------------
+ *
+ * @brief calculation of a RSS for a vector of length n from u to v for intercept
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+bl_RSS (void *space, double *x, Uint n, Uint u, Uint v)
+{
+ Uint i;
+ double *cum, *y;
+
+ assert(v>u);
+
+ cum = ALLOCMEMORY(space, NULL, double, v-u+1);
+ y = ALLOCMEMORY(space, NULL, double, v-u+1);
+
+ cum[0] = x[u];
+ y[0] = x[u] * sqrt(2);
+
+ for(i=u+1; i < v; i++) {
+ cum[i-u] = x[i] + cum[i-u-1];
+ y[i-u] = x[i] - cum[i-u]/((double)i-u+1);
+ y[i-u] *= sqrt(1.0+(1.0/((double)i-u)));
+ }
+
+ cum[0] = 0;
+
+ for(i=1; i < v-u+1; i++) {
+ cum[i] = cum[i-1] + y[i]*y[i];
+ }
+
+ //diagonal
+ cum[0] = 0;
+
+ FREEMEMORY(space, y);
+ return cum;
+}
+
+
+/*------------------------------- bl_RSSmatrix -------------------------------
+ *
+ * @brief compute the triangular RSS matrix
+ * @author Steve Hoffmann
+ *
+ */
+
+breakpoints_t*
+bl_RSSmatrix (void *space, double *x, Uint n, Uint h, Uint noofbreaks)
+{
+ Uint i, j, minarg, *POS, m, resc; //m breaks
+ breakpoints_t *breaks;
+ double *M, *RSS, *temp, minval;
+ double *y;
+
+ /*calculation of rss matrix*/
+ M = ALLOCMEMORY(space, NULL, double, (n-h+1)*n);
+ memset(M, 0, sizeof(double)*((n-h+1)*n));
+
+ for(i=0; i < n-h+1; i++) {
+ y = bl_RSS(space, x, n, i, n);
+ memmove(&MATRIX2D(M, n, i, i), y, sizeof(double)*(n-i));
+ FREEMEMORY(space, y);
+ }
+
+ /*DP in the rss matrix*/
+ RSS = ALLOCMEMORY(space, NULL, double, (noofbreaks+1)*(n+1));
+ POS = ALLOCMEMORY(space, NULL, Uint, (noofbreaks+1)*(n+1));
+ memset(RSS, 0, (noofbreaks+1)*(n+1)*sizeof(double));
+ memset(POS, 0, (noofbreaks+1)*(n+1)*sizeof(Uint));
+
+ for(i=0; i < n; i++) {
+ MATRIX2D(RSS, noofbreaks+1, i, 1) = MATRIX2D(M, n, 0, i);
+ MATRIX2D(POS, noofbreaks+1, i, 1) = i;
+ }
+
+ for(m=2; m <= noofbreaks; m++) {
+ for(i=m*h-1; i < n-h; i++) {
+ temp = ALLOCMEMORY(space, NULL, double, (i-h)-((m-1)*h-1)+1);
+ /*Recursion*/
+ for(j=(m-1)*h-1; j < i-h+1; j++) {
+ temp[j-(((m-1)*h)-1)] = MATRIX2D(RSS, noofbreaks+1, j, m-1) + MATRIX2D(M, n, j+1, i);
+ }
+ /*find the minimum*/
+ minarg = 0;
+ minval = temp[0];
+ for(j=1; j < (i-h)-((m-1)*h-1)+1; j++) {
+ if(temp[j] < minval) {
+ minarg = j;
+ minval = temp[j];
+ }
+ }
+ /*register*/
+ MATRIX2D(RSS, noofbreaks+1, i, m) = temp[minarg];
+ MATRIX2D(POS, noofbreaks+1, i, m) = (m-1)*h-1+minarg;
+
+ FREEMEMORY(space, temp);
+ }
+ }
+
+
+ breaks = ALLOCMEMORY(space, NULL, breakpoints_t, noofbreaks+1);
+ breaks[0].noofbreaks = 0;
+ breaks[0].RSS = MATRIX2D(RSS, noofbreaks+1, n-1, 1);
+ breaks[0].LL = -0.5 * n * (log(breaks[0].RSS) + 1 - log(n) + log(2*M_PI));
+ breaks[0].BIC = -2*breaks[0].LL + 2*log(n);
+ breaks[0].breaks = NULL;
+
+
+ for(m=noofbreaks; m >= 1; m--) {
+ /*calling breaks and backtrace*/
+ temp = ALLOCMEMORY(space, NULL, double, n);
+
+ for(i=h-1; i < n-h; i++) {
+ if (MATRIX2D(RSS, noofbreaks+1, i, m) == 0.0 ||
+ MATRIX2D(M, n, i+1, n-1) == 0.0) {
+ temp[i]= -1.0;
+ } else {
+ temp[i] = MATRIX2D(RSS, noofbreaks+1, i, m) + MATRIX2D(M, n, i+1, n-1);
+ }
+ }
+
+ minarg = 0;
+ minval = temp[h-1];
+ for(i=h-1; i < n-h; i++) {
+ if((temp[i] > 0.0 && temp[i] < minval) || minval == -1.0) {
+ minarg = i;
+ minval = temp[i];
+ }
+ }
+
+ breaks[m].noofbreaks = m;
+ breaks[m].breaks = ALLOCMEMORY(space, NULL, Uint, m);
+ breaks[m].RSS = temp[minarg];
+ breaks[m].LL = -0.5 * n * (log(temp[minarg]) + 1 - log(n) + log(2*M_PI));
+ breaks[m].BIC = -2*breaks[m].LL + (2*(m+1))*log(n);
+ breaks[m].breaks[0] = minarg;
+
+
+ for(i=m, j=1; i >= 2; i--, j++) {
+ breaks[m].breaks[j] = MATRIX2D(POS, noofbreaks+1, breaks[m].breaks[j-1], i);
+ }
+
+ //reverse
+ for(i=0; i < m/2; i++) {
+ resc = breaks[m].breaks[m-i-1];
+ breaks[m].breaks[m-i-1] = breaks[m].breaks[i];
+ breaks[m].breaks[i] = resc;
+ }
+ FREEMEMORY(space, temp);
+ }
+
+ FREEMEMORY(space, RSS);
+ FREEMEMORY(space, M);
+
+ return breaks;
+}
+
+
+/*---------------------------------- gevcdf ----------------------------------
+ *
+ * @brief cumulative extreme value distribution
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+gevcdf (double x, double mu, double si, double xi)
+{
+ double t, r=.0;
+
+ if(xi != 0.0) {
+ t = 1.0 + xi*((x-mu)/si);
+ t = pow(t,(-1.0/xi));
+ } else {
+ t = exp(-(x-mu)/si);
+ }
+
+ r = exp(-t);
+ return r;
+}
+
+/*---------------------------------- llgev -----------------------------------
+ *
+ * @brief log likelihood of a gev Prescott Walden Formulation
+ * l(mu, sigma, kappa) =
+ * -\Sum log(sigma) - (1-k) \Sum x_i - \Sum exp(-x_i)
+ * x_i = -(1/k) log[1 - k((y_i - mu)/sigma)]
+ * NOTE: k = -\xi !!!
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+gevll (double *data, Uint n, double mu, double sigma, double kappa)
+{
+ Uint i;
+ double sum1 = .0, sum2 =.0, sum3=.0, x, z;
+
+ sum1 =-1.0*log(sigma) * (double)n;
+
+ for(i=0; i < n; i++) {
+ z = 1.0-(kappa*((data[i]-mu)/sigma));
+ if(z >= .0)
+ x = -1.0/kappa * log(z);
+ else
+ x = 1/1e-5;
+ sum2 += x;
+ sum3 += exp(-1.0*x);
+ }
+
+ sum1 = sum1 - ((1.0-kappa)*sum2) - sum3;
+ return sum1;
+}
+
+
+/*-------------------------------- gev1stdev ---------------------------------
+ *
+ * @brief first derivative of gev location mu
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+gevinvhessian(void *space, double *y, Uint n, double m, double s, double k,
+ double* dm, double* ds, double* dk, Uint *excl)
+
+{
+ //the likelihood is given by:
+ //l(m,s,k) = \sum \log(s) - (1-k) \Sum x_i - \Sum exp (-x_i)
+ //where x_i is given by
+ //x_i = (-1/k) \log(1-k\frac{y-m}{s})
+ //we denote q = -(1-k) \Sum x_i
+ //we denote z = 1-k\frac{y-m}{s}
+ //we denote r = exp(-1*x_i)
+
+ Uint i, exclude=0;
+ double x=0, ex=0, kmys, kmys2, logs, k2, k3, s2, my, my2, logkmys;
+ double cdqm =.0, cdqs = .0, cdqk =.0;
+ double cdqmm =.0, cdqss =.0, cdqkk =.0;
+ double cdqms =.0, cdqmk =.0, cdqsk =.0;
+ double cdrm =.0, cdrs = .0, cdrk =.0;
+ double cdrmm =.0, cdrss =.0, cdrkk =.0;
+ double cdrms =.0, cdrmk =.0, cdrsk =.0;
+ double dxm =.0, dxs =.0, dxk =.0;
+ double dxmm, dxss, dxkk, dxms, dxmk, dxsk;
+ double *H;
+ double fulldm = .0;
+
+ logs = log(s);
+ k2 = k*k;
+ k3 = k*k*k;
+ s2 = s*s;
+
+ for(i = 0; i < n; i++) {
+
+ my = m - y[i];
+ my2 = my * my;
+ kmys = k*my+s; //kmys = z * s;
+ kmys2 = kmys*kmys;
+ logkmys = log(kmys);
+
+ if(kmys > (double).0) {
+
+ x = (-1.0/k)*log(1.0-(k*(1.0/s)*((y[i]-m))));
+ ex = exp(-1.0*x);
+
+ dxm = -1.0/(kmys);
+ dxs = my/(s*kmys);
+ dxk = -(1.0/k2) * ((k*my)/kmys - logkmys + logs);
+
+ dxmm = k/kmys2;
+ dxss = -my*(k*my+2.0*s)/(s2*kmys2);
+ dxkk = (1.0/k3) * (((k2*my2)/(kmys2)) + 2.0*((k*my)/kmys -logkmys + logs));
+
+ //dxms = 1.0/((k*(-m+y[i])-s)*(k*(-m+y[i])-s));
+ dxms = 1.0/(kmys2);
+ dxmk = my/kmys2;
+ dxsk = -1.0*my2/(s*kmys2);
+
+ //sums of derivatives for q direct or product rule
+ cdqm += dxm;
+ cdqs += dxs;
+ cdqk += -1.0*x + (1.0-k)*dxk; //product rule
+ cdqmm += dxmm;
+ cdqss += dxss;
+ cdqkk += (-1.0* dxk) - dxk + (1.0-k)*dxkk; //2x product rule
+ cdqms += dxms;
+ cdqmk += (m+s-y[i])/kmys2; //direct derivative
+ cdqsk += -(my*(m+s-y[i]))/(s*kmys2); //direct derivative
+
+ //sums of derivatives for r using chain, product rule
+ cdrm += ex * -1.0*dxm;
+ cdrs += ex * -1.0*dxs;
+ cdrk += ex * -1.0*dxk;
+ cdrmm += ex * ((dxm*dxm) + -1.0*dxmm);
+ cdrss += ex * ((dxs*dxs) + -1.0*dxss);
+ cdrkk += ex * ((dxk*dxk) + -1.0*dxkk);
+ cdrms += ex * ((dxm*dxs) + -1.0*dxms);
+ cdrmk += ex * ((dxm*dxk) + -1.0*dxmk);
+ cdrsk += ex * ((dxs*dxk) + -1.0*dxsk);
+
+ fulldm += -(exp((1/k)*log((k*m-k*y[i]+s)/s))+k-1) * (1/kmys);
+
+ } else {
+ exclude++;
+ }
+
+ }
+
+ cdqm *= -(k-1.0); cdqs *= -(k-1.0);
+ cdqmm *= -(k-1.0); cdqss *= -(k-1.0); cdqms *= -(k-1.0);
+
+ *dm = (- cdqm - cdrm);
+ *ds = (-1.0*n*(1/s) - cdqs - cdrs);
+ *dk = (-cdqk - cdrk);
+
+ H = ALLOCMEMORY(space, NULL, double, 9);
+
+ H[0] = (-cdqmm - cdrmm) * -1.0;
+ H[4] = (n*(1.0/s2) - cdqss - cdrss) * -1.0;
+ H[8] = (-cdqkk - cdrkk) * -1.0;
+ H[1] = H[3] = (-cdqms - cdrms) * -1.0;
+ H[2] = H[6] = (-cdqmk - cdrmk) * -1.0;
+ H[5] = H[7] = (-cdqsk - cdrsk) * -1.0;
+
+// fprintf(stderr, "excluded: %d\n", exclude);
+// fprintf(stderr, "du:%f, da:%f, dg:%f, fulldu:%f\n", *dm, *ds, *dk, fulldm);
+// fprintf(stderr, "duu:%f, dua:%f, daa:%f, dug:%f, dag:%f, dgg:%f\n", H[0], H[1], H[4], H[2], H[5], H[8]);
+
+ H = invert3D(space, H);
+
+ if(H) {
+// fprintf(stderr, "du:%e, da:%e, dg:%e\n", *dm, *ds, *dk);
+// fprintf(stderr, "duu:%e, dua:%e, daa:%e, dug:%e, dag:%e, dgg:%e\n", H[0], H[3], H[4], H[6], H[7], H[8]);
+ }
+
+ *excl = exclude;
+ return H;
+}
+
+/*--------------------------------- gevmle -----------------------------------
+ *
+ * @brief fit a generalized extreme value distribution
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+gevmle(void *space, double *y, Uint n,
+ double *m, double *s, double *k, Uint maxiter, double ymin, double ymax) {
+
+ Uint l=0, maxrcnt = 20, rcnt=0, exclude;
+ double *H, oll=-1.0*DBL_MAX, ll;
+ double vm=.0, vs=.0, vk=.0;
+ double dm, ds, dk;
+ double lm, ls, lk;
+ double sm=0.5, ss=0.25, sk=0.02, acc=1e-8, rho=0.25, smax;
+ double z;
+
+ lm = *m;
+ ls = *s;
+ lk = *k;
+
+ //adjust values to avoid failure of invHessian (ie. kmys <= 0)
+ if(fabs(lk) < 1e-4) lk = 1e-4;
+ if(ls < 0) ls = 1;
+
+ if(lk <= 0) {
+ if(ymin < lm) {
+ z = ls/(ymin-lm);
+ if(lk <= z) {
+ lk = z + 1e-4;
+ if(lk >= 0) lk = 0.5 * z;
+ }
+ }
+ } else {
+ if(ymax > lm) {
+ z = ls/(ymax-lm);
+ if(lk >= z) {
+ lk = z - 1e-4;
+ if(lk <= 0) lk = 0.5 *z;
+ }
+ }
+ }
+
+
+ do {
+
+ H = gevinvhessian(space, y, n, lm, ls, lk, &dm, &ds, &dk, &exclude);
+ ll = gevll(y, n, lm, ls, lk);
+ if(exclude) ll *= 0.9;
+
+ // fprintf(stderr, "-------------\n iter:%d m:%f, s:%f, k:%f -> ll:%.10f oll:%.10f\n",
+ // l, lm, ls, lk, ll, oll);
+
+ if(H && (H[0] >= 0 && H[4] >= 0 && H[8] >= 0) && ll > oll) {
+ oll = ll;
+
+ // fprintf(stderr, "likelihood has increased\n");
+
+ vm = H[0]*dm + H[3]*ds + H[6]*dk;
+ vs = H[4]*ds + H[3]*dm + H[7]*dk;
+ vk = H[8]*dk + H[6]*dm + H[7]*ds;
+
+ FREEMEMORY(space, H);
+
+ smax = (fabs(vm)/(sm*ls) > fabs(ds)/(ss*ls)) ? fabs(vm)/(sm*ls) : fabs(ds)/(ss*ls);
+ smax = (smax > fabs(vk)/sk) ? smax : fabs(vk)/sk;
+
+ if(smax < 1) {
+ smax = 1/smax;
+ vm *= smax;
+ vs *= smax;
+ vk *= smax;
+ }
+
+ } else {
+
+ if(ll <= oll) {
+ // fprintf(stderr, "last move was wrong");
+ lm -= vm;
+ ls -= vs;
+ lk -= vk;
+ } else {
+ // fprintf(stderr, "steepest ascent\n");
+ double tmp1 = 1e37;
+ double tmp2 = 1e37;
+ double tmp3 = 1e37;
+
+ if(dm != 0.0) tmp1 = sm/(lm*fabs(dm));
+ if(ds != 0.0) tmp2 = ss/(ls*fabs(ds));
+ if(dk != 0.0) tmp3 = sk/(fabs(dk));
+ smax = MAX3(tmp1, tmp2, tmp3);
+
+ vm = dm * lm * lm * smax;
+ vs = ds * ls * ls * smax;
+ vk = dk * smax;
+ }
+ }
+
+ rcnt = 0;
+ do {
+ // fprintf(stderr, "rescale (vm:%f, vs:%f, vk:%f)\n", vm, vs, vk);
+ vm *= rho;
+ vs *= rho;
+ vk *= rho;
+ } while ((1-(lk+vk)*(ymin-(lm+vm))/(ls+vs) < 0 ||
+ (1-(lk+vk)*(ymax-(lm+vm))/(ls+vs) < 0)) && rcnt++ <maxrcnt);
+
+ lm += vm;
+ ls += vs;
+ lk += vk;
+
+ // fprintf(stderr, "vm:%.8f, vs:%.8f, vk:%.8f\n", vm, vs, vk);
+ // fprintf(stderr, "lm:%.8f, ls:%.8f, lk:%.8f\n", lm, ls, lk);
+
+ } while(l++ < maxiter && fabs(vm)>acc*ls && fabs(vs)>acc*ls && fabs(vk)>acc);
+
+ *m = lm;
+ *s = ls;
+ *k = lk;
+
+ return .0;
+}
+
+
+
+/*---------------------------------- gevvar ----------------------------------
+ *
+ * @brief get the variance for gev
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+gevvar (double mu, double s, double k)
+{
+ if(mu != .0 && k < 1) {
+ return (s*s)*(exp(lgamma(1.0-2.0*k)) - exp(lgamma(1.0-k))*exp(lgamma(1.0-k)))/(k*k);
+ }
+
+ if(mu == .0) {
+ return s*s*((M_PI*M_PI)/k);
+ }
+
+ return -1*log(0);
+}
+
+
+/*---------------------------------- stirling ----------------------------------
+ *
+ * @brief stirlings's approximation for the gamma function (numerical recipies)
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+gammaln (double x)
+{
+ Uint i;
+ double y,tmp;
+ double ser = 1.000000000190015;
+ double coef[6] = { 76.18009172947146, -86.50532032941677,
+ 24.01409824083091, -1.231739572450155,
+ 0.1208650973866179e-2, -0.5395239384953e-5};
+ y = x;
+
+ for(i=0; i < 6; i++) {
+ ser += coef[i]/++y;
+ }
+
+ tmp = (x+5.5)-(x+0.5)*log(x+5.5);
+ return -tmp+log(2.5066282746310005*ser/x);
+}
+
+
+
+/*-------------------------------- gevmoment ---------------------------------
+ *
+ * @brief L moment estimators (Hosking 1985)for a generalized extreme value
+ * distribution using probability weighted moments PWM
+ * (Wallis 1980, 1985; cf. Martins Stedinger 2000);
+ * data is ordered x1 < x2
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+gevLmoment (double *data, Uint n, double *m, double *s, double *k)
+{
+
+ Uint i;
+ double beta[3], lambda[4], c, kappa, alpha, xi, q, g;
+
+ beta[0] = .0;
+ beta[1] = .0;
+ beta[2] = .0;
+
+ for(i = 1 ; i <= n; i++) {
+ beta[0] += data[i-1];
+ q = (((double)(i-1))/(n-1));
+ beta[1] += q * data[i-1];
+ beta[2] += (q*(((double)(i-2))/(n-2))) * data[i-1];
+ }
+
+ beta[0] /= n;
+ beta[1] /= n;
+ beta[2] /= n;
+
+ lambda[1] = beta[0];
+ lambda[2] = 2.0*beta[1] - beta[0];
+ lambda[3] = 6.0*beta[2] - 6.0*beta[1] + beta[0];
+
+ //calculation of theta3 = lambda3 / lambda2
+ c = 2.0/(3.0+(lambda[3]/lambda[2])) - log(2)/log(3);
+ kappa = 7.8590*c + 2.9554*(c*c);
+ g = exp(gammaln(1.0+kappa));
+ alpha = (lambda[2]*kappa)/((1-pow(2.0,-1.0*kappa)) * g);
+ xi = lambda[1] - alpha*(1.0-exp(gammaln(1.0+kappa)))/kappa;
+
+ *m = xi;
+ *s = alpha;
+ *k = kappa;
+
+ return ;
+}
+
+
+/*------------------------------- dumpmatrix2D -------------------------------
+ *
+ * @brief dump a 2-Dmatrix
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+dumprecursionmatrix2D (FILE *dev, int **M, char**B, char **K, Uint m, Uint n,
+ PairUint *mark)
+{
+ Uint i,j;
+
+
+ fprintf(dev, " \t");
+ for(j=0; j < n-1; j++) {
+ fprintf(dev, " %d \t", j);
+ }
+ fprintf(dev, "\n");
+
+ for(i=0; i < m; i++) {
+ fprintf(dev, "%d\t", i);
+ for(j=0; j < n-1; j++) {
+ if(mark->a == i && mark->b ==j)
+ fprintf(dev, "^");
+ if(B[i][j] && K[i][j])
+ fprintf(dev, "-*%u*-\t", M[i][j]);
+ else if(B[i][j])
+ fprintf(dev, " *%u* \t", M[i][j]);
+ else if(K[i][j])
+ fprintf(dev, "- %u -\t", M[i][j]);
+ else
+ fprintf(dev, " %u \t", M[i][j]);
+ }
+ if(B[i][j] && K[i][j])
+ fprintf(dev, "-*%u*-\n", M[i][j]);
+ else if(B[i][j])
+ fprintf(dev, " *%u* \n", M[i][j]);
+ else if(K[i][j])
+ fprintf(dev, "- %u -\n", M[i][j]);
+ else
+ fprintf(dev, " %u \n", M[i][j]);
+ }
+
+ return ;
+}
+
+
+/*-------------------------------- upperbound --------------------------------
+ *
+ * @brief get the first element in a sorted list greater x
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+upperbound (double *l, Uint lo, Uint hi, double x)
+{
+ Uint cur=lo, begin = lo, size=hi-lo, step =0;
+
+ while(size > 0) {
+ step=size/2;
+ cur = begin+step;
+ if(x >= l[cur]) {
+ begin = ++cur;
+ size -= (step+1);
+ } else {
+ size = step;
+ }
+ }
+
+ return begin;
+}
+
+
+/*-------------------------------- lowerbound --------------------------------
+ *
+ * @brief get the first element in a sorted list that equals x
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+lowerbound (double *l, Uint lo, Uint hi, double x)
+{
+ Uint cur=lo, begin = lo, size=hi-lo, step =0; //rename count to size
+
+ while(size > 0) {
+ step=size/2;
+ cur = begin+step;
+ if(x > l[cur]) {
+ begin = ++cur;
+ size -= (step+1);
+ } else {
+ size = step;
+ }
+ }
+
+ return begin;
+}
+
+
+/*----------------------------------- ecdf -----------------------------------
+ *
+ * @brief calculate the probability for a given ecdf
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+ecdf (double x, ecdf_t *e)
+{
+ Uint pos = upperbound(e->l, 0, e->n-1, x);
+ return ((double)pos)/((double)e->n);
+}
+
+
+/*-------------------------------- ecdf_init ---------------------------------
+ *
+ * @brief initalize the ecdf
+ * @author Steve Hoffmann
+ *
+ */
+
+ecdf_t*
+ecdf_init (double *x, Uint n)
+{
+ ecdf_t* e;
+ double *l;
+
+ l = ALLOCMEMORY(NULL, NULL, double, n);
+ memmove(l, x, sizeof(double)*n);
+ e = ALLOCMEMORY(NULL, NULL, ecdf_t, 1);
+
+ qsort(l, n, sizeof(double), cmp_dbl_qsort);
+
+ e->l = l;
+ e->n = n;
+
+ return e;
+}
+
+
+/*------------------------------ ecdf_destruct -------------------------------
+ *
+ * @brief destruct ecdf
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+ecdf_destruct (ecdf_t *e)
+{
+ FREEMEMORY(NULL, e->l);
+ return ;
+}
+
+/*--------------------------------- incbeta ---------------------------------
+ *
+ * @brief
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+incbeta (double x, double a, double b, char lower)
+{
+
+ if(x <= 0) return .0;
+ if(x >= 1) return 1.;
+ double x1 = 0.5 - x + 0.5, w, wc;
+ long int ierr;
+
+ bratio_(&a, &b, &x, &x1, &w, &wc, &ierr);
+ return lower ? w : wc;
+}
+
+/*---------------------------------- pbinom ----------------------------------
+ *
+ * @brief binominal distribution
+ * @author Steve Hoffmann
+ *
+ */
+
+double
+pbinom (double x, double n, double p, char lower)
+{
+ if(n<0 || p<0 || p>1) return 0;
+ if (x < 0) return 0.0;
+ x = floor(x + 1e-7);
+ if (n <= x) return 1.0;
+ return incbeta(p, x + 1, n - x, !lower);
+}
+
+
+/*-------------------------------- quantiles ---------------------------------
+ *
+ * @brief get k quantiles from x
+ * @author Steve Hoffmann
+ *
+ */
+
+double*
+quantiles (double *x, Uint n, double *p, Uint k)
+{
+ Uint i;
+ double *q;
+
+ //sort the array
+ qsort(x, n, sizeof(double), cmp_dbl_qsort);
+
+ q = ALLOCMEMORY(NULL, NULL, double, k);
+
+ for(i=0; i < k; i++) {
+ double h = ((double)n-1)*p[i] + 1.0;
+ Uint hlo = floor(h);
+ q[i] = x[hlo-1] + (h - ((double)hlo))*(x[hlo] - x[hlo-1]);
+ }
+
+ return q;
+}
+
+
+/*------------------------- choleskyTriDiagArrowFact -------------------------
+ *
+ * @brief cholesky tri-diagnal arrow factorization for splines
+ * @author Steve Hoffmann
+ *
+ * DEBUG!
+ * http://www.math.ethz.ch/education/bachelor/lectures/hs2012/math/nummath_cse/sol5.pdf
+ */
+
+void
+choleskyTriDiagArrowFact(double *dia, double *off, double *bot, Uint n) {
+
+ Uint i;
+ double sum =0;
+
+ assert(n > 3);
+ dia[0] = sqrt(dia[0]); //L
+ off[0] = off[0]/dia[0]; //B = M
+ bot[0] = bot[0]/dia[0]; //E
+
+ for(i=1; i < n-3; i++){
+
+ dia[i] = dia[i] - off[i-1]*off[i-1];
+ assert(dia[i] >= 0); // pos def matrix
+ dia[i] = sqrt(dia[i]);
+ off[i] = off[i]/dia[i]; //off = M
+
+ bot[i] = bot[i] - bot[i-1]*off[i-1] / dia[i];
+ sum += bot[i-1]*bot[i-1]; //bot = E
+ }
+
+ dia[n-3] = dia[n-3] - off[n-4]*off[n-4];
+ assert(dia[n-3] >= 0);
+ dia[n-3] = sqrt(dia[n-3]);
+ off[n-3] = off[n-3] - bot[n-4]*off[n-4]/dia[n-3];
+ bot[n-3] = bot[n-3] - bot[n-4]*off[n-4]/dia[n-3];
+
+ dia[n-2] = dia[n-2] - off[n-3]*off[n-3] - sum;
+ assert(dia[i] >= 0);
+ dia[n-2] = sqrt(dia[n-2]);
+
+ return;
+}
+
+
+/*----------------------------- splines_periodic -----------------------------
+ *
+ * @brief http://svn.r-project.org/R/trunk/src/library/stats/src/splines.c
+ * @author Steve Hoffmann
+ *
+ */
+
+
+double*
+splines_periodic(double *x, double *y, Uint n) {
+ int i;
+ double sum = .0;
+ double *dia, *off, *bot, *c;
+
+ dia = ALLOCMEMORY(NULL, NULL, double, n);
+ off = ALLOCMEMORY(NULL, NULL, double, n);
+ bot = ALLOCMEMORY(NULL, NULL, double, n);
+ c = ALLOCMEMORY(NULL, NULL, double, n);
+
+ off[0] = x[1] - x[0];
+ off[n-2]= x[n-1] - x[n-2];
+ dia[0] = 2.0 * (off[0] + off[n-2]);
+ bot[0] = x[n-1] - x[n-2];
+ c[0] = (y[1] - y[0])/off[0] - (y[n-1] - y[n-2])/off[n-2];
+
+ for(i = 1; i < n-1; i++) {
+ off[i] = x[i+1] - x[i];
+ dia[i] = 2.0 * (off[i] + off[i-1]);
+ bot[i] = 0;
+ c[i] = (y[i+1] - y[i])/off[i] - (y[i] - y[i-1])/off[i-1];
+ }
+
+ choleskyTriDiagArrowFact(dia, off, bot, n);
+
+ //do forward substititution
+ c[0] = c[0]/dia[0];
+ for(i=1; i < n-2; i++) {
+ c[i] = (c[i] - off[i-1] * c[i-1])/dia[i];
+ //for periodic boundary condition
+ sum += bot[i-1] * c[i-1];
+ }
+ c[n-2] = (c[n-2] - off[n-3] * c[n-3] - sum) / dia[n-2];
+
+ //now backward substitution
+ c[n-2] = c[n-2]/dia[n-2];
+ //for periodic boundary condition
+ c[n-3] = (c[n-3] - off[n-3] * c[n-2])/dia[n-3];
+
+ for(i=n-4; i >= 0; i--) {
+ c[i] = (c[i] - off[i]*c[i] - bot[i])/dia[i];
+ }
+
+ c[n-1] = c[0];
+
+ for(i=0; i < n-1; i++) {
+ sum = x[i+1] - x[i];
+ dia[i] = (y[i+1]-y[i])/sum - sum*(c[i+1]+2.0*c[i]);
+ off[i] = (c[i+1]-c[i])/sum;
+ c[i] = 3.0*c[i];
+ }
+
+ dia[n-1] = dia[0];
+ off[n-1] = off[0];
+ c[n-1] = c[0];
+
+ return c;
+}
+
+
+/*--------- function pretty was adapted from the R project pretty.c ----------
+ *
+ * @brief
+ * @author Steve Hoffmann
+ *
+ */
+
+
+double *
+prettyarray(double min, double max, Uint n, Uint *r)
+{
+
+ double *arr = NULL;
+ double eps = 1e-7;
+ double cell, U, base, unit, ns, nu;
+ double shrink = 0.75;
+ double h = 1.5;
+ double h5 = .5 + 1.5*h;
+ double d = max-min;
+ double delta;
+ int min_n = n / 3;
+ char i_small = 0;
+ int k, newn;
+
+ if(max == 0 && min == 0) {
+ cell = 1;
+ i_small = 1;
+ } else {
+ cell = MAX(fabs(min),fabs(max));
+ U = (1 + (h5 >= 1.5*h+.5)) ? 1/(1+h) : 1.5/(1+h5);
+ i_small = d < cell * U * MAX(1,n) * DBL_EPSILON *3;
+ }
+
+// fprintf(stderr, "small:%d, cell:%f, U:%f, d:%f\n", i_small, cell, U, d);
+
+ if(i_small) {
+ if(cell > 10) cell = 9 + cell/10;
+ cell *= shrink;
+ if(min_n > 1) cell /= (double)min_n;
+ } else {
+ cell = d;
+ if(n > 1) cell /= (double)n;
+ }
+
+// fprintf(stderr, "cell: %f\n", cell);
+
+ if(cell < 20*DBL_MIN) {
+ cell = 20*DBL_MIN;
+ } else if(cell * 10 > DBL_MAX) {
+ cell = .1*DBL_MAX;
+ }
+
+// fprintf(stderr, "cell: %f\n", cell);
+
+ base = pow(10., floor(log10(cell)));
+ unit = base;
+ if((U = 2*base)-cell < h*(cell-unit)) {
+ unit = U;
+ if((U = 5*base)-cell < h5*(cell-unit)) {
+ unit = U;
+ if((U =10*base)-cell < h*(cell-unit))
+ unit = U;
+ }
+ }
+
+// fprintf(stderr, "unit: %f\n", unit);
+
+ ns = floor(min/unit+eps);
+ nu = ceil (max/unit-eps);
+
+// fprintf(stderr, "initial ns-nu [%f,%f] - %f %f %f\n", ns, nu, ns*unit, min, eps*unit);
+
+ while(ns*unit > min + eps*unit) ns--;
+ while(nu*unit < max - eps*unit) nu++;
+
+ k = (int)(0.5 + nu - ns);
+// fprintf(stderr, "itered ns-nu [%f,%f] - %d, %d\n", ns, nu, k, min_n);
+
+ if(k < min_n) {
+ k = min_n - k;
+ if(ns >= 0.) {
+ nu += k/2;
+ ns -= k/2 + k%2;/* ==> nu-ns = old(nu-ns) + min_n -k = min_n */
+ } else {
+ ns -= k/2;
+ nu += k/2 + k%2;
+ }
+ newn = min_n;
+ } else {
+ newn = k;
+ }
+
+ arr = ALLOCMEMORY(NULL, NULL, double, newn+2);
+ delta = (nu-ns)*unit/((double)newn);
+ //fprintf(stderr, "from %f to %f, delta:%f, newn:%d\n", nu, ns, delta, newn);
+
+ for(int i=0; i < newn; i++) {
+ arr[i] = ns*unit + (((double)i)*delta);
+ }
+
+ *r = newn;
+ return arr;
+}
+
+
+Uint *
+bin(double *x, Uint n, double **breaks, Uint *nbins) {
+
+ double *bins, min, max;
+ Uint i, newn, curbin = 0, *cnt;
+
+ assert(n > 0);
+
+ qsort(x, n, sizeof(double), cmp_dbl_qsort);
+ min = x[0];
+ max = x[n-1];
+
+ if(*nbins == 0) *nbins = ceil(log2(n) + 1);
+
+ fprintf(stderr, "\n[%f,%f]\n", min, max);
+ bins = prettyarray(min, max, *nbins, &newn);
+
+ for(i=0; i < newn; i++) fprintf(stderr, "%d %f\n", i, bins[i]);
+ cnt = ALLOCMEMORY(space, NULL, Uint, newn);
+ memset(cnt, 0, sizeof(Uint)*newn);
+ cnt[0] = 0;
+
+ for(i=0; i < n; i++) {
+ while(curbin+1 < newn && x[i] >= bins[curbin+1]) {
+ cnt[curbin+1] = 0;
+ curbin++;
+ }
+ if(curbin > 10) fprintf(stderr, "%f -> bin[%f]\n", x[i], bins[curbin]);
+ cnt[curbin]++;
+ }
+
+ *nbins = newn;
+ *breaks = bins;
+
+ return cnt;
+}
+
+/*-------------------------------- dist_uint ---------------------------------
+ *
+ * @brief absolute distances
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+dist_uint (Uint a, Uint b)
+{
+ if(a > b) {
+ return a-b;
+ }
+
+ return b-a;
+}
diff --git a/segemehl/libs/mathematics.h b/segemehl/libs/mathematics.h
new file mode 100644
index 0000000..5032854
--- /dev/null
+++ b/segemehl/libs/mathematics.h
@@ -0,0 +1,187 @@
+#ifndef _MATHEMATICS_H_
+#define _MATHEMATICS_H_
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "basic-types.h"
+#include <math.h>
+
+/*typedef struct
+{
+ size_t k;
+ gsl_matrix *A;
+ gsl_matrix *dB;
+} gsl_bspline_deriv_workspace;
+
+
+
+*/
+
+/* important for compiling with cygwin */
+#undef log2
+
+#define RANDUNIT ((rand()/(RAND_MAX + 1.0)))
+//#define RANDINT(MAX) ((Uint)round(((double)MAX) * (rand()/((double)RAND_MAX + 1.0))))
+#define RANDINT(MAX) ((Uint)round((((double)MAX+1) * (rand()/((double)RAND_MAX + 1.0)))-0.5))
+
+#define MATRIX2D(X,NCOL,I,J) X[(I)*(NCOL)+(J)]
+#define MATRIX3D(X,DIM_M,DIM_N,I,J,K) X[((I)*(DIM_M)+(J))*DIM_N+(K)]
+#define MATRIX4D(X,DIM_M,DIM_N,DIM_Z,I,J,K,L) X[(((I)*(DIM_M)+(J))*DIM_N+(K))*DIM_Z+(L)]
+#define VECTOR(X,I) ((X)->elements[I])
+
+
+#define INITMATRIX(X,SIZE,TYPESIZE) initArray(X,SIZE,TYPESIZE)
+#define INITMATRIX2D(X,M,N,TYPESIZE) initArray(X, (M)*(N), TYPESIZE)
+#define INITMATRIX3D(X,M,N,L,TYPESIZE) initArray(X, (M)*(N)*(L), TYPESIZE)
+#define INITVECTOR(V) (V)->elements=NULL;\
+ (V)->length=0
+
+#define RESIZEVEC(V,N) initArray(V,N,sizeof(vectorelem))
+#define LENGTHVEC(V) ((V)->length)
+#define SWEEPVEC(V) {int m;\
+ for (m=0;m<LENGTHVEC(V);m++)\
+ VECTOR(V,mi)=0;}
+#define APPENDVEC(S,V,E) appendvector(S,V,E)
+#define SWAPVEC(I,J,V) { vectorelem msv; \
+ msv = (V)->elements[I];\
+ (V)->elements[I] = (V)->elements[J];\
+ (V)->elements[J] = msv; }
+#define DESTRUCTVEC(S,V) destructVector((S),(V));
+#define SWAPUINT(X,A,B) {Uint msu ;\
+ msu= X[A]; \
+ X[A] = X[B]; \
+ X[B]=msu; }
+
+#define REVERSEVEC(A,B,V) {Uint mi;\
+ for(mi=0; mi<(B-A); mi++) {\
+ SWAPVEC(A+mi,B-mi,V);\
+ }}
+#define EMPTYVEC(V) (V->elements==NULL)
+#define MAX(A,B) (((A) >= (B)) ? (A) : (B))
+#define DMAX(A,B) (double)MAX(A,B)
+
+#define MAX3(A, B, C) MAX(MAX((A),(B)),(C))
+#define MAX4(A, B, C, D) MAX(MAX(MAX((A),(B)),(C)),(D))
+#define MAX5(A, B, C, D, E) MAX(MAX(MAX(MAX((A),(B)),(C)),(D)),(E))
+
+#define MIN(A,B) (((A) <= (B)) ? (A) : (B))
+#define DMIN(A,B) (double)MIN(A,B)
+#define MIN3(A, B, C) MIN((MIN((A),(B))),(C))
+
+#define MAX_DOUBLE 1e200
+#define ABS(X) ((X)>=0?(X):-(X))
+#define DABS(X) (double)ABS(X)
+#define OVERLAP(A,B,C,D) (((A) >= (C) && (B) <= (D)) || \
+ ((A) <= (D) && (B) >= (D)) || \
+ ((B) >= (C) && (A) <= (C)))
+#define CLOSEDINTERVAL(X,A,B) ((A) >= (X) && (X) <= (B))
+#define OPENINTERVAL(X,A,B) ((A) > (X) && (X) < (B))
+#define ISELEM(X,A,B) ((A) <= (X) && (X) <= (B))
+
+
+#ifndef FLT_EPSILON
+ #define FLT_EPSILON 1.192092896e-06
+ #define DBL_EPSILON 2.2204460492503131e-016
+#endif
+
+#ifndef M_PI
+ #define M_PI 3.141592653589793238462643
+#endif
+
+#ifndef M_SQRT2
+ #define M_SQRT2 1.4142135623730950488016887
+#endif
+
+#ifndef ALLOCMEMORY
+ #include "memory.h"
+#endif
+
+#ifndef VECTORELEM
+typedef Uint vectorelem;
+#endif
+
+typedef struct {
+ vectorelem *elements;
+ Lint length;
+} vector_t;
+
+typedef struct{
+ Uint noofbreaks;
+ Uint *breaks;
+ double RSS;
+ double LL;
+ double BIC;
+} breakpoints_t;
+
+typedef struct {
+ double *l;
+ Uint n;
+} ecdf_t;
+
+
+
+
+Lint llabs(Lint);
+double ecdf (double x, ecdf_t *e);
+ecdf_t* ecdf_init (double *x, Uint n);
+void ecdf_destruct (ecdf_t *e);
+Uint lowerbound(double *l, Uint, Uint, double);
+Uint upperbound(double *l, Uint, Uint, double);
+void *initArray(void *, int, size_t);
+void dumpMatrix_int(int *, int, int);
+void dumpMatrix_Uint(Uint *, Uint, Uint);
+void dumpMatrix_flt(float *, int, int);
+void dumpMatrix_dbl(double *, Uint, Uint);
+void dumpMatrix3D_int(int *, int, int, int);
+double *transpose(void* space, double *, Uint, Uint);
+void appendvector(void *, vector_t *, vectorelem);
+void dumpVector(vector_t *);
+void destructVector(void *, vector_t *);
+Uint uarraymax(Uint *, Uint);
+int arraymax(int *, int);
+int gcd(int, int);
+double power(double, int);
+double uniroot(double, double, double (*f)(double, void*), double, void*);
+double BLAST_Expm1(double x);
+Uint fak(Uint);
+double* coldel (void *, double *, Uint, Uint, Uint);
+double* rowdel (void *, double *, Uint, Uint, Uint);
+Uint minvecdist(void *space, vector_t *vec, Uint i, Uint j);
+int* intrev(int *n, Uint len);
+double shannonentropy(void *space, char *seq, Uint len, Uint asize, Uint *encodetab);
+double log2(double x);
+double log10(double x);
+double logadd(double a, double b);
+double log10add(double a, double b);
+double var_int (int *x, Uint n);
+double poisson(double lambda, double x);
+double logpoisson(double lambda, double x);
+double multivarnorm (double *pt, double *mu, double *sd, Uint n);
+double bivarnorm(double x, double y, double mu1, double mu2, double* cv);
+double univarnormcdf (double x, double mu, double sd);
+double randunivarnorm(double mu, double sd);
+double gmm(void *space, double *pt, Uint m, Uint n, double *mu, double *sd, double *w, Uint g, Uint maxiter);
+double var (double *x, Uint n);
+void normalize (double *a, Uint n);
+breakpoints_t * bl_RSSmatrix (void *space, double *x, Uint n, Uint h, Uint noofbreaks);
+double* bl_RSS (void *space, double *x, Uint n, Uint u, Uint v);
+void gevLmoment (double *data, Uint n, double *m, double *s, double *k);
+double gammaln (double x);
+double gevll (double *data, Uint n, double mu, double sigma, double kappa);
+double* gevinvhessian(void *space, double *y, Uint n, double m, double s, double k,
+ double* dm, double* ds, double* dk, Uint *ex);
+double gevmle(void *space, double *y, Uint n,
+ double *m, double *s, double *k, Uint maxiter, double ymin, double ymax);
+double
+gevcdf (double x, double mu, double si, double xi);
+double gevvar (double mu, double s, double k);
+void dumprecursionmatrix2D (FILE *dev, int **M, char**B, char**K, Uint m, Uint n, PairUint *mark);
+Uint uarraysecond(Uint *arr, Uint l, Uint max);
+double pbinom (double x, double n, double p, char lower);
+Uint* bin(double *x, Uint n, double **breaks, Uint *nbins);
+double* splines_periodic(double *x, double *y, Uint n);
+double *quantiles(double *x, Uint n, double* q, Uint k);
+double scalar (double* x, double *y, Uint m);
+Uint dist_uint (Uint a, Uint b);
+
+#endif
diff --git a/segemehl/libs/md5.c b/segemehl/libs/md5.c
new file mode 100644
index 0000000..4198343
--- /dev/null
+++ b/segemehl/libs/md5.c
@@ -0,0 +1,407 @@
+/*
+ Copyright (C) 1999 Aladdin Enterprises. All rights reserved.
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ L. Peter Deutsch
+ ghost at aladdin.com
+
+ */
+/*
+ Independent implementation of MD5 (RFC 1321).
+
+ This code implements the MD5 Algorithm defined in RFC 1321.
+ It is derived directly from the text of the RFC and not from the
+ reference implementation.
+
+ The original and principal author of md5.c is L. Peter Deutsch
+ <ghost at aladdin.com>. Other authors are noted in the change history
+ that follows (in reverse chronological order):
+
+ 1999-11-04 lpd Edited comments slightly for automatic TOC extraction.
+ 1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5).
+ 1999-05-03 lpd Original version.
+ 2009-10-03 Steve Hoffmann added handling function MD5
+ */
+
+#include "md5.h"
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#ifdef TEST
+/*
+ * Compile with -DTEST to create a self-contained executable test program.
+ * The test program should print out the same values as given in section
+ * A.5 of RFC 1321, reproduced below.
+ */
+#include <string.h>
+main()
+{
+ static const char *const test[7] = {
+ "", /*d41d8cd98f00b204e9800998ecf8427e*/
+ "945399884.61923487334tuvga", /*0cc175b9c0f1b6a831c399e269772661*/
+ "abc", /*900150983cd24fb0d6963f7d28e17f72*/
+ "message digest", /*f96b697d7cb7938d525a2f31aaf161d0*/
+ "abcdefghijklmnopqrstuvwxyz", /*c3fcd3d76192e4007dfb496cca67e13b*/
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
+ /*d174ab98d277d9f5a5611c2c9f419d9f*/
+ "12345678901234567890123456789012345678901234567890123456789012345678901234567890" /*57edf4a22be3c955ac49da2e2107b67a*/
+ };
+ int i;
+
+ for (i = 0; i < 7; ++i) {
+ md5_state_t state;
+ md5_byte_t digest[16];
+ int di;
+
+ md5_init(&state);
+ md5_append(&state, (const md5_byte_t *)test[i], strlen(test[i]));
+ md5_finish(&state, digest);
+ printf("MD5 (\"%s\") = ", test[i]);
+ for (di = 0; di < 16; ++di)
+ printf("%02x", digest[di]);
+ printf("\n");
+ }
+ return 0;
+}
+#endif /* TEST */
+
+
+unsigned char*
+MD5R(unsigned char *msg, unsigned int l, void *nfo) {
+ md5_state_t state;
+ md5_byte_t *digest;
+
+ digest = calloc(17,sizeof(md5_byte_t));
+ md5_init(&state);
+ md5_append(&state, (const md5_byte_t *)msg, l);
+ md5_finish(&state, digest);
+
+ return digest;
+}
+/*
+ * For reference, here is the program that computed the T values.
+ */
+#if 0
+#include <math.h>
+main()
+{
+ int i;
+ for (i = 1; i <= 64; ++i) {
+ unsigned long v = (unsigned long)(4294967296.0 * fabs(sin((double)i)));
+ printf("#define T%d 0x%08lx\n", i, v);
+ }
+ return 0;
+}
+#endif
+/*
+ * End of T computation program.
+ */
+#define T1 0xd76aa478
+#define T2 0xe8c7b756
+#define T3 0x242070db
+#define T4 0xc1bdceee
+#define T5 0xf57c0faf
+#define T6 0x4787c62a
+#define T7 0xa8304613
+#define T8 0xfd469501
+#define T9 0x698098d8
+#define T10 0x8b44f7af
+#define T11 0xffff5bb1
+#define T12 0x895cd7be
+#define T13 0x6b901122
+#define T14 0xfd987193
+#define T15 0xa679438e
+#define T16 0x49b40821
+#define T17 0xf61e2562
+#define T18 0xc040b340
+#define T19 0x265e5a51
+#define T20 0xe9b6c7aa
+#define T21 0xd62f105d
+#define T22 0x02441453
+#define T23 0xd8a1e681
+#define T24 0xe7d3fbc8
+#define T25 0x21e1cde6
+#define T26 0xc33707d6
+#define T27 0xf4d50d87
+#define T28 0x455a14ed
+#define T29 0xa9e3e905
+#define T30 0xfcefa3f8
+#define T31 0x676f02d9
+#define T32 0x8d2a4c8a
+#define T33 0xfffa3942
+#define T34 0x8771f681
+#define T35 0x6d9d6122
+#define T36 0xfde5380c
+#define T37 0xa4beea44
+#define T38 0x4bdecfa9
+#define T39 0xf6bb4b60
+#define T40 0xbebfbc70
+#define T41 0x289b7ec6
+#define T42 0xeaa127fa
+#define T43 0xd4ef3085
+#define T44 0x04881d05
+#define T45 0xd9d4d039
+#define T46 0xe6db99e5
+#define T47 0x1fa27cf8
+#define T48 0xc4ac5665
+#define T49 0xf4292244
+#define T50 0x432aff97
+#define T51 0xab9423a7
+#define T52 0xfc93a039
+#define T53 0x655b59c3
+#define T54 0x8f0ccc92
+#define T55 0xffeff47d
+#define T56 0x85845dd1
+#define T57 0x6fa87e4f
+#define T58 0xfe2ce6e0
+#define T59 0xa3014314
+#define T60 0x4e0811a1
+#define T61 0xf7537e82
+#define T62 0xbd3af235
+#define T63 0x2ad7d2bb
+#define T64 0xeb86d391
+
+static void
+md5_process(md5_state_t *pms, const md5_byte_t *data /*[64]*/)
+{
+ md5_word_t
+ a = pms->abcd[0], b = pms->abcd[1],
+ c = pms->abcd[2], d = pms->abcd[3];
+ md5_word_t t;
+
+#ifndef ARCH_IS_BIG_ENDIAN
+# define ARCH_IS_BIG_ENDIAN 1 /* slower, default implementation */
+#endif
+#if ARCH_IS_BIG_ENDIAN
+
+ /*
+ * On big-endian machines, we must arrange the bytes in the right
+ * order. (This also works on machines of unknown byte order.)
+ */
+ md5_word_t X[16];
+ const md5_byte_t *xp = data;
+ unsigned int i;
+
+ for (i = 0; i < 16; ++i, xp += 4)
+ X[i] = xp[0] + (xp[1] << 8) + (xp[2] << 16) + (xp[3] << 24);
+
+#else /* !ARCH_IS_BIG_ENDIAN */
+
+ /*
+ * On little-endian machines, we can process properly aligned data
+ * without copying it.
+ */
+ md5_word_t xbuf[16];
+ const md5_word_t *X;
+
+ if (!((data - (const md5_byte_t *)0) & 3)) {
+ /* data are properly aligned */
+ X = (const md5_word_t *)data;
+ } else {
+ /* not aligned */
+ memcpy(xbuf, data, 64);
+ X = xbuf;
+ }
+#endif
+
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+
+ /* Round 1. */
+ /* Let [abcd k s i] denote the operation
+ a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). */
+#define F(x, y, z) (((x) & (y)) | (~(x) & (z)))
+#define SET(a, b, c, d, k, s, Ti)\
+ t = a + F(b,c,d) + X[k] + Ti;\
+ a = ROTATE_LEFT(t, s) + b
+ /* Do the following 16 operations. */
+ SET(a, b, c, d, 0, 7, T1);
+ SET(d, a, b, c, 1, 12, T2);
+ SET(c, d, a, b, 2, 17, T3);
+ SET(b, c, d, a, 3, 22, T4);
+ SET(a, b, c, d, 4, 7, T5);
+ SET(d, a, b, c, 5, 12, T6);
+ SET(c, d, a, b, 6, 17, T7);
+ SET(b, c, d, a, 7, 22, T8);
+ SET(a, b, c, d, 8, 7, T9);
+ SET(d, a, b, c, 9, 12, T10);
+ SET(c, d, a, b, 10, 17, T11);
+ SET(b, c, d, a, 11, 22, T12);
+ SET(a, b, c, d, 12, 7, T13);
+ SET(d, a, b, c, 13, 12, T14);
+ SET(c, d, a, b, 14, 17, T15);
+ SET(b, c, d, a, 15, 22, T16);
+#undef SET
+
+ /* Round 2. */
+ /* Let [abcd k s i] denote the operation
+ a = b + ((a + G(b,c,d) + X[k] + T[i]) <<< s). */
+#define G(x, y, z) (((x) & (z)) | ((y) & ~(z)))
+#define SET(a, b, c, d, k, s, Ti)\
+ t = a + G(b,c,d) + X[k] + Ti;\
+ a = ROTATE_LEFT(t, s) + b
+ /* Do the following 16 operations. */
+ SET(a, b, c, d, 1, 5, T17);
+ SET(d, a, b, c, 6, 9, T18);
+ SET(c, d, a, b, 11, 14, T19);
+ SET(b, c, d, a, 0, 20, T20);
+ SET(a, b, c, d, 5, 5, T21);
+ SET(d, a, b, c, 10, 9, T22);
+ SET(c, d, a, b, 15, 14, T23);
+ SET(b, c, d, a, 4, 20, T24);
+ SET(a, b, c, d, 9, 5, T25);
+ SET(d, a, b, c, 14, 9, T26);
+ SET(c, d, a, b, 3, 14, T27);
+ SET(b, c, d, a, 8, 20, T28);
+ SET(a, b, c, d, 13, 5, T29);
+ SET(d, a, b, c, 2, 9, T30);
+ SET(c, d, a, b, 7, 14, T31);
+ SET(b, c, d, a, 12, 20, T32);
+#undef SET
+
+ /* Round 3. */
+ /* Let [abcd k s t] denote the operation
+ a = b + ((a + H(b,c,d) + X[k] + T[i]) <<< s). */
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define SET(a, b, c, d, k, s, Ti)\
+ t = a + H(b,c,d) + X[k] + Ti;\
+ a = ROTATE_LEFT(t, s) + b
+ /* Do the following 16 operations. */
+ SET(a, b, c, d, 5, 4, T33);
+ SET(d, a, b, c, 8, 11, T34);
+ SET(c, d, a, b, 11, 16, T35);
+ SET(b, c, d, a, 14, 23, T36);
+ SET(a, b, c, d, 1, 4, T37);
+ SET(d, a, b, c, 4, 11, T38);
+ SET(c, d, a, b, 7, 16, T39);
+ SET(b, c, d, a, 10, 23, T40);
+ SET(a, b, c, d, 13, 4, T41);
+ SET(d, a, b, c, 0, 11, T42);
+ SET(c, d, a, b, 3, 16, T43);
+ SET(b, c, d, a, 6, 23, T44);
+ SET(a, b, c, d, 9, 4, T45);
+ SET(d, a, b, c, 12, 11, T46);
+ SET(c, d, a, b, 15, 16, T47);
+ SET(b, c, d, a, 2, 23, T48);
+#undef SET
+
+ /* Round 4. */
+ /* Let [abcd k s t] denote the operation
+ a = b + ((a + I(b,c,d) + X[k] + T[i]) <<< s). */
+#define I(x, y, z) ((y) ^ ((x) | ~(z)))
+#define SET(a, b, c, d, k, s, Ti)\
+ t = a + I(b,c,d) + X[k] + Ti;\
+ a = ROTATE_LEFT(t, s) + b
+ /* Do the following 16 operations. */
+ SET(a, b, c, d, 0, 6, T49);
+ SET(d, a, b, c, 7, 10, T50);
+ SET(c, d, a, b, 14, 15, T51);
+ SET(b, c, d, a, 5, 21, T52);
+ SET(a, b, c, d, 12, 6, T53);
+ SET(d, a, b, c, 3, 10, T54);
+ SET(c, d, a, b, 10, 15, T55);
+ SET(b, c, d, a, 1, 21, T56);
+ SET(a, b, c, d, 8, 6, T57);
+ SET(d, a, b, c, 15, 10, T58);
+ SET(c, d, a, b, 6, 15, T59);
+ SET(b, c, d, a, 13, 21, T60);
+ SET(a, b, c, d, 4, 6, T61);
+ SET(d, a, b, c, 11, 10, T62);
+ SET(c, d, a, b, 2, 15, T63);
+ SET(b, c, d, a, 9, 21, T64);
+#undef SET
+
+ /* Then perform the following additions. (That is increment each
+ of the four registers by the value it had before this block
+ was started.) */
+ pms->abcd[0] += a;
+ pms->abcd[1] += b;
+ pms->abcd[2] += c;
+ pms->abcd[3] += d;
+}
+
+void
+md5_init(md5_state_t *pms)
+{
+ pms->count[0] = pms->count[1] = 0;
+ pms->abcd[0] = 0x67452301;
+ pms->abcd[1] = 0xefcdab89;
+ pms->abcd[2] = 0x98badcfe;
+ pms->abcd[3] = 0x10325476;
+}
+
+void
+md5_append(md5_state_t *pms, const md5_byte_t *data, unsigned int nbytes)
+{
+ const md5_byte_t *p = data;
+ unsigned int left = nbytes;
+ unsigned int offset = (pms->count[0] >> 3) & 63;
+ md5_word_t nbits = (md5_word_t)(nbytes << 3);
+
+ if (nbytes <= 0)
+ return;
+
+ /* Update the message length. */
+ pms->count[1] += nbytes >> 29;
+ pms->count[0] += nbits;
+ if (pms->count[0] < nbits)
+ pms->count[1]++;
+
+ /* Process an initial partial block. */
+ if (offset) {
+ unsigned int copy = (offset + nbytes > 64 ? 64 - offset : nbytes);
+
+ memcpy(pms->buf + offset, p, copy);
+ if (offset + copy < 64)
+ return;
+ p += copy;
+ left -= copy;
+ md5_process(pms, pms->buf);
+ }
+
+ /* Process full blocks. */
+ for (; left >= 64; p += 64, left -= 64)
+ md5_process(pms, p);
+
+ /* Process a final partial block. */
+ if (left)
+ memcpy(pms->buf, p, left);
+}
+
+void
+md5_finish(md5_state_t *pms, md5_byte_t digest[16])
+{
+ static const md5_byte_t pad[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+ md5_byte_t data[8];
+ unsigned int i;
+
+ /* Save the length before padding. */
+ for (i = 0; i < 8; ++i)
+ data[i] = (md5_byte_t)(pms->count[i >> 2] >> ((i & 3) << 3));
+ /* Pad to 56 bytes mod 64. */
+ md5_append(pms, pad, ((55 - (pms->count[0] >> 3)) & 63) + 1);
+ /* Append the length. */
+ md5_append(pms, data, 8);
+ for (i = 0; i < 16; ++i)
+ digest[i] = (md5_byte_t)(pms->abcd[i >> 2] >> ((i & 3) << 3));
+}
diff --git a/segemehl/libs/md5.h b/segemehl/libs/md5.h
new file mode 100644
index 0000000..c5c8db8
--- /dev/null
+++ b/segemehl/libs/md5.h
@@ -0,0 +1,96 @@
+/*
+ Copyright (C) 1999 Aladdin Enterprises. All rights reserved.
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ L. Peter Deutsch
+ ghost at aladdin.com
+
+ */
+/*
+ Independent implementation of MD5 (RFC 1321).
+
+ This code implements the MD5 Algorithm defined in RFC 1321.
+ It is derived directly from the text of the RFC and not from the
+ reference implementation.
+
+ The original and principal author of md5.h is L. Peter Deutsch
+ <ghost at aladdin.com>. Other authors are noted in the change history
+ that follows (in reverse chronological order):
+
+ 1999-11-04 lpd Edited comments slightly for automatic TOC extraction.
+ 1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5);
+ added conditionalization for C++ compilation from Martin
+ Purschke <purschke at bnl.gov>.
+ 1999-05-03 lpd Original version.
+ */
+
+#ifndef md5_INCLUDED
+# define md5_INCLUDED
+
+/*
+ * This code has some adaptations for the Ghostscript environment, but it
+ * will compile and run correctly in any environment with 8-bit chars and
+ * 32-bit ints. Specifically, it assumes that if the following are
+ * defined, they have the same meaning as in Ghostscript: P1, P2, P3,
+ * ARCH_IS_BIG_ENDIAN.
+ */
+
+typedef unsigned char md5_byte_t; /* 8-bit byte */
+typedef unsigned int md5_word_t; /* 32-bit word */
+
+/* Define the state of the MD5 Algorithm. */
+typedef struct md5_state_s {
+ md5_word_t count[2]; /* message length in bits, lsw first */
+ md5_word_t abcd[4]; /* digest buffer */
+ md5_byte_t buf[64]; /* accumulate block */
+} md5_state_t;
+
+unsigned char*
+MD5R(unsigned char *msg, unsigned int l, void *nfo);
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* Initialize the algorithm. */
+#ifdef P1
+void md5_init(P1(md5_state_t *pms));
+#else
+void md5_init(md5_state_t *pms);
+#endif
+
+/* Append a string to the message. */
+#ifdef P3
+void md5_append(P3(md5_state_t *pms, const md5_byte_t *data, unsigned int nbytes));
+#else
+void md5_append(md5_state_t *pms, const md5_byte_t *data, unsigned int nbytes);
+#endif
+
+/* Finish the message and return the digest. */
+#ifdef P2
+void md5_finish(P2(md5_state_t *pms, md5_byte_t digest[16]));
+#else
+void md5_finish(md5_state_t *pms, md5_byte_t digest[16]);
+#endif
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif /* md5_INCLUDED */
diff --git a/segemehl/libs/memmac.h b/segemehl/libs/memmac.h
new file mode 100644
index 0000000..0104d2b
--- /dev/null
+++ b/segemehl/libs/memmac.h
@@ -0,0 +1,7 @@
+#ifndef MEMMAC_H
+#define MEMMAC_H
+
+#define ALLOCMEMORY(X,S,T,N) allocmemory(__FILE__,__LINE__, X, S, sizeof(T), N)
+#define FREEMEMORY(X,P) freememory(__FILE__, __LINE__, X, P)
+
+#endif
diff --git a/segemehl/libs/memman.c b/segemehl/libs/memman.c
new file mode 100644
index 0000000..e8f55a5
--- /dev/null
+++ b/segemehl/libs/memman.c
@@ -0,0 +1,190 @@
+/*
+ This file is part of gdub.
+ (C) 2006 Steve Hoffmann
+
+ gdub is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ gdub is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with gdub; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+
+ */
+
+/**
+ * @file memman.c
+ * @author Steve Hoffmann
+ * @brief functions for memory management
+*/
+
+/*
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: memman.c 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/memman.c $
+ *
+ */
+
+#include "memman.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+void initmemoryblocks(Spacetable *st, int numberofblocks) {
+
+ int i;
+
+ /*alloc spacetable and blocks*/
+ /*st = (Spacetable*) malloc(sizeof(Spacetable))*/
+ st->numberofblocks = numberofblocks;
+ st->lastalloced = 0;
+ st->lastfreed=0;
+ st->blocks = (Spaceblock*) malloc(st->numberofblocks*sizeof(Spaceblock));
+
+ /*init blocks*/
+ for (i=0; i < st->numberofblocks; i++) {
+ st->blocks[i].spaceptr = NULL;
+ st->blocks[i].fileallocated = NULL;
+ st->blocks[i].lineallocated = 0;
+ st->blocks[i].sizeofcell = 0;
+ st->blocks[i].numberofcells = 0;
+ }
+
+}
+
+void *allocmemory(char *file, int line, Spacetable *st, void *ptr, int size, int number) {
+
+ int i;
+ Spaceblock* rescueptr;
+
+ /*alloc new block*/
+ if (ptr==NULL) {
+
+ if (st->lastfreed != 0)
+ st->lastalloced = st->lastfreed;
+
+ /*find free block*/
+ for(i=st->lastalloced; i < st->numberofblocks; i++) {
+ if(st->blocks[i].numberofcells == 0) break;
+ }
+
+ if(i == st->numberofblocks) {
+
+ st->numberofblocks++;
+
+ /*alloc in spacetable*/
+ rescueptr = (Spaceblock*) realloc(st->blocks, st->numberofblocks*sizeof(Spaceblock));
+ assert(rescueptr != NULL);
+ st->blocks = rescueptr;
+
+ /*alloc in spaceblock*/
+ st->blocks[st->numberofblocks-1].sizeofcell = size;
+ st->blocks[st->numberofblocks-1].numberofcells = number;
+ st->blocks[st->numberofblocks-1].fileallocated = file;
+ st->blocks[st->numberofblocks-1].lineallocated = line;
+ st->blocks[st->numberofblocks-1].spaceptr = (void*) malloc(number*size);
+ st->lastalloced = st->numberofblocks-1;
+ st->lastfreed = 0;
+
+ return st->blocks[st->numberofblocks-1].spaceptr;
+
+ } else {
+
+ st->blocks[i].sizeofcell = size;
+ st->blocks[i].numberofcells = number;
+ st->blocks[i].fileallocated = file;
+ st->blocks[i].lineallocated = line;
+ st->blocks[i].spaceptr = (void*) malloc(number*size);
+ st->lastalloced = i;
+ st->lastfreed = 0;
+
+ return st->blocks[i].spaceptr;
+ }
+ }
+
+
+ /*resize block*/
+ if(ptr != NULL) {
+
+ /*get blockno*/
+ for (i=0; i < st->numberofblocks; i++) {
+ if (st->blocks[i].spaceptr == ptr) break;
+ }
+
+ assert(i < st->numberofblocks);
+ st->blocks[i].sizeofcell = size;
+ st->blocks[i].numberofcells = number;
+ st->blocks[i].lineallocated = line;
+ st->blocks[i].fileallocated = file;
+
+ rescueptr = (void*) realloc(st->blocks[i].spaceptr, size*number);
+ assert(rescueptr != NULL);
+
+ st->blocks[i].spaceptr = rescueptr;
+
+ return st->blocks[i].spaceptr;
+
+ }
+
+ /*a stub for the compiler*/
+ return NULL;
+}
+
+void freememory(char* file, int line, Spacetable *st, void *ptr) {
+ int i;
+
+ for (i=0; i < st->numberofblocks; i++) {
+ if (st->blocks[i].spaceptr == ptr && st->blocks[i].numberofcells > 0) break;
+ }
+
+ if (i >= st->numberofblocks) {
+ printf("Attempt to free unallocated spaceblock in line %d, %s \n",line,file);
+ exit(-1);
+ }
+
+ st->blocks[i].numberofcells = 0;
+ st->blocks[i].sizeofcell = 0;
+ free(st->blocks[i].spaceptr);
+ st->blocks[i].spaceptr = NULL;
+
+
+ return;
+}
+
+void activeblocks(Spacetable *st) {
+ int i;
+
+ for(i=0; i < st->numberofblocks; i++) {
+ if (st->blocks[i].numberofcells > 0) {
+ printf("# active block %d: allocated with with %d cells in file \"%s\", line %d \n", i, st->blocks[i].numberofcells, st->blocks[i].fileallocated, st->blocks[i].lineallocated);
+ }
+ }
+
+}
+
+void checkspaceleak(Spacetable *st) {
+ int i;
+
+ for(i=0; i < st->numberofblocks; i++) {
+ if (st->blocks[i].numberofcells > 0){
+ printf("space leak: memory for block.%d not freed \n %d cells of size %d \n", i, st->blocks[i].numberofcells, st->blocks[i].sizeofcell);
+ printf("allocated in file \"%s\", line %d \n", st->blocks[i].fileallocated, st->blocks[i].lineallocated);
+ break;
+ }
+ }
+
+ return;
+}
+
diff --git a/segemehl/libs/memman.h b/segemehl/libs/memman.h
new file mode 100644
index 0000000..bbf0f71
--- /dev/null
+++ b/segemehl/libs/memman.h
@@ -0,0 +1,67 @@
+/*
+ This file is part of gdub.
+ (C) 2006 Steve Hoffmann
+
+ gdub is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ gdub is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with gdub; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+
+ */
+
+/**
+* @file memman.h
+ * @author Steve Hoffmann
+ * @brief declarations of functions for memory management
+ */
+
+/*
+ * $Log$
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: memman.h 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/memman.h $
+ */
+
+#ifndef MEMMAN_H
+#define MEMMAN_H
+
+typedef struct
+{
+ void *spaceptr;
+ int sizeofcell, numberofcells;
+ char* fileallocated;
+ int lineallocated;
+
+} Spaceblock;
+
+typedef struct
+{
+ int numberofblocks,
+ lastalloced,
+ lastfreed;
+ Spaceblock *blocks;
+
+} Spacetable;
+
+void initmemoryblocks(Spacetable *st, int numberofblocks);
+void *allocmemory(char* file, int line, Spacetable *st, void* ptr, int size, int number);
+void freememory(char* file, int line, Spacetable *st, void *ptr);
+void activeblocks(Spacetable *st);
+void checkspaceleak(Spacetable *st);
+
+#endif
diff --git a/segemehl/libs/memory.c b/segemehl/libs/memory.c
new file mode 100644
index 0000000..52fdb53
--- /dev/null
+++ b/segemehl/libs/memory.c
@@ -0,0 +1,27 @@
+
+/*
+ * memory.c
+ *
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 02.01.2010 00:11:46 CET
+ *
+ */
+
+#include "memory.h"
+#include <stdlib.h>
+
+void*
+bl_realloc(void *ptr, size_t sz) {
+ ptr = realloc(ptr, sz);
+ assert(ptr != NULL);
+ return ptr;
+}
+
+void*
+bl_calloc(void *ptr, size_t nelem, size_t sz) {
+ ptr = calloc(nelem, sz);
+ assert(ptr != NULL);
+ return ptr;
+}
diff --git a/segemehl/libs/memory.h b/segemehl/libs/memory.h
new file mode 100644
index 0000000..d8a6072
--- /dev/null
+++ b/segemehl/libs/memory.h
@@ -0,0 +1,19 @@
+#ifndef MEMORY_H
+#define MEMORY_H
+/*
+#include "memman.h"
+#include "memmac.h"
+*/
+#include <assert.h>
+#include <stdlib.h>
+
+#define ALLOCMEMORY(X,PTR,TYPE,SIZE) bl_realloc((PTR),sizeof(TYPE)*(SIZE))
+#define CALLOCMEMORY(X,TYPE,SIZE) bl_realloc((PTR),(SIZE),sizeof(TYPE))
+
+#define FREEMEMORY(X,PTR) free(PTR); PTR=NULL
+
+void* bl_realloc(void *, size_t);
+void* bl_calloc(void *, size_t, size_t);
+
+#endif
+
diff --git a/segemehl/libs/merge.c b/segemehl/libs/merge.c
new file mode 100644
index 0000000..890f4d3
--- /dev/null
+++ b/segemehl/libs/merge.c
@@ -0,0 +1,626 @@
+/*
+ * merge.c
+ * functions to merge matches
+ *
+ * SVN
+ * Revision of last commit: $Rev: 348 $
+ * Author: $Author: steve $
+ * Date: $Date: 2012-08-24 08:46:52 -0400 (Fri, 24 Aug 2012) $
+ *
+ * Id: $Id: merge.c 348 2012-08-24 12:46:52Z steve $
+ * Url: $URL: http://www2.bioinf.uni-leipzig.de/svn5/segemehl/libs/merge.c $
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "debug.h"
+#include "info.h"
+#include "basic-types.h"
+#include "stringutils.h"
+#include "mathematics.h"
+#include "biofiles.h"
+#include "fileBins.h"
+#include "matchfilesfields.h"
+#include "merge.h"
+
+
+
+/*------------------------- bl_mergefilesInit ---------------------------------
+ *
+ * @brief init container for multiple merge files
+ * @author Christian Otto
+ *
+ */
+void bl_mergefilesInit(void *space, bl_mergefiles_t *files, Uint nooffiles){
+ files->f = ALLOCMEMORY(space, NULL, bl_mergefile_t, nooffiles);
+ files->nooffiles = nooffiles;
+}
+
+
+/*----------------------- bl_mergefilesDestruct -------------------------------
+ *
+ * @brief destruct container for multiple merge files
+ * @author Christian Otto
+ *
+ */
+void bl_mergefilesDestruct(void *space, bl_mergefiles_t *files){
+ if (files->f != NULL){
+ FREEMEMORY(space, files->f);
+ files->f = NULL;
+ }
+ files->nooffiles = 0;
+}
+
+/*--------------------------- bl_mergefileInit --------------------------------
+ *
+ * @brief init merge file container
+ * @author Christian Otto
+ *
+ */
+void bl_mergefileInit(void *space, bl_mergefile_t *file, FILE *fp){
+ file->fp = fp;
+ file->eof = 0;
+ file->complete = 0;
+ file->entry = ALLOCMEMORY(space, NULL, bl_mergefilematch_t, 1);
+ bl_mergefilematchInit(space, file->entry);
+}
+
+
+/*------------------------ bl_mergefileDestruct -------------------------------
+ *
+ * @brief destruct merge file container
+ * @author Christian Otto
+ *
+ */
+void bl_mergefileDestruct(void *space, bl_mergefile_t *file){
+ file->fp = NULL;
+ file->eof = 0;
+ file->complete = 0;
+ bl_mergefilematchDestruct(space, file->entry);
+ FREEMEMORY(space, file->entry);
+ file->entry = NULL;
+}
+
+
+/*------------------------ bl_mergefilematchInit ------------------------------
+ *
+ * @brief init merge file entry
+ * @author Christian Otto
+ *
+ */
+void bl_mergefilematchInit(void *space, bl_mergefilematch_t *entry){
+ entry->match = NULL;
+ entry->matchlen = 0;
+ entry->matematch = NULL;
+ entry->matematchlen = 0;
+ entry->key = NULL;
+ entry->keylen = 0;
+ entry->flag = 0;
+ entry->mateflag = 0;
+ entry->edist = 0;
+ entry->mateedist = 0;
+ entry->noofmatches = 0;
+ entry->noofmatematches = 0;
+ entry->rname = NULL;
+ entry->rstart = 0;
+}
+
+
+/*---------------------- bl_mergefilematchDestruct ----------------------------
+ *
+ * @brief destruct merge file entry
+ * @author Christian Otto
+ *
+ */
+void bl_mergefilematchDestruct(void *space, bl_mergefilematch_t *entry){
+ if (entry->match != NULL){
+ FREEMEMORY(space, entry->match);
+ entry->match = NULL;
+ }
+ entry->matchlen = 0;
+ if (entry->matematch != NULL){
+ FREEMEMORY(space, entry->matematch);
+ entry->matematch = NULL;
+ }
+ entry->matematchlen = 0;
+ if (entry->key != NULL){
+ FREEMEMORY(space, entry->key);
+ entry->key = NULL;
+ }
+ entry->keylen = 0;
+ entry->flag = 0;
+ entry->mateflag = 0;
+ entry->edist = 0;
+ entry->mateedist = 0;
+ entry->noofmatches = 0;
+ entry->noofmatematches = 0;
+ if (entry->rname != NULL){
+ FREEMEMORY(space, entry->rname);
+ entry->rname = NULL;
+ }
+ entry->rstart = 0;
+}
+
+/*------------------------- bl_mergefileCompare -------------------------------
+ *
+ * @brief compare two merge file entries regarding SAM flag in case of
+ * paired-end reads (i.e. best pairing state), returns -1 if
+ * first is better, 1 if second is better, 0 otherwise @author
+ * Christian Otto
+ *
+ */
+int bl_mergefilematchCompareFlag(bl_mergefilematch_t *i, bl_mergefilematch_t *j){
+ int tmpi, tmpj;
+
+ /* flag compare */
+ if (i->flag & 1) {
+ /* if qry/mate unmapped */
+ tmpi = (i->flag >> 3) & 1;
+ tmpj = (j->flag >> 3) & 1;
+ if (tmpi != tmpj){
+ return tmpi - tmpj;
+ }
+ /* if proper pair */
+ tmpi = ((i->flag >> 1) & 1);
+ tmpj = ((j->flag >> 1) & 1);
+ if (tmpi != tmpj){
+ return -1 * (tmpi - tmpj);
+ }
+ }
+ return 0;
+}
+
+/*---------------------- bl_mergefileCompareEdist -----------------------------
+ *
+ * @brief compare two merge file entries regarding edit distance (or
+ * pair edit distance in case of paired-end reads), returns -1
+ * if first is better, 1 if second is better, 0 otherwise
+ * @author Christian Otto
+ *
+ */
+int bl_mergefilematchCompareEdist(bl_mergefilematch_t *i, bl_mergefilematch_t *j){
+ int tmpi, tmpj;
+
+ /* edist compare */
+ tmpi = i->edist + i->mateedist;
+ tmpj = j->edist + j->mateedist;
+ if (tmpi != tmpj){
+ if (tmpi < tmpj){
+ return -1;
+ }
+ else {
+ return 1;
+ }
+ }
+ else {
+ return 0;
+ }
+}
+
+
+/*------------------------- bl_mergefileFastaIDCompare -------------------------------
+ *
+ * @brief compare two fasta descriptions if they contain the same fasta ID,
+ * in case of paired-end data, it disregards /1 or /2 at the end or
+ * any differences after the first white space
+ * returns 1 if both descriptions contain the same ID, 0 otherwise
+ * @author Christian Otto
+ *
+ */
+unsigned char
+bl_mergefileFastaIDCompare(char *desc1, Uint desc1len, char *desc2, Uint desc2len) {
+
+ char *id, *id2, *tok1, *tok2;
+ unsigned char res;
+
+ id = ALLOCMEMORY(space, NULL, char, desc1len+2);
+ id2 = ALLOCMEMORY(space, NULL, char, desc2len+2);
+
+ strcpy(id, desc1);
+ strcpy(id2, desc2);
+
+ tok1 = strtok(id, "/");
+ tok2 = strtok(id2, "/");
+ res = (strcmp(tok1, tok2)==0);
+
+ if(!res) {
+ FREEMEMORY(space, id);
+ FREEMEMORY(space, id2);
+
+ id = ALLOCMEMORY(space, NULL, char, desc1len+2);
+ id2 = ALLOCMEMORY(space, NULL, char, desc2len+2);
+
+ strcpy(id, desc1);
+ strcpy(id2, desc2);
+
+ tok1 = strtok(id, " ");
+ tok2 = strtok(id2, " ");
+ res = (strcmp(tok1, tok2)==0);
+ }
+
+ FREEMEMORY(space, id);
+ FREEMEMORY(space, id2);
+ return res;
+}
+
+
+/*------------------------- bl_mergeParseLine ---------------------------------
+ *
+ * @brief parses a SAM-formatted line (single or paired-end) and
+ * inserts the data in the given container
+ * NOTE: split reads not supported up to now
+ * @author Christian Otto
+ *
+ */
+unsigned char bl_mergeParseLine(void *space, bl_mergefilematch_t *entry, char *line, Uint len){
+ unsigned char complete = 0;
+ char *tmp;
+ Uint tmplen, flag;
+ stringset_t *fields = NULL;
+
+ fields = tokensToStringset(space, "\t\007", line, len);
+ flag = bl_matchfileGetFlag(fields, SAM);
+
+ if ((flag >> 8) & 1){
+ DBG("Split reads not supported yet. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ /* ignore unmapped fragments */
+ if ((flag >> 2) & 1){
+ destructStringset(space, fields);
+ return complete;
+ }
+
+ /*
+ * reading match
+ * (simply first fragment in
+ * output, not necessarily
+ * the read match)
+ */
+ if (entry->match == NULL){
+ entry->match = line;
+ entry->matchlen = len;
+
+ tmp = bl_matchfileGetQname(fields, SAM);
+ if (tmp == NULL){
+ DBG("Error in parsing line. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ entry->keylen = strlen(tmp);
+ entry->key = ALLOCMEMORY(space, NULL, char, entry->keylen + 1);
+ memmove(entry->key, tmp, entry->keylen);
+ entry->key[entry->keylen] = '\0';
+
+ entry->flag = flag;
+ entry->edist = bl_matchfileGetEdist(fields, SAM);
+ entry->noofmatches = bl_matchfileGetMatchCnt(fields, SAM);
+
+ tmp = bl_matchfileGetChrom(fields, SAM);
+ tmplen = strlen(tmp);
+ entry->rname = ALLOCMEMORY(space, NULL, char, tmplen + 1);
+ memmove(entry->rname, tmp, tmplen);
+ entry->rname[tmplen] = '\0';
+ entry->rstart = bl_matchfileGetStartPos(fields, SAM);
+
+ /* abort if non-valid flags if paired (either first or second in pair) */
+ if ((entry->flag & 1) && !(((entry->flag >> 6) & 1) ^
+ ((entry->flag >> 7) & 1))){
+ DBG("Incorrect flag information in paired-end match. Exit forced.\n", NULL);
+ exit(-1);
+ }
+
+ /* match complete if unpaired or unmapped other fragment */
+ if (!(entry->flag & 1) ||
+ ((entry->flag >> 3) & 1)){
+ complete = 1;
+ }
+ }
+ /*
+ * reading mate match
+ * (simply second fragment in
+ * output, not necessarily
+ * the mate match)
+ */
+ else {
+ entry->matematch = line;
+ entry->matematchlen = len;
+
+
+ /* abort if mateflag already set */
+ if (entry->mateflag & 1){
+ DBG("Error in reading paired-end matches. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ entry->mateflag = flag;
+
+ /* abort if not equal query name */
+ tmp = bl_matchfileGetQname(fields, SAM);
+ if (tmp == NULL){
+ DBG("Error in parsing line. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ if (! bl_mergefileFastaIDCompare(entry->key, entry->keylen, tmp, strlen(tmp))){
+ DBG("Error in reading paired-end matches. Exit forced.\n", NULL);
+ exit(-1);
+ }
+
+ /*
+ * abort with non-valid flags
+ * (mate unpaired, both/none first/second in pair)
+ */
+ if (!(entry->mateflag & 1) ||
+ !(((entry->flag >> 6) & 1) ^ ((entry->mateflag >> 6) & 1)) ||
+ !(((entry->flag >> 7) & 1) ^ ((entry->mateflag >> 7) & 1))) {
+ DBG("Incorrect flag information in paired-end matches. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ entry->mateedist = bl_matchfileGetEdist(fields, SAM);
+ entry->noofmatematches = bl_matchfileGetMatchCnt(fields, SAM);
+
+ complete = 1;
+ }
+ destructStringset(space, fields);
+ return complete;
+}
+
+
+/*-------------------------- bl_mergeReadNext ---------------------------------
+ *
+ * @brief read next match (and mate match) entry in mergefile
+ * @author Christian Otto
+ *
+ */
+void bl_mergeReadNext(void *space, bl_mergefile_t *file){
+ Uint buffersize = 1024, len = 0;
+ char *buffer = NULL, ch;
+
+ if (!file->complete && !file->eof){
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ len = 0;
+ while((ch = getc(file->fp)) != EOF) {
+ /* extend buffer */
+ if(len == buffersize-1) {
+ buffersize = 2*buffersize+1;
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ }
+ /* process buffer */
+ if(ch == '\n' && len > 0) {
+ buffer[len++] = ch;
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+
+ file->complete = bl_mergeParseLine(space, file->entry, buffer, len);
+
+ if (file->complete){
+ break;
+ }
+ else {
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+ len = 0;
+ continue;
+ }
+ } else {
+ if(ch != '\n') buffer[len++] = ch;
+ }
+ }
+ /* set end of file */
+ if (!file->eof && ch == EOF){
+ file->eof = 1;
+
+ if (len > 0 && !file->complete){
+ DBG("%u:%s\n", len, buffer);
+ DBG("Incomplete read matching entry at end of file. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ }
+ if (!file->complete){
+ FREEMEMORY(space, buffer);
+ }
+ }
+}
+
+
+/*------------------------- bl_mergeUpdateTag ---------------------------------
+ *
+ * @brief replaces noofmatches in SAM tag in given input line,
+ * input line and length is given as references
+ * @author Christian Otto
+ *
+ */
+void bl_mergeUpdateTag(void *space, char **line, Uint *len, Uint noofmatches){
+ Uint i, totallen;
+ char *pch, *res;
+
+ /* search for NH:i: tag in read match */
+ pch = strstr(*line, "NH:i:");
+ if (pch == NULL){
+ DBG("Error in updating NH TAG. Exit forced.\n", NULL);
+ exit(-1);
+ }
+
+ /* find end of tag */
+ for (i = 1; i < strlen(pch); i++){
+ if (ISWHITESPACE(pch[i]) || pch[i] == '\007'){
+ break;
+ }
+ }
+ pch[0] = '\0';
+ totallen = snprintf(NULL, 0, "%sNH:i:%u%s", *line, noofmatches, pch+i);
+ res = ALLOCMEMORY(space, NULL, char, totallen + 1);
+ sprintf(res, "%sNH:i:%u%s", *line, noofmatches, pch+i);
+ res[totallen] = '\0';
+ FREEMEMORY(space, *line);
+ *len = totallen;
+ *line = res;
+}
+
+
+/*------------------------- bl_mergeBisulfiteBins -----------------------------
+ *
+ * @brief merging of bisulfite bins according to given read order between
+ * different bisulfite matching runs for each bin separately
+ * NOTE: this may be threaded if necessary later
+ * @author Christian Otto
+ *
+ */
+void
+se_mergeBisulfiteBins (void *space, bl_fileBinDomains_t *bsdomains, fasta_t **reads,
+ FILE *dev, bl_fileBinDomains_t *chrdomains, unsigned char remove,
+ Uint bestonly){
+ Uint i, j, k, l, curlen, noofbins,
+ noofbest, allocbest = 1000;
+ int cmp;
+ char *curkey;
+ FILE *fp;
+ bl_mergefiles_t files;
+ bl_mergefilematch_t **best;
+
+ assert(bsdomains->noofdomains > 0);
+ noofbins = bsdomains->domain[0].bins.noofbins;
+ for (i = 1; i < bsdomains->noofdomains; i++){
+ if (bsdomains->domain[i].bins.noofbins != noofbins){
+ DBG("Inconsistent noofbins in domains. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ }
+
+ best = ALLOCMEMORY(space, NULL, bl_mergefilematch_t **, allocbest);
+
+ for (i = 0; i < noofbins; i++){
+ NFO("Merging bisulfite bin %d.\n", i);
+ /* init and open files */
+ bl_mergefilesInit(space, &files, bsdomains->noofdomains);
+ for (k = 0; k < files.nooffiles; k++){
+ bl_mergefileInit(space, &files.f[k], bl_fileBinsOpen(space, &bsdomains->domain[k].bins.b[i], "r"));
+ }
+
+ /* perform merging */
+ for (j = 0; j < reads[i]->noofseqs; j++){
+ noofbest = 0;
+
+ /* get next key */
+ curkey = bl_fastaGetDescription(reads[i], j);
+ curlen = bl_fastaGetDescriptionLength(reads[i], j);
+ //DBG("queryfile:id=%d\tkey=%s\n", j, curkey);
+ /*
+ * find match(es) with current key,
+ * best pairing state, minimal edist (qry edist + mate edist)
+ */
+ for (k = 0; k < files.nooffiles; k++){
+ while (1){
+ /* read next entry */
+ if (!files.f[k].complete){
+ bl_mergeReadNext(space, &files.f[k]);
+ }
+ //DBG("files.f[%d]: curkey=%s\nmatch=%s\n", k, files.f[k].entry->key, files.f[k].entry->match);
+ /*
+ * end of file reached or next match with different key
+ * Note: allow for partial matches (match one to the end)
+ * due to clipping of /1 or /2 in paired-end data
+ */
+ if (files.f[k].eof || ! bl_mergefileFastaIDCompare(curkey, curlen, files.f[k].entry->key,files.f[k].entry->keylen)){
+ break;
+ }
+
+ /* compare current with previous best match (pairing state & edist) */
+ if (noofbest > 0){
+ cmp = bl_mergefilematchCompareFlag(best[0], files.f[k].entry);
+
+ /* compare edit distance only in case of best-only */
+ if (cmp == 0 && bestonly){
+ cmp = bl_mergefilematchCompareEdist(best[0], files.f[k].entry);
+ }
+ }
+ else {
+ cmp = 0;
+ }
+ //DBG("cmp=%d\n", cmp);
+
+ if (cmp >= 0){
+ if (cmp > 0){
+ /* new best match found => destruct previous ones */
+ for (l = 0; l < noofbest; l++){
+ bl_mergefilematchDestruct(space, best[l]);
+ FREEMEMORY(space, best[l]);
+ }
+ noofbest = 0;
+ }
+ /* extend best buffer */
+ if (noofbest == allocbest - 1){
+ allocbest *= 2;
+ best = ALLOCMEMORY(space, best, bl_mergefilematch_t **, allocbest);
+ }
+ /* append current match to best */
+ best[noofbest++] = files.f[k].entry;
+
+ files.f[k].entry = ALLOCMEMORY(space, NULL, bl_mergefilematch_t, 1);
+ bl_mergefilematchInit(space, files.f[k].entry);
+ }
+ /* better match already found => clear data */
+ else {
+ bl_mergefilematchDestruct(space, files.f[k].entry);
+ }
+ files.f[k].complete = 0;
+ }
+ }
+
+ for (k = 0; k < noofbest; k++){
+ /* updating the 'number of matches' tag in SAM format (if necessary) */
+ if (best[k]->noofmatches != noofbest){
+ bl_mergeUpdateTag(space, &best[k]->match, &best[k]->matchlen, noofbest);
+ }
+ if (best[k]->matematch != NULL && best[k]->noofmatematches != noofbest){
+ bl_mergeUpdateTag(space, &best[k]->matematch, &best[k]->matematchlen, noofbest);
+ }
+
+ /* select output device */
+ if (chrdomains == NULL) {
+ fp = dev;
+ }
+ else {
+ fp = bl_fileBinsOpen(space, bl_fileBinsDomainGetBin(chrdomains, best[k]->rname,
+ best[k]->rstart), "w");
+ }
+
+ /* output found match */
+ fprintf(fp, "%s", best[k]->match);
+ //fprintf(stderr, "best: %s", best[k]->match);
+
+ if (best[k]->matematch != NULL){
+ if (chrdomains != NULL){
+ fp = bl_fileBinsOpen(space, bl_fileBinsDomainGetBin(chrdomains, best[k]->matername,
+ best[k]->materstart), "w");
+ }
+ fprintf(fp, "%s", best[k]->matematch);
+ //fprintf(stderr, "%s", best[k]->matematch);
+ }
+
+ /* clear data */
+ bl_mergefilematchDestruct(space, best[k]);
+ FREEMEMORY(space, best[k]);
+ }
+ }
+
+ for (k = 0; k < files.nooffiles; k++){
+ /* check whether match file is entirely processed */
+ bl_mergeReadNext(space, &files.f[k]);
+ if (!files.f[k].eof || files.f[k].complete){
+ DBG("Files not yet entirely processed. Exit forced.\n", NULL);
+ //DBG("files.f[%d]: key=%s\nmatch=%s\n", k, files.f[k].entry->match, files.f[k].entry->key);
+ exit(-1);
+ }
+ /* destruct */
+ bl_mergefileDestruct(space, &files.f[k]);
+ /* close file */
+ bl_fileBinsClose(&bsdomains->domain[k].bins.b[i]);
+ if (remove){
+ bl_rm(space, bsdomains->domain[k].bins.b[i].fname);
+ bsdomains->domain[k].bins.b[i].unlinked = 1;
+ }
+ }
+ bl_mergefilesDestruct(space, &files);
+ }
+ FREEMEMORY(space, best);
+}
diff --git a/segemehl/libs/merge.h b/segemehl/libs/merge.h
new file mode 100644
index 0000000..d4a078f
--- /dev/null
+++ b/segemehl/libs/merge.h
@@ -0,0 +1,97 @@
+#ifndef MERGE_H
+#define MERGE_H
+
+/*
+ * merge.h
+ * functions to merge matches
+ *
+ * SVN
+ * Revision of last commit: $Rev: 348 $
+ * Author: $Author: steve $
+ * Date: $Date: 2012-08-24 08:46:52 -0400 (Fri, 24 Aug 2012) $
+ *
+ * Id: $Id: merge.h 348 2012-08-24 12:46:52Z steve $
+ * Url: $URL: http://www2.bioinf.uni-leipzig.de/svn5/segemehl/libs/merge.h $
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "fileBins.h"
+#include "biofiles.h"
+
+#define SAM 0
+
+typedef struct {
+ /* match (always first one read,
+ * not necessarily 1st in pair) */
+ char *match;
+ /* match length */
+ Uint matchlen;
+ /* mate match */
+ char *matematch;
+ /* mate match length */
+ Uint matematchlen;
+ /* key for merging */
+ char *key;
+ /* key length */
+ Uint keylen;
+ /* flag in SAM format */
+ Uint flag;
+ /* mate flag in SAM format */
+ Uint mateflag;
+ /* edit distance of qry alignment */
+ Uint edist;
+ /* edit distance of mate alignment */
+ Uint mateedist;
+ /* number of matches */
+ Uint noofmatches;
+ /* number of mate matches */
+ Uint noofmatematches;
+ /* reference name */
+ char *rname;
+ /* reference start */
+ Uint rstart;
+ /* strand */
+ char strand;
+ /* mate reference name */
+ char *matername;
+ /* mate reference start */
+ Uint materstart;
+ /* mate strand */
+ char matestrand;
+} bl_mergefilematch_t;
+
+typedef struct {
+ /* file pointer */
+ FILE *fp;
+ /* EOF read */
+ unsigned char eof;
+ /* current entry read */
+ bl_mergefilematch_t *entry;
+ /* current entry complete? */
+ unsigned char complete;
+} bl_mergefile_t;
+
+typedef struct {
+ Uint nooffiles;
+ bl_mergefile_t *f;
+} bl_mergefiles_t;
+
+void bl_mergefilesInit(void *space, bl_mergefiles_t *files, Uint nooffiles);
+void bl_mergefilesDestruct(void *space, bl_mergefiles_t *files);
+void bl_mergefileInit(void *space, bl_mergefile_t *file, FILE *fp);
+void bl_mergefileDestruct(void *space, bl_mergefile_t *file);
+void bl_mergefilematchInit(void *space, bl_mergefilematch_t *match);
+int bl_mergefilematchCompareFlag(bl_mergefilematch_t *i, bl_mergefilematch_t *j);
+int bl_mergefilematchCompareEdist(bl_mergefilematch_t *i, bl_mergefilematch_t *j);
+unsigned char bl_mergefileFastaIDCompare(char *desc1, Uint desc1len, char *desc2, Uint desc2len);
+void bl_mergefilematchDestruct(void *space, bl_mergefilematch_t *match);
+unsigned char bl_mergeParseLine(void *space, bl_mergefilematch_t *match, char *line, Uint len);
+void bl_mergeReadNext(void *space, bl_mergefile_t *file);
+void bl_mergeUpdateTag(void *space, char **line, Uint *len, Uint noofmatches);
+void se_mergeBisulfiteBins (void *space, bl_fileBinDomains_t *bsdomains, fasta_t **reads,
+ FILE *dev, bl_fileBinDomains_t *chrdomains, unsigned char remove,
+ Uint bestonly);
+#endif /* MERGE_H */
diff --git a/segemehl/libs/ncursesext.c b/segemehl/libs/ncursesext.c
new file mode 100644
index 0000000..ecc74ab
--- /dev/null
+++ b/segemehl/libs/ncursesext.c
@@ -0,0 +1,78 @@
+
+/*
+ * ncursesext.c
+ * extensions for ncurses
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 10/16/2010 03:14:16 PM CEST
+ *
+ */
+#include <stdlib.h>
+
+#include <ncurses.h>
+#include "ncursesext.h"
+#include "basic-types.h"
+
+SHADOWEDWINDOW*
+newshadowedwin(Uint nlines, Uint ncols, Uint xpos, Uint ypos) {
+ SHADOWEDWINDOW *w;
+
+ w = calloc(1, sizeof(SHADOWEDWINDOW));
+ w->main = newwin(nlines, ncols, xpos, ypos);
+ w->shadow = newwin(nlines, ncols, xpos+1, ypos+2);
+ return w;
+}
+
+void
+shadowedwbkgd(SHADOWEDWINDOW *w, int maincol, int shadowcol) {
+ if(!w) return;
+ wbkgd(w->shadow, shadowcol);
+ wbkgd(w->main, maincol);
+}
+
+void
+shadowedwrefresh(SHADOWEDWINDOW *w){
+ if(!w) return;
+ wrefresh(w->shadow);
+ wrefresh(w->main);
+}
+
+void
+delshadowedwin(SHADOWEDWINDOW *w) {
+ if(!w) return;
+ delwin(w->shadow);
+ delwin(w->main);
+ free(w);
+}
+
+
+WINDOW*
+dershadowedwin(SHADOWEDWINDOW* w, Uint nlines, Uint ncols, Uint xpos, Uint ypos) {
+ WINDOW *dw;
+
+ if(!w) return NULL;
+ dw = derwin(w->main, nlines, ncols, xpos, ypos);
+ return dw;
+}
+
+int
+mvwaddchattr(WINDOW *win, int y, int x, int attr, chtype ch) {
+
+ wattrset(win, attr);
+ return mvwaddch(win, y, x, ch);
+}
+
+int
+mvwprintwattr(WINDOW *win, int y, int x, int attr, char *fmt, ...) {
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ wattrset(win, attr);
+ ret = mvwprintw(win, y, x, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+
diff --git a/segemehl/libs/ncursesext.h b/segemehl/libs/ncursesext.h
new file mode 100644
index 0000000..162505b
--- /dev/null
+++ b/segemehl/libs/ncursesext.h
@@ -0,0 +1,28 @@
+
+/*
+ *
+ * ncursesext.h
+ * extensions to ncurses
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 10/16/2010 03:13:29 PM CEST
+ *
+ */
+
+#include <ncurses.h>
+#include "basic-types.h"
+
+typedef struct {
+ WINDOW *main;
+ WINDOW *shadow;
+} SHADOWEDWINDOW;
+
+
+void shadowedwbkgd(SHADOWEDWINDOW *W, int maincol, int shadowcol);
+void shadowedwrefresh(SHADOWEDWINDOW *w);
+SHADOWEDWINDOW* newshadowedwin(Uint nlines, Uint ncols, Uint xpos, Uint ypos);
+WINDOW* dershadowedwin(SHADOWEDWINDOW*, Uint nlines, Uint ncols, Uint xpos, Uint ypos);
+void delshadowedwin(SHADOWEDWINDOW *w);
+int mvwaddchattr(WINDOW *win, int y, int x, int attr, chtype ch);
+int mvwprintwattr(WINDOW *win, int y, int x, int attr, char *fmt, ...);
diff --git a/segemehl/libs/newton.c b/segemehl/libs/newton.c
new file mode 100644
index 0000000..98f1a59
--- /dev/null
+++ b/segemehl/libs/newton.c
@@ -0,0 +1,88 @@
+
+/*
+ * newton.c
+ * newton method
+ *
+ * @author Steve Hoffmann
+ * @email shoffmann at zbh.uni-hamburg.de
+ * @date 03/24/07 03:48:19 CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: newton.c 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/newton.c $
+ */
+
+#include <stdio.h>
+#include <math.h>
+
+double newton(double x_0, double tol, int max_iters,
+ int* iters_p, int* converged_p);
+double f(double x);
+double f_prime(double x);
+
+int main() {
+ double x_0; /* Initial guess */
+ double x; /* Approximate solution */
+ double tol; /* Maximum error */
+ int max_iters; /* Maximum number of iterations */
+ int iters; /* Actual number of iterations */
+ int converged; /* Whether iteration converged */
+
+ printf("Enter x_0, tol, and max_iters\n");
+ scanf("%lf %lf %d", &x_0, &tol, &max_iters);
+
+ x = newton(x_0, tol, max_iters, &iters, &converged);
+
+ if (converged) {
+ printf("Newton algorithm converged after %d steps.\n",
+ iters);
+ printf("The approximate solution is %19.16e\n", x);
+ printf("f(%19.16e) = %19.16e\n", x, f(x));
+ } else {
+ printf("Newton algorithm didn't converge after %d steps.\n",
+ iters);
+ printf("The final estimate was %19.16e\n", x);
+ printf("f(%19.16e) = %19.16e\n", x, f(x));
+ }
+
+ return 0;
+} /* main */
+
+
+double newton(double x_0, double tol, int max_iters,
+ int* iters_p, int* converged_p) {
+ double x = x_0;
+ double x_prev;
+ int iter = 0;
+
+ do {
+ iter++;
+ x_prev = x;
+ x = x_prev - f(x_prev)/f_prime(x_prev);
+ } while (fabs(x - x_prev) > tol && iter < max_iters);
+
+ if (fabs(x - x_prev) <= tol)
+ *converged_p = 1;
+ else
+ *converged_p = 0;
+ *iters_p = iter;
+
+ return x;
+} /* newton algorithm */
+
+
+double f(double x) {
+ return x*x-2;
+} /* f */
+
+double f_prime(double x) {
+ return 2*x; /*the derivative*/
+} /* f_prime */
+
+
+
+
diff --git a/segemehl/libs/nw.c b/segemehl/libs/nw.c
new file mode 100644
index 0000000..a7ce9d3
--- /dev/null
+++ b/segemehl/libs/nw.c
@@ -0,0 +1,402 @@
+
+/*
+ * nw.c
+ * needleman wunsch
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 11/15/2010 12:12:27 AM CET
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "basic-types.h"
+#include "memory.h"
+#include "mathematics.h"
+#include "alignment.h"
+#include <limits.h>
+#include "sw.h"
+
+
+/*--------------------------------- nwalign ----------------------------------
+ *
+ * needleman-wunsch global similarity alignment
+ * returns a matrix of size (m+1)*(n+1) where m is length of given sequence a
+ * and n the length of sequence b, respectively. Function expects
+ * a function to calculate a substitution score
+ *
+ */
+
+ int*
+nwmatrix (void *space, symtype *a, Uint m, symtype *b, Uint n, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo)
+{
+ int i, j, cols, rows, size;
+ int *L;
+
+ rows = m+1;
+ cols = n+1;
+
+ size = rows*cols;
+ L = ALLOCMEMORY(space, NULL, int, size);
+ L = memset(L, 0, sizeof(int)*size);
+
+ for(i=1; i < m+1; i++) {
+ MATRIX2D(L, cols, i, 0) = i*indel;
+ for(j=1; j < n+1; j++) {
+ MATRIX2D(L, cols, 0, j) = j*indel;
+
+ MATRIX2D(L, cols, i, j) =
+ MAX3(
+ MATRIX2D(L, cols, (i-1), j) + indel ,
+ MATRIX2D(L, cols, i, (j-1)) + indel ,
+ MATRIX2D(L, cols, (i-1), (j-1)) + sub(a[i-1], b[j-1], nfo)
+ );
+ }
+ }
+
+ return L;
+}
+
+
+/*------------------------------- nwtraceback --------------------------------
+ *
+ * traceback to find optimal global alignment path
+ *
+ */
+
+ void
+nwtraceback (void *space, int *M,
+ symtype *a, Uint m, symtype *b, Uint n, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, Alignment *al)
+{
+ Uint i, j, ncol, cur;
+
+ ncol = n+1;
+ i = m;
+ j = n;
+
+ al->uoff = 0;
+ al->voff = 0;
+
+ while(i > 0 && j > 0) {
+
+ cur = MATRIX2D(M, ncol, i, j);
+ // fprintf(stderr, "enter while cur = %d, %c-%c, %d\n", cur, a[i-1], b[j-1],
+ // sub(a[i-1], b[j-1], nfo) );
+ if (MATRIX2D(M, ncol, i-1, j) + indel == cur){
+ insertEop(al, Insertion);
+ i--;
+
+ // fprintf(stderr, "insertion\n");
+ } else {
+ if (MATRIX2D(M, ncol, i, j-1) + indel == cur) {
+ insertEop(al, Deletion);
+ j--;
+
+ // fprintf(stderr, "deletion\n");
+ } else {
+ if (MATRIX2D(M, ncol, i-1, j-1)+sub(a[i-1], b[j-1], nfo)
+ == cur){
+ insertEop(al, Replacement);
+ i--; j--;
+ // fprintf(stderr, "replacement\n");
+ }
+ else {
+ assert(cur == 0);
+ // fprintf(stderr, "asserting.\n");
+ al->uoff = i;
+ al->voff = j;
+
+ revMeops(al);
+ return;
+ }
+ }
+ }
+ }
+
+ al->uoff = i;
+ al->voff = j;
+ revMeops(al);
+
+ return;
+}
+
+/*--------------------------------- sgalign ----------------------------------
+ *
+ * semi-global similarity alignment
+ * returns a matrix of size (m+1)*(n+1) where m is length of given sequence a
+ * and n the length of sequence b, respectively. Function expects
+ * a function to calculate a substitution score
+ *
+ */
+
+ int*
+sgmatrix (void *space, symtype *a, Uint m, symtype *b, Uint n, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo)
+{
+ int i, j, cols, rows, size;
+ int *L;
+
+ rows = m+1;
+ cols = n+1;
+
+ size = rows*cols;
+ L = ALLOCMEMORY(space, NULL, int, size);
+ L = memset(L, 0, sizeof(int)*size);
+
+ for(i=1; i < m+1; i++) {
+ MATRIX2D(L, cols, i, 0) = i*indel;
+ for(j=1; j < n+1; j++) {
+ MATRIX2D(L, cols, 0, j) = 0;
+
+ MATRIX2D(L, cols, i, j) =
+ MAX3(
+ MATRIX2D(L, cols, (i-1), j) + indel ,
+ MATRIX2D(L, cols, i, (j-1)) + indel ,
+ MATRIX2D(L, cols, (i-1), (j-1)) + sub(a[i-1], b[j-1], nfo)
+ );
+ }
+ }
+
+ return L;
+}
+
+/*------------------------------- sgtraceback --------------------------------
+ *
+ * traceback to find optimal semi global alignment path
+ *
+ */
+
+ void
+sgtraceback (void *space, int *M,
+ symtype *a, Uint m, symtype *b, Uint n, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, Alignment *al)
+{
+ Uint i, j, ncol, cur;
+ Uint maxcol;
+
+ ncol = n+1;
+ i = m;
+ maxcol = n;
+
+ for(j=0; j < n; j++) {
+ if(MATRIX2D(M, ncol, i, maxcol) < MATRIX2D(M, ncol, i, j))
+ maxcol = j;
+ }
+
+ j = maxcol;
+
+ al->uoff = 0;
+ al->voff = 0;
+
+ while(i > 0 && j > 0) {
+
+ cur = MATRIX2D(M, ncol, i, j);
+ // fprintf(stderr, "enter while cur = %d, %c-%c, %d\n", cur, a[i-1], b[j-1],
+ // sub(a[i-1], b[j-1], nfo) );
+ if (MATRIX2D(M, ncol, i-1, j) + indel == cur){
+ insertEop(al, Insertion);
+ i--;
+
+ // fprintf(stderr, "insertion\n");
+ } else {
+ if (MATRIX2D(M, ncol, i, j-1) + indel == cur) {
+ insertEop(al, Deletion);
+ j--;
+
+ // fprintf(stderr, "deletion\n");
+ } else {
+ if (MATRIX2D(M, ncol, i-1, j-1)+sub(a[i-1], b[j-1], nfo)
+ == cur){
+ insertEop(al, Replacement);
+ i--; j--;
+ // fprintf(stderr, "replacement\n");
+ }
+ else {
+ assert(cur == 0);
+ // fprintf(stderr, "asserting.\n");
+ al->uoff = 0;
+ al->voff = 0;
+
+ revMeops(al);
+ return;
+ }
+ }
+ }
+ }
+
+ al->uoff = 0;
+ al->voff = 0;
+ revMeops(al);
+
+ return;
+}
+
+
+/*--------------------------------- sgaffine ---------------------------------
+ *
+ * @brief affine gap cost semi global alignment
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+sgaffinematrix (void *space, int **A, int **B, int **S, symtype *a, Uint m, symtype *b,
+ Uint n, int open, int ext, int close, Sint (*sub)(symtype, symtype, void*),
+ void *nfo)
+{
+
+ int i, j, cols, rows, size;
+
+ rows = m+1;
+ cols = n+1;
+
+ size = rows*cols;
+ //A is gaps in a, B is gaps in b and S is substitutions
+ *A = ALLOCMEMORY(space, NULL, int, size);
+ *B = ALLOCMEMORY(space, NULL, int, size);
+ *S = ALLOCMEMORY(space, NULL, int, size);
+
+ *A = memset(*A, 0, sizeof(int)*size);
+ *B = memset(*B, 0, sizeof(int)*size);
+ *S = memset(*S, 0, sizeof(int)*size);
+
+ //init for semi global gap affine alignment
+ for(i=0; i < m+1; i++) {
+ MATRIX2D(*S, cols, i, 0) = 0;
+ MATRIX2D(*B, cols, i, 0) = INT_MIN;
+ }
+
+ for(j=0; j < n+1; i++) {
+ MATRIX2D(*S, cols, 0, j) = 0;
+ MATRIX2D(*B, cols, 0, j) = INT_MIN;
+ }
+
+ for(i=1; i < m+1; i++) {
+ for(j=1; j < n+1; j++) {
+ MATRIX2D(*A, cols, i, j) =
+ MAX(
+ MATRIX2D(*A, cols, (i-1), j) + ext,
+ MATRIX2D(*S, cols, (i-1), j) + open + ext
+ );
+
+ MATRIX2D(*B, cols, i, j) =
+ MAX(
+ MATRIX2D(*B, cols, i, (j-1)) + ext,
+ MATRIX2D(*S, cols, i, (j-1)) + open + ext
+ );
+
+ MATRIX2D(*S, cols, i, j) =
+ MAX3(
+ MATRIX2D(*S, cols, (i-1), (j-1)) + sub(a[i-1], b[j-1], nfo),
+ MATRIX2D(*A, cols, i, j),
+ MATRIX2D(*B, cols, i, j)
+ );
+ }
+ }
+
+
+ return ;
+}
+
+
+/*---------------------------- sgaffinetraceback -----------------------------
+ *
+ * @brief trace back the affine gap cost alignment
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+sgaffinetraceback(void *space, int *A, int *B, int *S, symtype *a, Uint m, symtype *b,
+ Uint n, int open, int ext, int close, Sint (*sub)(symtype, symtype, void*),
+ void *nfo, Alignment *al)
+
+{
+
+ Uint i, j, ncol, cur;
+ Uint maxcol;
+ char state = 'S';
+
+ ncol = n+1;
+ i = m;
+ maxcol = n;
+
+ for(j=0; j < n; j++) {
+ if(MATRIX2D(S, ncol, i, maxcol) < MATRIX2D(S, ncol, i, j))
+ maxcol = j;
+ }
+
+ j = maxcol;
+
+ al->uoff = 0;
+ al->voff = 0;
+
+
+ while(i > 0 && j > 0) {
+ if (state == 'S') {
+ cur = MATRIX2D(S, ncol, i, j);
+ if (cur == MATRIX2D(S, ncol, (i-1), (j-1)) + sub(a[i-1], b[j-1], nfo)) {
+ //go diagonally dont change the state
+ insertEop(al, Replacement);
+ i--; j--;
+ } else if(cur == MATRIX2D(A, ncol, i, j)) {
+ //move up
+ state = 'A';
+ } else if(cur == MATRIX2D(B, ncol, i, j)) {
+ //move left
+ state = 'B';
+ }
+ }
+
+ if(state == 'A') {
+ cur = MATRIX2D(A, ncol, i, j);
+ if(cur == MATRIX2D(A, ncol, (i-1), j) + open) {
+ //if this was an opening - change the state to S
+ state = 'S';
+ }
+ //but move upwards anyways
+ insertEop(al, Insertion);
+ i--;
+ }
+
+ if(state == 'B') {
+ cur = MATRIX2D(B, ncol, i, j);
+ if(cur == MATRIX2D(B, ncol, i, (j-1)) + open) {
+ //if this was an opening - change the state to S
+ state = 'S';
+ }
+ //but move leftwards anyways
+ insertEop(al, Deletion);
+ j--;
+ }
+ }
+
+ al->uoff = 0;
+ al->voff = 0;
+ revMeops(al);
+
+ return ;
+}
+
+
+/*----------------------------- sgaffinecircular -----------------------------
+ *
+ * @brief get circular matches with affine gap cost semi global alignment
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+sgaffinecircular ( )
+{
+
+ //take reference and double
+
+
+ return ;
+}
diff --git a/segemehl/libs/nw.h b/segemehl/libs/nw.h
new file mode 100644
index 0000000..8a2d38e
--- /dev/null
+++ b/segemehl/libs/nw.h
@@ -0,0 +1,36 @@
+
+/*
+ *
+ * nw.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 11/15/2010 12:52:56 AM CET
+ *
+ */
+
+#include "basic-types.h"
+#include "alignment.h"
+#include "sw.h"
+
+ int*
+nwmatrix (void *space, symtype *a, Uint m, symtype *b, Uint n, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo);
+
+
+ void
+nwtraceback (void *space, int *M,
+ symtype *a, Uint m, symtype *b, Uint n, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, Alignment *al);
+
+ int*
+sgmatrix (void *space, symtype *a, Uint m, symtype *b, Uint n, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo);
+
+
+ void
+sgtraceback (void *space, int *M,
+ symtype *a, Uint m, symtype *b, Uint n, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, Alignment *al);
+
diff --git a/segemehl/libs/physmem.c b/segemehl/libs/physmem.c
new file mode 100644
index 0000000..31f93bc
--- /dev/null
+++ b/segemehl/libs/physmem.c
@@ -0,0 +1,332 @@
+
+
+/* Calculate the size of physical memory.
+ * Copyright 2000, 2001, 2003 Free Software Foundation, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
+
+/* Written by Paul Eggert. */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#if HAVE_UNISTD_H
+# include <unistd.h>
+#endif
+
+#if HAVE_SYS_PSTAT_H
+# include <sys/pstat.h>
+#endif
+
+#if HAVE_SYS_SYSMP_H
+# include <sys/sysmp.h>
+#endif
+
+#if HAVE_SYS_SYSINFO_H && HAVE_MACHINE_HAL_SYSINFO_H
+# include <sys/sysinfo.h>
+# include <machine/hal_sysinfo.h>
+#endif
+
+#if HAVE_SYS_TABLE_H
+# include <sys/table.h>
+#endif
+
+#include <sys/types.h>
+
+#if HAVE_SYS_PARAM_H
+# include <sys/param.h>
+#endif
+
+
+
+
+#if HAVE_SYS_SYSCTL_H
+
+# include <sys/sysctl.h>
+#endif
+
+#if HAVE_SYS_SYSTEMCFG_H
+# include <sys/systemcfg.h>
+#endif
+
+#ifdef _WIN32
+# define WIN32_LEAN_AND_MEAN
+# include <windows.h>
+
+
+
+typedef struct
+
+{
+ DWORD dwLength;
+ DWORD dwMemoryLoad;
+ DWORDLONG ullTotalPhys;
+ DWORDLONG ullAvailPhys;
+ DWORDLONG ullTotalPageFile;
+ DWORDLONG ullAvailPageFile;
+ DWORDLONG ullTotalVirtual;
+ DWORDLONG ullAvailVirtual;
+ DWORDLONG ullAvailExtendedVirtual;
+}
+lMEMORYSTATUSEX;
+typedef WINBOOL (WINAPI *PFN_MS_EX) (lMEMORYSTATUSEX*);
+#endif
+#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
+//#include "libiberty.h"
+
+/* Return the total amount of physical memory. */
+ double
+physmem_total ()
+
+{
+ fprintf(stderr,"enter total\n");
+#if defined _SC_PHYS_PAGES && defined _SC_PAGESIZE
+ { /* This works on linux-gnu, solaris2 and cygwin. */
+ double pages = sysconf (_SC_PHYS_PAGES);
+ double pagesize = sysconf (_SC_PAGESIZE);
+ if (0 <= pages && 0 <= pagesize)
+ return pages * pagesize;
+ }
+#endif
+
+#if HAVE_PSTAT_GETSTATIC
+ { /* This works on hpux11. */
+ struct pst_static pss;
+ if (0 <= pstat_getstatic (&pss, sizeof pss, 1, 0))
+ {
+ double pages = pss.physical_memory;
+ double pagesize = pss.page_size;
+ if (0 <= pages && 0 <= pagesize)
+ return pages * pagesize;
+ }
+ }
+#endif
+
+#if HAVE_SYSMP && defined MP_SAGET && defined MPSA_RMINFO && defined _SC_PAGESIZE
+ { /* This works on irix6. */
+ struct rminfo realmem;
+ if (sysmp (MP_SAGET, MPSA_RMINFO, &realmem, sizeof realmem) == 0)
+ {
+ double pagesize = sysconf (_SC_PAGESIZE);
+ double pages = realmem.physmem;
+ if (0 <= pages && 0 <= pagesize)
+ return pages * pagesize;
+ }
+ }
+#endif
+
+#if HAVE_GETSYSINFO && defined GSI_PHYSMEM
+ { /* This works on Tru64 UNIX V4/5. */
+ int physmem;
+
+ if (getsysinfo (GSI_PHYSMEM, (caddr_t) &physmem, sizeof (physmem),
+ NULL, NULL, NULL) == 1)
+ {
+ double kbytes = physmem;
+
+ if (0 <= kbytes)
+ return kbytes * 1024.0;
+ }
+ }
+#endif
+
+#ifdef HAVE_SYS_SYSCTL_H && defined HW_PHYSMEM
+ { /* This works on *bsd and darwin. */
+ unsigned long int physmem;
+ size_t len = sizeof physmem;
+ static int mib[2] = { CTL_HW, HW_PHYSMEM };
+ fprintf(stderr, "mac os x identified.\n");
+
+ if (sysctl (mib, ARRAY_SIZE(mib), &physmem, &len, NULL, 0) == 0
+ && len == sizeof (physmem))
+ return (double) physmem;
+ }
+#endif
+
+#if HAVE__SYSTEM_CONFIGURATION
+ /* This works on AIX 4.3.3+. */
+ return _system_configuration.physmem;
+#endif
+
+#if defined _WIN32
+ { /* this works on windows */
+ PFN_MS_EX pfnex;
+ HMODULE h = GetModuleHandle ("kernel32.dll");
+
+ if (!h)
+ return 0.0;
+
+ /* Use GlobalMemoryStatusEx if available. */
+ if ((pfnex = (PFN_MS_EX) GetProcAddress (h, "GlobalMemoryStatusEx")))
+ {
+ lMEMORYSTATUSEX lms_ex;
+ lms_ex.dwLength = sizeof lms_ex;
+ if (!pfnex (&lms_ex))
+ return 0.0;
+ return (double) lms_ex.ullTotalPhys;
+ }
+
+ /* Fall back to GlobalMemoryStatus which is always available.
+ * but returns wrong results for physical memory > 4GB. */
+ else
+ {
+ MEMORYSTATUS ms;
+ GlobalMemoryStatus (&ms);
+ return (double) ms.dwTotalPhys;
+ }
+ }
+#endif
+
+ fprintf(stderr, "havent found any method.\n");
+ /* Return 0 if we can't determine the value. */
+ return 0;
+}
+
+
+/* Return the amount of physical memory available. */
+ double
+physmem_available ()
+
+{
+#if defined _SC_AVPHYS_PAGES && defined _SC_PAGESIZE
+ { /* This works on linux-gnu, solaris2 and cygwin. */
+ double pages = sysconf (_SC_AVPHYS_PAGES);
+ double pagesize = sysconf (_SC_PAGESIZE);
+ if (0 <= pages && 0 <= pagesize)
+ return pages * pagesize;
+ }
+#endif
+
+#if HAVE_PSTAT_GETSTATIC && HAVE_PSTAT_GETDYNAMIC
+ { /* This works on hpux11. */
+ struct pst_static pss;
+ struct pst_dynamic psd;
+ if (0 <= pstat_getstatic (&pss, sizeof pss, 1, 0)
+ && 0 <= pstat_getdynamic (&psd, sizeof psd, 1, 0))
+ {
+ double pages = psd.psd_free;
+ double pagesize = pss.page_size;
+ if (0 <= pages && 0 <= pagesize)
+ return pages * pagesize;
+ }
+ }
+#endif
+
+#if HAVE_SYSMP && defined MP_SAGET && defined MPSA_RMINFO && defined _SC_PAGESIZE
+ { /* This works on irix6. */
+ struct rminfo realmem;
+ if (sysmp (MP_SAGET, MPSA_RMINFO, &realmem, sizeof realmem) == 0)
+ {
+ double pagesize = sysconf (_SC_PAGESIZE);
+ double pages = realmem.availrmem;
+ if (0 <= pages && 0 <= pagesize)
+ return pages * pagesize;
+ }
+ }
+#endif
+
+#if HAVE_TABLE && defined TBL_VMSTATS
+ { /* This works on Tru64 UNIX V4/5. */
+ struct tbl_vmstats vmstats;
+
+ if (table (TBL_VMSTATS, 0, &vmstats, 1, sizeof (vmstats)) == 1)
+ {
+ double pages = vmstats.free_count;
+ double pagesize = vmstats.pagesize;
+
+ if (0 <= pages && 0 <= pagesize)
+ return pages * pagesize;
+ }
+ }
+#endif
+
+#if HAVE_SYS_SYSCTL_H && defined HW_USERMEM
+ { /* This works on *bsd and darwin. */
+ unsigned int usermem;
+ size_t len = sizeof usermem;
+ printf("size of bytefield: %u\n", len);
+ static int mib[2] = { CTL_HW, HW_USERMEM };
+
+ if (sysctl (mib, ARRAY_SIZE (mib), &usermem, &len, NULL, 0) == 0
+ && len == sizeof (usermem))
+ return (double) usermem;
+ }
+#endif
+
+#if defined _WIN32
+ { /* this works on windows */
+ PFN_MS_EX pfnex;
+ HMODULE h = GetModuleHandle ("kernel32.dll");
+
+ if (!h)
+ return 0.0;
+
+ /* Use GlobalMemoryStatusEx if available. */
+ if ((pfnex = (PFN_MS_EX) GetProcAddress (h, "GlobalMemoryStatusEx")))
+ {
+ lMEMORYSTATUSEX lms_ex;
+ lms_ex.dwLength = sizeof lms_ex;
+ if (!pfnex (&lms_ex))
+ return 0.0;
+ return (double) lms_ex.ullAvailPhys;
+ }
+
+ /* Fall back to GlobalMemoryStatus which is always available.
+ * but returns wrong results for physical memory > 4GB */
+ else
+ {
+ MEMORYSTATUS ms;
+ GlobalMemoryStatus (&ms);
+ return (double) ms.dwAvailPhys;
+ }
+ }
+#endif
+
+ fprintf(stderr, "guessing free memory!\n");
+ /* Guess 25% of physical memory. */
+ return physmem_total () / 4;
+}
+
+
+
+#if DEBUG
+
+# include <stdio.h>
+# include <stdlib.h>
+
+int
+main ()
+
+{
+printf ("%12.f %12.f\n", (physmem_total ()/1024), (physmem_available ()/1000));
+
+exit (0);
+}
+
+
+#endif
+
+
+
+
+/*
+ * Local Variables:
+ * compile-command: "gcc -DDEBUG -DHAVE_CONFIG_H -I.. -g -O -Wall -W physmem.c"
+ * End:
+ * */
diff --git a/segemehl/libs/plotmatchfiles.c b/segemehl/libs/plotmatchfiles.c
new file mode 100644
index 0000000..1780415
--- /dev/null
+++ b/segemehl/libs/plotmatchfiles.c
@@ -0,0 +1,248 @@
+
+/*
+ * plotmatchfiles.c
+ *
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 10/22/2010 05:17:21 PM CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "alignment.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "fileio.h"
+#include "matchfiles.h"
+#include "evalmatchfiles.h"
+
+
+void
+bl_matchfileCROSSERRGNUPLOT(void *space, matchfileindex_t *index) {
+ char *name = "tempname.txt";
+ double *y;
+ Uint i;
+
+ y = ALLOCMEMORY(space, NULL, double, (Uint)index->maxreadlen);
+
+ for(i=0; i < index->maxreadlen; i++) {
+ y[i] = index->P_ERR[i]/index->noofreads;
+ }
+
+ FILE *pipe = popen("gnuplot -persist 2>/dev/null","w");
+ writeY(name, y, index->maxreadlen, 0, 0);
+
+ fprintf(pipe, "set title 'read position dependent error rates'\n");
+ fprintf(pipe, "set xlabel 'read position'\n");
+ fprintf(pipe, "set ylabel 'error rate'\n");
+ fprintf(pipe, "plot '%s' using 1:2 notitle w lines\n", name);
+
+ pclose(pipe);
+ FREEMEMORY(space, y);
+}
+
+
+void
+bl_matchfileQERRGNUPLOT(void *space, matchfileindex_t *index) {
+ char *name = "tempname.txt";
+ char *name2 = "tempname2.txt";
+ Uint i;
+ double *arr, *arr2;
+
+ arr = ALLOCMEMORY(space, NULL, double, QRNGE);
+ arr2 = ALLOCMEMORY(space, NULL, double, QRNGE);
+
+ for(i=0; i < QRNGE; i++) {
+ arr[i] = (double)index->Q_ERR[i]/index->Q_N[i];
+ if(i==0 && !index->Q_N[i]) arr[i] = 1;
+ arr2[i] = pow(10,(((double)i)/-10.0));
+ }
+
+ FILE *pipe = popen("gnuplot -persist 2>/dev/null","w");
+ writeY(name, arr, QRNGE, 0, 0);
+ writeY(name2, arr2, QRNGE, 0, 0);
+
+ fprintf(pipe, "set title 'quality dependent error rates'\n");
+ fprintf(pipe, "set xlabel 'quality'\n");
+ fprintf(pipe, "set ylabel 'errorrate'\n");
+ fprintf(pipe, "plot '%s' using 1:2 w points notitle", name);
+ fprintf(pipe, " , '%s' using 1:2 smooth bezier title 'observed error' w lines", name);
+ fprintf(pipe, " , '%s' using 1:2 title 'expected error' w lines\n", name2);
+
+ pclose(pipe);
+ FREEMEMORY(space, arr);
+ FREEMEMORY(space, arr2);
+}
+
+void
+bl_matchfilePERRGNUPLOT(void *space, matchfileindex_t *index) {
+ char *name = "tempname.txt";
+ double *y;
+ Uint i;
+
+ y = ALLOCMEMORY(space, NULL, double, (Uint)index->maxreadlen);
+
+ for(i=0; i < index->maxreadlen; i++) {
+ y[i] = (double)index->P_ERR[i]/index->noofreads;
+ }
+
+ FILE *pipe = popen("gnuplot -persist 2>/dev/null","w");
+ writeY(name, y, index->maxreadlen, 0, 0);
+
+ fprintf(pipe, "set title 'read position dependent error rates'\n");
+ fprintf(pipe, "set xlabel 'read position'\n");
+ fprintf(pipe, "set ylabel 'error rate'\n");
+ fprintf(pipe, "plot '%s' using 1:2 notitle w lines\n", name);
+
+ pclose(pipe);
+ FREEMEMORY(space, y);
+}
+
+void
+bl_matchfileSUBGNUPLOT(void *space, matchfileindex_t *index) {
+ char *name = "tempname.txt";
+ double *y,*x,*z, sum=0;
+ double cnt[4] = {0,0,0,0};
+
+ Uint i, j, k, l, u=0;
+
+ x = ALLOCMEMORY(space, NULL, double, 37);
+ y = ALLOCMEMORY(space, NULL, double, 37);
+ z = ALLOCMEMORY(space, NULL, double, 37);
+
+ for(i=0; i < 5; i++) {
+ for(j=0; j < 5; j++) {
+ for(k=0; k < QRNGE; k++) {
+ for(l=0; l < index->maxreadlen; l++) {
+ cnt[i] += MATRIX4D(index->submatrix, 6,
+ QRNGE, MAXREADLENGTH, i, j, k, l);
+ }
+ }
+ }
+ }
+
+ for(i=0; i < 5; i++) {
+ for(j=0; j < 5; j++) {
+ sum = 0;
+ for(k=0; k < QRNGE; k++) {
+ for(l=0; l < index->maxreadlen; l++) {
+ sum += MATRIX4D(index->submatrix, 6,
+ QRNGE, MAXREADLENGTH, i, j, k, l);
+ }
+ }
+ fprintf(stderr, "i:%d, j:%d sum: %f\n", i, j, sum);
+ x[u] = i;
+ y[u] = j;
+ z[u] = log10(sum/cnt[i]);
+ u++;
+ }
+ }
+
+ FILE *pipe = popen("gnuplot -persist 2>/dev/null","w");
+ writeXYZ(name, x, y, z, 25);
+
+ fprintf(pipe, "set title 'substitution rates (log10)'\n");
+ fprintf(pipe, "set xlabel 'read'\n");
+ fprintf(pipe, "set ylabel 'reference'\n");
+ fprintf(pipe, "set xtics ('A' 0,'C' 1,'G' 2,'T' 3, '-' 4)\n");
+ fprintf(pipe, "set ytics ('A' 0,'C' 1,'G' 2,'T' 3, '-' 4)\n");
+ fprintf(pipe, "set label 1 'bla' at 1,1\n");
+ fprintf(pipe, "plot '%s' using 1:2:3 w image\n", name);
+
+ pclose(pipe);
+ FREEMEMORY(space, y);
+ FREEMEMORY(space, x);
+ FREEMEMORY(space, z);
+
+}
+
+
+void
+bl_matchfileCOVGNUPLOT(void *space, matchfileFrame_t *frame) {
+ char *name = "tempname.txt";
+ Uint i, *arr;
+
+ arr = ALLOCMEMORY(space, NULL, Uint, frame->width);
+
+ for(i=0; i < frame->width; i++) {
+ arr[i] = frame->cs[i].len;
+ }
+
+ FILE *pipe = popen("gnuplot -persist 2>/dev/null","w");
+ writeYUint(name, arr, frame->width, frame->start, 0);
+
+ fprintf(pipe, "set title 'frame coverage %s[%d,%d]'\n", frame->chrname,
+ frame->start, frame->start+frame->width);
+ fprintf(pipe, "set xlabel 'frame position'\n");
+ fprintf(pipe, "set ylabel 'coverage'\n");
+ fprintf(pipe, "plot '%s' using 1:2 notitle w lines\n", name);
+
+ pclose(pipe);
+}
+
+
+void
+bl_matchfileRSSGNUPLOT(void *space, matchfileFrame_t *frame,
+ matchfileFrameStats_t *stats) {
+
+ char *name = "tempname.txt";
+ char *name2 = "tempname2.txt";
+ Uint *data, xmax, ymax, total, width;
+ double avg, maxval;
+ FILE *pipe = popen("gnuplot -persist 2>/dev/null","w");
+
+ data = stats->dist_rss;
+ xmax = stats->dist_rss_xmax;
+ ymax = stats->dist_rss_ymax;
+ total = stats->rss;
+ width = frame->width;
+ avg = (double)total/width;
+
+ maxval = MAX(poisson(avg, avg)*width, ymax);
+ writeYUint(name, data, xmax, 0, 0);
+ writeYUintNorm(name2, data, xmax, 0);
+
+ fprintf(pipe, "set title 'read start site distribution %s[%d,%d]'\n",
+ frame->chrname, frame->start, frame->start+width);
+ fprintf(pipe, "set xlabel 'number of read startsites'\n");
+ fprintf(pipe, "set ylabel 'number of genomic loci'\n");
+ fprintf(pipe, "poissondraw(x)= (x>=0) ? exp(-l) * l**(x) / gamma(x+1) : 1\n");
+ fprintf(pipe, "normal(x) = (1/(sd*sqrt(2*pi)))*exp(-(x-mu)**2/(2*sd**2))\n");
+ fprintf(pipe, "l=%f; mu=%f; sd=1\n", avg, avg);
+
+ if(xmax < 100) {
+ fprintf(pipe, "fit normal(x) 'tempname2.txt' via mu,sd\n");
+ fprintf(pipe, "fit poissondraw(x) 'tempname2.txt' via l\n");
+ }
+
+ fprintf(pipe, "set label 1 'lambda=%%g',l at %d,%d\n", (int)(xmax/2), (int)(maxval/2));
+ fprintf(pipe, "set label 2 'mu=%%g',mu at %d,%d\n", (int)(xmax/2), (int)(maxval/3));
+ fprintf(pipe, "set parametric\n");
+ fprintf(pipe, "set trange [0:%d]\n", xmax+1);
+ fprintf(pipe, "set xrange [0:%d]\n", xmax+1);
+ fprintf(pipe, "set yrange [0:%d]\n", (int)(maxval+(0.1*maxval)));
+
+ fprintf(pipe, "plot '%s' using 1:2 notitle w points,", name);
+ fprintf(pipe, " '%s' using 1:2 smooth csplines title 'read start sites spline' w lines, ", name);
+ fprintf(pipe, " '%s' using 1:2 smooth bezier title 'read start sites bezier' w lines", name);
+
+ if(xmax < 100) {
+ fprintf(pipe, " , t,poissondraw(t)*%d w lines title 'poisson'", width);
+ fprintf(pipe, " , t,normal(t)*%d w lines title 'gaussian'\n", width);
+ } else {
+ fprintf(pipe, "\n");
+ }
+
+ pclose(pipe);
+}
diff --git a/segemehl/libs/plotmatchfiles.h b/segemehl/libs/plotmatchfiles.h
new file mode 100644
index 0000000..15d91c8
--- /dev/null
+++ b/segemehl/libs/plotmatchfiles.h
@@ -0,0 +1,21 @@
+#ifndef _PLOTMATCHFILES_H
+#define _PLOTMATCHFILES_H
+
+/*
+ *
+ * plotmatchfiles.h
+ * gnuplot routines
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 10/22/2010 05:23:11 PM CEST
+ *
+ */
+
+void bl_matchfileQERRGNUPLOT(void *space, matchfileindex_t *index);
+void bl_matchfilePERRGNUPLOT(void *space, matchfileindex_t *index);
+void bl_matchfileCOVGNUPLOT(void *space, matchfileFrame_t *frame);
+void bl_matchfileRSSGNUPLOT(void *space, matchfileFrame_t *frame, matchfileFrameStats_t *stats);
+void bl_matchfileSUBGNUPLOT(void *space, matchfileindex_t *index);
+
+#endif
diff --git a/segemehl/libs/queue.c b/segemehl/libs/queue.c
new file mode 100644
index 0000000..328d1f0
--- /dev/null
+++ b/segemehl/libs/queue.c
@@ -0,0 +1,211 @@
+/*
+ This file is part of gdub.
+ (C) 2006 Steve Hoffmann
+
+ gdub is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ gdub is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with gdub; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+
+*/
+/**
+ * queue.c
+ * implementation of a simple queue for int
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Mon Oct 13 14:13:08 CEST 2008
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 72 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-28 18:14:42 +0100 (Tue, 28 Oct 2008) $
+ * Id: $Id: queue.c 72 2008-10-28 17:14:42Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/queue.c $
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "debug.h"
+#include "queue.h"
+
+/*----------------------------- bl_queueInit -----------------------------------
+ *
+ * @brief init queue
+ * @author Steve Hoffmann
+ *
+ */
+void bl_queueInit(Queue *q, int allocelem) {
+
+ if (allocelem <= 0) {
+ DBG("queue.c: Attempt to initialize a queue of size %d. Exit forced.\n",
+ allocelem);
+ exit(-1);
+ }
+
+ q->queuespace = (Queueelem *) malloc(sizeof(Queueelem) * allocelem);
+ if (q->queuespace == NULL){
+ DBG("queue.c: Memory allocation failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ q->allocelem = allocelem;
+ q->numofelem = 0;
+ q->enqueueindex = 0;
+ q->dequeueindex = 0;
+}
+
+/*--------------------------- bl_queueDestruct ---------------------------------
+ *
+ * @brief destruct queue
+ * @author Steve Hoffmann
+ *
+ */
+void bl_queueDestruct(Queue *q) {
+
+ free(q->queuespace);
+
+ q->enqueueindex = 0;
+ q->dequeueindex = 0;
+ q->allocelem = 0;
+ q->numofelem = 0;
+}
+
+/*---------------------------- bl_queueIsEmpty ---------------------------------
+ *
+ * @brief returns if the queue is empty
+ * @author Steve Hoffmann
+ *
+ */
+BOOL bl_queueIsEmpty(Queue *q) {
+ return (q->numofelem == 0);
+}
+
+/*---------------------------- bl_queueEnqueue ---------------------------------
+ *
+ * @brief enqueues elements at the back of the queue
+ * @author Steve Hoffmann
+ *
+ */
+void bl_queueEnqueue(Queue *q, Queueelem elem) {
+
+ if(q->numofelem == q->allocelem) {
+ bl_queueResize(q);
+ }
+
+ q->queuespace[q->enqueueindex] = elem;
+ q->numofelem++;
+
+ /*implements circular datastructure*/
+ if (q->enqueueindex == q->allocelem-1) {
+ q->enqueueindex = 0;
+ } else {
+ q->enqueueindex++;
+ }
+}
+
+/*---------------------------- bl_queueDequeue ---------------------------------
+ *
+ * @brief dequeues element from the front of the queue
+ * @author Steve Hoffmann
+ *
+ */
+Queueelem bl_queueDequeue(Queue *q) {
+
+ Queueelem elem;
+
+ if(bl_queueIsEmpty(q)) {
+ return 0;
+ }
+
+ elem = q->queuespace[q->dequeueindex];
+ q->numofelem--;
+
+ /*implements circular data structure*/
+ if(q->dequeueindex == q->allocelem - 1) {
+ q->dequeueindex = 0;
+ } else {
+ q->dequeueindex++;
+ }
+ return elem;
+}
+
+/*---------------------------- bl_queueResize ----------------------------------
+ *
+ * @brief expands the size of the queue to the double
+ * @author Steve Hoffmann
+ *
+ */
+void bl_queueResize(Queue *q) {
+
+ Queueelem *src;
+ Queueelem *dest;
+
+ /* resize queue to double */
+ q->queuespace = (Queueelem *) realloc(q->queuespace,
+ sizeof(Queueelem) * (q->allocelem * 2));
+ if (q->queuespace == NULL){
+ DBG("queue.c: Memory reallocation failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ if (q->dequeueindex >= q->enqueueindex) {
+
+ /* ptr arithmetics to move queue elements */
+ src = &q->queuespace[q->dequeueindex];
+ dest = &q->queuespace[q->allocelem + q->dequeueindex];
+
+ memmove(dest, src,((q->allocelem)-q->dequeueindex)*sizeof(Queueelem));
+ q->dequeueindex = (q->dequeueindex + q->allocelem);
+ }
+
+ q->allocelem *= 2;
+}
+
+/*------------------------------ bl_queueShow ----------------------------------
+ *
+ * @brief prints the queue
+ * @author Steve Hoffmann
+ *
+ */
+void bl_queueShow(Queue *q) {
+ int i;
+ Queueelem elem;
+
+ printf("[");
+ for(i = 0; i < q->allocelem; i++) {
+ elem = q->queuespace[i];
+ if (i != q->enqueueindex && i != q->dequeueindex)
+ printf("%d", elem);
+ if (i == q->enqueueindex)
+ printf("%d*", elem);
+ if(i == q->dequeueindex)
+ printf("%d^", elem);
+ if(i+1 != q->allocelem)
+ printf(",");
+ }
+ printf("]\n");
+
+}
+
+
+/*------------------------------ bl_queueSize ----------------------------------
+ *
+ * @brief returns number of elements in the queue
+ * @author Steve Hoffmann
+ *
+ */
+Uint bl_queueSize(Queue *q){
+ return q->numofelem;
+}
diff --git a/segemehl/libs/queue.h b/segemehl/libs/queue.h
new file mode 100644
index 0000000..e221e24
--- /dev/null
+++ b/segemehl/libs/queue.h
@@ -0,0 +1,66 @@
+/*
+ This file is part of gdub.
+ (C) 2006 Steve Hoffmann
+
+ gdub is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ gdub is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with gdub; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+
+ */
+
+/**
+ * queue.c
+ * implementation of a simple queue for int
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Mon Oct 13 14:13:08 CEST 2008
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 72 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-28 18:14:42 +0100 (Tue, 28 Oct 2008) $
+ * Id: $Id: queue.h 72 2008-10-28 17:14:42Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/queue.h $
+ */
+
+#ifndef QUEUE_H
+#define QUEUE_H
+
+#include "basic-types.h"
+
+typedef int Queueelem;
+
+typedef struct
+{
+ Queueelem *queuespace;
+ int enqueueindex,
+ dequeueindex,
+ allocelem,
+ numofelem;
+} Queue;
+
+void bl_queueInit(Queue *q, int allocelem);
+void bl_queueDestruct(Queue *q);
+BOOL bl_queueIsEmpty(Queue *q);
+void bl_queueResize(Queue *q);
+void bl_queueEnqueue(Queue *q, Queueelem elem);
+Queueelem bl_queueDequeue(Queue *q);
+void bl_queueShow(Queue *q);
+Uint bl_queueSize(Queue *q);
+
+#endif
diff --git a/segemehl/libs/radixsort.c b/segemehl/libs/radixsort.c
new file mode 100644
index 0000000..ae43d4d
--- /dev/null
+++ b/segemehl/libs/radixsort.c
@@ -0,0 +1,168 @@
+
+/*
+ * radixsort.c
+ * a radix sort implementation
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 07.02.2010 23:26:52 CET
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <limits.h>
+#include <string.h>
+#include <basic-types.h>
+#include <radixsort.h>
+
+
+
+void
+bl_radixSort(void *space, void *tosrt,
+ size_t size, size_t nelem,
+ Uint (*keyaccess)(void *),
+ Uint bits) {
+
+ char *p, *b, *src, *toSort;
+
+ Uint mask, offset=0, i, key;
+ Uint cntsize;
+ Uint *cnt;
+
+ toSort = (char*) tosrt;
+ cntsize = 1 << bits;
+ cnt = ALLOCMEMORY(space, NULL, Uint, cntsize);
+
+ printf("alloc'd %d bins\n", cntsize);
+
+ memset(cnt, 0, sizeof(Uint)*cntsize);
+ b = src = malloc(size*nelem);
+
+ mask =~ (UINT_MAX<<bits);
+
+ for(; mask; mask <<= bits, offset+=bits) {
+ for(p=toSort; p < toSort+(nelem*size); p+=size) {
+ key = (keyaccess(p) & mask) >> offset;
+ ++cnt[key];
+ }
+
+ for(i=1; i < cntsize; ++i) {
+ cnt[i]+=cnt[i-1];
+ }
+
+ for(p=toSort+((nelem-1)*size); p >= toSort; p-=size) {
+ key = (keyaccess(p) & mask) >> offset;
+ memmove(b+((cnt[key]-1)*size), p, size);
+ --cnt[key];
+ }
+
+ p=b; b=toSort; toSort=p;
+ memset(cnt, 0, sizeof(Uint)*cntsize);
+ }
+
+ if(toSort == src) memcpy(b, toSort, size*nelem);
+ FREEMEMORY(space, src);
+ FREEMEMORY(space, cnt);
+
+ return;
+}
+
+
+void
+bl_radixSortKeyFirst(void *space, void *tosrt,
+ size_t size, size_t nelem,
+ Uint bits) {
+
+ char *p, *b, *src, *toSort;
+ Uint *cast;
+
+ Uint mask, offset=0, i, key;
+ Uint cntsize;
+ Uint *cnt;
+
+ toSort = (char*) tosrt;
+ cntsize = 1 << bits;
+ cnt = ALLOCMEMORY(space, NULL, Uint, cntsize);
+
+ memset(cnt, 0, sizeof(Uint)*cntsize);
+ b = src = malloc(size*nelem);
+
+ mask =~ (UINT_MAX<<bits);
+
+ for(; mask; mask <<= bits, offset+=bits) {
+ for(p=toSort; p < toSort+(nelem*size); p+=size) {
+ cast = (Uint*)p;
+ key = (*cast & mask) >> offset;
+ ++cnt[key];
+ }
+
+ for(i=1; i < cntsize; ++i) {
+ cnt[i]+=cnt[i-1];
+ }
+
+ for(p=toSort+((nelem-1)*size); p >= toSort; p-=size) {
+ cast = (Uint*)p;
+ key = (*cast & mask) >> offset;
+ memmove(b+((cnt[key]-1)*size), p, size);
+ --cnt[key];
+ }
+
+ p=b; b=toSort; toSort=p;
+ memset(cnt, 0, sizeof(Uint)*cntsize);
+ }
+
+ if(toSort == src) memcpy(b, toSort, size*nelem);
+ FREEMEMORY(space, src);
+ FREEMEMORY(space, cnt);
+
+ return;
+}
+
+
+void
+bl_radixSortUint(void *space, Uint *toSort,
+ size_t nelem,
+ Uint bits) {
+
+ Uint *p, *b, *src;
+
+ Uint mask, offset=0, i, key;
+ Uint cntsize;
+ Uint *cnt;
+
+ cntsize = 1 << bits;
+ cnt = ALLOCMEMORY(space, NULL, Uint, cntsize);
+
+ memset(cnt, 0, sizeof(Uint)*cntsize);
+ b = src = malloc(sizeof(Uint)*nelem);
+
+ mask =~ (UINT_MAX<<bits);
+
+ for(; mask; mask <<= bits, offset+=bits) {
+ for(p=toSort; p < toSort+nelem; ++p) {
+ key = (*p & mask) >> offset;
+ ++cnt[key];
+ }
+
+ for(i=1; i < cntsize; ++i) {
+ cnt[i]+=cnt[i-1];
+ }
+
+ for(p=toSort+((nelem-1)); p >= toSort; --p) {
+ key = (*p & mask) >> offset;
+ b[cnt[key]-1] = *p;
+ --cnt[key];
+ }
+
+ p=b; b=toSort; toSort=p;
+ memset(cnt, 0, sizeof(Uint)*cntsize);
+ }
+
+ if(toSort == src) memcpy(b, toSort, sizeof(Uint)*nelem);
+ FREEMEMORY(space, src);
+ FREEMEMORY(space, cnt);
+
+ return;
+}
diff --git a/segemehl/libs/radixsort.h b/segemehl/libs/radixsort.h
new file mode 100644
index 0000000..776ee39
--- /dev/null
+++ b/segemehl/libs/radixsort.h
@@ -0,0 +1,31 @@
+#ifndef RADIXSORT_H
+#define RADIXSORT_H
+
+/*
+ * radixsort.h
+ * segemehl
+ *
+ * Created by Steve Hoffmann on 08.02.10.
+ * Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include <basic-types.h>
+
+void
+bl_radixSort(void *space, void *toSort,
+ size_t size, size_t nelem,
+ Uint (*keyaccess)(void *),
+ Uint bits);
+
+void
+bl_radixSortKeyFirst(void *space, void *toSort,
+ size_t size, size_t nelem,
+ Uint bits);
+
+void
+bl_radixSortUint(void *space, Uint *toSort,
+ size_t nelem,
+ Uint bits);
+
+#endif
diff --git a/segemehl/libs/randseqs.c b/segemehl/libs/randseqs.c
new file mode 100644
index 0000000..a3a0570
--- /dev/null
+++ b/segemehl/libs/randseqs.c
@@ -0,0 +1,1522 @@
+
+/*
+ * randseqs.c
+ * randomize sequences
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 07/15/2010 10:33:13 AM CEST
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <time.h>
+#include "zlib.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "biofiles.h"
+#include "fileio.h"
+#include "randseqs.h"
+#include "charsequence.h"
+#include "assert.h"
+#include "zran.h"
+#include "info.h"
+#include "vtprogressbar.h"
+
+double fiveacc = .95;
+double threeacc = .9;
+
+/*----------------------------- bl_fastxScramble -----------------------------
+ *
+ * @brief scramble a fasta sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_fastxScramble (char *buffer, char *quality,
+ char *template, Uint len,
+ double acc, double Pmis, double Pins,
+ Uint uoff, Uint voff,
+ char *alphabet, Uint alphabetsize,
+ Uint minqual, Uint maxqual,
+ char *editstring, Uint *editstringlen, Uint *readerrcnt)
+{
+ Uint u = uoff, k = (*editstringlen);
+ int j;
+ char errchr, chr;
+ double errtype;
+
+ for (j=0; j < len; j++) {
+ chr=template[voff+j];
+ if (RANDUNIT > acc) {
+ errtype = RANDUNIT;
+ (*readerrcnt) += 1;
+
+ if (errtype <= Pmis) {
+ errchr = chr;
+ if (editstring) k += sprintf(&editstring[k], "%d:S;", j);
+ while(errchr == chr)
+ errchr = alphabet[RANDINT(alphabetsize-1)];
+ quality[u] = minqual + RANDINT(maxqual - minqual);
+ buffer[u++] = errchr;
+
+ } else if (errtype <= Pmis + Pins) {
+ errchr = alphabet[RANDINT(alphabetsize-1)];
+ if (editstring) k += sprintf(&editstring[k], "%d:I;", j);
+ quality[u] = minqual + RANDINT(maxqual - minqual);
+ buffer[u++] = errchr;
+ if(j < len-1) {
+ quality[u] = minqual + RANDINT(maxqual - minqual);
+ buffer[u++] = chr;
+ j++;
+ }
+
+ } else {
+ if (editstring) k += sprintf(&editstring[k], "%d:D;", j);
+ j--;
+ }
+
+ } else {
+ quality[u] = minqual + RANDINT(maxqual - minqual);
+ buffer[u++] = chr;
+ }
+
+ }
+
+ (*editstringlen) = k;
+ return u;
+}
+
+/*-------------------------- bl_fastxPrintMatePairs --------------------------
+ *
+ * @brief print mate pairs to dev and matedev
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastxPrintRandomMatePairs(FILE *dev, FILE *matedev,
+ char *sequence, Uint seqlen, Uint n,
+ Uint minlen, Uint maxlen, Uint mindist, Uint maxdist,
+ char *alphabet, Uint alphabetsize,
+ double acc,
+ double Pmis, double Pins, double Pdel,
+ unsigned char fastq,
+ Uint minqual, Uint maxqual,
+ char *five, Uint fivelen,
+ char *three, Uint threelen, Uint polyAlen)
+{
+ char *buffer, *quality, *matebuffer, *matequality, *polyAseq, *editstring, *rc,
+ startchr='>';
+ Uint i, u, start, matestart, len, matelen, readerrcnt, mateerrcnt,
+ fiveseqlen=fivelen, threeseqlen=threelen, polyAseqlen=polyAlen, editstringlen;
+
+ assert(maxlen >= minlen);
+ assert(maxdist >= mindist);
+ assert(seqlen > 100 && seqlen > 5*(maxdist+2*maxlen+1));
+
+ srand((unsigned int)time(NULL));
+
+ if (fastq) {
+ startchr = '@';
+ }
+
+ buffer = ALLOCMEMORY(space, NULL, char, 2*(maxlen+polyAlen)+threelen+fivelen+1);
+ matebuffer = ALLOCMEMORY(space, NULL, char, 2*(maxlen+polyAlen)+threelen+fivelen+1);
+ quality = ALLOCMEMORY(space, NULL, char, 2*(maxlen+polyAlen)+threelen+fivelen+1);
+ matequality = ALLOCMEMORY(space, NULL, char, 2*(maxlen+polyAlen)+threelen+fivelen+1);
+ editstring = ALLOCMEMORY(space, NULL, char, 40*(maxlen+polyAlen+threelen+fivelen)+1);
+ polyAseq = ALLOCMEMORY(space, NULL, char, 2*polyAlen);
+ memset(polyAseq, 'A', 2*polyAlen);
+
+ for (i=0; i < n; i++) {
+ start=seqlen;
+
+ memset(buffer, 0, 2*(maxlen+polyAlen)+threelen+fivelen+1);
+ memset(matebuffer, 0, 2*(maxlen+polyAlen)+threelen+fivelen+1);
+ memset(quality, 0, 2*(maxlen+polyAlen)+threelen+fivelen+1);
+ memset(matequality, 0, 2*(maxlen+polyAlen)+threelen+fivelen+1);
+ memset(editstring, 0, 40*(maxlen+polyAlen+threelen+fivelen)+1);
+ editstringlen = 0;
+ polyAseqlen = polyAlen;
+
+ while (start > seqlen-(maxdist+2*maxlen+1)) start = RANDINT(seqlen);
+ len = minlen + RANDINT(maxlen - minlen);
+
+ if(RANDUNIT > fiveacc) {
+ if(RANDUNIT > .5) {
+ fiveseqlen = fivelen - RANDINT(fivelen);
+ }
+ }
+
+ if(RANDUNIT > threeacc) {
+ if(RANDUNIT > .5) {
+ threeseqlen = threelen - RANDINT(threelen);
+ }
+ }
+
+ if(RANDUNIT > acc) {
+ if(RANDUNIT > .5) {
+ polyAseqlen = polyAlen - RANDINT(polyAlen);
+ } else {
+ polyAseqlen = polyAlen + RANDINT(polyAlen);
+ }
+ }
+
+
+ matelen = minlen + RANDINT(maxlen - minlen);
+ matestart = start + len - 1 + RANDINT(maxdist - mindist) + matelen - 1;
+
+ fprintf(dev, "%c%d %d (len: %d) [", startchr, i, start+1, len);
+ fprintf(dev, "mate %d (len: %d) on rc] ", matestart+1, matelen);
+ fprintf(dev, "3'prime: %s (l:%d), 5'prime %s (l:%d), polyA: %d /",
+ five, fiveseqlen, three, threeseqlen, polyAseqlen);
+
+ fprintf(matedev, "%c%d %d (len: %d) [", startchr, i, start+1, len);
+ fprintf(matedev, "mate %d (len: %d) on rc] ", matestart+1, matelen);
+ fprintf(matedev, "3'prime: %s (l:%d), 5'prime %s (l:%d), polyA: %d /",
+ five, fiveseqlen, three, threeseqlen, polyAseqlen);
+
+ readerrcnt = 0;
+ u = 0;
+
+ u = bl_fastxScramble (buffer, quality, five, fiveseqlen, fiveacc, Pmis, Pins,
+ u, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ editstring[editstringlen++] = '|';
+ u = bl_fastxScramble (buffer, quality, sequence, len, acc, Pmis, Pins,
+ u, start, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ editstring[editstringlen++] = '|';
+ u = bl_fastxScramble (buffer, quality, polyAseq, polyAseqlen, acc, Pmis, Pins,
+ u, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ editstring[editstringlen++] = '|';
+ u = bl_fastxScramble (buffer, quality, three, threeseqlen, threeacc, Pmis, Pins,
+ u, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ fprintf(dev, " %s", editstring);
+
+ memset(editstring, 0, 40*(maxlen+polyAlen+threelen+fivelen)+1);
+ editstringlen = 0;
+
+ mateerrcnt = 0;
+ rc = charDNAcomplement(NULL, &sequence[matestart-matelen+1], matelen);
+ u = 0;
+
+ u = bl_fastxScramble (matebuffer, matequality, five, fiveseqlen, fiveacc, Pmis, Pins,
+ u, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &mateerrcnt);
+
+ editstring[editstringlen++] = '|';
+ u = bl_fastxScramble (matebuffer, matequality, rc, matelen, acc, Pmis, Pins,
+ u, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &mateerrcnt);
+
+ editstring[editstringlen++] = '|';
+ u = bl_fastxScramble (matebuffer, matequality, polyAseq, polyAseqlen, acc, Pmis, Pins,
+ u, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &mateerrcnt);
+
+ editstring[editstringlen++] = '|';
+ u = bl_fastxScramble (matebuffer, matequality, three, threeseqlen, threeacc, Pmis, Pins,
+ u, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &mateerrcnt);
+
+
+ FREEMEMORY(space, rc);
+
+
+ fprintf(dev, " (r: %f) [r: %f]\n", (double) readerrcnt/(len+1.0),
+ (double) mateerrcnt/(matelen+1.0));
+
+ fprintf(matedev, " %s", editstring);
+ fprintf(matedev, " (r: %f) [r: %f]\n", (double) readerrcnt/(len+1.0),
+ (double) mateerrcnt/(matelen+1.0));
+
+ fprintf(dev, "%s\n", buffer);
+ fprintf(matedev, "%s\n", matebuffer);
+
+ if(fastq) {
+ fprintf(dev, "+ \n");
+ fprintf(dev, "%s\n\n", quality);
+ fprintf(matedev, "+ \n");
+ fprintf(matedev, "%s\n\n", matequality);
+ }
+ }
+
+ FREEMEMORY(space, buffer);
+ FREEMEMORY(space, quality);
+ FREEMEMORY(space, matebuffer);
+ FREEMEMORY(space, matequality);
+ FREEMEMORY(space, polyAseq);
+ FREEMEMORY(space, editstring);
+ return ;
+}
+
+
+/*--------------------------- bl_generateGeneModel ---------------------------
+ *
+ * @brief generate a random gene model
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_generateGeneModel (void *space, Uint n, Uint minexoncnt, Uint maxexoncnt,
+ Uint minexonlen, Uint maxexonlen, Uint minexondist, Uint maxexondist)
+{
+ Uint i, j, exoncnt, exonlen, exondist;
+
+
+ for(i=0; i < n; i++) {
+ exoncnt = minexoncnt + RANDINT(maxexoncnt-minexoncnt);
+ for(j=0; j < exoncnt; j++) {
+ exonlen = minexonlen + RANDINT(maxexonlen - minexonlen);
+ if(j>0) {
+ exondist = minexondist + RANDINT(maxexondist - minexondist);
+ // dummy info to avoid unused variables
+ if (0){
+ fprintf(stderr, "exoncnt=%u\texonlen=%u\texondist%u\n",
+ exoncnt, exonlen, exondist);
+ }
+ }
+ }
+ }
+ return ;
+}
+
+
+
+/*----------------------- bl_fastxSimulateSpliceSites ------------------------
+ *
+ * @brief simulate splicesites and split reads accross splice sites
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_fastxSimulateSpliceSites (void *space, char *sequence,
+ Uint seqlen, Uint n, Uint maxchildren,
+ char *alphabet, Uint alphabetsize, double acc,
+ double Pmis, double Pins, double Pdel,
+ Uint minqual, Uint maxqual,
+ double Pcis, Uint mincisdist, Uint maxcisdist,
+ double Pstrandswitch, Uint readlen)
+{
+
+ Uint parent;
+ Uint child=0;
+ Uint maxcoverage = 200;
+ Uint mincoverage = 1;
+ Uint c;
+ Uint randrange;
+ Uint i, j, k, l, e, m, effsplit;
+ int u, v;
+ Uint parentlen, lastparentlen = 0, childlen;
+ Uint readerrcnt;
+ Uint editstringlen;
+ double errtype;
+ char *buffer, *quality, *editstring, chr, errchr;
+ unsigned parentstrand, childrc=0, complement = 0;
+
+ assert(seqlen > 2*readlen);
+ randrange = seqlen - (2*readlen);
+
+ buffer = ALLOCMEMORY(space, NULL, char, 2*readlen+1);
+ quality = ALLOCMEMORY(space, NULL, char, 2*readlen+1);
+ editstring = ALLOCMEMORY(space, NULL, char, 40*(2*readlen)+1);
+
+ /*generate parent splice site*/
+ for(i=0; i < n; i++) {
+ parent = readlen + RANDINT(randrange);
+ child = 0;
+ parentstrand = (RANDUNIT > 0.5) ? 1 : 0;
+ m = RANDINT(maxchildren);
+ m = MAX(1, m);
+
+ /*generate the children*/
+ for(j=0; j < m; j++) {
+ childrc = 0;
+
+ if(RANDUNIT < Pcis) {
+ child = RANDINT(maxcisdist-mincisdist) + mincisdist;
+ if(RANDUNIT > 0.5 && parent + child + readlen < seqlen) {
+ child = parent + child;
+ } else if(parent > child + readlen) {
+ child = parent - child;
+ }
+ } else {
+ if (RANDUNIT > Pstrandswitch) {
+ child = RANDINT(maxcisdist-mincisdist) + mincisdist;
+ if(RANDUNIT > 0.5 && parent + child + readlen < seqlen) {
+ child = parent + child;
+ } else if(parent > child + readlen) {
+ child = parent - child;
+ }
+ childrc = 1;
+ } else {
+ child = readlen + RANDINT(randrange);
+ }
+ }
+
+ parentstrand = 0;
+ childrc=0;
+ /*do the sequence work*/
+ if(parent && child) {
+ c = RANDINT(maxcoverage);
+ c = MAX(mincoverage, c);
+ fprintf(stdout, "subject\t%d\t%d\tsplicetype%d-%d\t%d\n", parent+1, child+1, parentstrand, childrc, c);
+ /*cover the splice sites*/
+ for(k=0; k < c; k++) {
+
+ memset(buffer, 0, 2*readlen+1);
+ memset(quality, 0, 2*readlen+1);
+ memset(editstring, 0, 40*(2*readlen)+1);
+
+ do {
+ parentlen = RANDINT(readlen-22);
+ parentlen = MAX(22, parentlen);
+ } while (parentlen == lastparentlen);
+
+ lastparentlen = parentlen;
+
+ childlen = readlen-parentlen;
+ readerrcnt = 0;
+ editstringlen = 0;
+ effsplit = 0;
+ e = 0;
+
+ if(!parentstrand)
+ editstringlen = sprintf(editstring, "splice:%d", parent-parentlen);
+ else
+ editstringlen = sprintf(editstring, "splice:%d", parent + 1);
+
+
+ if((parentstrand && childrc) || (!parentstrand && !childrc)) {
+ editstringlen += sprintf(&editstring[editstringlen], "-%d;%d;", child-childlen, k);
+ } else {
+ editstringlen += sprintf(&editstring[editstringlen], "-%d;%d;", child+1, k);
+ }
+
+
+ /*construct the sequence*/
+ for(l=0; l < parentlen+childlen; l++) {
+
+ if (l == parentlen) effsplit = (e > 0) ? e-1: 0;
+
+ if(l < parentlen) {
+ if(!parentstrand) {
+ v = parent - parentlen + 1;
+ u = l;
+ complement = 0;
+ } else {
+ v = parent + parentlen - 1;
+ u = -1*l;
+ complement = 1;
+ }
+ } else {
+ if((parentstrand && childrc) || (!parentstrand && !childrc)) {
+ v = child;
+ u = l-parentlen;
+ complement = 0;
+ } else {
+ v = child;
+ u = -1*(l-parentlen);
+ complement = 1;
+ }
+ }
+
+ chr=sequence[v+u];
+ if(complement) chr = charComplementChar(chr);
+
+ if (RANDUNIT > acc) {
+ errtype = RANDUNIT;
+ readerrcnt++;
+ if (errtype <= Pmis) {
+ errchr = chr;
+ editstringlen += sprintf(&editstring[editstringlen], "%d:S;",u);
+ while(errchr == chr) errchr = alphabet[RANDINT(alphabetsize)];
+ quality[e] = minqual + RANDINT(maxqual - minqual);
+ buffer[e++] = errchr;
+ } else if (errtype <= Pmis + Pins) {
+ errchr = alphabet[RANDINT(alphabetsize)];
+ editstringlen += sprintf(&editstring[editstringlen], "%d:I;",u);
+ quality[e] = minqual + RANDINT(maxqual - minqual);
+ buffer[e++] = errchr;
+ quality[e] = minqual + RANDINT(maxqual - minqual);
+ buffer[e++] = chr;
+ } else {
+ editstringlen += sprintf(&editstring[editstringlen], "%d:D;",u);
+ }
+ } else {
+ quality[e] = minqual + RANDINT(maxqual - minqual);
+ buffer[e++] = chr;
+ }
+
+ } /*construct the sequence*/
+
+
+ if(!parentstrand) {
+ editstringlen += sprintf(&editstring[editstringlen], "(+:");
+ } else {
+ editstringlen += sprintf(&editstring[editstringlen], "(-:");
+ }
+
+ if((parentstrand && childrc) || (!parentstrand && !childrc)) {
+ editstringlen += sprintf(&editstring[editstringlen], "+)");
+ } else {
+ editstringlen += sprintf(&editstring[editstringlen], "-)");
+ }
+
+
+ if(e-1-effsplit >= 0);
+ assert(effsplit <= strlen(buffer));
+ fprintf(stderr, "@%s\n%s\n+%s\n%s\n", editstring, buffer, editstring, quality);
+
+ } /*end of coverage loop*/
+ } /*end of sequence work*/
+
+ }
+ fprintf(stderr, "\n");
+ }
+
+
+ return ;
+}
+
+/*------------------------ bl_fastxGenerateSplitReads ------------------------
+ *
+ * @brief generate a set of spliced reads
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_fastxPrintRandomSplitReads (FILE *dev, char *sequence,
+ Uint seqlen, Uint n,
+ Uint minspltlen, Uint maxspltlen,
+ char *alphabet, Uint alphabetsize,
+ double acc,
+ double Pmis, double Pins, double Pdel,
+ unsigned char fastq,
+ Uint minqual, Uint maxqual,
+ char *five, Uint fivelen,
+ char *three, Uint threelen, Uint polyAlen)
+{
+ char *buffer, *rc, *quality, *fivebuffer=NULL, *fivequality=NULL,
+ *threebuffer=NULL, *threequality=NULL,
+ *polyAbuffer=NULL, *polyAquality=NULL, *polyAseq=NULL,
+ *editstring, startchr='>', chr, errchr=0;
+ unsigned char hasReverse, isPartial, inDownstream;
+ double errtype;
+ int i, j, u, q, k, start, start2, spltlen, spltlen2,effsplit=0;
+ Uint threeseqlen=threelen, fiveseqlen=fivelen, polyAseqlen=polyAlen,
+ readerrcnt=0, editstringlen=0;
+
+ assert(maxspltlen >= minspltlen);
+ assert(seqlen > 100 && seqlen > 5*maxspltlen);
+
+ srand((unsigned int)time(NULL));
+ if (five) {
+ fivebuffer = ALLOCMEMORY(space, NULL, char, 2*fivelen+3);
+ fivequality = ALLOCMEMORY(space, NULL, char, 2*fivelen+3);
+ }
+
+ buffer = ALLOCMEMORY(space, NULL, char, 2*maxspltlen+3);
+ quality = ALLOCMEMORY(space, NULL, char, 2*maxspltlen+3);
+ editstring = ALLOCMEMORY(space, NULL, char, 40*(2*maxspltlen+polyAlen+threelen+fivelen));
+
+ if(polyAlen) {
+ polyAbuffer = ALLOCMEMORY(space, NULL, char, 2*polyAlen+3);
+ polyAquality = ALLOCMEMORY(space, NULL, char, 2*polyAlen+3);
+ }
+
+ if (three) {
+ threebuffer = ALLOCMEMORY(space, NULL, char, 2*threelen+3);
+ threequality = ALLOCMEMORY(space, NULL, char, 2*threelen+3);
+ }
+
+ polyAseq = ALLOCMEMORY(space, NULL, char, 2*polyAlen);
+ memset(polyAseq, 'A', 2*polyAlen);
+
+ for (i=0; i < n; i++) {
+ start=seqlen;
+ memset(buffer, 0, 2*maxspltlen+3);
+ memset(quality, 0, 2*maxspltlen+3);
+ memset(editstring, 0, 40*(2*maxspltlen+polyAlen+threelen+fivelen));
+ editstringlen = 0;
+ readerrcnt = 0;
+ polyAseqlen = polyAlen;
+
+ hasReverse = isPartial = inDownstream = 0;
+ if(RANDINT(10) > 5) hasReverse = 1;
+ if(RANDINT(10) > 5) isPartial = 1;
+ if(RANDINT(10) > 5) inDownstream = 1;
+
+ while (start > seqlen-(2*maxspltlen)) start = RANDINT(seqlen);
+ spltlen = minspltlen + RANDINT(maxspltlen - minspltlen);
+ start2 = start + spltlen + RANDINT(seqlen - start - spltlen - maxspltlen);
+ spltlen2 = minspltlen + RANDINT(maxspltlen - minspltlen);
+
+ if(RANDUNIT > fiveacc) {
+ if(RANDUNIT > .5) {
+ fiveseqlen = fivelen - RANDINT(fivelen);
+ }
+ }
+
+ if(RANDUNIT > threeacc) {
+ if(RANDUNIT > .5) {
+ threeseqlen = threelen - RANDINT(threelen);
+ }
+ }
+
+ if(RANDUNIT > acc) {
+ if(RANDUNIT > .5) {
+ polyAseqlen = polyAlen - RANDINT(polyAlen);
+ } else {
+ polyAseqlen = polyAlen + RANDINT(polyAlen);
+ }
+ }
+
+ fprintf(dev, "%c%d %d(len: %d)-", startchr, i, start+1, spltlen);
+ fprintf(dev, "%d(len: %d) ", start2+1, spltlen2);
+ u = 0;
+
+ for (k=0; k < spltlen+spltlen2; k++) {
+ j = (k < spltlen) ? k : k-spltlen;
+ q = (k < spltlen) ? start : start2;
+ if (k == spltlen) effsplit = (u > 0) ? u-1: 0;
+
+ chr=sequence[q+j];
+
+ if (RANDUNIT > acc) {
+ errtype = RANDUNIT;
+ readerrcnt++;
+ if (errtype <= Pmis) {
+ errchr = chr;
+ editstringlen += sprintf(editstring, "%d:S;",j);
+ while(errchr == chr)
+ errchr = alphabet[RANDINT(alphabetsize)];
+ quality[u] = minqual + RANDINT(maxqual - minqual);
+ buffer[u++] = errchr;
+ } else if (errtype <= Pmis + Pins) {
+ errchr = alphabet[RANDINT(alphabetsize)];
+ editstringlen += sprintf(editstring, "%d:I;",j);
+ quality[u] = minqual + RANDINT(maxqual - minqual);
+ buffer[u++] = errchr;
+ quality[u] = minqual + RANDINT(maxqual - minqual);
+ buffer[u++] = chr;
+ } else {
+ editstringlen += sprintf(editstring, "%d:D;",j);
+ }
+ } else {
+ quality[u] = minqual + RANDINT(maxqual - minqual);
+ buffer[u++] = chr;
+ }
+ }
+
+ assert(u-1-effsplit >= 0);
+ assert(effsplit <= strlen(buffer));
+
+ if (hasReverse) {
+ if(isPartial) {
+ if(inDownstream) {
+ rc = charDNAcomplement(NULL, &buffer[effsplit], strlen(buffer)-effsplit);
+ memmove(&buffer[effsplit], rc, strlen(buffer)-effsplit);
+ free(rc);
+ editstringlen += sprintf(editstring, " (+/-)");
+ } else {
+ rc = charDNAcomplement(NULL, buffer, effsplit);
+ memmove(buffer, rc, effsplit);
+ free(rc);
+ editstringlen = sprintf(editstring, " (-/+))");
+ }
+ } else {
+ rc = charDNAcomplement(NULL, buffer, strlen(buffer));
+ memmove(buffer, rc, strlen(buffer));
+ editstringlen = sprintf(editstring, " (-/-)");
+ FREEMEMORY(space, rc);
+ }
+ } else {
+ editstringlen += sprintf(editstring, " (+/+)");
+ }
+
+ editstring[editstringlen++] = '|';
+ if (five) {
+ memset(fivebuffer, 0, 2*fivelen+3);
+ memset(fivequality, 0, 2*fivelen+3);
+ u = bl_fastxScramble (fivebuffer, fivequality, five, fiveseqlen,
+ fiveacc, Pmis, Pins, 0, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+ }
+
+ editstring[editstringlen++] = '|';
+ if(polyAlen) {
+ memset(polyAbuffer, 0, 2*polyAlen+3);
+ memset(polyAquality, 0, 2*polyAlen+3);
+ u = bl_fastxScramble (polyAbuffer, polyAquality, polyAseq, polyAseqlen,
+ acc, Pmis, Pins, 0, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+ }
+
+ editstring[editstringlen++] = '|';
+ if(three) {
+ memset(threebuffer, 0, 2*threelen+3);
+ memset(threequality, 0, 2*threelen+3);
+ u = bl_fastxScramble (threebuffer, threequality, three, threeseqlen,
+ threeacc, Pmis, Pins, 0, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+ }
+
+ fprintf(dev, " (r: %f)\n", (double) readerrcnt/(spltlen+spltlen2+1.0));
+ if (five) fprintf(dev, "%s", fivebuffer);
+ fprintf(dev, "%s", buffer);
+ if (polyAlen) fprintf(dev, "%s", polyAbuffer);
+ if (three) fprintf(dev, "%s", threebuffer);
+ fprintf(dev, "\n");
+
+ if(fastq) {
+ fprintf(dev, "+ \n");
+ if(five) fprintf(dev,"%s", fivequality);
+ fprintf(dev, "%s", quality);
+ if(three) fprintf(dev, "%s", threequality);
+ fprintf(dev, "\n\n");
+ }
+ }
+
+ if(five) {
+ FREEMEMORY(space, fivebuffer);
+ FREEMEMORY(space, fivequality);
+ }
+
+ FREEMEMORY(space, buffer);
+ FREEMEMORY(space, quality);
+ FREEMEMORY(space, editstring);
+
+ if(three) {
+ FREEMEMORY(space, threebuffer);
+ FREEMEMORY(space, threequality);
+ }
+ if (polyAlen) {
+ FREEMEMORY(space, polyAbuffer);
+ FREEMEMORY(space, polyAquality);
+
+ }
+ FREEMEMORY(space, polyAseq);
+ return ;
+}
+
+
+/*-------------------------- bl_fastxPrintRefReadSet -------------------------
+ *
+ * @brief print a reference and a set of simulated reads in fastq format
+ * to device dev
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastxPrintRandomReads(
+ FILE *reads,
+ char *sequence,
+ Uint seqlen,
+ Uint n,
+ Uint minlen,
+ Uint maxlen,
+ char *alphabet, Uint alphabetsize,
+ double acc,
+ double Pmis, double Pins, double Pdel,
+ unsigned char fastq,
+ Uint minqual, Uint maxqual,
+ char *five, Uint fivelen,
+ char *three, Uint threelen, Uint polyAlen)
+{
+
+ Uint i, k, start, len, readerrcnt, fiveseqlen=fivelen, threeseqlen=threelen,
+ polyAseqlen=0, editstringlen=0;
+ char *quality, *buffer, *editstring, *polyAseq, startchr = '>';
+
+ if (fastq) {
+ startchr = '@';
+ assert(maxqual <= 126 && minqual >=33);
+ }
+
+ assert(seqlen > maxlen);
+ assert(minlen <= maxlen);
+ assert(alphabetsize > 1);
+ assert(Pmis + Pins + Pdel == 1);
+
+ srand((unsigned int)time(NULL));
+
+ quality = ALLOCMEMORY(space, NULL, char, 2*(maxlen+fivelen+threelen+polyAlen));
+ buffer = ALLOCMEMORY(space, NULL, char, 2*(maxlen+fivelen+threelen+polyAlen));
+ editstring = ALLOCMEMORY(space, NULL, char, 40*(maxlen+fivelen+threelen+polyAlen));
+ polyAseq = ALLOCMEMORY(space, NULL, char, 2*polyAlen);
+ memset(polyAseq, 'A', 2*polyAlen);
+
+ for(i=0; i < n; i++){
+ memset(buffer, 0, sizeof(char)*(2*(maxlen+fivelen+threelen+polyAlen)));
+ memset(quality, 0, sizeof(char)*(2*(maxlen+fivelen+threelen+polyAlen)));
+ memset(editstring, 0, sizeof(char)*(40*(maxlen+fivelen+threelen+polyAlen)));
+ editstringlen = 0;
+ polyAseqlen = polyAlen;
+
+ len = minlen + RANDINT(maxlen - minlen);
+ start = RANDINT(seqlen-len);
+
+ if(RANDUNIT > fiveacc) {
+ if(RANDUNIT > .5) {
+ fiveseqlen = fivelen - RANDINT(fivelen);
+ }
+ }
+
+ if(RANDUNIT > threeacc) {
+ if(RANDUNIT > .5) {
+ threeseqlen = threelen - RANDINT(threelen);
+ }
+ }
+
+ if(RANDUNIT > acc) {
+ if(RANDUNIT > .5) {
+ polyAseqlen = polyAlen - RANDINT(polyAlen);
+ } else {
+ polyAseqlen = polyAlen + RANDINT(polyAlen);
+ }
+ }
+ fprintf(reads, "%c%d %d (len: %d) ", startchr, i, start+1, len);
+
+ readerrcnt=0;
+ k=0;
+
+ k = bl_fastxScramble (buffer, quality, five, fiveseqlen, fiveacc, Pmis, Pins,
+ k, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ editstring[editstringlen++] = '|';
+
+ k = bl_fastxScramble (buffer, quality, sequence, len, acc, Pmis, Pins,
+ k, start, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ editstring[editstringlen++] = '|';
+
+ k = bl_fastxScramble (buffer, quality, polyAseq, polyAseqlen, acc, Pmis, Pins,
+ k, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ editstring[editstringlen++] = '|';
+
+ k = bl_fastxScramble (buffer, quality, three, threeseqlen, threeacc, Pmis, Pins,
+ k, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ fprintf(reads, "%s", editstring);
+ fprintf(reads, " (r: %f)\n", (double) readerrcnt/(len+1.0));
+ fprintf(reads, "%s\n", buffer);
+ if (fastq) {
+ fprintf(reads, "+\n");
+ fprintf(reads, "%s\n", quality);
+ }
+ }
+
+ FREEMEMORY(space, buffer);
+ FREEMEMORY(space, quality);
+ FREEMEMORY(space, polyAseq);
+ FREEMEMORY(space, editstring);
+
+ return ;
+}
+
+
+/*---------------------------- bl_fastaPrintRandom ---------------------------
+ *
+ * @brief print a random fasta sequence to device dev
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_fastaPrintRandom(FILE *dev, Uint noofseqs, Uint minlen, Uint maxlen)
+{
+ Uint i, j, len, chr;
+ unsigned int iseed = (unsigned int)time(NULL);
+
+ assert(minlen <= maxlen);
+ srand(iseed);
+ for(i=0; i < noofseqs; i++){
+ len = minlen + ((maxlen - minlen) * (rand() / (RAND_MAX + 1.0)));
+ fprintf(dev, ">random sequence %d (len: %d)\n", i, len);
+ for(j=0; j < len; j++) {
+ chr = 65 + (62 * (rand() / (RAND_MAX + 1.0)));
+ fprintf(dev, "%c", chr);
+ }
+ fprintf(dev,"\n");
+ }
+ return ;
+}
+
+
+
+
+/*------------------------- bl_getTrackFromGeneModel -------------------------
+ *
+ * @brief get a annotation track from a gene model
+ * @author Steve Hoffmann
+ *
+ */
+
+annotationtrack_t *
+bl_getTrackFromGeneModel (void *space, geneset_t *set)
+{
+ Uint i, j;
+ annotationtrack_t *track;
+
+ track = ALLOCMEMORY(space, NULL, annotationtrack_t, 1);
+ track->items = ALLOCMEMORY(space, NULL, annotationitem_t, set->noofgenes);
+
+ for(i=0; i < set->noofgenes; i++) {
+
+ track->items[i].start = set->genes[i].exons[0].start;
+ track->items[i].end = set->genes[i].exons[set->genes[i].noofexons-1].end;
+ track->items[i].strand = set->genes[i].direction;
+ track->items[i].thickStart = set->genes[i].startcodon;
+ track->items[i].thickEnd = set->genes[i].stopcodon;
+ track->items[i].blockCount = set->genes[i].noofexons;
+ track->items[i].score = 0;
+
+ track->items[i].blockSizes =
+ ALLOCMEMORY(space, NULL, Uint, set->genes[i].noofexons);
+ track->items[i].blockStarts =
+ ALLOCMEMORY(space, NULL, Uint, set->genes[i].noofexons);
+ track->items[i].blockRefseqs =
+ ALLOCMEMORY(space, NULL, char*, set->genes[i].noofexons);
+ track->items[i].blockStrands =
+ ALLOCMEMORY(space, NULL, char, set->genes[i].noofexons);
+ track->items[i].itemRgb =
+ ALLOCMEMORY(space, NULL, Uint, 3);
+ memset(track->items[i].itemRgb, 0, 3*sizeof(Uint));
+ track->items[i].chromname =
+ ALLOCMEMORY(space, NULL, char, strlen(set->genes[i].exons[0].refchr)+1);
+ memmove(track->items[i].chromname, set->genes[i].exons[0].refchr,
+ strlen(set->genes[i].exons[0].refchr)+1);
+ track->items[i].chromname[strlen(set->genes[i].exons[0].refchr)]=0;
+
+ track->items[i].name =
+ ALLOCMEMORY(space, NULL, char, strlen(set->genes[i].id));
+ memmove(track->items[i].name, set->genes[i].id, strlen(set->genes[i].id));
+ track->items[i].name[strlen(set->genes[i].id)] = 0;
+
+ for(j=0; j < set->genes[i].noofexons; j++) {
+ track->items[i].blockSizes[j] =
+ set->genes[i].exons[j].end - set->genes[i].exons[j].start + 1;
+ track->items[i].blockStrands[j] =
+ set->genes[i].exons[j].strand;
+ track->items[i].itemRgb[j] = 0;
+ if(strcmp(set->genes[i].exons[j].refchr, track->items[i].chromname) ||
+ track->items[i].blockStrands[j] != track->items[i].strand) {
+ track->items[i].blockRefseqs[j] =
+ ALLOCMEMORY(space, NULL, char, strlen(set->genes[i].exons[j].refchr));
+ memmove(track->items[i].blockRefseqs[j], set->genes[i].exons[j].refchr,
+ strlen(set->genes[i].exons[j].refchr));
+ track->items[i].blockRefseqs[j][strlen(set->genes[i].exons[j].refchr)] = 0;
+
+ track->items[i].blockStarts[j] =
+ set->genes[i].exons[j].start;
+ } else {
+ track->items[i].blockRefseqs[j] = NULL;
+
+ track->items[i].blockStarts[j] =
+ set->genes[i].exons[j].start - track->items[i].start;
+ }
+ }
+ }
+
+ track->noofitems = set->noofgenes;
+ return track;
+}
+
+
+/*------------------------ bl_getGeneModelFromBEDTrack -------------------------
+ *
+ * @brief get the gene model from an annotation track
+ * @author Steve Hoffmann
+ *
+ */
+
+geneset_t *
+bl_getGeneModelFromBEDtrack (void *space, annotationtrack_t *track)
+{
+ Uint i, j, start, blockCount;//, end;
+ Uint *blockSizes;
+ Uint *blockStarts;
+ gene_t *genes;
+ geneset_t *geneset;
+
+ geneset = ALLOCMEMORY(space, NULL, geneset_t, 1);
+ genes = ALLOCMEMORY(space, NULL, gene_t, track->noofitems);
+
+ for(i=0; i < track->noofitems; i++) {
+
+ start = track->items[i].start;
+ // not used: end = track->items[i].end;
+ blockCount = track->items[i].blockCount;
+ blockSizes = track->items[i].blockSizes;
+ blockStarts = track->items[i].blockStarts;
+
+ genes[i].id = track->items[i].name;
+ genes[i].startcodon = track->items[i].thickStart;
+ genes[i].stopcodon = track->items[i].thickEnd;
+ genes[i].noofexons = blockCount;
+ genes[i].direction = track->items[i].strand;
+ genes[i].exons = ALLOCMEMORY(space, NULL, exon_t, blockCount);
+
+ for(j=0; j < blockCount; j++) {
+
+ if (track->items[i].blockRefseqs[j]) {
+ genes[i].exons[j].refchr = track->items[i].blockRefseqs[j];
+ genes[i].exons[j].start = blockStarts[j];
+ } else {
+ genes[i].exons[j].refchr = track->items[i].chromname;
+ genes[i].exons[j].start = start + blockStarts[j];
+ }
+
+ genes[i].exons[j].strand = track->items[i].blockStrands[j];
+
+ genes[i].exons[j].end = genes[i].exons[j].start + blockSizes[j] -1;
+ genes[i].exons[j].noofcds = 0;
+ genes[i].exons[j].cds = NULL;
+ }
+ }
+
+ geneset->noofgenes = track->noofitems;
+ geneset->genes = genes;
+
+ return geneset;
+}
+
+
+/*------------------------------- bl_copyGene --------------------------------
+ *
+ * @brief copy the gene
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_copyGene (void *space, gene_t *to, gene_t *from)
+{
+ Uint i;
+
+ to->id = from->id;
+ to->startcodon = from->startcodon;
+ to->stopcodon = from->stopcodon;
+ to->noofexons = from->noofexons;
+ to->direction = from->direction;
+ to->exons = ALLOCMEMORY(space, NULL, exon_t, from->noofexons);
+
+ for(i=0; i < from->noofexons; i++) {
+ to->exons[i].refchr = from->exons[i].refchr;
+ to->exons[i].strand = from->exons[i].strand;
+ to->exons[i].start = from->exons[i].start;
+ to->exons[i].end = from->exons[i].end;
+ to->exons[i].noofcds = 0;
+ to->exons[i].cds = NULL;
+ }
+
+ return ;
+}
+
+/*------------------------- bl_simulateTransSplicing -------------------------
+ *
+ * @brief simulate random transsplicing n events from a given gene model
+ * @author Steve Hoffmann
+ *
+ */
+
+ geneset_t*
+bl_simulateTransSplicing (void *space, geneset_t *set, char type, Uint n)
+{
+ geneset_t *transset;
+ gene_t* genes;
+ Uint i, u, v, w;
+
+ transset = ALLOCMEMORY(space, NULL, geneset_t, 1);
+ genes = ALLOCMEMORY(space, NULL, gene_t, n);
+
+ for(i=0; i < n; i++) {
+ u = RANDINT(set->noofgenes-1);
+ switch(type) {
+
+ case 'S':
+ bl_copyGene(space, &genes[i], &set->genes[u]);
+ w = RANDINT(genes[i].noofexons-1);
+
+ if(genes[i].exons[w].strand == '+') {
+ genes[i].exons[w].strand = '-';
+ if(genes[i].noofexons == 1) {
+ genes[i].direction= '-';
+ }
+ } else {
+ genes[i].exons[w].strand = '+';
+ if(genes[i].noofexons == 1) {
+ genes[i].direction = '+';
+ }
+ }
+
+ break;
+
+ case 'D':
+ bl_copyGene(space, &genes[i], &set->genes[u]);
+ v = RANDINT(set->noofgenes-1);
+ w = RANDINT(set->genes[v].noofexons-1);
+ genes[i].exons =
+ ALLOCMEMORY(space, genes[i].exons, exon_t, genes[i].noofexons+1);
+ genes[i].exons[genes[i].noofexons].refchr =
+ set->genes[v].exons[w].refchr;
+ genes[i].exons[genes[i].noofexons].strand =
+ set->genes[v].exons[w].strand;
+ genes[i].exons[genes[i].noofexons].start =
+ set->genes[v].exons[w].start;
+ genes[i].exons[genes[i].noofexons].end =
+ set->genes[v].exons[w].end;
+ genes[i].exons[genes[i].noofexons].noofcds = 0;
+ genes[i].exons[genes[i].noofexons].cds = NULL;
+
+ genes[i].noofexons++;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ transset->noofgenes = n;
+ transset->genes = genes;
+ return transset;
+}
+
+
+/*-------------------------- bl_printSplicingEdges ---------------------------
+ *
+ * @brief print the splicing edges
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_printSplicingEdges (void *space, FILE *dev, geneset_t *set)
+{
+ Uint i, j, k, direction, next;
+ Uint spliceBpos, spliceApos;
+ char *spliceAchr, *spliceBchr;
+
+ for(i=0; i < set->noofgenes; i++) {
+ direction = set->genes[i].direction;
+ for(j=0; j < set->genes[i].noofexons; j++) {
+ next = -1;
+
+ if(direction == '+') {
+ k = j;
+ if(j < set->genes[i].noofexons-1) {
+ next = j+1;
+ }
+ } else {
+ k = set->genes[i].noofexons - j - 1;
+ if(j < set->genes[i].noofexons-1) {
+ next = set->genes[i].noofexons - j - 2;
+ }
+ }
+
+ if(next != -1) {
+ spliceAchr = set->genes[i].exons[k].refchr;
+ if(set->genes[i].exons[k].strand == '-') {
+ spliceApos = set->genes[i].exons[k].start;
+ } else {
+ spliceApos = set->genes[i].exons[k].end;
+ }
+
+
+ spliceBchr = set->genes[i].exons[next].refchr;
+ if(set->genes[i].exons[next].strand == '-') {
+ spliceBpos = set->genes[i].exons[next].end;
+ } else {
+ spliceBpos = set->genes[i].exons[next].start;
+ }
+
+ fprintf(dev, "%s\t%d\t%c\t%s\t%d\t%c\n", spliceAchr, spliceApos,
+ set->genes[i].exons[k].strand, spliceBchr, spliceBpos,
+ set->genes[i].exons[next].strand);
+ }
+ }
+ }
+ return ;
+}
+
+/*--------------------------- bl_getGeneSequences ----------------------------
+ *
+ * @brief get the sequences from a gene model
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_getGeneSequence(void *space, fasta_t *reference, gene_t *gene)
+{
+ Uint i, start, end, seqlen = 0, id=0, k = 0;
+ char *chr;
+ char *sequence = NULL, *buffer, *test;
+
+ for(i=0; i < gene->noofexons; i++) {
+
+ if(gene->direction == '-') {
+ k = gene->noofexons - 1 - i;
+ } else {
+ k = i;
+ }
+
+ start = gene->exons[k].start;
+ end = gene->exons[k].end;
+
+ sequence = ALLOCMEMORY(space, sequence, char, seqlen+(end-start)+2);
+ id = bl_fastxFindIDIdx (gene->exons[k].refchr, reference);
+ chr = bl_fastaGetSequence(reference, id);
+
+ if(gene->exons[k].strand == '+') {
+ test = ALLOCMEMORY(space, NULL, char, (end-start)+2);
+ memmove(test, &chr[start], (end-start)+1);
+ test[(end-start)+1] = 0;
+// fprintf(stdout,"[%d,%d]\ns:%s\n\n", start, end, test);
+ memmove(&sequence[seqlen], &chr[start], (end-start)+1);
+
+ } else {
+ test = ALLOCMEMORY(space, NULL, char, (end-start)+2);
+ memmove(test, &chr[start], (end-start)+1);
+ test[(end-start)+1] = 0;
+ buffer = charDNAcomplement(space, &chr[start], (end-start)+1);
+// fprintf(stdout,"[%d,%d]\ns:%s\nb:%s\n\n", start, end, test, buffer);
+ memmove(&sequence[seqlen], buffer, (end-start)+1);
+ FREEMEMORY(space, buffer);
+ }
+ seqlen += (end-start)+1;
+ sequence[seqlen] = 0;
+ }
+
+ return sequence;
+}
+
+
+/*------------------------ bl_simulateGeneSequencing -------------------------
+ *
+ * @brief generate a set of simulated reads for a given gene
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_simulateGeneSequencing (void *space, FILE *dev, fasta_t *reference, gene_t *gene, Uint readlen,
+ Uint cov, char *alphabet, Uint alphabetsize, Uint minqual, Uint maxqual,
+ double acc, double Pmis, double Pins)
+{
+
+ char *quality, *buffer, *template, *editstring;
+ char *sequence;
+ Uint i, j, seqlen, noofreads, editstringlen, readerrcnt=0;//, buflen;
+ int start=0, off=0;
+ double lambda;
+ double x=0;
+
+ srand((unsigned int)time(NULL));
+ sequence = bl_getGeneSequence(space, reference, gene);
+ seqlen = strlen(sequence);
+
+ noofreads = ((Uint)(((double)seqlen+readlen)/(double)readlen))*cov;
+ lambda = ((double)1.0/((double)readlen))*((double)cov);
+
+ template = ALLOCMEMORY(space, NULL, char, readlen+1);
+
+ buffer = ALLOCMEMORY(space, NULL, char, 2*readlen+1);
+ quality = ALLOCMEMORY(space, NULL, char, 2*readlen+1);
+ editstring = ALLOCMEMORY(space, NULL, char, 40*readlen+1);
+
+
+ for(i=0; i < noofreads; i++) {
+ //start=RANDINT(seqlen-1);
+
+ x += 1.0/lambda;
+ off = trunc(x);
+ start = MAX(0, ((int)seqlen-1)-off);
+
+ memset(template, 'A', sizeof(char)*readlen);
+ template[readlen] = 0;
+
+ for(j=0; j < readlen && start+j < seqlen; j++) {
+ template[j] = sequence[start+j];
+ }
+
+ memset(buffer, 0, sizeof(char)*(2*readlen)+1);
+ memset(quality, 0, sizeof(char)*(2*readlen)+1);
+ memset(editstring, 0, sizeof(char)*(readlen)*40+1);
+ editstringlen =0;
+
+ //not used buflen = bl_fastxScramble(...
+ bl_fastxScramble(buffer, quality, template, readlen, acc, Pmis, Pins,
+ 0, 0,
+ alphabet, alphabetsize,
+ minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ assert(strlen(buffer) == readlen);
+ assert(strlen(quality) == readlen);
+
+ fprintf(dev, "@%s:%d:%d;%s\n", gene->id, start, start+j, editstring);
+ fprintf(dev, "%s\n+\n", buffer);
+ fprintf(dev, "%s\n", quality);
+
+ }
+
+
+ return ;
+}
+
+
+/*----------------------- bl_simulateGeneSetSequencing -----------------------
+ *
+ * @brief generate simulated reads for a set of genes
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_simulateGeneSetSequencing (void *space, FILE *dev, fasta_t *reference, geneset_t *genes,
+ Uint readlen, Uint cov, char *alphabet, Uint alphabetsize, Uint minqual, Uint maxqual,
+ double acc, double Pmis, double Pins)
+{
+ Uint i;
+
+ initProgressBarVT();
+
+ for(i=0; i < genes->noofgenes; i++) {
+ progressBarVT("genes simulated.", genes->noofgenes, i, 25);
+
+ bl_simulateGeneSequencing(space, dev, reference, &genes->genes[i], readlen, cov,
+ alphabet, alphabetsize, minqual, maxqual, acc, Pmis, Pins);
+ }
+
+ return ;
+}
+
+
+
+/*---------------------- bl_fastxPrintRandomBisulfiteReads ---------------------
+ *
+ * @brief print a reference and a set of simulated bisulfite reads in fastq format
+ * to device dev with given conversion rates at cytosines
+ * @author Christian Otto
+ *
+ */
+
+void
+bl_fastxPrintRandomBisulfiteReads(
+ FILE *reads,
+ char *sequence,
+ char *sequencerates,
+ Uint seqlen,
+ Uint n,
+ Uint minlen,
+ Uint maxlen,
+ char *alphabet, Uint alphabetsize,
+ double acc,
+ double Pmis, double Pins, double Pdel,
+ unsigned char fastq,
+ Uint minqual, Uint maxqual,
+ char *five, Uint fivelen,
+ char *three, Uint threelen, Uint polyAlen, char *prefix)
+{
+
+ Uint i, k, start, len, readerrcnt, fiveseqlen=fivelen, threeseqlen=threelen,
+ polyAseqlen=0, editstringlen=0;
+ char *quality, *buffer, *editstring, *polyAseq, startchr = '>';
+
+ if (fastq) {
+ startchr = '@';
+ assert(maxqual <= 126 && minqual >=33);
+ }
+
+ assert(seqlen > maxlen);
+ assert(minlen <= maxlen);
+ assert(alphabetsize > 1);
+ assert(Pmis + Pins + Pdel == 1);
+
+ srand((unsigned int)time(NULL));
+
+ quality = ALLOCMEMORY(space, NULL, char, 2*(maxlen+fivelen+threelen+polyAlen));
+ buffer = ALLOCMEMORY(space, NULL, char, 2*(maxlen+fivelen+threelen+polyAlen));
+ editstring = ALLOCMEMORY(space, NULL, char, 40*(maxlen+fivelen+threelen+polyAlen));
+ polyAseq = ALLOCMEMORY(space, NULL, char, 2*polyAlen);
+ memset(polyAseq, 'A', 2*polyAlen);
+
+ for(i=0; i < n; i++){
+ memset(buffer, 0, sizeof(char)*(2*(maxlen+fivelen+threelen+polyAlen)));
+ memset(quality, 0, sizeof(char)*(2*(maxlen+fivelen+threelen+polyAlen)));
+ memset(editstring, 0, sizeof(char)*(40*(maxlen+fivelen+threelen+polyAlen)));
+ editstringlen = 0;
+ polyAseqlen = polyAlen;
+
+ len = minlen + RANDINT(maxlen - minlen);
+ start = RANDINT(seqlen-len);
+
+ if(RANDUNIT > fiveacc) {
+ if(RANDUNIT > .5) {
+ fiveseqlen = fivelen - RANDINT(fivelen);
+ }
+ }
+
+ if(RANDUNIT > threeacc) {
+ if(RANDUNIT > .5) {
+ threeseqlen = threelen - RANDINT(threelen);
+ }
+ }
+
+ if(RANDUNIT > acc) {
+ if(RANDUNIT > .5) {
+ polyAseqlen = polyAlen - RANDINT(polyAlen);
+ } else {
+ polyAseqlen = polyAlen + RANDINT(polyAlen);
+ }
+ }
+ if (prefix == NULL){
+ fprintf(reads, "%c%d %d (len: %d) ", startchr, i, start+1, len);
+ }
+ else {
+ fprintf(reads, "%c%d_%s %d_%s %d (len: %d) ", startchr, i, prefix, i, prefix, start+1, len);
+ }
+ readerrcnt=0;
+ k=0;
+
+ k = bl_fastxScramble (buffer, quality, five, fiveseqlen, fiveacc, Pmis, Pins,
+ k, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ editstring[editstringlen++] = '|';
+
+ k = bl_fastxBisulfiteScramble (buffer, quality, sequence, sequencerates, len, acc, Pmis, Pins,
+ k, start, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ editstring[editstringlen++] = '|';
+
+ k = bl_fastxScramble (buffer, quality, polyAseq, polyAseqlen, acc, Pmis, Pins,
+ k, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ editstring[editstringlen++] = '|';
+
+ k = bl_fastxScramble (buffer, quality, three, threeseqlen, threeacc, Pmis, Pins,
+ k, 0, alphabet, alphabetsize, minqual, maxqual,
+ editstring, &editstringlen, &readerrcnt);
+
+ fprintf(reads, "%s", editstring);
+ fprintf(reads, " (r: %f)\n", (double) readerrcnt/(len+1.0));
+ fprintf(reads, "%s\n", buffer);
+ if (fastq) {
+ fprintf(reads, "+\n");
+ fprintf(reads, "%s\n", quality);
+ }
+ }
+
+ FREEMEMORY(space, buffer);
+ FREEMEMORY(space, quality);
+ FREEMEMORY(space, polyAseq);
+ FREEMEMORY(space, editstring);
+
+ return ;
+}
+
+/*--------------------------- bl_fastxBisulfiteScramble ------------------------
+ *
+ * @brief scramble a bisulfite-treated fasta sequence
+ * @author Christian Otto
+ *
+ */
+
+Uint
+bl_fastxBisulfiteScramble (char *buffer, char *quality,
+ char *template, char *rates, Uint len,
+ double acc, double Pmis, double Pins,
+ Uint uoff, Uint voff,
+ char *alphabet, Uint alphabetsize,
+ Uint minqual, Uint maxqual,
+ char *editstring, Uint *editstringlen, Uint *readerrcnt)
+{
+ Uint u = uoff, k = (*editstringlen);
+ int j;
+ char errchr, chr;
+ double errtype, rate;
+
+ for (j=0; j < len; j++) {
+ chr=template[voff+j];
+ rate=(double)((int)rates[voff+j]-1)/100;
+ /*
+ * RANDUNIT ==> [0,1)
+ * rate == 0 ==> RANDUNIT always >= rate
+ * rate == 1 ==> RANDUNIT always < rate
+ */
+ if (RANDUNIT >= rate){
+ assert(chr == 'C' || chr == 'G');
+ if (chr == 'C'){
+ chr = 'T';
+ }
+ /* only +RC/-RC reads */
+ if (chr == 'G'){
+ chr = 'A';
+ }
+ }
+
+ if (RANDUNIT > acc) {
+ errtype = RANDUNIT;
+ (*readerrcnt) += 1;
+
+ if (errtype <= Pmis) {
+ errchr = chr;
+ if (editstring) k += sprintf(&editstring[k], "%d:S;", j);
+ while(errchr == chr)
+ errchr = alphabet[RANDINT(alphabetsize-1)];
+ quality[u] = minqual + RANDINT(maxqual - minqual);
+ buffer[u++] = errchr;
+
+ } else if (errtype <= Pmis + Pins) {
+ errchr = alphabet[RANDINT(alphabetsize-1)];
+ if (editstring) k += sprintf(&editstring[k], "%d:I;", j);
+ quality[u] = minqual + RANDINT(maxqual - minqual);
+ buffer[u++] = errchr;
+ if(j < len-1) {
+ quality[u] = minqual + RANDINT(maxqual - minqual);
+ buffer[u++] = chr;
+ j++;
+ }
+
+ } else {
+ if (editstring) k += sprintf(&editstring[k], "%d:D;", j);
+ j--;
+ }
+
+ } else {
+ quality[u] = minqual + RANDINT(maxqual - minqual);
+ buffer[u++] = chr;
+ }
+
+ }
+
+ (*editstringlen) = k;
+ return u;
+}
diff --git a/segemehl/libs/randseqs.h b/segemehl/libs/randseqs.h
new file mode 100644
index 0000000..bb79f75
--- /dev/null
+++ b/segemehl/libs/randseqs.h
@@ -0,0 +1,117 @@
+#ifndef _RANDSEQS_H_
+#define _RANDSEQS_H_
+
+/*
+ *
+ * randseqs.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 07/15/2010 10:42:14 AM CEST
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "stringutils.h"
+#include "basic-types.h"
+#include "charsequence.h"
+#include "randseqs.h"
+
+ geneset_t *
+bl_getGeneModelFromBEDtrack (void *space, annotationtrack_t *track);
+
+void
+bl_copyGene (void *space, gene_t *to, gene_t *from);
+
+annotationtrack_t *
+bl_getTrackFromGeneModel (void *space, geneset_t *set);
+
+ geneset_t*
+bl_simulateTransSplicing (void *space, geneset_t *set, char type, Uint n);
+
+ void
+bl_printSplicingEdges (void *space, FILE *dev, geneset_t *set);
+
+ char*
+bl_getGeneSequence(void *space, fasta_t *reference, gene_t *gene);
+
+ void
+bl_simulateGeneSequencing (void *space, FILE *dev, fasta_t *reference, gene_t *gene, Uint readlen,
+ Uint cov, char *alphabet, Uint alphabetsize, Uint minqual, Uint maxqual,
+ double acc, double Pmis, double Pins);
+
+ void
+bl_simulateGeneSetSequencing (void *space, FILE *dev, fasta_t *reference, geneset_t *genes,
+ Uint readlen, Uint cov, char *alphabet, Uint alphabetsize, Uint minqual, Uint maxqual,
+ double acc, double Pmis, double Pins);
+
+Uint
+bl_fastxScramble (char *buffer, char *quality,
+ char *template, Uint len,
+ double acc, double Pmis, double Pins,
+ Uint uoff, Uint voff,
+ char *alphabet, Uint alphabetsize,
+ Uint minqual, Uint maxqual,
+ char *editstring, Uint *editstringlen, Uint *readerrcnt);
+
+ void
+bl_fastxSimulateSpliceSites (void *space, char *sequence,
+ Uint seqlen, Uint n, Uint maxchildren,
+ char *alphabet, Uint alphabetsize, double acc,
+ double Pmis, double Pins, double Pdel,
+ Uint minqual, Uint maxqual,
+ double Pcis, Uint mincisdist, Uint maxcisdist,
+ double Pstrandswitch, Uint readlen);
+
+void
+bl_fastxPrintRandomReads(FILE *ref, char *sequence, Uint reflen, Uint n,
+ Uint minlen, Uint maxlen, char *alphabet, Uint alphabetsize,
+ double acc, double Pmis, double Pins, double Pdel, unsigned char fastq,
+ Uint minqual, Uint maxqual,
+ char *five, Uint fivelen, char *three, Uint threelen, Uint polyAlen);
+
+
+void
+bl_fastxPrintRandomSplitReads (FILE *dev, char *sequence, Uint seqlen, Uint n,
+ Uint minspltlen, Uint maxspltlen,
+ char *alphabet, Uint alphabetsize,
+ double acc,
+ double Pmis, double Pins, double Pdel,
+ unsigned char fastq,
+ Uint minqual, Uint maxqual,
+ char *five, Uint fivelen,
+ char *three, Uint threelen, Uint polyAlen);
+
+
+void
+bl_fastxPrintRandomMatePairs(FILE *dev, FILE *matedev,
+ char *sequence, Uint seqlen, Uint n,
+ Uint minlen, Uint maxlen, Uint mindist, Uint maxdist,
+ char *alphabet, Uint alphabetsize,
+ double acc,
+ double Pmis, double Pins, double Pdel,
+ unsigned char fastq,
+ Uint minqual, Uint maxqual,
+ char *five, Uint fivelen,
+ char *three, Uint threelen, Uint polyAlen);
+
+void
+bl_fastxPrintRandomBisulfiteReads(FILE *ref, char *sequence, char *sequencerates, Uint reflen,
+ Uint n, Uint minlen, Uint maxlen, char *alphabet, Uint alphabetsize,
+ double acc, double Pmis, double Pins, double Pdel, unsigned char fastq,
+ Uint minqual, Uint maxqual,
+ char *five, Uint fivelen, char *three, Uint threelen, Uint polyAlen, char *prefix);
+
+Uint
+bl_fastxBisulfiteScramble (char *buffer, char *quality,
+ char *template, char *rates, Uint len,
+ double acc, double Pmis, double Pins,
+ Uint uoff, Uint voff,
+ char *alphabet, Uint alphabetsize,
+ Uint minqual, Uint maxqual,
+ char *editstring, Uint *editstringlen, Uint *readerrcnt);
+
+#endif
diff --git a/segemehl/libs/realign.c b/segemehl/libs/realign.c
new file mode 100644
index 0000000..2310b14
--- /dev/null
+++ b/segemehl/libs/realign.c
@@ -0,0 +1,3448 @@
+/*
+ * realign.c
+ * realigning
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 26.06.2012 12:55:18 CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <ctype.h>
+#include <pthread.h>
+#include "alignment.h"
+#include "debug.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "manout.h"
+#include "sort.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "zran.h"
+#include "sw.h"
+#include "matchfiles.h"
+#include "evalmatchfiles.h"
+#include "manout.h"
+#include "matchfilesfields.h"
+#include "matepairs.h"
+#include "matchfiles.h"
+#include "manopt.h"
+#include "iupac.h"
+#include "info.h"
+#include "fqueue.h"
+#include "list.h"
+#include "realign.h"
+#include "multicharseq.h"
+#include "charsequence.h"
+#include "container.h"
+#include "sw.h"
+#include "segemehl.h"
+
+pthread_mutex_t inout;
+pthread_mutex_t mutkuh; /*?*/
+
+/*------------------------------ destructBuffer ------------------------------
+ *
+ * @brief to clear the realignmentlists' string buffersi
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+destructBuffer (void *elem)
+{
+
+ matchlistelem_t* myelem = (matchlistelem_t*) elem;
+ FREEMEMORY(NULL, myelem->str);
+
+ return ;
+}
+
+void*
+bl_insertArray(void *arr, size_t nmemb, size_t size, void *elem, Uint pos) {
+ void *dst, *src;
+
+ arr = realloc(arr, size*(nmemb+1));
+ if(pos>=nmemb) {
+ dst = ((char*)arr)+(size*nmemb);
+ memmove(dst, elem, size);
+ } else {
+ src = ((char*)arr)+(size*pos);
+ dst = ((char*)arr)+(size*(pos+1));
+ memmove(dst, src, size*(nmemb-pos));
+ memmove(src, elem, size);
+ }
+
+ return arr;
+}
+
+void*
+bl_removeArray(void *arr, size_t nmemb, size_t size, void *elem, Uint pos) {
+ void *dst, *src;
+
+ if(pos>=nmemb-1) {
+ arr = realloc(arr, size*(nmemb-1));
+ } else {
+ dst = ((char*)arr)+(size*pos);
+ src = ((char*)arr)+(size*(pos+1));
+ memmove(dst, src, size*(nmemb-pos));
+ arr = realloc(arr, size*(nmemb-1));
+ }
+
+ return arr;
+}
+/*------------------------------ cmp functions -------------------------------
+ *
+ * @brief compare functions
+ * @author Steve Hoffmann
+ *
+ */
+
+int cmp_matchsplitsitecluster_qsort(const void *a, const void *b) {
+ matchsplitsitecluster_t *l = (matchsplitsitecluster_t*) a;
+ matchsplitsitecluster_t *r = (matchsplitsitecluster_t*) b;
+
+ if(l->a < r->a) return -1;
+ if(l->a > r->a) return 1;
+ if(l->b < r->b) return -1;
+ if(l->b > r->b) return 1;
+
+ return 0;
+
+}
+
+
+Uint cmp_matchsplitsites_quickSort(Uint i, Uint j, void *arr, void* nfo) {
+ matchsplitsite_t *sites = (matchsplitsite_t*) arr;
+
+ if(sites[i].cnt < sites[j].cnt) return 1;
+ if(sites[i].cnt > sites[j].cnt) return 2;
+
+ return 0;
+
+}
+
+ Uint cmp_distsites_quickSort(Uint i, Uint j, void *arr, void* nfo) {
+ matchsplitsearchkey_t *key = (matchsplitsearchkey_t*) nfo;
+ matchlistelem_t *sites = (matchlistelem_t*) arr;
+ Uint pos1, pos2;
+
+ if(sites[i].distchr > sites[j].distchr) return 1;
+ if(sites[i].distchr < sites[j].distchr) return 2;
+
+ if(sites[i].distpos > sites[j].distpos) return 1;
+ if(sites[i].distpos < sites[j].distpos) return 2;
+
+ /*key->trans = left*/
+ if(key->trans) {
+ pos1 = sites[i].start - key->pos;
+ pos2 = sites[j].start - key->pos;
+ } else {
+ pos1 = sites[i].end - key->pos;
+ pos2 = sites[j].end - key->pos;
+ }
+
+ if(pos1 > pos2) return 1;
+ if(pos1 < pos2) return 2;
+
+ if(sites[i].trans > sites[j].trans) return 2;
+ if(sites[i].trans < sites[j].trans) return 1;
+
+ return 0;
+}
+
+
+
+Uint cmp_matchsplitsitecluster_bin(Uint a, void *data, void *key, void *nfo) {
+ matchsplitsitecluster_t *d = (matchsplitsitecluster_t*) data;
+ Uint *k = (Uint*) key;
+
+ if (d[a].a > *k) return 1;
+ if (d[a].b < *k) return 2;
+
+ return 0;
+ }
+
+
+/*---------------------- bl_matchJoinSplitSiteClusters -----------------------
+ *
+ * @brief
+ * @author Steve Hoffmann
+ *
+ */
+
+matchsplitsitecluster_t *
+bl_matchJoinSplitSiteClusters (void *space, matchsplitsitecluster_t *dst,
+ matchsplitsitecluster_t *src, Uint dstelems, Uint srcelems)
+{
+ dst = ALLOCMEMORY(space, dst, matchsplitsitecluster_t, dstelems+srcelems);
+ memmove(&dst[dstelems], src, sizeof(matchsplitsitecluster_t)*(srcelems));
+
+ return dst;
+}
+
+/*------------------------ bl_matchfileDestructMatch -------------------------
+ *
+ * @brief destroy a match
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDestructMatchListElem (void *elem)
+{
+ matchlistelem_t *match = (matchlistelem_t*) elem;
+ FREEMEMORY(NULL, match->bookkeeper);
+ FREEMEMORY(NULL, match->str);
+
+ return ;
+}
+/*------------------------- bl_matchfileEnlistMatch --------------------------
+ *
+ * @brief enlist match alignment to unsorted list
+ * @author Steve Hoffmann
+ *
+ */
+
+List *
+bl_matchfileEnlistMatch (void *space, List *l, Uint start, Uint end,
+ Uint distchr, Uint distpos, Uint adjointpos, char trans, char *str,
+ unsigned char* bookkeeper)
+{
+
+ matchlistelem_t elem;
+ elem.start = start;
+ elem.end = end;
+ elem.str = str;
+ elem.distchr = distchr;
+ elem.distpos = distpos;
+ elem.trans = trans;
+ elem.adjoint = adjointpos;
+ elem.bookkeeper = bookkeeper;
+
+ bl_listInsert(l, l->last, &elem);
+ return l;
+}
+
+/*------------------------- bl_matchfileUnlistMatch -------------------------
+ *
+ * @brief unlist a match from unsorted list if it is mapped to [a,b]
+ * @author Steve Hoffmann
+ *
+ */
+
+ matchlistelem_t*
+bl_matchfileUnlistMatches (void *space, List *l, Uint a, Uint b, Uint *k,
+ char left)
+{
+ matchlistelem_t *elem, *arr=NULL;
+ Uint n=0;
+ Uint pos=0;
+ int cur, next;
+
+ cur = l->first;
+ while(cur != -1) {
+ next = l->nodes[cur].next;
+ elem = (matchlistelem_t*) bl_listGetElem (l, cur);
+ pos = (left) ? elem->start : elem->end;
+ if(elem && pos >= a && pos <=b) {
+ elem = (matchlistelem_t*) bl_listUnlink(l, cur, NULL);
+ arr = ALLOCMEMORY(space, arr, matchlistelem_t, n+1);
+ memmove(&arr[n], elem, sizeof(matchlistelem_t));
+ FREEMEMORY(space, elem);
+ n++;
+ }
+
+ cur = next;
+ }
+
+ *k = n;
+ return arr;
+}
+
+
+/*------------------------ bl_matchfileClusterSplits -------------------------
+ *
+ * @brief cluster the split sites w/ greedy lr extension within r-l+1=interval
+ * @author Steve Hoffmann
+ *
+ */
+
+ matchsplitsitecluster_t*
+bl_matchfileClusterSplits (void *space, matchsplitsite_t *sites, Uint n,
+ Uint interval, Uint *noofclusters)
+{
+ Uint *idx;
+ char *mrk;
+ matchsplitsitecluster_t *C=NULL;
+ Uint k=0, i, j, unmarked = n;
+ Uint med, mid, sum=0, *cum=NULL;
+
+ mrk = ALLOCMEMORY(space, NULL, char, n);
+ memset(mrk, 0, n);
+ idx = quickSort(space, sites, n, cmp_matchsplitsites_quickSort, NULL);
+
+ while(unmarked > 0) {
+ C = ALLOCMEMORY(space, C, matchsplitsitecluster_t, k+1);
+ C[k].a =0;
+ C[k].b =0;
+ C[k].cnt = 0;
+
+#ifdef CHECKLINKS
+ C[k].trans = 0;
+ C[k].lnkcnt = 0;
+#endif
+
+ sum =0;
+ for(i=0; i < n; i++) {
+ if(!mrk[idx[i]]) {
+ //left expand
+ if(C[k].a == 0 ||
+ (C[k].a > sites[idx[i]].pos &&
+ sites[idx[i]].pos+interval >= C[k].b)) {
+ C[k].a = sites[idx[i]].pos;
+ mrk[idx[i]]=1;
+ }
+ //right expand
+ if(C[k].b == 0 ||
+ (C[k].b < sites[idx[i]].pos &&
+ sites[idx[i]].pos <= C[k].a+interval)) {
+ C[k].b = sites[idx[i]].pos;
+ mrk[idx[i]]=1;
+ }
+ //inclusion
+ if(C[k].a <= sites[idx[i]].pos && C[k].b >= sites[idx[i]].pos) {
+
+ for(j=0; j < sites[idx[i]].cnt; j++) {
+ cum = bl_insertArray(cum, sum, sizeof(Uint),
+ &sites[idx[i]].pos, sum);
+ sum++;
+ }
+
+ C[k].cnt += sites[idx[i]].cnt;
+#ifdef CHECKLINKS
+ C[k].trans += sites[idx[i]].trans;
+#endif
+ mrk[idx[i]]=1;
+ }
+
+ if(mrk[idx[i]]) {
+ unmarked--;
+ }
+ }
+ }
+
+ med = C[k].a;
+ if(sum) {
+ qsort(cum, sum, sizeof(Uint), cmp_Uint_qsort);
+ mid = sum/2 + sum%2;
+ med = cum[mid-1];
+ FREEMEMORY(space, cum);
+ }
+
+ assert(med >= C[k].a && C[k].b >= med);
+ C[k].median = med;
+ /* C[k].median*/
+ k++;
+ }
+
+ FREEMEMORY(space, idx);
+ FREEMEMORY(space, mrk);
+ *noofclusters = k;
+
+ return C;
+}
+
+
+/*----------------------------- bl_matchDistPos ------------------------------
+ *
+ * @brief get the condensed dist pos list
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchsplitdistpos (void *space, matchsplitsitecluster_t *C,
+ matchlistelem_t *arr, Uint n, char left)
+{
+
+ Uint i, j;
+
+#if defined PARANOID && defined CHECKLINKS
+ Uint pos;
+#endif
+
+ matchlistelem_t *elem;
+ Uint *idx;
+ Uint noofdists=0;
+ int16_t *adjoint = NULL;
+ Uint inelem =0;
+ int16_t inelem1 =0;
+ Uint *adjointcnt = NULL;
+ int16_t relpos =0;
+ Uint noofadjoints = 0;
+
+ matchsplitsearchkey_t key;
+
+ key.trans = left;
+ key.pos = C->a;
+
+ /*generate sort idx to for unsorted list arr*/
+ idx = quickSort(space, arr, n, cmp_distsites_quickSort, &key);
+
+ C->distchr = ALLOCMEMORY(space, NULL, Uint, noofdists+1);
+ C->distpos = ALLOCMEMORY(space, NULL, Uint, noofdists+1);
+ C->disttrans = ALLOCMEMORY(space, NULL, char, noofdists+1);
+ C->distcnt = ALLOCMEMORY(space, NULL, Uint, noofdists+1);
+
+#if defined PARANOID && defined CHECKLINKS
+ C->rpos = ALLOCMEMORY(space, NULL, char, noofdists+1);
+ pos = (left) ? arr[idx[0]].start - C->a : arr[idx[0]].end - C->a;
+ C->rpos[0] = pos;
+#endif
+
+ C->distchr[0] = arr[idx[0]].distchr;
+ C->distpos[0] = arr[idx[0]].distpos;
+ C->disttrans[0] = arr[idx[0]].trans;
+ C->distcnt[0] = 1;
+ noofdists++;
+
+ /*iter arr using the sort index idx*/
+ for(i=1; i < n; i++) {
+ elem = &arr[idx[i]];
+
+#if defined PARANOID && defined CHECKLINKS
+ pos = (left) ? arr[idx[i]].start - C->a : arr[idx[i]].end - C->a;
+#endif
+ if( elem->distchr != C->distchr[noofdists-1] ||
+ elem->distpos != C->distpos[noofdists-1] ||
+#if defined PARANOID && defined CHECKLINKS
+ pos != C->rpos[noofdists-1] ||
+#endif
+ elem->trans != C->disttrans[noofdists-1]
+ ) {
+
+ C->distchr = ALLOCMEMORY(space, C->distchr, Uint, noofdists+1);
+ C->distpos = ALLOCMEMORY(space, C->distpos, Uint, noofdists+1);
+ C->disttrans = ALLOCMEMORY(space, C->disttrans, char, noofdists+1);
+ C->distcnt = ALLOCMEMORY(space, C->distcnt, Uint, noofdists+1);
+#if defined PARANOID && defined CHECKLINKS
+ C->rpos = ALLOCMEMORY(space, C->rpos, char, noofdists+1);
+ C->rpos[noofdists] = pos;
+#endif
+ C->distchr[noofdists] = elem->distchr;
+ C->distpos[noofdists] = elem->distpos;
+ C->disttrans[noofdists] = elem->trans;
+ C->distcnt[noofdists] = 1;
+ noofdists++;
+ } else {
+ C->distcnt[noofdists-1] += 1;
+ }
+
+ /*keep a sorted list for adjoints and their counts*/
+ if(elem->adjoint != -1) {
+ relpos = elem->adjoint - C->median;
+ for(j=0; j < noofadjoints; j++) {
+ if(adjoint[j] == relpos) {
+ adjointcnt[j]++;
+ break;
+ }
+ if(adjoint[j] > relpos) {
+ break;
+ }
+ }
+
+ if(noofadjoints==j || adjoint[j] > relpos) {
+ inelem1 = relpos;
+ adjoint = bl_insertArray(adjoint, noofadjoints,
+ sizeof(int16_t), &inelem1, j);
+ inelem = 1;
+ adjointcnt = bl_insertArray(adjointcnt, noofadjoints, sizeof(Uint),
+ &inelem, j);
+ noofadjoints++;
+ }
+ }
+ }
+
+
+ C->distsites = noofdists;
+ C->adjoint = adjoint;
+ C->adjointcnt = adjointcnt;
+ C->noofadjoints = noofadjoints;
+ C->noofadjclust = 0;
+ C->adjclust = NULL;
+ C->adjclustweights = NULL;
+ C->distclust = NULL;
+ C->distclustchr = NULL;
+ C->distclustcnt = NULL;
+ C->distclusttype = NULL;
+ C->realigned = NULL;
+ C->emat = NULL;
+ C->distclustrealigned = NULL;
+ C->noofdistclust = 0;
+
+#if defined PARANOID && defined CHECKLINKS
+ C->linked = calloc(noofdists, sizeof(Uint));
+#endif
+
+ FREEMEMORY(space, idx);
+
+ return ;
+}
+
+/*------------------------ bl_matchfileScanRightList ------------------------
+ *
+ * @brief scan the list to accumulate splits
+ * @author Steve Hoffmann
+ *
+ */
+
+ matchsplitsitecluster_t *
+bl_matchfileScanList (void *space, List *l, matchsplitsite_t *sites,
+ Uint n, Uint interval, Uint curstart, char left, Uint *noofclusters)
+{
+
+ Uint i, nstr, k;
+ matchlistelem_t* arr=NULL;
+ matchsplitsitecluster_t *C;
+
+ /*first determine the cluster intervals and the medians in sites array*/
+ C = bl_matchfileClusterSplits (space, sites, n, interval, &k);
+
+ for(i=0; i < k; i++) {
+ /*then unlist the matches in unsorted list*/
+ arr = bl_matchfileUnlistMatches (space, l, C[i].a, C[i].b, &nstr, left);
+ /*add the elems for unsorted list to cluster*/
+ bl_matchsplitdistpos (space, &C[i], arr, nstr, left) ;
+
+ if(nstr >0) FREEMEMORY(space, arr);
+ }
+
+#ifdef CHECKLINKS
+ Uint j;
+ for(i=0; i < k; i++) {
+ nstr = 0;
+ for(j=0; j < C[i].distsites; j++) {
+ if(C[i].disttrans[j])
+ nstr += C[i].distcnt[j];
+ }
+ assert(nstr == C[i].trans);
+ }
+#endif
+
+
+ qsort(C, k, sizeof(matchsplitsitecluster_t), cmp_matchsplitsitecluster_qsort);
+
+ *noofclusters = k;
+ return C;
+}
+
+/*------------------------ bl_matchfileUpdateSplitSites------------------------
+ *
+ * @brief update and sort the lsites and rsites arrays.
+ * @author Steve Hoffmann
+ *
+ */
+
+ matchsplitsite_t*
+bl_matchfileUpdateSplitSites (void *space, Uint e,
+ matchsplitsite_t *sites, Uint *noofsites, Uint interval, char trans)
+{
+ Uint i, n;
+ matchsplitsite_t elem;
+ n = *noofsites;
+
+ for(i=0; i < n; i++) {
+ if(sites[i].pos == e) {
+ sites[i].cnt++;
+ sites[i].trans+=trans;
+ }
+ if(sites[i].pos >= e) {
+ break;
+ }
+ }
+
+ if(i==n || sites[i].pos > e) {
+ elem.pos = e;
+ elem.cnt = 1;
+ elem.trans = trans;
+ sites = bl_insertArray(sites, n, sizeof(matchsplitsite_t), &elem, i);
+ n++;
+ }
+
+ *noofsites = n;
+ return sites;
+}
+
+
+/*------------------------- bl_matchfileGetOverhang --------------------------
+ *
+ * @brief get the overhanging sequence, boundaries and score
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+bl_matchfileSplitAlignment (char *read, char *ref, char *aln,
+ Uint allen, Uint splitpos, int *scores,
+ Alignment **leftaln, Alignment **rightaln, char left)
+{
+ int xscr =0, yscr=0;
+ Uint p=0, q=0, k=0, xqlen=0, xrlen=0, yqlen=0, yrlen =0, yendS=0;
+ Alignment *xal, *yal;
+ Uint x0;
+
+ xal = ALLOCMEMORY(space, NULL, Alignment, 1);
+ initAlignment(xal, read, strlen(read), 0, ref, strlen(read), 0);
+ yal = ALLOCMEMORY(space, NULL, Alignment, 1);
+ initAlignment(yal, read, strlen(read), 0, ref, strlen(read), 0);
+
+ //lookahead
+ while(aln[k] == 'S' || aln[k] =='H') {
+ if(aln[k] != 'H') q++;
+ k++;
+ }
+
+ x0 = q;
+
+ for(; k < allen; k++) {
+ switch(aln[k]) {
+ case 'M':
+ case 'X':
+ case '=':
+ if(p <= splitpos) {
+ xscr += scores[!matchIUPAC(ref[p],read[q])];/*[(ref[p]!=read[q])];*/
+ insertEop(xal, Replacement);
+ xrlen++; xqlen++;
+ } else {
+ yscr += scores[!matchIUPAC(ref[p],read[q])];/*[(ref[p]!=read[q])];*/
+ insertEop(yal, Replacement);
+ yrlen++; yqlen++;
+ }
+ p++; q++;
+ break;
+ case 'D':
+ if(p <= splitpos) {
+ xscr += scores[1];
+ insertEop(xal, Deletion);
+ xrlen++;
+ } else {
+ yscr += scores[1];
+ insertEop(yal, Deletion);
+ yrlen++;
+ }
+ p++;
+ break;
+ case 'I':
+ //left
+ if(p <= splitpos+left) {
+ xscr += scores[1];
+ insertEop(xal, Insertion);
+ xqlen++;
+ } else {
+ yscr += scores[1];
+ insertEop(yal, Insertion);
+ yqlen++;
+ }
+ q++;
+ break;
+ case 'S':
+ if(p <= splitpos) {
+ }
+ else {
+ yendS++;
+ }
+ break;
+ case 'N':
+ case 'P':
+ p++;
+ default:
+ break;
+ }
+ }
+
+ xal->u = read;
+ xal->uoff = x0;
+ xal->ulen = xqlen;
+
+ xal->v = ref;
+ xal->voff= 0;
+ xal->vlen= xrlen;
+
+ yal->v = read;
+ yal->uoff = strlen(read)-(yqlen+yendS);
+ yal->ulen = yqlen;
+
+ yal->v = ref;
+ yal->voff = splitpos+1;
+ yal->vlen= yrlen;
+
+ *leftaln = xal;
+ *rightaln = yal;
+
+ assert(getAlignScore(xal, scores, -1) == xscr);
+ assert(getAlignScore(yal, scores, -1) == yscr);
+
+ return q;
+}
+
+
+/*------------------------ bl_matchfileRealignWriteSAM -----------------------
+ *
+ * @brief write the realignment in SAM
+ * @author Steve Hoffmann
+ *
+ */
+
+ char*
+bl_matchfileRealignWriteSAM (matchfileRec_t *orig, char *qual,
+ char *rname, Uint pos, char strand, Alignment* al, Uint ustart, Uint uend,
+ Uint uno, Uint off, char *rnext, Uint pnext, char fnext)
+{
+
+
+ char *tag = NULL, *md, *cigar, flg=0, *newqual, *newseq;
+ Uint len=0, ptr=0, flag, allen =0, edist=0,
+ noofsplits=0, resplits=0;
+ int addx=0;
+
+
+ flag = orig->flag;
+ flag |= (1 << 8);
+ edist = getEdist(al);
+ allen = getUalignlen(al);
+
+ if(strand == '-') {
+ flag |= (1 << 4);
+ } else {
+ flag &= ~(1 << 4);
+ }
+
+ newqual = ALLOCMEMORY(space, NULL, char, strlen(qual)+1);
+
+ memmove(newqual, qual, strlen(qual));
+ if(orig->strand != strand) {
+ newqual = strrev(newqual, strlen(qual));
+ }
+
+ newseq = ALLOCMEMORY(space, NULL, char, allen+1);
+ memmove(newseq, &al->u[al->uoff], allen);
+ memmove(newqual, &newqual[al->uoff], allen);
+ newseq[allen] = 0;
+ newqual[allen] = 0;
+
+
+ noofsplits = orig->noofsplits;
+ resplits = 2;
+
+ cigar = cigarstring(al, 0, 0, 'S', 0);
+
+ md = mdstring(al,0);
+
+ len = snprintf(NULL, 0, "%s\t%d\t%s\t%d\t255\t%s\t*\t0\t0\t%s\t%s\t",
+ orig->curname, flag, rname, pos, cigar, newseq, newqual);
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1, "%s\t%d\t%s\t%d\t255\t%s\t*\t0\t0\t%s\t%s\t",
+ orig->curname, flag, rname, pos, cigar, newseq, newqual);
+ ptr += len;
+
+ if(orig->noofsplits) {
+ len = snprintf(NULL, 0,
+ "NM:i:%d\tMD:Z:%s\tNH:i:%d\tXI:i:%d\tXL:i:%d\tXR:i:%d\tXO:i:%d\t",
+ edist, md, orig->curcnt, orig->identity,noofsplits, resplits, orig->xno);
+
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1,
+ "NM:i:%d\tMD:Z:%s\tNH:i:%d\tXI:i:%d\tXL:i:%d\tXR:i:%d\tXO:i:%d\t",
+ edist, md, orig->curcnt, orig->identity, noofsplits, resplits, orig->xno);
+ ptr += len;
+
+ } else {
+ len = snprintf(NULL, 0,
+ "NM:i:%d\tMD:Z:%s\tNH:i:%d\tXI:i:%d\tXR:i:%d\t",
+ edist, md, orig->curcnt, orig->identity, resplits);
+
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1,
+ "NM:i:%d\tMD:Z:%s\tNH:i:%d\tXI:i:%d\tXR:i:%d\t",
+ edist, md, orig->curcnt, orig->identity, resplits);
+ ptr += len;
+
+ }
+ if(orig->xstart>0){
+ addx=orig->xstart-1;
+ }
+ len = snprintf(NULL, 0, "XX:i:%d\tXY:i:%d\tXQ:i:%d\t", ustart+addx, uend+addx, uno);
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1, "XX:i:%d\tXY:i:%d\tXQ:i:%d\t", ustart+addx, uend+addx, uno);
+ ptr += len;
+
+ if(uno == 0) { /*??*/
+ if(orig->donorchr) {
+ len = snprintf(NULL, 0, "XP:Z:%s\tXU:i:%d\tXS:i:%d\t",orig->donorchr,
+ orig->donorpos, orig->donorflg); /**/
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1,"XP:Z:%s\tXU:i:%d\tXS:i:%d\t",orig->donorchr,
+ orig->donorpos, orig->donorflg); /**/
+ ptr += len;
+ }
+
+ flg = (fnext == '-') ? 0 : SPLIT_NEXT_PLUS;
+
+ len = snprintf(NULL, 0, "XC:Z:%s\tXV:i:%d\tXT:i:%d", rnext, pnext, flg); /**/
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1,"XC:Z:%s\tXV:i:%d\tXT:i:%d",rnext, pnext, flg); /**/
+ ptr += len;
+
+ } else {
+ flg = (fnext == '-') ? 0 : SPLIT_PREV_PLUS;
+ len = snprintf(NULL, 0, "XP:Z:%s\tXU:i:%d\tXS:i:%d",rnext, pnext, flg); /**/
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1,"XP:Z:%s\tXU:i:%d\tXS:i:%d",rnext, pnext, flg); /**/
+ ptr += len;
+
+ if(orig->acceptorchr) {
+ len = snprintf(NULL, 0, "\tXC:Z:%s\tXV:i:%d\tXT:i:%d",orig->acceptorchr,
+ orig->acceptorpos, orig->acceptorflg);
+ tag = ALLOCMEMORY(space, tag, char, ptr+len+1);
+ snprintf(&tag[ptr], len+1,"\tXC:Z:%s\tXV:i:%d\tXT:i:%d",orig->acceptorchr,
+ orig->acceptorpos, orig->acceptorflg);
+ ptr += len;
+ }
+ }
+
+
+ FREEMEMORY(space, newseq);
+ FREEMEMORY(space, newqual);
+ FREEMEMORY(space, cigar);
+ FREEMEMORY(space, md);
+
+
+ return tag;
+}
+
+/*------------------------- bl_matchfileRealign -------------------------
+ *
+ * @brief realign right and left sides of alignments
+ *
+ * median
+ * -------------|-----> |
+ * <------------|------ |
+ * median + interval
+ *
+ *
+ * realign left sides of alignments
+ * median
+ * | <-----|------------
+ * median+interval
+ *
+ *
+ *
+ * @author Steve Hoffmann
+ *
+ */
+
+ Uint
+bl_matchfileRealign (void *space, FILE *realigndev, List *rlist, matchsplitsitecluster_t *T,
+ Uint n, char left, fasta_t *set, Uint chromidx, unsigned char fmt, Uint interval, int threadno, Uint sinterval,int MAXSPLICE)
+{
+
+ Uint i, q, nstr; /* *siteidx,*/
+ matchlistelem_t* arr=NULL;
+ Uint noofrealigns=0;
+ pthread_t *threads;
+ readthread *ReadT;
+
+ /*this would be the point if we boss it*/
+ for(i=0; i < n; i++) {
+ if(T[i].noofdistclust<MAXSPLICE) {
+ if(left) {
+ arr = bl_matchfileUnlistMatches(space, rlist, T[i].median-interval, T[i].median,
+ &nstr, 1);
+ } else {
+ arr = bl_matchfileUnlistMatches(space, rlist, T[i].median, T[i].median+interval,
+ &nstr, 0);
+ }
+ // fprintf(stderr,"%d,%d\n",T[i].distsites,nstr);
+ if(threadno<2||nstr<2) {
+ Readrealign(0, nstr-1, arr, fmt, &(T[i]),left,set,chromidx, &noofrealigns,realigndev, rlist,interval,space,sinterval);
+ }
+ else {
+ ReadT=ALLOCMEMORY(space, NULL, realignthread , threadno);/*N_threads??*/
+ threads = ALLOCMEMORY(space, NULL, pthread_t, threadno);
+ ReadT[0].T=&(T[i]);;
+ ReadT[0].arr=arr;
+ ReadT[0].fmt=fmt;
+ ReadT[0].left=left;
+ ReadT[0].set=set;
+ ReadT[0].chromidx=chromidx;
+ ReadT[0].noofrealigns=0;
+ ReadT[0].realigndev=realigndev;
+ ReadT[0].rlist=rlist;
+ ReadT[0].interval=interval;
+ ReadT[0].space=space;
+ ReadT[0].sinterval=sinterval;
+ //pthread_create(&threads[0]/*???*/, NULL, Readthreadstarter, &RT[0]);
+ if(nstr<threadno) {
+ ReadT[0].begin=0;
+ ReadT[0].stop=0;
+ pthread_create(&threads[0]/*???*/, NULL, Readthreadstarter, &ReadT[0]);
+
+ for(q=1; q < nstr; q++) {
+ memmove(&ReadT[q], &ReadT[0], sizeof(readthread));
+ ReadT[q].begin=q;
+ ReadT[q].stop=q;
+ pthread_create(&threads[q]/*???*/, NULL, Readthreadstarter, &ReadT[q]);
+ }
+ for(q=0; q < nstr; q++) {
+ pthread_join(threads[q], NULL);
+ }
+ for(q=0; q < nstr; q++) {
+ noofrealigns+=ReadT[q].noofrealigns;
+ }
+
+ }
+ else {
+ int chunk;
+ int overhead;
+ int where=0;
+ chunk=nstr/threadno;
+ overhead=nstr%threadno;
+ chunk--;
+ if (where<overhead) {
+ q=1+chunk;
+ }
+ else {
+ q=chunk;
+ }
+ ReadT[where].begin=0;
+ ReadT[where].stop=q;
+ assert(q<nstr);
+ pthread_create(&threads[where]/*???*/, NULL, Readthreadstarter, &ReadT[where]);
+ for(where=1; where<threadno; where++) {
+ q++;
+ memmove(&ReadT[where], &ReadT[0], sizeof(readthread));
+ ReadT[where].begin=q;
+ if (where<overhead) {
+ q++;
+ }
+ q+=chunk;
+ ReadT[where].stop=q;
+ assert(q<nstr);
+ pthread_create(&threads[where]/*???*/, NULL, Readthreadstarter, &ReadT[where]);
+ }
+ for(q=0; q < threadno; q++) {
+ pthread_join(threads[q], NULL);
+ }
+ for(q=0; q < threadno; q++) {
+ noofrealigns+=ReadT[q].noofrealigns;
+ }
+ }
+ FREEMEMORY(space, threads);
+ FREEMEMORY(space, ReadT);
+ }
+
+ FREEMEMORY(space, arr);
+ }
+}
+ return noofrealigns;
+}
+
+
+
+/*----------------------- bl_matchfileDumpUnRealigned ------------------------
+ *
+ * @brief dump the un-realigned reads to device and clean list
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDumpUnRealigned (void *space, FILE* realigndev, List *llist, List *rlist,
+ Uint curstart)
+{
+ Uint cur, next;
+ matchlistelem_t* elem=NULL;
+ cur = llist->first;
+
+ while(cur != -1) {
+ next = llist->nodes[cur].next;
+ elem = (matchlistelem_t*) bl_listGetElem (llist, cur);
+
+ if(elem->end < curstart) {
+
+ elem = (matchlistelem_t*) bl_listUnlink(llist, cur, NULL);
+ if(!(elem->bookkeeper[0] & LEFTSPLIT) &&
+ !(elem->bookkeeper[0] & RIGHTSPLIT)) {
+ elem->bookkeeper[0] |= LEFTSPLIT;
+ fprintf(realigndev, "%s\n", elem->str);
+ }
+
+ if(!(elem->bookkeeper[0] & RIGHTLIST))
+ FREEMEMORY(space, elem->bookkeeper);
+
+ FREEMEMORY(space, elem->str);
+ FREEMEMORY(space, elem);
+ }
+ cur = next;
+ }
+
+ cur = rlist->first;
+ while(cur != -1) {
+ next = rlist->nodes[cur].next;
+ elem = (matchlistelem_t*) bl_listGetElem (rlist, cur);
+
+ if(elem->end < curstart) {
+
+ elem = (matchlistelem_t*) bl_listUnlink(rlist, cur, NULL);
+ if(!(elem->bookkeeper[0] & LEFTSPLIT) &&
+ !(elem->bookkeeper[0] & RIGHTSPLIT)) {
+ elem->bookkeeper[0] |= RIGHTSPLIT;
+ fprintf(realigndev, "%s\n", elem->str);
+ }
+
+ FREEMEMORY(space, elem->bookkeeper);
+ FREEMEMORY(space, elem->str);
+ FREEMEMORY(space, elem);
+ }
+ cur = next;
+ }
+
+ return ;
+}
+
+/*----------------------- bl_matchfileRealignScanFile ------------------------
+ *
+ * @brief read all matches from start to end on chromname
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileRealignScanFileNew(void *space, matchfile_t *file, FILE *realigndev,
+ fasta_t *set, unsigned char fields, matchsplitsiteclusterlist_t **Lclust,
+ matchsplitsiteclusterlist_t **Rclust, Uint *nchr, int threadno, int maxdist) {
+
+ stringset_t *token;
+
+ Uint buffersize=1024, //startbin, //endbin,
+ len=0, k=-1,
+ curstart=0, curend=0,
+ acceptorpos=0, donorpos=0, /*xstart, xend, xno,*/
+ acceptorchridx, donorchridx,
+ acceptorflg = 0, donorflg = 0, curchromidx=0, adjoint=0
+ //,counter=0, pnext = 0, curcnt = 0
+ ;
+ char *buffer = NULL, *buffer2 = NULL, *buffer3 = NULL,
+ ch,
+ //*curseq=NULL, *curqual=NULL,
+ //*curaln,
+ *filename, strand,
+ *acceptorchr = NULL, *donorchr = NULL,
+ //*rnext,
+ *curchrom;
+ unsigned char header = 1;
+ //readlen=0, u,
+ /*int edist=0;*/
+
+ matchfileindex_t *index;
+ unsigned char gzip, fmt;/*, curedist=0;*/
+ FILE *fp = NULL;
+ struct gzidxfile *gzf = NULL;
+ matchsplitsitecluster_t *T=NULL;
+ matchsplitsiteclusterlist_t *L=NULL, *R=NULL;
+ int gzlen;
+ Uint temp = 0;
+ List rightl, leftl, rightrealign, leftrealign;
+ Uint interval=10;
+ matchsplitsite_t *rsites=NULL;
+ matchsplitsite_t *lsites=NULL;
+ Uint nooflsites=0;
+ Uint noofrsites=0;
+ Uint trans = 0;
+ Uint noofleftrealigns = 0, noofrightrealigns = 0;
+ unsigned char *bookkeeper=NULL;
+
+ gzip = file->gzip;
+ fmt = file->fmt;
+ filename = file->filename;
+ index = file->index;
+ buffer = ALLOCMEMORY(space, NULL, char, buffersize);
+
+ if (gzip) {
+ index->gzindex = bl_zranGetIndex(filename, &gzlen);
+ fp = fopen(filename, "rb");
+ gzf = bl_initgzidxfile(fp, index->gzindex, 0, CHUNK);
+ } else {
+ fp = fopen(filename, "r");
+ }
+
+ if(fp == NULL || fseek(fp, 0, SEEK_END) < 0)
+ {
+ DBGEXIT("Couldn't open file %s. Exit forced!\n", filename);
+ }
+
+ fseek(fp, 0, SEEK_SET);
+
+ //initialize buffer list
+ bl_listInit(&rightl, 100, sizeof(matchlistelem_t));
+ bl_listInit(&leftl, 100, sizeof(matchlistelem_t));
+ bl_listInit(&rightrealign, 100, sizeof(matchlistelem_t));
+ bl_listInit(&leftrealign, 100, sizeof(matchlistelem_t));
+
+ L = *Lclust;
+ R = *Rclust;
+
+ while((ch = (gzip) ? bl_getgzidxc(gzf) : getc(fp)) != EOF) {
+
+ if(len == buffersize-1) {
+ buffersize = 2*buffersize+1;
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ }
+
+ if(ch == '\n' && len > 0) {
+
+ buffer = ALLOCMEMORY(space, buffer, char, len+1);
+ buffer[len] = '\0';
+
+#ifdef DBGIDX
+ DBG("buffer: %s\n", buffer);
+#endif
+
+ if(header) header = bl_matchfileIsHeader(buffer, len, fmt);
+
+ if(!header) {
+ token = tokensToStringset(space, "\t", buffer, len);
+
+ curchrom = bl_matchfileGetChrom(token, fmt);
+ curstart = bl_matchfileGetStartPos(token, fmt);
+ curend = bl_matchfileGetEndPos(token, fmt);
+ curchromidx = bl_matchfileGetChromIndexNumber(index, curchrom);
+ if(k == -1) {
+ k = curchromidx;
+ // fprintf(stderr, "chrom:%s %d\n", curchrom, curchromidx);
+ }
+
+ if(k != curchromidx) {
+ //finalize stuff
+
+ T = bl_matchfileScanList (space, &leftl,
+ lsites, nooflsites, interval, -1, 1, &temp);
+
+ if(realigndev) {
+ noofleftrealigns += bl_matchfileRealign(space, realigndev, &leftrealign, T, temp, 1, set,
+ k, fmt, 25,threadno, interval, maxdist);
+ bl_listSweep(&leftrealign);
+ }
+
+ L[k].cluster =
+ bl_matchJoinSplitSiteClusters(space, L[k].cluster, T, L[k].noofclusters, temp);
+ L[k].noofclusters += temp;
+
+ FREEMEMORY(space, T);
+ FREEMEMORY(space, lsites);
+ nooflsites = 0;
+
+ T = bl_matchfileScanList (space, &rightl,
+ rsites, noofrsites, interval, -1, 0, &temp);
+
+ if(realigndev) {
+ noofrightrealigns += bl_matchfileRealign(space, realigndev, &rightrealign, T, temp, 0, set, k, fmt, 25,threadno,interval,maxdist);
+ bl_listSweep(&rightrealign);
+ bl_matchfileDumpUnRealigned(space, realigndev, &leftrealign, &rightrealign, -1);
+ }
+
+ R[k].cluster =
+ bl_matchJoinSplitSiteClusters(space, R[k].cluster, T, R[k].noofclusters, temp);
+ R[k].noofclusters += temp;
+
+ FREEMEMORY(space, T);
+ FREEMEMORY(space, rsites);
+ noofrsites = 0;
+
+ k = curchromidx;
+
+ bl_listDestruct(&rightrealign, NULL);
+ bl_listDestruct(&leftrealign, NULL);
+ bl_listInit(&rightrealign, 100, sizeof(matchlistelem_t));
+ bl_listInit(&leftrealign, 100, sizeof(matchlistelem_t));
+
+ // fprintf(stderr, "chrom:%s %d\n", curchrom, curchromidx);
+ }
+
+
+ //fprintf(stderr, "reading %s: %d - %d\n", curchrom, curstart, curend);
+ /*last condition to avoid inclusion of 0-alignments in BAM files*/
+ if (curstart != curend && curend+1 > 0) {
+
+ /* edist = bl_matchfileGetEdist(token, fmt);*/
+ strand = bl_matchfileGetStrand(token, fmt);
+
+ if(fields & MFREAD_SPLITS) {
+ acceptorpos = bl_matchfileGetNextPos(token, fmt);
+ acceptorchr = bl_matchfileGetNextChr(token, fmt);
+ acceptorflg = bl_matchfileGetNextFlag(token, fmt);
+ donorpos = bl_matchfileGetPrevPos(token, fmt);
+ donorchr = bl_matchfileGetPrevChr(token, fmt);
+ donorflg = bl_matchfileGetPrevFlag(token, fmt);
+ /*xstart = bl_matchfileGetSplitStart(token, fmt);
+ xend = bl_matchfileGetSplitEnd(token, fmt);
+ xno = bl_matchfileGetSplitNumber(token, fmt);*/
+ }
+
+ /* if(edist > 255) {
+ curedist = 255;
+ } else curedist = edist;*/
+
+ donorchridx = -1;
+ acceptorchridx = -1;
+
+
+ /*
+ * rsites (lsites) holds split positions that occur at
+ * rightmost (leftmost)
+ * sites of split alignments. Both arrays are sorted and the
+ * largest position is last.
+ * The actual alignment information is stored in two unsorted lists.
+ * They are scanned if the current alignment
+ * start is at least interval nt larger then the largest element
+ * in lsites or rsites.
+ * In this case there is no further split alignment (sorted file)
+ * that belongs to a cluster of splits in the list. The scan
+ * returns an array of split site clusters (interval, median ...)
+ * which is appended to the L or R split site cluster array
+ *
+ */
+
+ /*
+ * cleaning up
+ *
+ */
+
+ if(lsites && lsites[nooflsites-1].pos+interval < curstart) {
+
+ T = bl_matchfileScanList (space, &leftl,
+ lsites, nooflsites, interval, curstart, 1, &temp);
+
+ if(realigndev) {
+ noofleftrealigns += bl_matchfileRealign(space, realigndev, &leftrealign, T, temp, 1, set,
+ curchromidx, fmt, 25,threadno, interval, maxdist);
+ bl_listSweep(&leftrealign);
+ }
+
+ L[k].cluster =
+ bl_matchJoinSplitSiteClusters(space, L[k].cluster, T, L[k].noofclusters, temp);
+ L[k].noofclusters += temp;
+
+ bl_listSweep(&leftl);
+ FREEMEMORY(space, T);
+ FREEMEMORY(space, lsites);
+ nooflsites = 0;
+ }
+
+ if(rsites && rsites[noofrsites-1].pos+interval < curstart) {
+
+ T = bl_matchfileScanList (space, &rightl,
+ rsites, noofrsites, interval, curstart, 0, &temp);
+
+ if(realigndev) {
+ noofrightrealigns += bl_matchfileRealign(space, realigndev, &rightrealign, T, temp, 0, set,
+ curchromidx, fmt, 25,threadno, interval, maxdist);
+ bl_listSweep(&rightrealign);
+ bl_matchfileDumpUnRealigned(space, realigndev, &leftrealign, &rightrealign, curstart);
+ }
+
+ R[k].cluster =
+ bl_matchJoinSplitSiteClusters(space, R[k].cluster, T, R[k].noofclusters, temp);
+ R[k].noofclusters += temp;
+
+ bl_listSweep(&rightl);
+ FREEMEMORY(space, T);
+ FREEMEMORY(space, rsites);
+ noofrsites = 0;
+ }
+
+ /*
+ * building up lists
+ *
+ */
+
+ if(acceptorchr) {
+ acceptorchridx = bl_matchfileGetChromIndexNumber(index, acceptorchr);
+ }
+
+ if(realigndev) {
+ if(!acceptorchr || !donorchr) {
+ bookkeeper = ALLOCMEMORY(space, NULL, unsigned char, 1);
+ bookkeeper[0] = 0;
+
+ buffer2 = ALLOCMEMORY(space, NULL, char, len+1);
+ memmove(buffer2, buffer, len);
+ buffer2[len] = 0;
+ }
+
+ if(!acceptorchr && !donorchr) {
+ buffer3 = ALLOCMEMORY(space, NULL, char, len+1);
+ memmove(buffer3, buffer, len);
+ buffer3[len] = 0;
+ }
+ }
+
+ if(donorchr) {
+
+ trans = 0;
+ donorchridx = bl_matchfileGetChromIndexNumber(index, donorchr);
+
+ if(strand == '-') {
+ trans = (donorflg & SPLIT_PREV_PLUS) ? 1 : 0;
+ adjoint = (acceptorchridx != -1) ? curstart : -1;
+ /*Do not take into account very short introns (size<10)*/
+ if(trans || donorchridx != curchromidx || (donorpos-curend)*(donorpos-curend)>100) {
+ bl_matchfileEnlistMatch (space, &rightl, curstart, curend,
+ donorchridx, donorpos, adjoint, trans, NULL, NULL);
+
+ rsites = bl_matchfileUpdateSplitSites (space, curend,
+ rsites, &noofrsites, interval, trans);
+ }
+ /* else {
+ fprintf(stderr,"NOT included in splicelist: %s\n",buffer);
+ }*/
+ //store read for realignment
+ if(!acceptorchr && realigndev) {
+ bookkeeper[0] |= LEFTLIST;
+ bl_matchfileEnlistMatch (space, &leftrealign, curstart, curend,
+ donorchridx, donorpos, adjoint, trans, buffer2, bookkeeper);
+ }
+
+ } else {
+ trans = (!(donorflg & SPLIT_PREV_PLUS)) ? 1 : 0;
+
+ adjoint = (acceptorchridx != -1) ? curend : -1;
+ if(trans || donorchridx != curchromidx || (donorpos-curstart)*(donorpos-curstart)>100) {
+ bl_matchfileEnlistMatch (space, &leftl, curstart, curend,
+ donorchridx, donorpos, adjoint, trans, NULL, NULL);
+
+ lsites = bl_matchfileUpdateSplitSites(space, curstart,
+ lsites, &nooflsites, interval, trans);
+ }
+ /* else {
+ fprintf(stderr,"NOT included in splicelist: %s\n",buffer);
+ }*/
+ //store read for realignment
+ if(!acceptorchr && realigndev) {
+ bookkeeper[0] |= RIGHTLIST;
+ bl_matchfileEnlistMatch (space, &rightrealign, curstart, curend,
+ donorchridx, donorpos, adjoint, trans, buffer2, bookkeeper);
+ }
+ }
+ }
+
+ if(acceptorchr) {
+
+ if(strand == '-') {
+ trans = (acceptorflg & SPLIT_NEXT_PLUS) ? 1 : 0;
+ adjoint = (donorchridx != -1) ? curend : -1;
+ if(trans || acceptorchridx != curchromidx || (acceptorpos-curstart)*(acceptorpos-curstart)>100) {
+ bl_matchfileEnlistMatch (space, &leftl, curstart, curend,
+ acceptorchridx, acceptorpos, adjoint, trans, NULL, NULL);
+
+ lsites = bl_matchfileUpdateSplitSites(space, curstart,
+ lsites, &nooflsites, interval, trans);
+ }
+ /* else {
+ fprintf(stderr,"NOT included in splicelist: %s\n",buffer);
+ }*/
+ //store read for realignment
+ if(!donorchr && realigndev) {
+ bookkeeper[0] |= RIGHTLIST;
+ bl_matchfileEnlistMatch (space, &rightrealign, curstart, curend,
+ acceptorchridx, acceptorpos, adjoint, trans, buffer2, bookkeeper);
+ }
+
+ } else {
+ trans = (!(acceptorflg & SPLIT_NEXT_PLUS)) ? 1 : 0;
+ adjoint = (donorchridx != -1) ? curstart : -1;
+ if(trans || acceptorchridx != curchromidx || (acceptorpos-curend)*(acceptorpos-curend)>100) {
+ bl_matchfileEnlistMatch (space, &rightl, curstart, curend,
+ acceptorchridx, acceptorpos, adjoint, trans, NULL, NULL);
+
+ rsites = bl_matchfileUpdateSplitSites(space, curend,
+ rsites, &noofrsites, interval, trans);
+ }
+ /* else {
+ fprintf(stderr,"NOT included in splicelist: %s\n",buffer);
+ }*/
+ //store read for realignment
+ if(!donorchr && realigndev) {
+ bookkeeper[0] |= LEFTLIST;
+ bl_matchfileEnlistMatch (space, &leftrealign, curstart, curend,
+ acceptorchridx, acceptorpos, adjoint, trans, buffer2, bookkeeper);
+ }
+ }
+
+ }
+
+ //store read for realignment
+ if(!acceptorchr && !donorchr && realigndev) {
+ bookkeeper[0] |= LEFTLIST;
+ bookkeeper[0] |= RIGHTLIST;
+
+ bl_matchfileEnlistMatch (space, &leftrealign, curstart,
+ curend, acceptorchridx, acceptorpos, adjoint, trans, buffer2, bookkeeper);
+
+ bl_matchfileEnlistMatch (space, &rightrealign, curstart,
+ curend, acceptorchridx, acceptorpos, adjoint, trans, buffer3, bookkeeper);
+ }
+ if(acceptorchr && donorchr && realigndev) {
+ // printf( "Was here!\n");
+ if (realigndev != NULL) {fprintf(realigndev,"%s\n",buffer);} /**/
+ }
+ }
+
+ destructStringset(space, token);
+ }
+ else {
+ if (realigndev != NULL) {fprintf(realigndev,"%s\n",buffer);}
+ }
+
+ buffer = ALLOCMEMORY(space, buffer, char, buffersize);
+ len = 0;
+
+ } else {
+ if(ch != '\n') buffer[len++] = ch;
+ }
+ }
+
+ T = bl_matchfileScanList (space, &leftl, lsites, nooflsites, interval, -1, 1, &temp);
+
+ if(realigndev) {
+ noofleftrealigns += bl_matchfileRealign(space, realigndev, &leftrealign, T, temp, 1, set,
+ curchromidx, fmt, 25,threadno, interval, maxdist);
+ bl_listSweep(&leftrealign);
+ }
+
+ if(k != -1) {
+ L[k].cluster = bl_matchJoinSplitSiteClusters(space, L[k].cluster, T, L[k].noofclusters, temp);
+ L[k].noofclusters += temp;
+ }
+
+ FREEMEMORY(space, T);
+ FREEMEMORY(space, lsites);
+
+ T = bl_matchfileScanList (space, &rightl, rsites, noofrsites, interval, -1, 0, &temp);
+
+ if(realigndev) {
+ noofrightrealigns += bl_matchfileRealign(space, realigndev, &rightrealign, T, temp, 0, set,
+ curchromidx, fmt, 25,threadno, interval, maxdist);
+
+ bl_listSweep(&rightrealign);
+ bl_matchfileDumpUnRealigned(space, realigndev, &leftrealign, &rightrealign, -1);
+ }
+
+ if(k != -1) {
+ R[k].cluster =
+ bl_matchJoinSplitSiteClusters(space, R[k].cluster, T, R[k].noofclusters, temp);
+ R[k].noofclusters += temp;
+ }
+
+ FREEMEMORY(space, T);
+ FREEMEMORY(space, rsites);
+
+
+ bl_listDestruct(&rightl, NULL);
+ bl_listDestruct(&leftl, NULL);
+ bl_listDestruct(&rightrealign, NULL);
+ bl_listDestruct(&leftrealign, NULL);
+
+ *Lclust = L;
+ *Rclust = R;
+
+ FREEMEMORY(space, buffer);
+
+ fclose(fp);
+ if(gzip) bl_destructgzidxfile(gzf);
+ FREEMEMORY(space, gzf);
+
+ return noofrightrealigns+noofleftrealigns;
+}
+
+/*-------------------------- bl_matchAddDistClusterLink --------------------------
+ *
+ * @brief add a dist cluster link to a cluster
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchAddDistClusterLink(matchsplitsitecluster_t *e1, Uint chr, Uint link,
+ Uint cnt, Uint realigned, char type)
+{
+
+ Uint l;
+e1->cnt +=realigned;
+ for(l=0; l < e1->noofdistclust; l++) {
+ if(e1->distclustchr[l]==chr &&
+ e1->distclust[l] == link && e1->distclusttype[l] == type) {
+ e1->distclustcnt[l] += cnt;
+ e1->distclustrealigned[l] += realigned;
+ /*NEW*/
+ e1->distclustcnt[l]+= realigned;
+
+ /*end(NEW)*/
+ break;
+ }
+ }
+
+ if(l==e1->noofdistclust) {
+ e1->distclust = bl_insertArray(e1->distclust, e1->noofdistclust,
+ sizeof(Uint), &link, e1->noofdistclust);
+
+ e1->distclustchr = bl_insertArray(e1->distclustchr, e1->noofdistclust,
+ sizeof(Uint), &chr, e1->noofdistclust);
+
+ e1->distclustcnt = bl_insertArray(e1->distclustcnt, e1->noofdistclust,
+ sizeof(Uint), &cnt, e1->noofdistclust);
+
+ e1->distclustrealigned = bl_insertArray(e1->distclustrealigned, e1->noofdistclust,
+ sizeof(Uint), &realigned, e1->noofdistclust);
+
+ e1->distclusttype = bl_insertArray(e1->distclusttype, e1->noofdistclust,
+ sizeof(char), &type, e1->noofdistclust);
+
+ e1->noofdistclust++;
+ e1->distclustcnt[l]+=realigned;
+
+ }
+ return ;
+}
+
+
+/*---------------------- bl_matchAddAdjoinedClusterLink ----------------------
+ *
+ * @brief add a link to adjoined clusters
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchAddAdjoinedClusterLink (matchsplitsitecluster_t *e1, Uint link, Uint cnt)
+{
+ Uint l;
+
+ for(l=0; l < e1->noofadjclust; l++) {
+ if(link == e1->adjclust[l]) {
+ e1->adjclustweights[l] += cnt;
+ break;
+ }
+ }
+ if(l==e1->noofadjclust) {
+
+ e1->adjclust = bl_insertArray(e1->adjclust, e1->noofadjclust,
+ sizeof(Uint), &link, e1->noofadjclust);
+
+ e1->adjclustweights = bl_insertArray(e1->adjclustweights, e1->noofadjclust,
+ sizeof(Uint), &cnt, e1->noofadjclust);
+
+ e1->noofadjclust++;
+ }
+
+ return ;
+}
+
+/*----------------------- bl_matchLinkAdjoinedCluster -----------------------
+ *
+ * @brief link all clusters that are adjoined by two splice sites
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchLinkAdjoinedCluster(void *space, matchsplitsiteclusterlist_t *L,
+ matchsplitsiteclusterlist_t *R, Uint nchr)
+{
+ Uint i, j, k, pos, u;
+ matchsplitsitecluster_t *e1, *e2;
+
+
+ for(i=0; i < nchr; i++) {
+ for(j=0; j < R[i].noofclusters; j++) {
+ e1 = &R[i].cluster[j];
+ for(k=0; k < e1->noofadjoints; k++) {
+
+ if(e1->adjoint[k] < 0) {
+
+ pos = e1->median + e1->adjoint[k];
+ u = binarySearch_left(L[i].cluster, L[i].noofclusters, &pos,
+ cmp_matchsplitsitecluster_bin, NULL);
+
+ if(u < L[i].noofclusters) {
+ e2 = &L[i].cluster[u];
+ if(e2->a <= pos && pos <= e2->b) {
+
+ bl_matchAddAdjoinedClusterLink (e2, j, e1->adjointcnt[k]);
+ bl_matchAddAdjoinedClusterLink (e1, u, e1->adjointcnt[k]);
+
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return ;
+}
+
+
+/*--------------------------- bl_matchLinkDistCluster ----------------------------
+ *
+ * @brief get loc links from cluster lists
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchLinkDistCluster (void *space, matchsplitsiteclusterlist_t *R,
+ matchsplitsiteclusterlist_t *L, Uint n)
+{
+ Uint i, j, k, u, q, trans, cnt, pos, chr;
+ matchsplitsitecluster_t *e1, *e2;
+ Uint realigned;
+
+
+#if defined CHECKLINKS
+ Uint temp;
+#endif
+#if defined PARANOID && defined CHECKLINKS
+ Uint dpos, v;
+#endif
+
+ //all chroms
+ for(i=0; i < n; i++) {
+ //all clusters
+ for(j=0; j < R[i].noofclusters; j++) {
+ //all dist sites R2L and R2R
+ for(k=0; k < R[i].cluster[j].distsites; k++){
+ e1 = &R[i].cluster[j];
+ pos = R[i].cluster[j].distpos[k];
+ chr = R[i].cluster[j].distchr[k];
+ cnt = R[i].cluster[j].distcnt[k];
+ trans = R[i].cluster[j].disttrans[k];
+
+ if(R[i].cluster[j].realigned)
+ realigned = R[i].cluster[j].realigned[k];
+ else
+ realigned = 0;
+
+ if(!trans) {
+
+ u = binarySearch_left(L[chr].cluster, L[chr].noofclusters, &pos,
+ cmp_matchsplitsitecluster_bin, NULL);
+
+ if(u < L[chr].noofclusters) {
+ e2 = &L[chr].cluster[u];
+ if(pos >= e2->a && e2->b >= pos) {
+
+ if(e2->realigned) {
+ for(q=0; q < e2->distsites; q++) {
+ if(e2->distchr[q] == chr && e2->distpos[q]>=e1->a && e1->b>=e2->distpos[q])
+ break;
+ }
+ if(q< e2->distsites) {
+ realigned += e2->realigned[q];
+ }
+ }
+
+ bl_matchAddDistClusterLink(e1, chr, u, cnt, realigned, 1);
+ bl_matchAddDistClusterLink(e2, i, j, cnt, realigned, 1);
+
+
+#ifdef CHECKLINKS
+ R[i].cluster[j].lnkcnt += cnt;
+ e2->lnkcnt += cnt;
+#endif
+#if defined PARANOID && defined CHECKLINKS
+ if(R[i].cluster[j].linked[k] < cnt) {
+ dpos = R[i].cluster[j].a + R[i].cluster[j].rpos[k];
+ for(v=0; v < e2->distsites; v++) {
+
+ if(e2->distchr[v] == i && dpos == e2->distpos[v] &&
+ e2->rpos[v] +e2->a == pos &&
+ e2->disttrans[v] == trans) {
+ assert(cnt == e2->distcnt[v]);
+ e2->linked[v] += cnt;
+ R[i].cluster[j].linked[k] += e2->distcnt[v];
+
+ }
+ }
+ }
+#endif
+ } else {
+ fprintf(stderr, "cluster not found (range check) [%d,%d] looking for %d chr %d\n",e1->a, e1->b, pos,chr);
+ }
+ } else {
+ fprintf(stderr,"cluster not found (search): [%d,%d] looking for %d, chr %d\n",
+ e1->a, e1->b, pos,chr);
+ for(q=0; q < L[chr].noofclusters; q++) {
+ if(pos >= L[chr].cluster[q].a && L[chr].cluster[q].b >= pos) {
+ fprintf(stderr, "found in linear scan\n");
+ break;
+ }
+ }
+ if(q == L[chr].noofclusters) {
+ fprintf(stderr, "not found in linear scan: pos:%d:%d\n", chr, pos);
+ }
+ }
+
+ } else {
+
+ u = binarySearch_left(R[chr].cluster, R[chr].noofclusters, &pos,
+ cmp_matchsplitsitecluster_bin, NULL);
+
+ if(u < R[chr].noofclusters) {
+ e2 = &R[chr].cluster[u];
+ if(pos >= e2->a && e2->b >= pos) {
+
+ //TOO MUCH IS LINKED HERE! TRANS1 -> TRANS2 AND TRANS2->TRANS1
+ //- BELoW (PaRANOID) THIS IS AVOIDED BY RIGOROURS POS CHECKING
+ //HENCE LINK ONLY IF ELEM >
+
+ if(chr > i || (chr == i && e2 >= &R[i].cluster[j])) {
+ if(e2->realigned) {
+ for(q=0; q < e2->distsites; q++) {
+ if(e2->distchr[q] == chr && e2->distpos[q]>=e1->a && e1->b>=e2->distpos[q])
+ break;
+ }
+ if(q< e2->distsites) {
+ if(e1 != e2) { /*do not increase twice if splicing back onto itself*/
+ realigned += e2->realigned[q];
+ }
+ }
+ }
+
+ bl_matchAddDistClusterLink(e1, chr, u, cnt, realigned, 2);
+ if(e1 != e2) { /*do not increase twice if splicing back onto itself*/
+ bl_matchAddDistClusterLink(e2, i, j, cnt, realigned, 2);
+ }
+ }
+#ifdef CHECKLINKS
+ if(chr > i || (chr == i && e2 >= &R[i].cluster[j])) {
+ R[i].cluster[j].lnkcnt += cnt;
+
+ if(chr > i || j != u) {
+ e2->lnkcnt += cnt;
+ }
+ }
+#endif
+#if defined PARANOID && defined CHECKLINKS
+ if(R[i].cluster[j].linked[k] < cnt) {
+
+ dpos = R[i].cluster[j].a + R[i].cluster[j].rpos[k];
+
+ for(v=0; v < e2->distsites; v++) {
+ if(e2->distchr[v] == i && dpos == e2->distpos[v] &&
+ e2->rpos[v]+e2->a == pos &&
+ e2->disttrans[v] == trans &&
+ e2->linked[v] < e2->distcnt[v]) {
+
+ assert(cnt == e2->distcnt[v]);
+ if(R[i].cluster[j].linked != e2->linked || k != v)
+ e2->linked[v] += cnt;
+ R[i].cluster[j].linked[k] += e2->distcnt[v];
+ }
+ }
+ }
+#endif
+ } else {
+ fprintf(stderr, "trans R2R cluster not found (range check)[%d,%d] looking for %d chr %d\n",e1->a, e1->b, pos,chr);
+ }
+ } else {
+ fprintf(stderr, "trans R2R cluster not found (search)[%d,%d] looking for %d chr %d\n",e1->a, e1->b, pos,chr);
+ }
+ }
+ }
+ }
+
+ //L2L
+ for(j=0; j < L[i].noofclusters; j++) {
+
+ for(k=0; k < L[i].cluster[j].distsites; k++){
+
+ e1 = &L[i].cluster[j];
+
+ pos = L[i].cluster[j].distpos[k];
+ chr = L[i].cluster[j].distchr[k];
+ cnt = L[i].cluster[j].distcnt[k];
+ trans = L[i].cluster[j].disttrans[k];
+ if(L[i].cluster[j].realigned)
+ realigned = L[i].cluster[j].realigned[k];
+ else
+ realigned = 0;
+
+ if(trans) {
+
+ u = binarySearch_left(L[chr].cluster, L[chr].noofclusters, &pos,
+ cmp_matchsplitsitecluster_bin, NULL);
+
+ if(u < L[chr].noofclusters) {
+ e2 = &L[chr].cluster[u];
+ if(pos >= e2->a && e2->b >= pos) {
+
+ if(chr > i || (chr == i && e2 >= &L[i].cluster[j])) {
+
+ if(e2->realigned) {
+ for(q=0; q < e2->distsites; q++) {
+ if(e2->distchr[q] == chr && e2->distpos[q]>=e1->a && e1->b>=e2->distpos[q])
+ break;
+ }
+ if(q< e2->distsites) {
+ realigned += e2->realigned[q];
+ }
+ }
+
+ bl_matchAddDistClusterLink(e1, chr, u, cnt, realigned, 3);
+ bl_matchAddDistClusterLink(e2, i, j, cnt, realigned, 3);
+ }
+#ifdef CHECKLINKS
+ if(chr > i || (chr == i && e2 >= &L[i].cluster[j])) {
+ L[i].cluster[j].lnkcnt += cnt;
+ if(chr > i || j != u)
+ e2->lnkcnt += cnt;
+ }
+#endif
+#if defined PARANOID && defined CHECKLINKS
+ if(trans && L[i].cluster[j].linked[k] < cnt) {
+ dpos = L[i].cluster[j].a + L[i].cluster[j].rpos[k];
+ for(v=0; v < e2->distsites; v++) {
+ if(e2->distchr[v] == i && dpos == e2->distpos[v]
+ && pos == e2->rpos[v]+e2->a
+ && e2->disttrans[v] == trans) {
+ assert(cnt == e2->distcnt[v]);
+ if(L[i].cluster[j].linked != e2->linked || k != v)
+ e2->linked[v] += cnt;
+ L[i].cluster[j].linked[k] += e2->distcnt[v];
+
+ }
+ }
+ }
+#endif
+ }
+ }
+ }
+ }
+ }
+ }
+
+#ifdef CHECKLINKS
+ for(i=0; i < n; i++) {
+ //all clusters
+ for(j=0; j < R[i].noofclusters; j++) {
+ //all dist sites R2L and R2R
+ temp = 0;
+
+ if(R[i].cluster[j].cnt != R[i].cluster[j].lnkcnt) {
+ fprintf(stderr, "R linkcount wrong: %d <> %d (trans:%d)\n",
+ R[i].cluster[j].cnt, R[i].cluster[j].lnkcnt, R[i].cluster[j].trans);
+ }
+
+ for(k=0; k < R[i].cluster[j].distsites; k++){
+ temp += R[i].cluster[j].distcnt[k];
+
+#if defined PARANOID && defined CHECKLINKS
+ if(R[i].cluster[j].distcnt[k] != R[i].cluster[j].linked[k]) {
+ fprintf(stderr,
+ "R wrong trans: (%d, %d, %d): %d:[%d]-%d <> %d (trans %d)\n",
+ i, j, k,R[i].cluster[j].distchr[k], R[i].cluster[j].distpos[k],
+ R[i].cluster[j].distcnt[k], R[i].cluster[j].linked[k],
+ R[i].cluster[j].disttrans[k]);
+ }
+#endif
+ }
+ assert(temp == R[i].cluster[j].cnt);
+ }
+ }
+
+ for(i=0; i < n; i++) {
+ //all clusters
+ for(j=0; j < L[i].noofclusters; j++) {
+ //all dist sites L2L
+
+ if(L[i].cluster[j].cnt != L[i].cluster[j].lnkcnt) {
+ fprintf(stderr, "L linkcount wrong: %d <> %d (trans:%d)\n",
+ L[i].cluster[j].cnt, L[i].cluster[j].lnkcnt, L[i].cluster[j].trans);
+ }
+
+ temp = 0;
+
+ for(k=0; k < L[i].cluster[j].distsites; k++){
+ temp += L[i].cluster[j].distcnt[k];
+#if defined PARANOID && defined CHECKLINKS
+ if(L[i].cluster[j].distcnt[k] != L[i].cluster[j].linked[k]) {
+ fprintf(stderr,
+ "L wrong trans: (%d, %d, %d): %d:[%d]-%d <> %d (trans %d)\n",
+ i, j, k, L[i].cluster[j].distchr[k], L[i].cluster[j].distpos[k],
+ L[i].cluster[j].distcnt[k], L[i].cluster[j].linked[k],
+ L[i].cluster[j].disttrans[k]);
+ }
+#endif
+ }
+ assert(temp == L[i].cluster[j].cnt);
+ }
+ }
+
+#endif
+
+ return ;
+}
+
+
+/*------------------ bl_matchCompareLinkedClusterSequences -------------------
+ *
+ * @brief this function compares the sequences of linked clusters.
+ * The more different the sequences are, the higher the credibility of
+ * the links
+ *
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchCompareLinkedClusterSequences (void *space, fasta_t *set,
+ matchsplitsiteclusterlist_t *R,
+ matchsplitsiteclusterlist_t *L, Uint n, int maxdist)
+{
+ Uint i,j,k,q;
+ Uint lmrgn, rmrgn, lmrgn2, rmrgn2, chr, pos, distclust, distclust2,
+ distchr, distpos, distchr2,
+ distpos2, reflen, reflen2, reflen3, len, len2, noofdistclust;
+ Uint edist, minlen;
+ char *seq, *seq2, *seq3, disttrans, disttrans2, *rm=NULL, *rm2=NULL;
+ int *M, scores[] = {1,-1};
+ double accuracy, *emat;
+ Uint range=25;
+ Alignment al;
+
+
+ for(i=0; i < n; i++) {
+ for(j=0; j < L[i].noofclusters; j++) {
+
+ //fprintf(stderr, "chrom:%d cluster:%d\n", i, j);
+ pos = L[i].cluster[j].median;
+ chr = i;
+ noofdistclust = L[i].cluster[j].noofdistclust;
+ if (!maxdist || noofdistclust<maxdist) { /**/
+ emat =
+ ALLOCMEMORY(space, NULL, double, ((noofdistclust+1)*(noofdistclust)));
+ memset(emat, 0, sizeof(double)*((noofdistclust+1)*(noofdistclust)));
+ //compare splits with left site
+ reflen = bl_fastaGetSequenceLength(set, chr);
+ lmrgn = (pos > range) ? range : pos;
+ rmrgn = (pos+range < reflen) ? range : reflen - pos;
+ seq = &bl_fastaGetSequence(set, chr)[pos-lmrgn];
+
+ for(k=0; k < L[i].cluster[j].noofdistclust; k++) {
+ distchr = L[i].cluster[j].distclustchr[k];
+ distclust = L[i].cluster[j].distclust[k];
+
+ if(L[i].cluster[j].distclusttype[k] == 1) {
+ distpos = R[distchr].cluster[distclust].median;
+ disttrans = 0;
+ } else {
+ distpos = L[distchr].cluster[distclust].median;
+ disttrans = 1;
+ }
+
+ reflen2 = bl_fastaGetSequenceLength(set, distchr);
+
+ lmrgn = (distpos > lmrgn) ? lmrgn : distpos;
+ rmrgn = (distpos+rmrgn < reflen2) ? rmrgn : reflen2 - distpos;
+ len = rmrgn + lmrgn;
+
+ seq2 =&bl_fastaGetSequence(set, distchr)[distpos-lmrgn];
+ if(disttrans) {
+ rm = charIUPACcomplement(space, seq2, len);
+ seq2 = rm;
+ }
+
+ M = swmatrix(space, seq, len, seq2, len, -1, constscr, scores);
+ initAlignment(&al, seq, len, 0, seq2, len, 0);
+ swtraceback(space, M, seq, len, seq2, len, -1, constscr, scores, &al);
+ MATRIX2D(emat, noofdistclust, 0, k) = .0;
+
+ minlen = MIN(getUalignlen(&al),getValignlen(&al));
+ edist = getEdist(&al);
+ accuracy = 1.0 - (double)edist/(double)minlen;
+
+ if(minlen >= 20) {
+ MATRIX2D(emat, noofdistclust, 0, k) = accuracy;
+ }
+ /*
+ if(minlen >= 20 && accuracy >= 0.75) {
+ fprintf(stderr, "comparing %d w/ %d (%d)\n", pos, distpos, disttrans);
+ fprintf(stderr, "accuracy: %f, minlen:%d\n", accuracy, minlen);
+ showAlign(&al, stderr);
+ }
+ */
+ wrapAlignment(&al);
+ FREEMEMORY(space, M);
+
+ //compare the splits with each other
+ for(q=k+1; q < L[i].cluster[j].noofdistclust; q++) {
+
+ distchr2 = L[i].cluster[j].distclustchr[q];
+ distclust2 = L[i].cluster[j].distclust[q];
+
+ if(L[i].cluster[j].distclusttype[q] == 1) {
+ distpos2 = R[distchr2].cluster[distclust2].median;
+ disttrans2 = 0;
+ } else {
+ distpos2 = L[distchr2].cluster[distclust2].median;
+ disttrans2 = 1;
+ }
+
+ reflen3 = bl_fastaGetSequenceLength(set, distchr2);
+
+ lmrgn2 = (distpos > range) ? range : distpos;
+ lmrgn2 = (distpos2 > lmrgn2) ? lmrgn2 : distpos2;
+ rmrgn2 = (distpos+range < reflen2) ? range : reflen2 - distpos;
+ rmrgn2 = (distpos2+rmrgn2 < reflen3) ? rmrgn2 : reflen3 - distpos2;
+
+ seq3 = &bl_fastaGetSequence(set, distchr2)[distpos2-lmrgn2];
+ len2 = rmrgn2 + lmrgn2;
+
+ if(disttrans2) {
+ rm2 = charIUPACcomplement(space, seq3, len2);
+ seq3 = rm2;
+ }
+
+ M = swmatrix(space, seq2, len, seq3, len2, -1, constscr, scores);
+ initAlignment(&al, seq2, len, 0, seq3, len2, 0);
+ swtraceback(space, M, seq2, len, seq3, len2, -1, constscr, scores, &al);
+
+ MATRIX2D(emat, noofdistclust, k+1, q) = .0;
+ MATRIX2D(emat, noofdistclust, q+1, k) = .0;
+
+ minlen = MIN(getUalignlen(&al),getValignlen(&al));
+ edist = getEdist(&al);
+ accuracy = 1.0 - (double)edist/(double)minlen;
+
+ if(minlen >= 20) {
+ MATRIX2D(emat, noofdistclust, k+1, q) = accuracy;
+ MATRIX2D(emat, noofdistclust, q+1, k) = accuracy;
+ }
+ /*
+ if(minlen >= 20 && accuracy >= 0.75) {
+ fprintf(stderr, "comparing %d (%d) w/ %d (%d)\n", distpos, disttrans, distpos2, disttrans2);
+ fprintf(stderr, "accuracy: %f, minlen:%d\n", accuracy, minlen);
+ showAlign(&al, stderr);
+ }
+ */
+ wrapAlignment(&al);
+ FREEMEMORY(space, M);
+ if(disttrans2) {
+ FREEMEMORY(space, rm2);
+ }
+ }
+
+ if(disttrans) {
+ FREEMEMORY(space, rm);
+ }
+ }
+
+ L[i].cluster[j].emat = emat;
+ }
+ else {
+ L[i].cluster[j].emat = NULL /*geht das? will it free?*/;
+ }
+ }
+ }
+
+ return ;
+}
+
+/*-------------------- bl_matchShowMatchsplitsiteclusterlist -----------------
+ *
+ * @brief shows the lists
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchShowMatchsplitsiteclusterlist (void *space,
+ matchsplitsiteclusterlist_t *l, matchsplitsiteclusterlist_t *r,
+ Uint n, char type)
+{
+ Uint i, k, j, u;
+
+ for(i=0; i < n; i++) {
+ k = l[i].noofclusters;
+ fprintf(stderr, "chromosome %d with %d clusters\n",i, k);
+
+ for(j=0; j < k; j++) {
+ fprintf(stderr,"cluster %d\t[%d,%d]", j,
+ l[i].cluster[j].a, l[i].cluster[j].b);
+#ifdef CHECKLINKS
+ fprintf(stderr,"-%d",l[i].cluster[j].cnt);
+#endif
+ fprintf(stderr, "\n");
+
+ fprintf(stderr, "\tdistant loci:\n");
+ for(u=0; u < l[i].cluster[j].distsites; u++) {
+ fprintf(stderr, "\t\tto: %d:%d cnt:%d (trans:%d)",
+ l[i].cluster[j].distchr[u], l[i].cluster[j].distpos[u],
+ l[i].cluster[j].distcnt[u], l[i].cluster[j].disttrans[u]);
+ if(l[i].cluster[j].realigned) {
+ fprintf(stderr, "realigned: %d\n",l[i].cluster[j].realigned[u]);
+ } else {
+ fprintf(stderr, "\n");
+ }
+ }
+
+ fprintf(stderr, "\tdistant cluster:\n");
+ for(u=0; u < l[i].cluster[j].noofdistclust; u++) {
+ fprintf(stderr, "\t\tto: %d:%d cnt:%d (type:%d) realigned:%d\n",
+ l[i].cluster[j].distclustchr[u], l[i].cluster[j].distclust[u],
+ l[i].cluster[j].distclustcnt[u], l[i].cluster[j].distclusttype[u],
+ l[i].cluster[j].distclustrealigned[u]);
+ }
+
+ fprintf(stderr, "\tadjoint loci:\n");
+ for(u=0; u < l[i].cluster[j].noofadjoints; u++) {
+ fprintf(stderr, "\t\tto: %d cnt:%d\n",
+ l[i].cluster[j].adjoint[u],
+ l[i].cluster[j].adjointcnt[u]);
+ }
+
+
+ fprintf(stderr, "\tadjoint cluster:\n");
+ for(u=0; u < l[i].cluster[j].noofadjclust; u++) {
+ fprintf(stderr, "\t\tto: %d cnt:%d [%d,%d]\n",
+ l[i].cluster[j].adjclust[u], l[i].cluster[j].adjclustweights[u],
+ r[i].cluster[l[i].cluster[j].adjclust[u]].a,
+ r[i].cluster[l[i].cluster[j].adjclust[u]].b);
+ }
+ fprintf(stderr, "\n");
+ }
+ }
+ return ;
+}
+
+
+
+/*----------------- bl_matchGetMatchsplitsiteclusterlistBED ------------------
+ *
+ * @brief get a bed from matchsplitsitecluster
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchGetMatchsplitsiteclusterlistBED (void *space, fasta_t *set, FILE *normdev, FILE *transdev,
+ matchsplitsiteclusterlist_t *l, matchsplitsiteclusterlist_t *r,
+ Uint n, int maxdist)
+{
+
+ Uint i, k, j, u, q, v, distclust, distchr, noofdistclust, noofdistclust2;
+ char flag;
+ double *emat, max, max2;
+ Lint distance;
+
+ //iter chromosomes
+ for(i=0; i < n; i++) {
+ k = l[i].noofclusters;
+ //iter clusters
+ for(j=0; j < k; j++) {
+ //iter dist clusters
+
+ noofdistclust = l[i].cluster[j].noofdistclust;
+ // no fancy stuff if cluster is promiscuitive
+ if (!maxdist || noofdistclust<maxdist) {
+ for(u=0; u < noofdistclust; u++) {
+
+ distchr = l[i].cluster[j].distclustchr[u];
+ distclust = l[i].cluster[j].distclust[u];
+ emat = l[i].cluster[j].emat;
+
+ max = MATRIX2D(emat, noofdistclust, 0, u);
+
+ for(q=0; q < noofdistclust; q++) {
+ max = MAX(max, MATRIX2D(emat, noofdistclust, q+1, u));
+ }
+
+ flag = 'P';
+ if(max >= 0.75) flag = 'F';
+
+ distance = (l[i].cluster[j].distclusttype[u] == 1) ?
+ labs((Lint)l[i].cluster[j].median - r[distchr].cluster[distclust].median) :
+ labs((Lint)l[i].cluster[j].median - l[distchr].cluster[distclust].median) ;
+
+
+ if( l[i].cluster[j].distclusttype[u] == 1 &&
+ distchr == i &&
+ distance <= 200000) {
+
+ emat = r[distchr].cluster[distclust].emat;
+
+ noofdistclust2 = r[distchr].cluster[distclust].noofdistclust;
+ if (!maxdist || noofdistclust2<maxdist) { //second cluster is promiscuitive
+ for(v=0; v < noofdistclust2; v++) {
+ if(r[distchr].cluster[distclust].distclustchr[v] == i &&
+ r[distchr].cluster[distclust].distclust[v] == j)
+ break;
+ }
+ assert(v < noofdistclust2);
+ max2 = MATRIX2D(emat, noofdistclust2, 0, v);
+ for(q=0; q < noofdistclust2; q++) {
+ max2 = MAX(max2, MATRIX2D(emat, noofdistclust2, q+1, v));
+ }
+ flag ='P';
+ if(max >= 0.75 || max2 >= 0.75) flag = 'F';
+ }
+ else {
+ flag = 'M';//second cluster is promiscuitive
+ }
+
+ if(l[i].cluster[j].median<r[distchr].cluster[distclust].median) {
+ /*here: include information about circulars*/
+ fprintf(normdev, "%s\t%d\t%d\tsplits:%d:%d:%d:C:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), l[i].cluster[j].median,
+ r[distchr].cluster[distclust].median,
+ l[i].cluster[j].distclustcnt[u], l[i].cluster[j].cnt,
+ r[distchr].cluster[distclust].cnt, flag, l[i].cluster[j].distclustrealigned[u]);
+ }
+ else {
+ fprintf(normdev, "%s\t%d\t%d\tsplits:%d:%d:%d:N:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), r[distchr].cluster[distclust].median,
+ l[i].cluster[j].median,
+ l[i].cluster[j].distclustcnt[u], r[distchr].cluster[distclust].cnt,
+ l[i].cluster[j].cnt, flag, l[i].cluster[j].distclustrealigned[u]);
+ }
+
+ } else {
+
+ if(distance > 200000 || i != l[i].cluster[j].distclustchr[u]) {
+ if(l[i].cluster[j].distclusttype[u] > 1){
+
+ if(j <= distclust) {
+ emat = l[distchr].cluster[distclust].emat;
+ noofdistclust2 = l[distchr].cluster[distclust].noofdistclust;
+ if (!maxdist || noofdistclust2<maxdist) { //second cluster is promiscuitive
+ for(v=0; v < noofdistclust2; v++) {
+ if(l[distchr].cluster[distclust].distclustchr[v] == i &&
+ l[distchr].cluster[distclust].distclust[v] == j)
+ break;
+ }
+ assert(v < noofdistclust2);
+ max2 = MATRIX2D(emat, noofdistclust2, 0, v);
+ for(q=0; q < noofdistclust2; q++) {
+ max2 = MAX(max2, MATRIX2D(emat, noofdistclust2, q+1, v));
+ }
+
+ flag ='P';
+ if(max >= 0.75 || max2 >= 0.75) flag = 'F';
+ }
+ else {
+ flag = 'M'; //second cluster is promiscuitive
+ }
+
+ //L2L
+ fprintf(transdev, "%s\t%d\t%d\tdiststrandsplice:%s:%d:%d:%d:%d:L:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), l[i].cluster[j].median,
+ l[i].cluster[j].median,
+ bl_fastaGetDescription(set, distchr),
+ l[distchr].cluster[distclust].median,
+ l[i].cluster[j].distclustcnt[u],
+ l[i].cluster[j].cnt,
+ l[distchr].cluster[distclust].cnt, flag,
+ l[i].cluster[j].distclustrealigned[u]);
+
+ fprintf(transdev, "%s\t%d\t%d\tdiststrandsplice:%s:%d:%d:%d:%d:L:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, l[i].cluster[j].distclustchr[u]),
+ l[distchr].cluster[distclust].median,
+ l[distchr].cluster[distclust].median,
+
+ bl_fastaGetDescription(set, i), l[i].cluster[j].median,
+ l[i].cluster[j].distclustcnt[u],
+ l[distchr].cluster[distclust].cnt,
+ l[i].cluster[j].cnt, flag,
+ l[i].cluster[j].distclustrealigned[u]);
+
+ }
+ } else {
+
+
+ emat = r[distchr].cluster[distclust].emat;
+
+ noofdistclust2 = r[distchr].cluster[distclust].noofdistclust;
+ if (!maxdist || noofdistclust2<maxdist) {//second cluster is promiscuitive
+ for(v=0; v < noofdistclust2; v++) {
+ if(r[distchr].cluster[distclust].distclustchr[v] == i &&
+ r[distchr].cluster[distclust].distclust[v] == j)
+ break;
+ }
+ assert(v < noofdistclust2);
+ max2 = MATRIX2D(emat, noofdistclust2, 0, v);
+ for(q=0; q < noofdistclust2; q++) {
+ max2 = MAX(max2, MATRIX2D(emat, noofdistclust2, q+1, v));
+ }
+
+ flag ='P';
+ if(max >= 0.75 || max2 >= 0.75) flag = 'F';
+ }
+ else {
+ flag = 'M'; //second cluster is promiscuitive
+ }
+ /* here if i want to put out promiscuitives*/
+ //L2R
+ fprintf(transdev, "%s\t%d\t%d\tdistsplice:%s:%d:%d:%d:%d:L:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), l[i].cluster[j].median,
+ l[i].cluster[j].median,
+ bl_fastaGetDescription(set, l[i].cluster[j].distclustchr[u]),
+ r[distchr].cluster[distclust].median,
+
+ l[i].cluster[j].distclustcnt[u],
+ l[i].cluster[j].cnt,
+ r[distchr].cluster[distclust].cnt,
+ flag,
+ l[i].cluster[j].distclustrealigned[u]);
+
+ fprintf(transdev, "%s\t%d\t%d\tdistsplice:%s:%d:%d:%d:%d:R:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, l[i].cluster[j].distclustchr[u]),
+ r[distchr].cluster[distclust].median,
+ r[distchr].cluster[distclust].median,
+ bl_fastaGetDescription(set, i), l[i].cluster[j].median,
+
+ l[i].cluster[j].distclustcnt[u],
+ r[distchr].cluster[distclust].cnt,
+ l[i].cluster[j].cnt,
+ flag,
+ l[i].cluster[j].distclustrealigned[u]);
+
+ }
+ } else {
+ if(j <= distclust) {
+
+ emat = l[distchr].cluster[distclust].emat;
+
+ noofdistclust2 = l[distchr].cluster[distclust].noofdistclust;
+ if (!maxdist || noofdistclust2<maxdist) {//second cluster is promiscuitive
+ for(v=0; v < noofdistclust2; v++) {
+ if(l[distchr].cluster[distclust].distclustchr[v] == i &&
+ l[distchr].cluster[distclust].distclust[v] == j)
+ break;
+ }
+ assert(v < noofdistclust2);
+ max2 = MATRIX2D(emat, noofdistclust2, 0, v);
+ for(q=0; q < noofdistclust2; q++) {
+ max2 = MAX(max2, MATRIX2D(emat, noofdistclust2, q+1, v));
+ }
+
+ flag ='P';
+ if(max >= 0.75 || max2 >= 0.75) flag = 'F';
+ }
+ else {
+ flag = 'M'; //second cluster is promiscuitive
+ }
+ /* here if i want to put out promiscuitives*/
+
+ //L2L
+ fprintf(transdev, "%s\t%d\t%d\tstrandsplice:%d:%d:%d:L:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), l[i].cluster[j].median,
+ l[distchr].cluster[distclust].median,
+
+ l[i].cluster[j].distclustcnt[u],
+ l[i].cluster[j].cnt,
+ l[distchr].cluster[distclust].cnt,
+ flag,
+ l[i].cluster[j].distclustrealigned[u]);
+ }
+ }
+ }
+ }
+ }
+ else {/*put out all multiple splice sites also*/
+ for(u=0; u < noofdistclust; u++) {
+
+ distchr = l[i].cluster[j].distclustchr[u];
+ distclust = l[i].cluster[j].distclust[u];
+ // emat = l[i].cluster[j].emat;
+
+ //max = MATRIX2D(emat, noofdistclust, 0, u);
+
+ //for(q=0; q < noofdistclust; q++) {
+ //max = MAX(max, MATRIX2D(emat, noofdistclust, q+1, u));
+ //}
+
+ flag = 'M';
+ //if(max >= 0.75) flag = 'F';
+
+ distance = (l[i].cluster[j].distclusttype[u] == 1) ?
+ labs((Lint)l[i].cluster[j].median - r[distchr].cluster[distclust].median) :
+ labs((Lint)l[i].cluster[j].median - l[distchr].cluster[distclust].median) ;
+
+
+ if( l[i].cluster[j].distclusttype[u] == 1 &&
+ distchr == i &&
+ distance <= 200000) {
+
+ // emat = r[distchr].cluster[distclust].emat;
+
+ /* noofdistclust2 = r[distchr].cluster[distclust].noofdistclust;
+ if (noofdistclust2<maxdist) {
+ for(v=0; v < noofdistclust2; v++) {
+ if(r[distchr].cluster[distclust].distclustchr[v] == i &&
+ r[distchr].cluster[distclust].distclust[v] == j)
+ break;
+ }
+ assert(v < noofdistclust2);
+ max2 = MATRIX2D(emat, noofdistclust2, 0, v);
+ for(q=0; q < noofdistclust2; q++) {
+ max2 = MAX(max2, MATRIX2D(emat, noofdistclust2, q+1, v));
+ }
+
+ flag ='P';
+ if(max >= 0.75 || max2 >= 0.75) flag = 'F';
+ }
+ else {
+ flag = 'M';
+ }*/
+
+ if(l[i].cluster[j].median<r[distchr].cluster[distclust].median) {
+ fprintf(normdev, "%s\t%d\t%d\tsplits:%d:%d:%d:L:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), l[i].cluster[j].median,
+ r[distchr].cluster[distclust].median,
+ l[i].cluster[j].distclustcnt[u], l[i].cluster[j].cnt,
+ r[distchr].cluster[distclust].cnt, flag, l[i].cluster[j].distclustrealigned[u]);
+ }
+ else {
+ fprintf(normdev, "%s\t%d\t%d\tsplits:%d:%d:%d:R:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), r[distchr].cluster[distclust].median,
+ l[i].cluster[j].median,
+ l[i].cluster[j].distclustcnt[u], r[distchr].cluster[distclust].cnt,
+ l[i].cluster[j].cnt, flag, l[i].cluster[j].distclustrealigned[u]);
+ }
+
+ } else {
+
+ if(distance > 200000 || i != l[i].cluster[j].distclustchr[u]) {
+ if(l[i].cluster[j].distclusttype[u] > 1){
+
+ if(j <= distclust) {
+ /*
+ emat = l[distchr].cluster[distclust].emat;
+ noofdistclust2 = l[distchr].cluster[distclust].noofdistclust;
+ if (noofdistclust2<maxdist) {
+ for(v=0; v < noofdistclust2; v++) {
+ if(l[distchr].cluster[distclust].distclustchr[v] == i &&
+ l[distchr].cluster[distclust].distclust[v] == j)
+ break;
+ }
+ assert(v < noofdistclust2);
+ max2 = MATRIX2D(emat, noofdistclust2, 0, v);
+ for(q=0; q < noofdistclust2; q++) {
+ max2 = MAX(max2, MATRIX2D(emat, noofdistclust2, q+1, v));
+ }
+
+ flag ='P';
+ if(max >= 0.75 || max2 >= 0.75) flag = 'F';
+ }
+ else {
+ flag = 'M';
+ }*/
+ /* here if i want to put out promiscuitives*/
+ //L2L
+ fprintf(transdev, "%s\t%d\t%d\tdiststrandsplice:%s:%d:%d:%d:%d:L:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), l[i].cluster[j].median,
+ l[i].cluster[j].median,
+ bl_fastaGetDescription(set, distchr),
+ l[distchr].cluster[distclust].median,
+ l[i].cluster[j].distclustcnt[u],
+ l[i].cluster[j].cnt,
+ l[distchr].cluster[distclust].cnt, flag,
+ l[i].cluster[j].distclustrealigned[u]);
+
+ fprintf(transdev, "%s\t%d\t%d\tdiststrandsplice:%s:%d:%d:%d:%d:L:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, l[i].cluster[j].distclustchr[u]),
+ l[distchr].cluster[distclust].median,
+ l[distchr].cluster[distclust].median,
+
+ bl_fastaGetDescription(set, i), l[i].cluster[j].median,
+ l[i].cluster[j].distclustcnt[u],
+ l[distchr].cluster[distclust].cnt,
+ l[i].cluster[j].cnt, flag,
+ l[i].cluster[j].distclustrealigned[u]);
+ }
+ } else {
+
+
+ /* emat = r[distchr].cluster[distclust].emat;
+
+ noofdistclust2 = r[distchr].cluster[distclust].noofdistclust;
+ if (noofdistclust2<maxdist) {
+ for(v=0; v < noofdistclust2; v++) {
+ if(r[distchr].cluster[distclust].distclustchr[v] == i &&
+ r[distchr].cluster[distclust].distclust[v] == j)
+ break;
+ }
+ assert(v < noofdistclust2);
+ max2 = MATRIX2D(emat, noofdistclust2, 0, v);
+ for(q=0; q < noofdistclust2; q++) {
+ max2 = MAX(max2, MATRIX2D(emat, noofdistclust2, q+1, v));
+ }
+
+ flag ='P';
+ if(max >= 0.75 || max2 >= 0.75) flag = 'F';
+ }
+ else {
+ flag = 'M';
+ }*/
+ /* here if i want to put out promiscuitives*/
+ //L2R
+ fprintf(transdev, "%s\t%d\t%d\tdistsplice:%s:%d:%d:%d:%d:L:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), l[i].cluster[j].median,
+ l[i].cluster[j].median,
+ bl_fastaGetDescription(set, l[i].cluster[j].distclustchr[u]),
+ r[distchr].cluster[distclust].median,
+
+ l[i].cluster[j].distclustcnt[u],
+ l[i].cluster[j].cnt,
+ r[distchr].cluster[distclust].cnt,
+ flag,
+ l[i].cluster[j].distclustrealigned[u]);
+
+ fprintf(transdev, "%s\t%d\t%d\tdistsplice:%s:%d:%d:%d:%d:R:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, l[i].cluster[j].distclustchr[u]),
+ r[distchr].cluster[distclust].median,
+ r[distchr].cluster[distclust].median,
+ bl_fastaGetDescription(set, i), l[i].cluster[j].median,
+
+ l[i].cluster[j].distclustcnt[u],
+ r[distchr].cluster[distclust].cnt,
+ l[i].cluster[j].cnt,
+ flag,
+ l[i].cluster[j].distclustrealigned[u]);
+
+ }
+ } else {
+ if(j <= distclust) {
+ /*
+ emat = l[distchr].cluster[distclust].emat;
+
+ noofdistclust2 = l[distchr].cluster[distclust].noofdistclust;
+ if (noofdistclust2<maxdist) {
+ for(v=0; v < noofdistclust2; v++) {
+ if(l[distchr].cluster[distclust].distclustchr[v] == i &&
+ l[distchr].cluster[distclust].distclust[v] == j)
+ break;
+ }
+ assert(v < noofdistclust2);
+ max2 = MATRIX2D(emat, noofdistclust2, 0, v);
+ for(q=0; q < noofdistclust2; q++) {
+ max2 = MAX(max2, MATRIX2D(emat, noofdistclust2, q+1, v));
+ }
+
+ flag ='P';
+ if(max >= 0.75 || max2 >= 0.75) flag = 'F';
+ }
+ else {
+ flag = 'M';
+ }*/
+ /* here if i want to put out promiscuitives*/
+
+ //L2L
+ fprintf(transdev, "%s\t%d\t%d\tstrandsplice:%d:%d:%d:L:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), l[i].cluster[j].median,
+ l[distchr].cluster[distclust].median,
+
+ l[i].cluster[j].distclustcnt[u],
+ l[i].cluster[j].cnt,
+ l[distchr].cluster[distclust].cnt,
+ flag,
+ l[i].cluster[j].distclustrealigned[u]);
+
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+
+ for(i=0; i < n; i++) {
+ k = r[i].noofclusters;
+ //iter clusters
+ for(j=0; j < k; j++) {
+ //iter dist clusters
+ noofdistclust = r[i].cluster[j].noofdistclust;
+ if (!maxdist || noofdistclust<maxdist) {
+ for(u=0; u < noofdistclust; u++) {
+ if (r[i].cluster[j].distclusttype[u] == 2) {
+
+ distchr = r[i].cluster[j].distclustchr[u];
+ distclust = r[i].cluster[j].distclust[u];
+ emat = r[i].cluster[j].emat;
+
+ max = MATRIX2D(emat, noofdistclust, 0, u);
+ for(q=0; q < noofdistclust; q++) {
+ max = MAX(max, MATRIX2D(emat, noofdistclust, q+1, u));
+ }
+
+ emat = r[distchr].cluster[distclust].emat;
+
+ noofdistclust2 = r[distchr].cluster[distclust].noofdistclust;
+ if (!maxdist || noofdistclust2<maxdist) {
+ for(v=0; v < noofdistclust2; v++) {
+ if(r[distchr].cluster[distclust].distclustchr[v] == i &&
+ r[distchr].cluster[distclust].distclust[v] == j)
+ break;
+ }
+ assert(v < noofdistclust2);
+
+ max2 = MATRIX2D(emat, noofdistclust2, 0, v);
+ for(q=0; q < noofdistclust2; q++) {
+ max2 = MAX(max2, MATRIX2D(emat, noofdistclust2, q+1, v));
+ }
+
+ flag ='P';
+ if(max >= 0.75 || max2 >= 0.75) flag = 'F';
+ }
+ else {
+ flag = 'M';
+ }
+ /*here if i want to put out promiscuitives*/
+
+ distance =
+ labs((Lint)r[i].cluster[j].median - r[distchr].cluster[distclust].median);
+
+
+ if(distance <= 200000 && i == distchr) {
+
+ if(j <= distclust) {
+ //R2R
+ fprintf(transdev, "%s\t%d\t%d\tstrandsplice:%d:%d:%d:R:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), r[i].cluster[j].median,
+ r[i].cluster[distclust].median,
+ r[i].cluster[j].distclustcnt[u], r[i].cluster[j].cnt,
+ r[i].cluster[distclust].cnt,
+ flag,
+ r[i].cluster[j].distclustrealigned[u]);
+ }
+ } else {
+
+ if(j <= distclust) {
+ fprintf(transdev, "%s\t%d\t%d\tdiststrandsplice:%s:%d:%d:%d:%d:R:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), r[i].cluster[j].median,
+ r[i].cluster[j].median,
+
+ bl_fastaGetDescription(set, distchr),
+ r[distchr].cluster[distclust].median,
+ r[i].cluster[j].distclustcnt[u],
+ r[i].cluster[j].cnt,
+ r[distchr].cluster[distclust].cnt,
+ flag,
+ r[i].cluster[j].distclustrealigned[u]);
+
+ fprintf(transdev, "%s\t%d\t%d\tdiststrandsplice:%s:%d:%d:%d:%d:R:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, r[i].cluster[j].distclustchr[u]),
+ r[distchr].cluster[distclust].median,
+ r[distchr].cluster[distclust].median,
+
+ bl_fastaGetDescription(set, i),
+ r[i].cluster[j].median,
+ r[i].cluster[j].distclustcnt[u],
+ r[distchr].cluster[distclust].cnt,
+ r[i].cluster[j].cnt,
+ flag,
+ r[i].cluster[j].distclustrealigned[u]);
+ }
+ }
+ }
+
+ }
+ }
+
+ else {
+ flag = 'M';
+ for(u=0; u < noofdistclust; u++) {
+ if (r[i].cluster[j].distclusttype[u] == 2) {
+
+ distchr = r[i].cluster[j].distclustchr[u];
+ distclust = r[i].cluster[j].distclust[u];
+ distance =
+ labs((Lint)r[i].cluster[j].median - r[distchr].cluster[distclust].median);
+
+
+ if(distance <= 200000 && i == distchr) {
+
+ if(j <= distclust) {
+ //R2R
+ fprintf(transdev, "%s\t%d\t%d\tstrandsplice:%d:%d:%d:R:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), r[i].cluster[j].median,
+ r[i].cluster[distclust].median,
+ r[i].cluster[j].distclustcnt[u], r[i].cluster[j].cnt,
+ r[i].cluster[distclust].cnt,
+ flag,
+ r[i].cluster[j].distclustrealigned[u]);
+ }
+ } else {
+
+ if(j <= distclust) {
+ fprintf(transdev, "%s\t%d\t%d\tdiststrandsplice:%s:%d:%d:%d:%d:R:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, i), r[i].cluster[j].median,
+ r[i].cluster[j].median,
+
+ bl_fastaGetDescription(set, distchr),
+ r[distchr].cluster[distclust].median,
+ r[i].cluster[j].distclustcnt[u],
+ r[i].cluster[j].cnt,
+ r[distchr].cluster[distclust].cnt,
+ flag,
+ r[i].cluster[j].distclustrealigned[u]);
+
+ fprintf(transdev, "%s\t%d\t%d\tdiststrandsplice:%s:%d:%d:%d:%d:R:%c\t%d\t+\n",
+ bl_fastaGetDescription(set, r[i].cluster[j].distclustchr[u]),
+ r[distchr].cluster[distclust].median,
+ r[distchr].cluster[distclust].median,
+
+ bl_fastaGetDescription(set, i),
+ r[i].cluster[j].median,
+ r[i].cluster[j].distclustcnt[u],
+ r[distchr].cluster[distclust].cnt,
+ r[i].cluster[j].cnt,
+ flag,
+ r[i].cluster[j].distclustrealigned[u]);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return ;
+}
+
+/*---------------- bl_matchDestructMatchsplitsiteclusterlist -----------------
+ *
+ * @brief destruct the lists
+ * @author Steve Hoffmann
+ *
+ */
+
+ void
+bl_matchDestructMatchsplitsiteclusterlist (void *space,
+ matchsplitsiteclusterlist_t *l, Uint n)
+{
+ Uint i,j;
+ for(i=0; i < n;i++) {
+ for(j=0; j < l[i].noofclusters; j++) {
+ FREEMEMORY(space, l[i].cluster[j].distpos);
+ FREEMEMORY(space, l[i].cluster[j].distchr);
+ FREEMEMORY(space, l[i].cluster[j].disttrans);
+#if defined PARANOID && defined CHECKLINKS
+ FREEMEMORY(space, l[i].cluster[j].rpos);
+ FREEMEMORY(space, l[i].cluster[j].linked);
+#endif
+ FREEMEMORY(space, l[i].cluster[j].distcnt);
+ if(l[i].cluster[j].adjoint) {
+ FREEMEMORY(space, l[i].cluster[j].adjoint);
+ FREEMEMORY(space, l[i].cluster[j].adjointcnt);
+ }
+ if(l[i].cluster[j].realigned) {
+ FREEMEMORY(space, l[i].cluster[j].realigned);
+ }
+
+ if(l[i].cluster[j].emat) {
+ FREEMEMORY(space, l[i].cluster[j].emat);
+ }
+
+ if(l[i].cluster[j].adjclust) {
+ FREEMEMORY(space, l[i].cluster[j].adjclust);
+ FREEMEMORY(space, l[i].cluster[j].adjclustweights);
+ }
+
+ if(l[i].cluster[j].distclust) {
+ FREEMEMORY(space, l[i].cluster[j].distclust);
+ FREEMEMORY(space, l[i].cluster[j].distclustchr);
+ FREEMEMORY(space, l[i].cluster[j].distclustcnt);
+ FREEMEMORY(space, l[i].cluster[j].distclusttype);
+ FREEMEMORY(space, l[i].cluster[j].distclustrealigned);
+ }
+ }
+
+ l[i].noofclusters = 0;
+ FREEMEMORY(space, l[i].cluster);
+ l[i].cluster = NULL;
+ }
+ return ;
+}
+
+#ifdef REALIGNTEST
+
+unsigned char mute = 0;
+char *ntcode;
+
+int main(int argc, char **argv) {
+
+ realign_t nfo;
+
+ manopt_optionset optset;
+ manopt_arg *unflagged;
+ manopt_arg *queries;
+ manopt_arg *dbfilenames;
+ manopt_intconstraint threadconstraint;
+
+ Uint realigned=0, desclen;
+ char norealign=0;
+ matchsplitsiteclusterlist_t *L = NULL, *R = NULL;
+ matchfileindex_t *index;
+ matchfile_t **files = NULL;
+ unsigned char gzip = 0;
+ char version[]="0.1", expand=0, *desc;
+ int i;
+ char verbose = 0;
+ Uint nchr;
+ Uint prefixlen=0;
+
+ threadconstraint.max = 3000;
+ threadconstraint.min = 1;
+ ra_setdefault(&nfo);
+ nfo.splitfile="splicesites.bed";
+ nfo.transfile="transrealigned.bed";
+ nfo.maxdist=100;
+ initIUPAC(1,1);
+ manopt_initoptionset(&optset, argv[0], NULL,
+ "Heuristic mapping of short sequences\n",
+ "SEGEMEHL is free software for non-commercial use \n (C) 2008 Bioinformatik Leipzig\n",
+ version,
+ "Please report bugs to steve at bioinf.uni-leipzig.de");
+ manopt(&optset, LISTOPT, 1, 'd', "database",
+ "list of path/filename(s) of database sequence(s)", "<file> [<file> ...]",
+ NULL, NULL);
+ manopt(&optset, LISTOPT, 1, 'q', "query",
+ "path/filename of alignment file", "<file> [<file> ...]", NULL, NULL);
+ manopt(&optset, FLAG, 0, 'E', "expand",
+ "expand", NULL, NULL, &expand);
+ manopt(&optset, FLAG, 0, 'v', "verbose",
+ "verbose", NULL, NULL, &verbose);
+ manopt(&optset, FLAG, 0, 'n', "norealign",
+ "do not realign", NULL, NULL, &norealign);
+ manopt(&optset, REQUINTOPT, 0, 't', "threads",
+ "start <n> threads for realigning", "<n>",
+ &threadconstraint, &nfo.threadno);
+ manopt(&optset, REQSTRINGOPT, 0, 'U', "splitfile",
+ "path/filename of the split bedfile", "<file>", &nfo.splitfile, &nfo.splitfile);
+ manopt(&optset, REQSTRINGOPT, 0, 'T', "transfile", "path/filename of bed files containing trans-split", "<file>",&nfo.transfile , &nfo.transfile);
+ manopt(&optset, REQSTRINGOPT, 0, 'o', "outfile", "path/filename of output sam file", "<file>",NULL , &nfo.outfile);
+ manopt(&optset, REQUINTOPT, 0, 'M', "maxdist",
+ "max number of distant sites to consider, 0 to disable", "<n>", NULL, &nfo.maxdist);
+
+ //open file for bed output
+
+ unflagged = manopt_getopts(&optset, argc, argv);
+
+ if(unflagged->noofvalues > 1) {
+ manopt_help(&optset, "unknown argument(s)\n");
+ }
+ nfo.transdev=fopen(nfo.transfile,"w");
+ nfo.normdev=fopen(nfo.splitfile,"w");
+ if(nfo.outfile != NULL) {
+ nfo.realigndev = fopen(nfo.outfile,"w");
+ }
+
+ if(norealign) {
+ nfo.realigndev = NULL;
+ }
+
+ pthread_mutex_init(&inout, NULL);
+ pthread_mutex_init(&mutkuh, NULL);
+
+ MSG("reading database sequences.\n");
+
+ dbfilenames = manopt_getarg(&optset, 'd', "database");
+ nfo.fasta = bl_fastxGetSet(nfo.space, dbfilenames->values,
+ dbfilenames->noofvalues, 1, 0, 0, 1);
+ for(i=0; i < nfo.fasta->noofseqs; i++) {
+ desclen = bl_fastaGetDescriptionLength(nfo.fasta, i);
+ desc = strclip(nfo.space, bl_fastaGetDescription(nfo.fasta, i), &desclen);
+ FREEMEMORY(nfo.space, nfo.fasta->seqs[i]->description);
+ nfo.fasta->seqs[i]->description = desc;
+ nfo.fasta->seqs[i]->descrlen = desclen;
+ }
+
+
+ NFO("%d database sequences found.\n", nfo.fasta->noofseqs);
+ MSG("reading query files.\n");
+
+ queries = manopt_getarg(&optset, 'q', "query");
+ if(queries->noofvalues > 30) {
+ manopt_help(&optset, "currently no more than 30 query files allowed\n");
+ }
+
+ ntcode = getNTcodekey(nfo.space);
+ files = ALLOCMEMORY(nfo.space, NULL, matchfile_t*, queries->noofvalues);
+
+ //using index structure only to carry the chr idx
+ index = bl_matchfileInitIndex(nfo.space);
+
+ nchr = nfo.fasta->noofseqs;
+ for(i=0; i < nchr; i++) {
+ bl_matchfileIndexAddChrom(index, bl_fastaGetDescription(nfo.fasta, i));
+ }
+
+ for(i=0; i < queries->noofvalues; i++) {
+
+ files[i] = ALLOCMEMORY(nfo.space, NULL, matchfile_t, 1);
+ files[i]->fmt = 0;
+ files[i]->index = index;
+ files[i]->filename = queries->values[i];
+
+ prefixlen = bl_fileprefixlen(files[i]->filename);
+
+ gzip = 0;
+ if(strncmp(&files[i]->filename[prefixlen], ".gz", 3) == 0 ||
+ strncmp(&files[i]->filename[prefixlen], ".gzip", 3) == 0) {
+ gzip = 1;
+ }
+
+ files[i]->gzip = gzip;
+ }
+
+ L = ALLOCMEMORY(nfo.space, NULL, matchsplitsiteclusterlist_t,nchr);
+ memset(L, 0, sizeof(matchsplitsiteclusterlist_t)*nchr);
+ R = ALLOCMEMORY(nfo.space, NULL, matchsplitsiteclusterlist_t, nchr);
+ memset(R, 0, sizeof(matchsplitsiteclusterlist_t)*nchr);
+
+ for(i=0; i < queries->noofvalues; i++) {
+ realigned +=
+ bl_matchfileRealignScanFileNew(nfo.space, files[i], nfo.realigndev, nfo.fasta, 255, &L, &R, &nchr,nfo.threadno, nfo.maxdist);
+ }
+ /* printf("realigned:%d\n",realigned);*/
+ bl_matchLinkAdjoinedCluster(nfo.space, L, R, nchr);
+ bl_matchLinkDistCluster (nfo.space, R, L, nchr);
+ bl_matchCompareLinkedClusterSequences (nfo.space, nfo.fasta, R, L, nchr, nfo.maxdist);
+ bl_matchCompareLinkedClusterSequences (nfo.space, nfo.fasta, L, R, nchr, nfo.maxdist);
+ bl_matchGetMatchsplitsiteclusterlistBED (nfo.space, nfo.fasta, nfo.normdev, nfo.transdev, L, R, nchr, nfo.maxdist);
+
+ if(verbose) {
+ fprintf(stderr, "LEFT -----------------\n");
+ bl_matchShowMatchsplitsiteclusterlist(nfo.space, L, R, nchr, 0);
+
+ fprintf(stderr, "RIGHT -----------------\n");
+ bl_matchShowMatchsplitsiteclusterlist(nfo.space, R, L, nchr, 1);
+ fprintf(stderr, "realigned:%d\n",realigned);
+ }
+
+ bl_matchDestructMatchsplitsiteclusterlist(nfo.space, L, nchr);
+ bl_matchDestructMatchsplitsiteclusterlist(nfo.space, R, nchr);
+ FREEMEMORY(nfo.space, L);
+ FREEMEMORY(nfo.space, R);
+
+ bl_fastaDestruct(nfo.space, nfo.fasta);
+ FREEMEMORY(nfo.space, nfo.fasta);
+ bl_matchfileDestructIndex(nfo.space, index);
+ FREEMEMORY(nfo.space, index);
+
+ if(files) {
+ for(i=0; i < queries->noofvalues; i++) {
+ FREEMEMORY(nfo.space, files[i]);
+ }
+ FREEMEMORY(nfo.space, files);
+ }
+
+ manopt_destructoptionset(&optset);
+ manopt_destructarg(unflagged);
+ FREEMEMORY(nfo.space, unflagged);
+
+ FREEMEMORY(nfo.space, ntcode);
+}
+
+#endif
+
+/* T[i].distsites;*/
+void *threadrealign(int begin, int stop, matchsplitsitecluster_t *T, Uint ovhglen, fasta_t *set, Alignment*** aligns,Uint chromidx,Uint seqlen, char left, matchfileRec_t r,void *space, int *e2, char ***refseqs, Uint **reflens, Uint **refstrand, Uint locreflen, Uint median, Uint interval, Uint start, Uint end, char *fwd, char *rev, Uint sinterval, int oven) {
+ int k, p, q, scores[] = {1,-1};;
+ Uint distpos, distchr, distreflen, l; /* **reflens, **refstrand,*/
+ int *M, **lmr, **lmv, **lmc, tempe, check;
+ char disttrans;/* ***refseqs,*/
+ int overhang;
+ int startleft;
+ int check2; /*reinsert deletion penalties?*/
+ overhang=2+ovhglen+ovhglen*.4;
+ /* if(overhang>interval){
+ overhang=interval;
+ }*/
+ startleft=(sinterval<overhang)? sinterval:overhang;
+ /* startleft=overhang;*/
+ for(k=begin; k <= stop; k++) {
+ e2[k]=0;
+ distpos = T->distpos[k];/*->?*/
+ distchr = T->distchr[k];
+ disttrans = T->disttrans[k];
+ distreflen = bl_fastaGetSequenceLength(set, distchr);
+ /* mrgn = (distpos > interval) ? interval : distpos; */
+ refseqs[k] = ALLOCMEMORY(NULL, NULL, char*, 2);
+ reflens[k] = ALLOCMEMORY(NULL, NULL, Uint, 2);
+ refstrand[k] = ALLOCMEMORY(NULL, NULL, Uint, 2);
+
+ aligns[k] = NULL;
+
+ if(distchr != chromidx && (ovhglen < 10 || oven < ovhglen-(0.35*ovhglen))) continue; /*??always true?*/
+ if(T->median+seqlen+1 >= locreflen) continue;
+ if(distpos+overhang/*mrgn*/+1 >= distreflen) continue;
+
+ if(left) {
+ if ((distchr == chromidx)&&(distpos>T->median) &&(T->median+seqlen/*??*/+1>distpos+ovhglen)) continue;
+ if(r.strand=='+') {
+ p =0; q =1;
+ } else {
+ p =1; q =0;
+ }
+ if ((distpos<startleft)) {
+ startleft=distpos;
+ }
+ if (overhang>distpos){
+ overhang=distpos;
+ }
+ refseqs[k][p] = &bl_fastaGetSequence(set, distchr)[distpos-overhang];
+ reflens[k][p] = startleft+overhang;
+ refseqs[k][q] = &bl_fastaGetSequence(set, chromidx)[T->median-1];
+ reflens[k][q] =end-T->median+1 ;/*seqlen; */
+ refstrand[k][q] = (r.strand == '+') ? 0 : 1;
+ if(disttrans)
+ refstrand[k][p] = (r.strand == '+') ? 1 : 0;
+ else
+ refstrand[k][p] = (r.strand == '+') ? 0 : 1;
+ } else {
+ if ((distchr == chromidx)&&(distpos<T->median)&&(T->median-seqlen/*??*/-1<distpos-ovhglen)) continue; /*strand??*/
+ if(r.strand=='+') {
+ p =0; q =1;
+ } else {
+ p =1; q =0;
+ }
+ if (startleft>distpos) {
+ startleft=distpos;
+ }
+ refseqs[k][p] = &bl_fastaGetSequence(set, chromidx)[start-1];
+ reflens[k][p] = median+5;
+ refseqs[k][q] = &bl_fastaGetSequence(set, distchr)[distpos-/*overhang*/startleft];
+ reflens[k][q] = startleft+overhang;
+ refstrand[k][p] = (r.strand == '+') ? 0 : 1;
+ if(disttrans)
+ refstrand[k][q] = (r.strand == '+') ? 1 : 0;
+ else
+ refstrand[k][q] = (r.strand == '+') ? 0 : 1;
+ }
+
+ aligns[k] = ALLOCMEMORY(space, NULL, Alignment*, 2);
+ aligns[k][0] = ALLOCMEMORY(space, NULL, Alignment, 1);
+ aligns[k][1] = ALLOCMEMORY(space, NULL, Alignment, 1);
+
+ initAlignment(aligns[k][p], (refstrand[k ][p]==0)? fwd : rev, seqlen,
+ 0, refseqs[k ][p], reflens[k ][p], 0);
+ initAlignment(aligns[k][q], (refstrand[k ][q]==0)? fwd : rev, seqlen,
+ 0, refseqs[k ][q], reflens[k ][q], 0);
+
+ M = localmultisplicedmatrix(space, fwd, rev, seqlen,
+ refseqs[k ], reflens[k ], refstrand[k ], 2, -1, 0,
+ constscr, scores, &lmv, &lmr, &lmc);
+
+ localmultisplicedtraceback(space, M, fwd, rev, seqlen,
+ refseqs[k ], reflens[k ], refstrand[k ], 2, -1, 0,
+ constscr, scores, aligns[k], lmv, lmr, lmc);
+ e2[k]=getAlignScore(aligns[k][0], scores, -1);
+ tempe= getAlignScore(aligns[k][1], scores, -1);
+ /*how many bases are not aligned, add deletion scores*/
+ check2=getUalignlen(aligns[k][q])+getUalignlen(aligns[k][p])-strlen(fwd);
+ /*check whether at least 50% of the overhang are aligned*/
+ if (left){
+ check=getAlignScore(aligns[k][p], scores, -1);
+ check*=2;
+ check-=(seqlen-getUalignlen(aligns[k][q]));
+ if(getUalignlen(aligns[k][p])<4) {
+ check=-1;
+ }
+ // if((getValignlen(aligns[k][q])+aligns[k][q]->voff)!=end) { /*v or u??*/
+ // check=-1;
+ //}
+ }
+ else {
+ check=getAlignScore(aligns[k][q], scores, -1)*2;
+ check-=(seqlen-getUalignlen(aligns[k][p]));
+ if(getUalignlen(aligns[k][q])<4) {
+ check=-1;
+ }
+ // if (aligns[k][p]->voff != start) { /*v or u??*/
+ // check=-1;
+ // }
+ }
+ /*if one of the parts is below 4??, set both zero*/
+ if((check<0)||(e2[k] < 4) || (tempe < 4)) {
+ e2[k]=0;/*in threading, e2=0?:*/
+ }
+ else {
+ e2[k] += tempe;
+ e2[k] += check2;
+ }
+
+ for(l=0; l < 2; l++) {
+ FREEMEMORY(space, lmv[l]);
+ FREEMEMORY(space, lmr[l]);
+ FREEMEMORY(space, lmc[l]);
+ }
+ FREEMEMORY(space, lmv);
+ FREEMEMORY(space, lmr);
+ FREEMEMORY(space, lmc);
+ FREEMEMORY(space, M);
+ }
+ return NULL;
+}
+
+
+void Readrealign(int begin, int stop, matchlistelem_t* arr, unsigned char fmt, matchsplitsitecluster_t *T, char left, fasta_t *set, Uint chromidx,Uint *noofrealigns/*??*/,FILE *realigndev, List *rlist, Uint interval,void *space, Uint sinterval) {
+ int j,p,k,l, rest, overhang;
+ int scores[] = {1,-1};
+ unsigned char *bookkeeper;
+ Uint start;
+ stringset_t *token;
+ matchfileRec_t r;
+ Alignment *laln, *raln, ***aligns;
+ Uint bestdistalign=0, bestdistpos=0, bestlocalign =0, bestlocpos=0, bestlocstrand=0,
+ bestaligns=0, bestdistspliceoff=0, bestlocspliceoff=0, bestlocustart=0, bestlocuend=0,
+ bestdistustart=0, bestdistuend=0, bestdistchr=0, seqlen,locreflen=0; /*, distreflen=0, q, v,*/
+ Uint distpos, distchr, ovhglen, median, end;
+ int bestscore, origscore, best;
+ char *seq, *qual, *out, *rnext, *rm, *fwd, *rev, *str, *ref, ***refseqs,
+ disttrans, bestdiststrand = '+', realigned = 0;
+ int noofrealignsT=0, oven;
+ Uint **reflens, **refstrand, mrgn;
+ for(j=begin; j <= stop; j++) {
+ str = arr[j].str;
+ start = arr[j].start;
+ end = arr[j].end;
+ bookkeeper = arr[j].bookkeeper;
+
+ if(!(bookkeeper[0] & LEFTSPLIT) && !(bookkeeper[0] & RIGHTSPLIT)) {
+ realigned = 0;
+ token = tokensToStringset(space, "\t", str, strlen(str));
+ bl_matchfileGetMatchFileRec(&r, 255, token, fmt);
+ //get overhanging sequence
+ ref = &bl_fastaGetSequence(set, chromidx)[start-1];
+ //both with 1-offset median points to position with 0-offset
+ median = T->median - start;
+ //get boundaries and score of overhanging sequence
+ rest = bl_matchfileSplitAlignment (r.curseq, ref, r.curaln, strlen(r.curaln),
+ median, scores, &laln, &raln, left);
+
+ bestscore = getAlignScore(laln, scores, -1) + getAlignScore(raln, scores, -1);
+ origscore = bestscore;
+
+ if(left) {
+ ovhglen = getUalignlen(laln) ;
+ oven = getAlignScore(laln, scores, -1);
+ if(arr[j].end<T->median) {
+ ovhglen=0;
+ }
+ } else {
+ ovhglen = getUalignlen(raln);
+ oven = getAlignScore(raln, scores, -1);
+ if(arr[j].start>T->median) {
+ ovhglen=0;
+ }
+ }
+
+ seqlen = rest - laln->uoff;
+ qual = ALLOCMEMORY(space, NULL, char, seqlen+1);
+ seq = ALLOCMEMORY(space, NULL, char, seqlen+1);
+ memmove(seq, &r.curseq[laln->uoff], seqlen);
+ memmove(qual, &r.curqual[laln->uoff], seqlen);
+ seq[seqlen] = 0;
+ qual[seqlen] = 0;
+
+ if(oven < ovhglen-(0.25*ovhglen) && ovhglen >= 4) {
+ int *e2/*??*/;
+ int q;
+ rm = charIUPACcomplement(space, seq, seqlen);
+
+/*
+ fprintf(stderr, "splitting: %s\n", str);
+ fprintf(stderr, "ovhglen:%d, e:%d\n", ovhglen, e);
+ fprintf(stderr, "distsites:%d\n", T[i].distsites);
+ fprintf(stderr, "left:\n");
+ showAlign(laln,stderr);
+ fprintf(stderr, "right:\n");
+ showAlign(raln,stderr);
+ fprintf(stderr, "rco: %s\n", rm);
+*/
+ //align overhanging sequence to distant splits
+ aligns = ALLOCMEMORY(space, NULL, Alignment**, T->distsites);
+ refseqs = ALLOCMEMORY(space, NULL, char*, T->distsites);
+ reflens = ALLOCMEMORY(space, NULL, Uint*, T->distsites);
+ refstrand = ALLOCMEMORY(space, NULL, Uint*, T->distsites);
+ locreflen = bl_fastaGetSequenceLength(set, chromidx);
+ e2= ALLOCMEMORY(space, NULL, int, T->distsites+1);
+ //ensure original read direction
+ if(r.strand == '-') {
+ fwd = rm;
+ rev = seq;
+ } else {
+ fwd = seq;
+ rev = rm;
+ }
+ assert(strlen(fwd) == seqlen);
+
+ best=0; /*da??*/
+ /*e2???*/
+
+ threadrealign(0,T->distsites-1 , T, ovhglen, set, aligns,chromidx,seqlen, left,r,space, e2, refseqs, reflens, refstrand, locreflen, median, interval, start, end, fwd, rev, sinterval,oven);
+ if(left) {
+ if(r.strand=='+') {
+ p =0; q =1;
+ } else {
+ p =1; q =0;
+ }
+ } else {
+ if(r.strand=='+') {
+ p =0; q =1;
+ } else {
+ p =1; q =0;
+ }
+ }
+
+ for (k=0; k< T->distsites; k++) {
+ if (e2[k]< bestscore) {
+ continue;
+ }
+ if (e2[k]> bestscore) {
+ if((left && aligns[k][q]->voff != 0) ||
+ (!left && aligns[k][p]->voff+getValignlen(aligns[k][p]) != median+1) ||
+ (left && ((T->median+aligns[k][q]->voff+getValignlen(aligns[k][q])) != end+1)) || /*?*/
+ (!left && aligns[k][p]->voff != 0) /*+1?*/ ) {
+ // fprintf(stderr, "not at median\n");
+ }
+ else {
+ bestscore=e2[k];
+ best=k;
+ }
+ continue;
+ }
+ if (T->distcnt[best]<T->distcnt[k]/*anzahl reads groesser*/) {
+ if((left && aligns[k][q]->voff != 0) ||
+ (!left && aligns[k][p]->voff+getValignlen(aligns[k][p]) != median+1) ||
+ (left && T->median+aligns[k][q]->voff+getValignlen(aligns[k][q]) != end+1) || /*???*/
+ (!left && aligns[k][p]->voff != 0)) {
+ // fprintf(stderr, "not at median\n");
+ }
+ else {
+ bestscore=e2[k];
+ best=k;
+ }
+ }
+ }
+
+
+ if(bestscore > origscore) {
+ /* if(e2[best] >= bestscore) { */
+
+ if((left && aligns[best][q]->voff != 0) ||
+ (!left && aligns[best][p]->voff+getValignlen(aligns[best][p]) != median+1)||
+ (left && T->median+aligns[best][q]->voff+getValignlen(aligns[best][q]) != end+1) || /*???*/
+ (!left && aligns[best][p]->voff != 0)) {
+
+ // fprintf(stderr, "not at median\n");
+ }
+ else{
+ // if(disttrans) fprintf(stderr, "distant split %d -> %d\n",refstrand[k][0], refstrand[k][1]);
+ // else fprintf(stderr, "regular split %d -> %d\n", refstrand[k][0], refstrand[k][1]);
+ // fprintf(stderr, "at median with score %d\n", e2);
+ bestaligns = best;
+ bestscore = e2[best];
+ distpos=T->distpos[best];
+ distchr=T->distchr[best];
+ disttrans=T->disttrans[best];
+ overhang=2+ovhglen+ovhglen*.4;
+ mrgn=(distpos > overhang) ? overhang : distpos;
+ if(left) {
+ bestdistalign = p;
+ bestlocspliceoff = 0;
+ /*if(sinterval<mrgn){
+ mrgn=sinterval;
+ } this seems to be wrongly applied..*/
+ } else {
+ bestdistalign = q;
+ if(sinterval<mrgn){
+ mrgn=sinterval;
+ }
+ }
+
+ bestdistpos = distpos-mrgn + aligns[bestaligns][bestdistalign]->voff + 1;
+ bestdistchr = distchr ;
+
+ if (!disttrans) {
+ bestdiststrand = r.strand;
+ } else {
+ bestdiststrand = (r.strand == '+') ? '-' : '+';
+ }
+
+ bestdistustart = (refstrand[bestaligns][bestdistalign] == 0) ?
+ aligns[bestaligns][bestdistalign]->uoff+1 :
+ seqlen-aligns[best][bestdistalign]->uoff -
+ getUalignlen(aligns[bestaligns][bestdistalign])+1;
+
+ bestdistuend = bestdistustart+getUalignlen(aligns[bestaligns][bestdistalign])-1;
+
+ if (left) {
+ bestlocalign = q;
+ bestlocpos = T->median;
+
+ bestdistspliceoff = (disttrans) ? 0 :
+ getValignlen(aligns[bestaligns][bestdistalign])-1;
+
+ } else {
+ bestlocalign = p;
+ bestlocpos = start;
+
+ bestdistspliceoff = (!disttrans) ? 0 :
+ getValignlen(aligns[bestaligns][bestdistalign])-1;
+
+ bestlocspliceoff = getValignlen(aligns[bestaligns][bestlocalign]) - 1;
+ }
+
+ bestlocstrand = r.strand;
+
+ bestlocustart = (refstrand[bestaligns][bestlocalign] == 0) ?
+ aligns[bestaligns][bestlocalign]->uoff+1 :
+ seqlen-aligns[bestaligns][bestlocalign]->uoff -
+ getUalignlen(aligns[bestaligns][bestlocalign])+1;
+
+ bestlocuend = bestlocustart+getUalignlen(aligns[bestaligns][bestlocalign])-1;
+ /* }*/
+ /* fprintf(stderr, "aligns[0]:\n");
+ showAlign(aligns[k][0],stderr);
+ fprintf(stderr, "aligns[1]:\n");
+ showAlign(aligns[k][1],stderr);
+ */
+ /* }*/
+ /*
+ FREEMEMORY(space, siteidx);
+ */
+
+ /* }*/
+ /* if(bestscore > origscore) { */
+ bookkeeper[0] |= (left) ? LEFTSPLIT : RIGHTSPLIT;
+ realigned = 1;
+ pthread_mutex_lock(&mutkuh);
+
+ if(!T->realigned) {
+ T->realigned = ALLOCMEMORY(space, NULL, Uint, T->distsites);
+ memset(T->realigned, 0, sizeof(Uint)*T->distsites);
+ }
+ T->realigned[bestaligns]++;
+ /* T->real++;*/
+ /*New for counting once only*/
+ /* T->cnt++;
+ T->distcnt[bestaligns]++;*/
+
+ /*end new for counting once only*/
+
+ pthread_mutex_unlock(&mutkuh);
+
+ rnext = bl_fastaGetDescription(set, bestdistchr);
+ pthread_mutex_lock(&inout);
+
+ out = bl_matchfileRealignWriteSAM (&r, qual, r.curchrom, bestlocpos, bestlocstrand,
+ aligns[bestaligns][bestlocalign], bestlocustart, bestlocuend,
+ (bestlocustart<bestdistustart?/*new*/0:1), 0, rnext, bestdistpos+bestdistspliceoff, bestdiststrand);
+ // pthread_mutex_lock(&inout);
+ fprintf(realigndev, "%s\n", out);
+ // pthread_mutex_unlock(&inout);
+
+ // for(l=0; l < getValignlen(aligns[bestaligns][bestlocalign]); l++) {
+ // fprintf(stderr, "%c",((char*)&bl_fastaGetSequence(set, chromidx)[bestlocpos-1])[l]);
+ // }
+ // fprintf(stderr, "\n");
+
+ FREEMEMORY(space, out);
+
+ out = bl_matchfileRealignWriteSAM (&r, qual, rnext, bestdistpos, bestdiststrand,
+ aligns[bestaligns][bestdistalign], bestdistustart, bestdistuend,
+ (bestlocustart<bestdistustart?/*new*/1:0)/*new*/, 0, r.curchrom, bestlocpos+bestlocspliceoff, bestlocstrand);
+
+ fprintf(realigndev, "%s\n", out);
+ pthread_mutex_unlock(&inout);
+
+ // for(l=0; l < getValignlen(aligns[bestaligns][bestdistalign]); l++) {
+ // fprintf(stderr, "%c", ((char*)&bl_fastaGetSequence(set, bestdistchr)[bestdistpos-1])[l]);
+ // }
+ // fprintf(stderr, "\n");
+
+ FREEMEMORY(space, out);
+ }
+ }
+ FREEMEMORY(space, e2);
+ for(k=0; k < T->distsites; k++) {
+ if(aligns[k]) {
+ for(l=0; l < 2; l++) {
+ wrapAlignment(aligns[k][l]);
+ FREEMEMORY(space, aligns[k][l]);
+ }
+ FREEMEMORY(space, aligns[k]);
+ }
+ FREEMEMORY(space, refseqs[k]);
+ FREEMEMORY(space, reflens[k]);
+ FREEMEMORY(space, refstrand[k]);
+ }
+
+ //enlist all unmarked sequences to be used in next iter
+ FREEMEMORY(space, aligns);
+ FREEMEMORY(space, refseqs);
+ FREEMEMORY(space, reflens);
+ FREEMEMORY(space, refstrand);
+ FREEMEMORY(space, rm);
+
+ }
+
+ wrapAlignment(laln);
+ wrapAlignment(raln);
+
+ FREEMEMORY(space, laln);
+ FREEMEMORY(space, raln);
+ FREEMEMORY(space, r.curaln);
+ FREEMEMORY(space, r.diff);
+ FREEMEMORY(space, seq);
+ FREEMEMORY(space, qual);
+ destructStringset(space, token);
+
+ if(realigned) {
+ noofrealignsT++;
+ }
+ }
+
+ pthread_mutex_lock(&mutkuh);
+
+ bl_matchfileEnlistMatch (space, rlist, arr[j].start, arr[j].end,
+ arr[j].distchr, arr[j].distpos, arr[j].adjoint,
+ arr[j].trans, arr[j].str, arr[j].bookkeeper);
+ pthread_mutex_unlock(&mutkuh);
+
+
+ }
+ *noofrealigns=noofrealignsT;
+ return;
+}
+
+void *Readthreadstarter(void *args) {
+ readthread *t;
+ t=(readthread*)args;
+ Readrealign(t->begin, t->stop, t->arr, t->fmt, t->T,t->left,t->set,t->chromidx,&t->noofrealigns,t->realigndev,t->rlist, t->interval,t->space, t->sinterval);
+ return NULL;
+}
+
+
diff --git a/segemehl/libs/realign.h b/segemehl/libs/realign.h
new file mode 100644
index 0000000..355a65d
--- /dev/null
+++ b/segemehl/libs/realign.h
@@ -0,0 +1,220 @@
+#ifndef REALIGN_H
+#define REALIGN_H
+
+/*
+ *
+ * realign.h
+ * realignment
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 07/10/2012 10:01:20 PM CEST
+ *
+ */
+
+#include "basic-types.h"
+#include "matchfiles.h"
+
+typedef struct{
+ Uint pos;
+ char trans;
+} matchsplitsearchkey_t;
+
+typedef struct {
+ Uint start;
+ Uint end;
+ Uint distchr;
+ Uint distpos;
+ Uint adjoint;
+ char trans;
+ char *str;
+ unsigned char *bookkeeper;
+} matchlistelem_t;
+
+typedef struct{
+ Uint chridx;
+ Uint pos;
+ Uint cnt;
+ Uint trans;
+} matchsplitsite_t;
+
+typedef struct{
+ Uint a;
+ Uint b;
+ Uint median;
+ Uint cnt;
+ Uint noofrealigns;
+ char *str;
+#ifdef CHECKLINKS
+ Uint trans;
+ Uint lnkcnt;
+#endif
+
+ Uint *distchr;
+ Uint *distpos;
+ char *disttrans;
+ Uint *distcnt;
+ Uint *realigned;
+ Uint distsites;
+
+
+ Uint *distclust;
+ Uint *distclustchr;
+ Uint *distclustcnt;
+ Uint *distclustrealigned;
+ char *distclusttype;
+ Uint noofdistclust;
+ double *emat;
+
+#if defined PARANOID && defined CHECKLINKS
+ char *rpos;
+ Uint *linked;
+#endif
+
+ int16_t* adjoint;
+ Uint* adjointcnt;
+ Uint noofadjoints;
+
+ Uint *adjclust;
+ Uint *adjclustweights;
+ Uint noofadjclust;
+
+} matchsplitsitecluster_t;
+
+typedef struct{
+
+ matchsplitsitecluster_t* cluster;
+ Uint noofclusters;
+
+} matchsplitsiteclusterlist_t;
+
+
+typedef struct {
+ matchsplitsiteclusterlist_t *L;
+ matchsplitsiteclusterlist_t *R;
+ void *space;
+ FILE *normdev;
+ FILE *transdev;
+ FILE *realigndev;
+ MultiCharSeq *seq;
+ fasta_t *fasta;
+ unsigned char mute;
+ Uint threadno;
+ int scores[2];
+ int indel;
+ int transition;
+ Uint minfragmentalignlen;
+ int minfragmentalignscore;
+ Uint minsplicedaligncover;
+ Uint minsplicedalignscore;
+ int accuracy;
+ char *splitfile;
+ char *transfile;
+ char *outfile;
+ int maxdist;
+} realign_t;
+
+
+inline static void
+ra_setdefault(realign_t *info) {
+ info->L = NULL;
+ info->R = NULL;
+ info->space = NULL;
+ info->normdev = stderr;
+ info->transdev = stderr;
+ info->realigndev = stdout;
+ info->seq = NULL;
+ info->fasta = NULL;
+ info->threadno = 1;
+ info->mute = 0;
+ info->scores[0] = 1;
+ info->scores[1] = -2;
+ info->indel = -2;
+ info->transition = -10;
+ info->minfragmentalignlen = 20;
+ info->minfragmentalignscore = 18;
+ info->minsplicedaligncover = 60;
+ info->minsplicedalignscore = 2*18;
+ info->accuracy = 80;
+ info->splitfile = NULL;
+ info->transfile = NULL;
+ info->outfile = NULL;
+}
+
+typedef struct {
+ int begin;
+ int stop;
+ int *e2;
+ char *seq;
+ matchsplitsitecluster_t *T;
+ Alignment ***aligns;
+ char ***refseq;
+ Uint **reflens;
+ Uint **refstrand;
+ fasta_t *set;
+ char *fwd;
+ char *rev;
+ Uint ovhglen;
+ matchfileRec_t r;
+ Uint chromidx;
+ Uint seqlen;
+ Uint start;
+ void *space;/*??*/
+ int **lmr;
+ int **lmv;
+ int **lmc;
+ char left;
+ Uint locreflen;
+ Uint median;
+ Uint interval;
+ Uint sinterval;
+ int oven;
+ } realignthread;
+
+typedef struct {
+ void *space;
+ FILE *realigndev;
+ List *rlist;
+ matchsplitsitecluster_t *T;
+ char left;
+ fasta_t *set;
+ Uint chromidx;
+ unsigned char fmt;
+ Uint interval;
+} splicesitethread;
+
+typedef struct {
+ int begin;
+ int stop;
+ matchlistelem_t* arr;
+ unsigned char fmt;
+ matchsplitsitecluster_t *T;
+ char left;
+ fasta_t *set;
+ Uint chromidx;
+ Uint noofrealigns/*??*/;
+ FILE *realigndev;
+ List *rlist;
+ Uint interval;
+ void *space;
+ Uint sinterval;
+} readthread;
+
+
+#define LEFTLIST ((unsigned char) 1)
+#define RIGHTLIST ((unsigned char) 1 << 1)
+#define LEFTSPLIT ((unsigned char) 1 << 2)
+#define RIGHTSPLIT ((unsigned char) 1 << 3)
+
+Uint cmp_matchsplitsitecluster_bin(Uint a, void *data, void *key, void *nfo);
+void bl_matchLinkAdjoinedCluster(void *space, matchsplitsiteclusterlist_t *L, matchsplitsiteclusterlist_t *R, Uint nchr);
+void bl_matchLinkDistCluster (void *space, matchsplitsiteclusterlist_t *R, matchsplitsiteclusterlist_t *L, Uint n);
+Uint bl_matchfileRealignScanFileNew(void *space, matchfile_t *file, FILE *realigndev, fasta_t *set, unsigned char fields, matchsplitsiteclusterlist_t **Lclust, matchsplitsiteclusterlist_t **Rclust, Uint *nchr, int threadno, int maxdist);
+void bl_matchDestructMatchsplitsiteclusterlist (void *space, matchsplitsiteclusterlist_t *l, Uint n);
+void *threadrealign(int begin, int stop, matchsplitsitecluster_t *T, Uint ovhglen, fasta_t *set, Alignment*** aligns,Uint chromidx,Uint seqlen, char left, matchfileRec_t r,void *space, int *e2, char ***refseqs, Uint **reflens, Uint **refstrand, Uint locreflen, Uint median, Uint interval, Uint start, Uint end, char *fwd, char *rev, Uint sinterval, int oven);
+void *realignthreadstarter(void *args);
+void Readrealign(int begin, int stop, matchlistelem_t* arr, unsigned char fmt, matchsplitsitecluster_t *T, char left, fasta_t *set, Uint chromidx,Uint *noofrealigns/*??*/,FILE *realigndev, List *rlist, Uint interval,void *space, Uint sinterval);
+void *Readthreadstarter(void *args);
+
+#endif
+
diff --git a/segemehl/libs/remapping.c b/segemehl/libs/remapping.c
new file mode 100644
index 0000000..b5a724b
--- /dev/null
+++ b/segemehl/libs/remapping.c
@@ -0,0 +1,1603 @@
+/**
+ * remapping.c
+ * remapping of unmapped reads
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Fri Oct 12 09:55:49 CEST 2012
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 407 $
+ * Author: $Author: steve $
+ * Date: $Date: 2014-02-06 04:55:25 -0500 (Thu, 06 Feb 2014) $
+ * Id: $Id: remapping.c 407 2014-02-06 09:55:25Z steve $
+ * Url: $URL: http://www2.bioinf.uni-leipzig.de/svn5/segemehl/libs/remapping.c $
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <ctype.h>
+#include "debug.h"
+#include "info.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "container.h"
+#include "list.h"
+#include "sort.h"
+#include "manout.h"
+#include "manopt.h"
+#include "multicharseq.h"
+#include "charsequence.h"
+#include "iupac.h"
+#include "alignment.h"
+#include "sw.h"
+#include "vtprogressbar.h"
+#include "segemehl.h"
+#include "realign.h"
+#include "remapping.h"
+
+void
+rm_initStat (remappingstat_t *stat){
+ stat->status = NULL;
+ stat->aligns = NULL;
+ stat->maxdist = NULL;
+ stat->extensions = NULL;
+ stat->n = 0;
+}
+
+void
+rm_addStat (void *space, remappingstat_t *stat,
+ remappingstatus_t status, Uint aligns,
+ Uint extensions, Uint maxdist){
+ stat->status = ALLOCMEMORY(space, stat->status, remappingstatus_t, stat->n+1);
+ stat->aligns = ALLOCMEMORY(space, stat->aligns, Uint, stat->n+1);
+ stat->extensions = ALLOCMEMORY(space, stat->extensions, Uint, stat->n+1);
+ stat->maxdist = ALLOCMEMORY(space, stat->maxdist, Uint, stat->n+1);
+ stat->status[stat->n] = status;
+ stat->aligns[stat->n] = aligns;
+ stat->extensions[stat->n] = extensions;
+ stat->maxdist[stat->n] = maxdist;
+ stat->n++;
+}
+
+void
+rm_destructStat (void *space, remappingstat_t *stat){
+ if (stat->status)
+ FREEMEMORY(space, stat->status);
+ if (stat->aligns)
+ FREEMEMORY(space, stat->aligns);
+ if (stat->maxdist)
+ FREEMEMORY(space, stat->maxdist);
+ if (stat->extensions)
+ FREEMEMORY(space, stat->extensions);
+ stat->n = 0;
+}
+
+void
+rm_updateProgressBar(Uint k, remapping_t *nfo) {
+
+ if (!nfo->mute) {
+ if (nfo->counter == NULL) {
+ progressBarVT("reads remapped.", nfo->reads->noofseqs, k, 25);
+ } else {
+ (*nfo->counter)++;
+ }
+ }
+ return;
+}
+
+void*
+remappingworker(void *args) {
+ remapping_t *t;
+
+ t = (remapping_t*) args;
+ bl_remapping(t->space, t->seq, t->reads, t->L, t->R, t);
+ return NULL;
+}
+
+int cmp_remappingclust_qsort(const void *a, const void* b) {
+ remappingclust_t *aclust = (remappingclust_t*) a;
+ remappingclust_t *bclust = (remappingclust_t*) b;
+
+ if(aclust->cnt > bclust->cnt) return -1;
+ if(aclust->cnt < bclust->cnt) return 1;
+
+ return 0;
+
+}
+
+void bl_remappingPrintAlign(FILE *dev, MultiCharSeqAlignment *mcsa, remapping_t *nfo){
+ Uint ustart, uend;
+
+ ustart = mcsa->al->uoff;
+ if (mcsa->strand == 1) {
+ uend = mcsa->al->ulen - ustart - 1;
+ ustart = uend - getUalignlen(mcsa->al) + 1;
+ } else {
+ uend = ustart + getUalignlen(mcsa->al) - 1;
+ }
+
+ fprintf(dev, "chr=%u, strand=%u, refstart=%u, refend=%u, reflen=%u, vstart=%u, vend=%u, vlen=%u, qrylen=%u, ustart=%u, uend=%u, ulen=%u, score=%d, edist=%d\n", mcsa->subidx, mcsa->strand, mcsa->refstart - mcsa->substart, mcsa->refstart - mcsa->substart + mcsa->reflen - 1, mcsa->reflen, mcsa->refstart - mcsa->substart + mcsa->al->voff, mcsa->refstart - mcsa->substart + mcsa->al->voff + getValignlen(mcsa->al) - 1, getValignlen(mcsa->al), mcsa->qrylen, ustart, uend, getUalignlen(mcsa->al) [...]
+
+ showAlign(mcsa->al, dev);
+}
+
+void bl_remappingPrintClust(FILE *dev, remappingclust_t *clust, remapping_t *nfo){
+ Uint j;
+
+ fprintf(dev, "dist: type=%u, chr=%u, clust=%d", clust->type, clust->chr, clust->clust);
+ if (clust->clust != -1){
+ fprintf(dev, ", a=%u, b=%u, median=%u", clust->cluster->a - 1, clust->cluster->b - 1, clust->cluster->median - 1);
+ }
+ fprintf(dev, ", noofmcsa=%u, score=%d\n", clust->noofmcsa, clust->score);
+
+ if (clust->noofmcsa > 0 && clust->mcsa != NULL){
+ for (j = 0; j < clust->noofmcsa; j++){
+ fprintf(dev, "frag[%u]: ", j);
+ bl_remappingPrintAlign(dev, clust->mcsa[j], nfo);
+ }
+ }
+}
+
+remappingclust_t *bl_remappingAlign(void *space, char **seqs, Container *dist, remapping_t *nfo){
+ Uint i, j, noofmcsa, ulen, vlen, qrylen = 0, *reflens, *strands;
+ int *M, **lmr, **lmv, **lmc, edist, score, purge;
+ int bestscore;
+ char **refseqs;
+ Alignment **aligns;
+ remappingclust_t *cur, *best;
+
+ best = NULL;
+ bestscore = -1;
+
+ assert(bl_containerSize(dist) > 0);
+ noofmcsa = ((remappingclust_t *) bl_containerGet(dist, 0))->noofmcsa;
+
+ reflens = ALLOCMEMORY(space, NULL, Uint, noofmcsa);
+ strands = ALLOCMEMORY(space, NULL, Uint, noofmcsa);
+ refseqs = ALLOCMEMORY(space, NULL, char*, noofmcsa);
+ aligns = ALLOCMEMORY(space, NULL, Alignment*, noofmcsa);
+
+ for (i = 0; i < bl_containerSize(dist); i++){
+ cur = bl_containerGet(dist, i);
+ assert(cur->noofmcsa == noofmcsa);
+ assert(noofmcsa < 3);
+
+ for (j = 0; j < cur->noofmcsa; j++){
+ if (cur->mcsa[j]->reflen < nfo->minfragmentalignlen)
+ continue;
+ if (qrylen == 0)
+ qrylen = cur->mcsa[j]->qrylen;
+ }
+
+
+ for (j = 0; j < noofmcsa; j++){
+ aligns[j] = cur->mcsa[j]->al;
+ refseqs[j] = cur->mcsa[j]->refseq;
+ reflens[j] = cur->mcsa[j]->reflen;
+ strands[j] = cur->mcsa[j]->strand;
+ }
+
+ // no transition penalty
+ M = localmultisplicedmatrix(space, seqs[0], seqs[1], qrylen,
+ refseqs, reflens, strands, noofmcsa,
+ nfo->indel, 0, constscr, nfo->scores,
+ &lmv, &lmr, &lmc);
+ assert(M != NULL);
+
+ localmultisplicedtraceback(space, M, seqs[0], seqs[1], qrylen,
+ refseqs, reflens, strands, noofmcsa, nfo->indel, 0,
+ constscr, nfo->scores, aligns, lmv, lmr, lmc);
+
+ purge = 0;
+ cur->score = 0;
+ for (j = 0; j < noofmcsa; j++){
+ ulen = getUalignlen(aligns[j]);
+ vlen = getValignlen(aligns[j]);
+ score = getAlignScore(aligns[j], nfo->scores, nfo->indel);
+ edist = getEdist(aligns[j]);
+ cur->score += score;
+
+ // check if both alignment fragments
+ // meet the given requirements
+ if (ulen < nfo->minfragmentalignlen ||
+ vlen < nfo->minfragmentalignlen ||
+ score < nfo->minfragmentalignscore ||
+ edist > (ulen - ceil((nfo->accuracy * ulen)/100))) {
+ purge = 1;
+ //fprintf(stderr, "purge: ulen=%u, vlen=%u, score=%d, edist=%u\n", ulen,
+ // vlen, score, edist);
+ }
+ }
+
+ if (!purge){
+ /*
+ for (j = 0; j < noofmcsa; j++){
+ ustart = aligns[j]->uoff;
+ vstart = aligns[j]->voff;
+ ulen = getUalignlen(aligns[j]);
+ vlen = getValignlen(aligns[j]);
+ score = getAlignScore(aligns[j], nfo->scores, nfo->indel);
+ totalscore += score;
+
+ if (strands[j] == 1){
+ uend = qrylen - ustart - 1;
+ ustart = uend - ulen + 1;
+ }
+ else {
+ uend = ustart + ulen - 1;
+ }
+
+ }*/
+
+ if (cur->score > bestscore){
+ bestscore = cur->score;
+ best = cur;
+ }
+ }
+
+ for (j = 0; j < noofmcsa; j++){
+ //wrapAlignment(aligns[j]);
+ //FREEMEMORY(space, aligns[j]);
+ FREEMEMORY(space, lmv[j]);
+ FREEMEMORY(space, lmr[j]);
+ FREEMEMORY(space, lmc[j]);
+ }
+ FREEMEMORY(space, lmv);
+ FREEMEMORY(space, lmr);
+ FREEMEMORY(space, lmc);
+ FREEMEMORY(space, M);
+ }
+
+ FREEMEMORY(space, reflens);
+ FREEMEMORY(space, refseqs);
+ FREEMEMORY(space, strands);
+ FREEMEMORY(space, aligns);
+
+ return best;
+}
+
+void bl_remappingUpdateAlignSeqs(void *space, MultiCharSeq *seq, char **seqs, Uint qrylen, remappingclust_t *cur){
+ Uint i, substart, subend;
+ MultiCharSeqAlignment *a;
+
+ for (i = 0; i < cur->noofmcsa; i++){
+ a = cur->mcsa[i];
+ getMultiCharSeqIdxBounds(seq, a->subidx, &substart, &subend);
+ a->substart = substart;
+ a->subend = subend;
+
+ a->refstart += a->substart;
+ a->refseq = &seq->sequences[a->refstart];
+ a->reflen = (a->subend > (Lint) a->refstart + a->reflen) ?
+ a->reflen : a->subend - a->refstart + 1;
+
+ a->query = seqs[a->strand];
+ a->qrylen = qrylen;
+ a->al = ALLOCMEMORY(space, NULL, Alignment, 1);
+ initAlignment(a->al, a->query, a->qrylen, 0, a->refseq, a->reflen, 0);
+ }
+}
+
+void bl_remappingUpdateAlignOff(void *space, char **seqs, Uint qrylen, remappingclust_t *cur, Uint offset, Uint right){
+ Uint i, ulen;
+ int edist;
+ MultiCharSeqAlignment *mcsa;
+
+ for (i = 0; i < cur->noofmcsa; i++){
+ mcsa = cur->mcsa[i];
+
+ /* for safety: ulen & edist should not change */
+ ulen = getUalignlen(mcsa->al);
+ edist = getEdist(mcsa->al);
+
+ //showAlign(mcsa->al, stderr);
+ //fprintf(stderr, "%s\n%s\n", seqs[mcsa->strand], mcsa->al->u);
+
+ /*
+ * replace query substring by entire one,
+ * adjust offsets and lengths
+ */
+ mcsa->query = seqs[mcsa->strand];
+ mcsa->qrylen = qrylen;
+ if (mcsa->al->ulen != qrylen){
+ mcsa->al->meops = ALLOCMEMORY(space, mcsa->al->meops, Multieop, mcsa->al->vlen + qrylen);
+ }
+ mcsa->al->ulen = qrylen;
+ mcsa->al->u = seqs[mcsa->strand];
+ if (right && mcsa->strand == 0) mcsa->al->uoff += offset;
+ if (!right && mcsa->strand == 1) mcsa->al->uoff += offset;
+
+ //fprintf(stderr, "offset=%u, uoff=%u, ulen=%u - %u, edist=%d - %d\n",
+ // offset, mcsa->al->uoff, ulen, getUalignlen(mcsa->al), edist,
+ // getEdist(mcsa->al));
+ //showAlign(mcsa->al, stderr);
+
+ assert(ulen == getUalignlen(mcsa->al) &&
+ edist == getEdist(mcsa->al));
+ }
+}
+
+void bl_remappingDestructClust(void *elem){
+ Uint i;
+ remappingclust_t *cur = (remappingclust_t *) elem;
+
+ if (cur->mcsa != NULL && cur->noofmcsa > 0){
+ for (i = 0; i < cur->noofmcsa; i++){
+ if (cur->mcsa[i] != NULL){
+ if (cur->mcsa[i]->al != NULL){
+ wrapAlignment(cur->mcsa[i]->al);
+ free(cur->mcsa[i]->al);
+ }
+ free(cur->mcsa[i]);
+ }
+ }
+ free(cur->mcsa);
+ }
+}
+
+void bl_remappingInitAlign(void *space, MultiCharSeqAlignment *mcsa, Uint subidx,
+ Uint start, Uint len, unsigned char strand){
+ mcsa->subidx = subidx;
+ mcsa->refstart = start;
+ mcsa->reflen = len;
+ mcsa->strand = strand;
+ mcsa->al = NULL;
+}
+
+remappingclust_t *bl_remappingUnsplicedAlign(void *space, MultiCharSeq *seq, char **seqs, Uint len,
+ Uint chr, Uint start, Uint end, unsigned char strand, remapping_t *nfo){
+ Container *a;
+ remappingclust_t *clust, *ret;
+
+ a = ALLOCMEMORY(space, NULL, Container, 1);
+ bl_containerInit(a, 1, sizeof(remappingclust_t));
+
+ clust = ALLOCMEMORY(space, NULL, remappingclust_t, 1);
+ clust->chr = 0;
+ clust->type = 0;
+ clust->clust = -1;
+ clust->cluster = NULL;
+ clust->mcsa = ALLOCMEMORY(space, NULL, MultiCharSeqAlignment*, 1);
+ clust->mcsa[0] = ALLOCMEMORY(space, NULL, MultiCharSeqAlignment, 1);
+ clust->noofmcsa = 1;
+ bl_remappingInitAlign(space, clust->mcsa[0], chr, start, end-start+1, strand);
+ bl_remappingUpdateAlignSeqs(space, seq, seqs, len, clust);
+ bl_containerAdd(a, clust);
+
+ ret = bl_remappingAlign(space, seqs, a, nfo);
+ if (ret == NULL){
+ bl_remappingDestructClust(clust);
+ FREEMEMORY(space, clust);
+ }
+ else {
+ memmove(clust, ret, sizeof(remappingclust_t));
+ }
+
+ bl_containerDestruct(a, NULL);
+ FREEMEMORY(space, a);
+
+ return(clust);
+}
+
+void bl_remappingGetDist(void *space, Container *a, matchsplitsiteclusterlist_t *L, matchsplitsiteclusterlist_t *R,
+ Uint type, Uint chr, Uint clust, unsigned char strand, Uint last, Uint margin, unsigned char right){
+ Uint i, j, curstart, curlen, diststart, distlen;
+ matchsplitsiteclusterlist_t *list, *other;
+ remappingclust_t dist;
+
+ if (type == 0){
+ list = L;
+ other = R;
+ }
+ else {
+ list = R;
+ other = L;
+ }
+
+ for (i = 0; i < list[chr].cluster[clust].noofdistclust; i++){
+ dist.chr = list[chr].cluster[clust].distclustchr[i];
+ dist.clust = list[chr].cluster[clust].distclust[i];
+ if (list[chr].cluster[clust].distclusttype[i] == 1){
+ dist.type = 1-type;
+ dist.cluster = &other[dist.chr].cluster[dist.clust];
+ }
+ else {
+ dist.type = type;
+ dist.cluster = &list[dist.chr].cluster[dist.clust];
+ }
+ dist.cnt = list[chr].cluster[clust].distclustcnt[i];
+
+ /* - 1 offset on median */
+ if (type == 0){
+ curlen = (last + 1 + 1 >= list[chr].cluster[clust].median) ?
+ last - list[chr].cluster[clust].median + 1 + 1 : 0;
+ curstart = list[chr].cluster[clust].median - 1;
+ }
+ else {
+ curlen = (list[chr].cluster[clust].median - 1 + 1 >= last)?
+ list[chr].cluster[clust].median - 1 - last + 1 : 0;
+ curstart = last;
+ }
+
+ if (margin <= curlen || curlen == 0){
+ continue;
+ }
+
+ /* - 1 offset on median */
+ if (dist.type == 0){
+ diststart = dist.cluster->median - 1;
+ }
+ else {
+ diststart = (dist.cluster->median - 1 + curlen + 1 > margin) ?
+ dist.cluster->median - 1 - margin + curlen + 1 : 0;
+ }
+ distlen = margin - curlen;
+
+ /*
+ * init spliced alignment slides with subidx,
+ * start (relative to subidx), length, and
+ * strand where the strand of the second slide
+ * is determined by the previous strand and the
+ * distclusttype: strand stays the same on R2L
+ * connections (ie. distclusttype==1) and changes
+ * otherwise
+ */
+ j = (list[chr].cluster[clust].distclusttype[i] == 1) ?
+ strand : 1-strand;
+
+ dist.mcsa = ALLOCMEMORY(space, NULL, MultiCharSeqAlignment*, 2);
+ dist.mcsa[0] = ALLOCMEMORY(space, NULL, MultiCharSeqAlignment, 1);
+ dist.mcsa[1] = ALLOCMEMORY(space, NULL, MultiCharSeqAlignment, 1);
+ dist.noofmcsa = 2;
+
+ /* order of alignment is dependent on current direction of extension */
+ bl_remappingInitAlign(space, dist.mcsa[(right) ? 0 : 1], chr, curstart, curlen, strand);
+ bl_remappingInitAlign(space, dist.mcsa[(right) ? 1 : 0], dist.chr, diststart, distlen, j);
+ bl_containerAdd(a, &dist);
+ }
+}
+
+void bl_remappingGetRange(void *space, Container *a, matchsplitsiteclusterlist_t *L, matchsplitsiteclusterlist_t *R,
+ Uint type, Uint chr, Uint start, Uint end, unsigned char strand, Uint maxdist, unsigned char right)
+{
+ Uint i, u, cur, last;
+ matchsplitsiteclusterlist_t *list;
+
+ if (type == 0){
+ list = L;
+ }
+ else {
+ list = R;
+ }
+
+ u = binarySearch_left(list[chr].cluster, list[chr].noofclusters, &start,
+ cmp_matchsplitsitecluster_bin, NULL);
+
+ for (i = u; i < list[chr].noofclusters; i++){
+ cur = list[chr].cluster[i].median - 1; /* - 1 offset on median */
+ if (cur >= start){
+ if (cur > end) break;
+
+ last = (type == 0) ? end : start;
+
+ bl_remappingGetDist(space, a, L, R, type, chr, i, strand, last, end-start+1, right);
+ }
+ }
+ qsort(a->contspace, bl_containerSize(a), sizeof(remappingclust_t), cmp_remappingclust_qsort);
+
+ if (maxdist > 0 && bl_containerSize(a) > maxdist){
+ for (i = maxdist; i < bl_containerSize(a); i++){
+ bl_remappingDestructClust(bl_containerGet(a, i));
+ }
+ a->nextfree = maxdist;
+ }
+}
+
+void bl_remappingGetAdjoint(void *space, Container *a, matchsplitsiteclusterlist_t *L, matchsplitsiteclusterlist_t *R,
+ Uint type, Uint chr, Uint clust, Uint margin, unsigned char strand, Uint maxdist, unsigned char right){
+ Uint i, j, start, cur, dist, last;
+ matchsplitsiteclusterlist_t *list, *other;
+
+ if (type == 0){
+ list = L;
+ other = R;
+ }
+ else {
+ list = R;
+ other = L;
+ }
+ /* - 1 offset on median */
+ start = list[chr].cluster[clust].median - 1;
+ for (i = 0; i < list[chr].cluster[clust].noofadjclust; i++){
+ j = list[chr].cluster[clust].adjclust[i];
+ cur = other[chr].cluster[j].median - 1;
+ dist = cur >= start ? cur - start : start - cur;
+
+ if (dist <= margin){
+ last = list[chr].cluster[clust].median - 1;
+
+ bl_remappingGetDist(space, a, L, R, 1-type, chr, j, strand, last, margin, right);
+ }
+ }
+ qsort(a->contspace, bl_containerSize(a), sizeof(remappingclust_t), cmp_remappingclust_qsort);
+
+ if (maxdist > 0 && bl_containerSize(a) > maxdist){
+ for (i = maxdist; i < bl_containerSize(a); i++){
+ bl_remappingDestructClust(bl_containerGet(a, i));
+ }
+ a->nextfree = maxdist;
+ }
+}
+
+
+unsigned char bl_remappingExtractSeed(void *space, char *desc, Uint desclen, remappingseed_t *seed){
+ Uint i;
+ int val;
+ char *err;
+ stringset_t *token, *fields;
+
+ // split description by spaces,
+ // the last two entries should contain the
+ // seed information
+ token = tokensToStringset(space, " ", desc, desclen);
+ if (token->noofstrings < 3) {
+ DBG("less than 3 tokens separated by space found (%d)\n", token->noofstrings);
+ return 0;
+ }
+
+ // split second last entry (query information of seed) by colon
+ i = token->noofstrings-2;
+ fields = tokensToStringset(space, ":", token->strings[i].str,
+ token->strings[i].len);
+ if (fields->noofstrings != 2) {
+ DBG("number of fields separated by ':' is wrong (found %d != 2)\n", fields->noofstrings);
+ return 0;
+ }
+
+ val = strtol(fields->strings[0].str, &err, 10);
+ if (val < 0 || errno == ERANGE || *err != 0) {
+ DBG("ustart out of range: %d\n", val);
+ return 0;
+ }
+ seed->ustart = val;
+
+ val = strtol(fields->strings[1].str, &err, 10);
+ if (val < 0 || errno == ERANGE || *err != 0) {
+ DBG("ulen out of range: %d\n", val);
+ return 0;
+ }
+ seed->ulen = val;
+ seed->uend = seed->ustart + seed->ulen - 1;
+ destructStringset(space, fields);
+
+ i++;
+ fields = tokensToStringset(space, ":", token->strings[i].str, token->strings[i].len);
+ if (fields->noofstrings != 3) {
+ DBG("number of fields separated by ':' is wrong (found %d != 3)\n", fields->noofstrings);
+ return 0;
+ }
+
+ val = strtol(fields->strings[0].str, &err, 10);
+ if (val < 0 || errno == ERANGE || *err != 0) {
+ DBG("chromidx out of range: %d\n", val);
+ return 0;
+ }
+ seed->chromidx = val;
+
+ val = strtol(fields->strings[1].str, &err, 10);
+ if (val < 0 || errno == ERANGE || *err != 0){
+ DBG("vstart out of range: %d\n", val);
+ return 0;
+ }
+ seed->vstart = val;
+
+
+ val = strtol(fields->strings[2].str, &err, 10);
+ if (val < 0 || val > 1 || errno == ERANGE || *err != 0) {
+ DBG("strand out of range: %d\n", val);
+ return 0;
+ }
+ seed->strand = (unsigned char) (val & 1);
+ destructStringset(space, fields);
+
+ destructStringset(space, token);
+ return 1;
+}
+
+
+unsigned char bl_remappingReport(void *space, List *res, fasta_t *reads, Uint k, remapping_t *nfo){
+ Uint i, qrylen, ustart, uend, ulen, vstart, vend, vlen, edist, totalcover, totaledist, maxedist;
+ int previdx, prevpos, nextidx, nextpos, prevstrand, nextstrand, score, totalscore;
+ matchstatus_t pairStatus = QUERY;
+ gread_t read;
+ Gmap map;
+ gmatchlist_t *list=NULL;
+ Alignment *alcopy;
+ MultiCharSeqAlignment *mcsa, *prev, *next;
+
+ qrylen = bl_fastaGetSequenceLength(reads, k);
+ maxedist = qrylen - floor(((double)nfo->accuracy*qrylen)/100.);
+ list = bl_gmatchlistInit(space, maxedist, 0);
+ totalcover = 0; totalscore = 0; totaledist = 0;
+
+ for (i = 0; i < bl_listSize(res); i++){
+ mcsa = bl_listGetElem(res, i);
+
+ vstart = mcsa->refstart + mcsa->al->voff;
+ vlen = getValignlen(mcsa->al);
+ vend = vstart + vlen - 1;
+
+ ustart = mcsa->al->uoff;
+ ulen = getUalignlen(mcsa->al);
+ if (mcsa->strand == 1){
+ uend = qrylen - ustart - 1;
+ ustart = uend - ulen + 1;
+ }
+ else {
+ uend = ustart + ulen - 1;
+ }
+
+ score = getAlignScore(mcsa->al, nfo->scores, nfo->indel);
+ edist = getEdist(mcsa->al);
+
+ assert(ulen >= nfo->minfragmentalignlen &&
+ vlen >= nfo->minfragmentalignlen &&
+ score >= nfo->minfragmentalignscore);
+
+ totalcover += ulen;
+ totalscore += score;
+ totaledist += edist;
+
+ alcopy = ALLOCMEMORY(space, NULL, Alignment, 1);
+ copyAlignment(alcopy, mcsa->al);
+
+ previdx = -1;
+ prevpos = -1;
+ nextidx = -1;
+ nextpos = -1;
+ prevstrand = -1;
+ nextstrand = -1;
+
+ if (i > 0){
+ prev = bl_listGetElem(res, i - 1);
+ previdx = prev->subidx;
+ if (prev->strand == 0){
+ prevpos = prev->refstart + prev->al->voff + getValignlen(prev->al) - 1;
+ prevstrand = '+';
+ }
+ else {
+ prevpos = prev->refstart + prev->al->voff;
+ prevstrand = '-';
+ }
+ }
+
+ if (i < bl_listSize(res) - 1){
+ next = bl_listGetElem(res, i + 1);
+ nextidx = next->subidx;
+ if (next->strand == 0){
+ nextpos = next->refstart + next->al->voff;
+ nextstrand = '+';
+ }
+ else {
+ nextpos = next->refstart + next->al->voff + getValignlen(next->al) - 1;
+ nextstrand = '-';
+ }
+ }
+
+ list = se_kdMatchListAdd(list, mcsa->subidx, vstart, vend, edist, score,
+ ustart, uend, .0, alcopy, mcsa->strand,
+ previdx, prevpos, prevstrand, nextidx, nextpos, nextstrand, i);
+
+ }
+
+ totalcover *= 100;
+ totalcover /= qrylen;
+
+ if (totalscore >= nfo->minsplicedalignscore &&
+ totalcover >= nfo->minsplicedaligncover){
+
+ initRead(&read, k);
+ initGmap(&map, nfo->seq, 1);
+ setReads(&map, &read, 1);
+
+ se_setMatches(space, &read, list, maxedist, &nfo->seinfo, 0);
+ reportMatch(space, &map, reads, &nfo->seinfo, pairStatus, 0);
+ se_destructMatches(space, &read);
+ bl_gmatchlistDestruct(space, list);
+ return 1;
+ }
+ else {
+ bl_gmatchlistDestruct(space, list);
+ return 0;
+ }
+}
+
+void bl_remappingReportUnmapped (void *space, fasta_t *reads, Uint k, remapping_t *nfo){
+
+ if (!bl_fastaHasMate(reads)) {
+ if (nfo->threadno > 1) pthread_mutex_lock(nfo->seinfo.mtx2);
+
+ if (!bl_fastaHasQuality(reads)){
+ fprintf(nfo->seinfo.nomatchdev, ">%s\n%s\n",
+ bl_fastaGetDescription(reads, k),
+ bl_fastaGetSequence(reads, k));
+ } else {
+ fprintf(nfo->seinfo.nomatchdev, "@%s\n%s\n+%s\n%s\n",
+ bl_fastaGetDescription(reads, k), bl_fastaGetSequence(reads, k),
+ bl_fastaGetDescription(reads, k), bl_fastaGetQuality(reads, k));
+ }
+ fflush(nfo->seinfo.nomatchdev);
+ if (nfo->threadno > 1) pthread_mutex_unlock(nfo->seinfo.mtx2);
+ }
+}
+
+void bl_remapping (void *space, MultiCharSeq *seq, fasta_t *reads,
+ matchsplitsiteclusterlist_t *L, matchsplitsiteclusterlist_t *R,
+ remapping_t *nfo){
+
+ Uint i, j, k, chr, type, clust, remainder, prevuoff, uoff, ulen,
+ vstart, margin, first, right;
+ Uint aligns, maxdist, extensions;
+ Container *dist;
+ List *res;
+ remappingclust_t *spliced, *unspliced, *distclust;
+ MultiCharSeqAlignment *mcsa;
+ unsigned char ret, strand, debug;
+ Uint len, desclen;
+ Uint maxedist;
+ char *seqs[2], *curseqs[2], *desc;
+ remappingseed_t seed;
+ remappingstatus_t status;
+
+ debug = 0;
+
+ for (k = 0; k < reads->noofseqs; k++){
+ aligns = 0;
+ extensions = 0;
+ maxdist = 0;
+
+ /* progressbar */
+ if(!nfo->mute) rm_updateProgressBar(k, nfo);
+
+ /* clip */
+
+ /* get seed information from descriptions */
+ desc = bl_fastaGetDescription(reads, k);
+ desclen = bl_fastaGetDescriptionLength(reads, k);
+
+ ret = bl_remappingExtractSeed(space, desc, desclen, &seed);
+ if (!ret || seed.chromidx >= seq->numofsequences){
+ DBG("Error in parsing seed information: '%s' (ret:%d, seed.chridx:%d[%d,%d] ? %d:numofseqs)\n", desc, ret, seed.chromidx, seed.chromstart, seed.chromend, seq->numofsequences);
+ exit(-1);
+ }
+ getMultiCharSeqIdxBounds(seq, seed.chromidx, &seed.chromstart, &seed.chromend);
+ //if (seed.vstart > seed.chromend){
+ // DBG("Error in parsing seed information: position (%u) of chromosome\n", seed.vstart);
+ // exit(-1);
+ //}
+
+ if (seed.ulen == 0){
+ /* update stats */
+ status = NO_SEED;
+ if (nfo->stat){
+ if (nfo->threadno > 1) pthread_mutex_lock(nfo->seinfo.mtx3);
+ rm_addStat(space, nfo->stat, status, aligns, extensions, maxdist);
+ if (nfo->threadno > 1) pthread_mutex_unlock(nfo->seinfo.mtx3);
+ }
+ /* output unmatched */
+ if (nfo->seinfo.nomatchdev){
+ bl_remappingReportUnmapped(space, reads, k, nfo);
+ }
+ continue;
+ }
+
+ seqs[0] = bl_fastaGetSequence(reads, k);
+ len = bl_fastaGetSequenceLength(reads, k);
+ seqs[1] = charDNAcomplement(space, seqs[0], len);
+
+ /* prefilters (eg entropy, #Ns, coverage of query by seed) */
+
+ maxedist = len-floor(((double)nfo->accuracy*len)/100.);
+
+ first = 1;
+ right = 1;
+
+ res = ALLOCMEMORY(space, NULL, List, 1);
+ bl_listInit(res, 10, sizeof(MultiCharSeqAlignment));
+
+ /* ============== EXTENSION ============== */
+ while (1){
+ dist = ALLOCMEMORY(space, NULL, Container, 1);
+ bl_containerInit(dist, 100, sizeof(remappingclust_t));
+
+ /* initializations */
+ if (first){
+ if (debug){
+ fprintf(stderr, "seed: seed.ustart=%u, seed.uend=%u, seed.ulen=%u, qrylen=%u, seed.chromidx=%u, seed.vstart=%u, seed.strand=%u\n", seed.ustart, seed.ustart+seed.ulen-1, seed.ulen, len, seed.chromidx, seed.vstart, seed.strand);
+ }
+
+ chr = seed.chromidx;
+ clust = -1;
+ prevuoff = 0;
+ uoff = 0;
+ ulen = len;
+ curseqs[0] = seqs[0];
+ curseqs[1] = seqs[1];
+ unspliced = NULL;
+
+ strand = seed.strand;
+
+ if (right){
+ type = (strand == 0) ? 1 : 0;
+
+ margin = len + maxedist;
+ if (strand != 0){
+ vstart = (seed.vstart + seed.ulen > margin) ?
+ seed.vstart + seed.ulen - margin : 0;
+ }
+ else {
+ vstart = seed.vstart;
+ }
+ }
+ else {
+ type = (strand == 0) ? 0 : 1;
+
+ /* update qry & ref info with first elem in res (= seed region) */
+ if (bl_listSize(res) > 0){
+ mcsa = bl_listGetElem(res, res->first);
+ assert(mcsa->strand == strand);
+
+ /*
+ * __________
+ * qry: ------|__________|-----
+ * mcsa
+ *
+ * \----- ulen -----/
+ */
+
+ if (strand == 0){
+ ulen = mcsa->al->uoff + getUalignlen(mcsa->al);
+ }
+ else {
+ ulen = mcsa->al->ulen - mcsa->al->uoff;
+ }
+ prevuoff = len - ulen;
+ curseqs[1] = &seqs[1][prevuoff];
+ margin = ulen + maxedist;
+
+ vstart = mcsa->refstart - mcsa->substart + mcsa->al->voff;
+ if (strand == 0){
+ vstart = (vstart + getValignlen(mcsa->al) > margin) ?
+ vstart + getValignlen(mcsa->al) - margin : 0;
+ }
+
+ /* if entire query is mapped --> no left extension necessary */
+ if (ulen - getUalignlen(mcsa->al) == 0){
+ break;
+ }
+
+ /* delete first elem in res (= seed region) */
+ mcsa = bl_listUnlink(res, res->first, NULL);
+ wrapMultiCharSeqAlignment(space, mcsa);
+ FREEMEMORY(space, mcsa);
+ }
+ else {
+ margin = 2 * (len + maxedist);
+ vstart = (seed.vstart > len + maxedist) ? seed.vstart - len - maxedist : 0;
+ }
+ }
+
+ // possible optimizations:
+ // - during right extension: switch to left extension if
+ // remainder of query right of seed is < minfragmentlen
+ // (assumes that there is no split within the seed region)
+
+
+ /*
+ * find R or L clusters with median in interval:
+ * [seed.vstart, seed.vstart+margin) if seed on plus
+ * (seed.vstart-margin, seed.vstart] otherwise
+ */
+ bl_remappingGetRange(space, dist, L, R, type, chr, vstart, vstart + margin - 1, strand, nfo->maxdist, right);
+
+ first = 0;
+ }
+ else {
+ bl_remappingGetAdjoint(space, dist, L, R, type, chr, clust, margin, strand, nfo->maxdist, right);
+ }
+
+ if (debug){
+ fprintf(stderr, "%s: type=%u, chr=%u, clust=%d, strand=%u, uoff=%u, ulen=%u, vstart=%u, margin=%u, noofdist=%u\n",
+ (right) ? "right" : "left", type, chr, clust, strand, uoff, ulen, vstart, margin, bl_containerSize(dist));
+ }
+
+ /* reporting stuff */
+ if (0){
+ for (i = 0; i < bl_containerSize(dist); i++){
+ distclust = bl_containerGet(dist, i);
+ fprintf(stderr, "dist: %u, type=%u, chr=%u, clust=%d, a=%u, b=%u, median=%u\n", i,
+ distclust->type, distclust->chr, distclust->clust, distclust->cluster->a - 1,
+ distclust->cluster->b - 1, distclust->cluster->median - 1);// - 1 offset due to clusterlist
+ for (j = 0; j < distclust->noofmcsa; j++){
+ fprintf(stderr, " mcsa[%u]: chr=%u, start=%u, end=%u, len=%u, strand=%u\n",
+ j, distclust->mcsa[j]->subidx, distclust->mcsa[j]->refstart,
+ distclust->mcsa[j]->refstart + distclust->mcsa[j]->reflen - 1,
+ distclust->mcsa[j]->reflen, distclust->mcsa[j]->strand);
+ }
+ }
+ }
+
+ /* extend by spliced alignment */
+ spliced = NULL;
+ if (bl_containerSize(dist) > 0){
+
+ /* update ref & query info in alignments */
+ for (i = 0; i < bl_containerSize(dist); i++){
+ distclust = bl_containerGet(dist, i);
+ bl_remappingUpdateAlignSeqs(space, seq, curseqs, ulen, distclust);
+ }
+
+ /* calculate spliced alignments => get best one */
+ spliced = bl_remappingAlign(space, curseqs, dist, nfo);
+ aligns += bl_containerSize(dist);
+ if (maxdist < bl_containerSize(dist))
+ maxdist = bl_containerSize(dist);
+
+ /* convert from relative query positions to absolute ones */
+ if (spliced != NULL){
+ bl_remappingUpdateAlignOff(space, seqs, len, spliced, prevuoff, right);
+ }
+ }
+
+ /* extend by unspliced alignment ==> only necessary once (at the beginning) */
+ if (unspliced == NULL){
+
+ /* no valid spliced alignment during initial extension */
+ if (spliced == NULL && bl_listSize(res) == 0){
+ /*
+ * continue with left extension if only unspliced
+ * extension is possible since it will be recalculated
+ * during left extension anyway
+ */
+ if (right){
+ bl_containerDestruct(dist, bl_remappingDestructClust);
+ FREEMEMORY(space, dist);
+
+ right = 0;
+ first = 1;
+ continue;
+ }
+ /*
+ * break if no spliced left extension is possible and no
+ * right extension was successful and hence the entire query
+ * would only be mapped unspliced which was already done in
+ * the initial mapping => do not waste time on this!
+ */
+ else {
+ break;
+ }
+ }
+
+ unspliced = bl_remappingUnsplicedAlign(space, seq, curseqs, ulen, chr, vstart,
+ vstart + margin - 1, strand, nfo);
+ aligns++;
+
+ if (unspliced != NULL){
+ bl_remappingUpdateAlignOff(space, seqs, len, unspliced, prevuoff, right);
+ }
+ }
+
+ /*
+ * cases:
+ * - spliced set and unspliced not set or is worse than spliced => extend further
+ * - unspliced set and spliced not or is worse than unspliced => last extension
+ * - unspliced and spliced not set => no extension
+ */
+
+ /* if best is set and spliced is better than unspliced */
+ if (spliced != NULL && (unspliced == NULL || spliced->score > unspliced->score)){
+
+ if (debug){
+ fprintf(stderr, "%s: spliced extension: qryoff=%u, qrylen=%u, vstart=%u, margin=%u, strand=%u\n",
+ (right)? "right" : "left", uoff, ulen, vstart, margin, strand);
+ bl_remappingPrintClust(stderr, spliced, nfo);
+ }
+ extensions++;
+
+ /*
+ * keep information of best split and
+ * update variables
+ */
+ type = spliced->type;
+ chr = spliced->chr;
+ clust = spliced->clust;
+ if (right) {
+ /*
+ * in right extension:
+ * /-rem-\
+ * __________ ________
+ * qry: ----|__________|-----|________|-------
+ * mcsa[0] mcsa[1]
+ *
+ * \----- off ----/\------- ulen -------/
+ */
+
+ strand = spliced->mcsa[1]->strand;
+
+ /* update qry info (with spliced->mcsa[0]) */
+ if (spliced->mcsa[0]->strand == 0){
+ ulen = spliced->mcsa[0]->al->ulen - spliced->mcsa[0]->al->uoff - getUalignlen(spliced->mcsa[0]->al);
+ }
+ else {
+ ulen = spliced->mcsa[0]->al->uoff;
+ }
+ uoff = len - ulen;
+ curseqs[0] = &seqs[0][uoff];
+
+ margin = ulen + maxedist;
+
+ /* calculate remainder of query (with spliced->mcsa[1]) */
+ if (spliced->mcsa[1]->strand == 0){
+ remainder = spliced->mcsa[1]->al->ulen - spliced->mcsa[1]->al->uoff - getUalignlen(spliced->mcsa[1]->al);
+ }
+ else {
+ remainder = spliced->mcsa[1]->al->uoff;
+ }
+
+ /* update ref info (with spliced->mcsa[1]) */
+ vstart = spliced->mcsa[1]->refstart - spliced->mcsa[1]->substart + spliced->mcsa[1]->al->voff;
+ if (strand != 0){
+ vstart = (vstart + getValignlen(spliced->mcsa[1]->al) > margin) ?
+ vstart + getValignlen(spliced->mcsa[1]->al) - margin : 0;
+ }
+ }
+ else {
+
+ /*
+ * in left extension:
+ * /-rem-\
+ * __________ ________
+ * qry: ------|__________|-----|________|------
+ * mcsa[0] mcsa[1]
+ *
+ * \--------- ulen ------/\---- uoff ----/
+ */
+ strand = spliced->mcsa[0]->strand;
+
+ /* update qry info (with spliced->mcsa[1]) */
+ if (spliced->mcsa[1]->strand == 0){
+ ulen = spliced->mcsa[1]->al->uoff;
+ }
+ else {
+ ulen = spliced->mcsa[1]->al->ulen - spliced->mcsa[1]->al->uoff - getUalignlen(spliced->mcsa[1]->al);
+ }
+ uoff = len - ulen;
+ curseqs[1] = &seqs[1][uoff];
+ margin = ulen + maxedist;
+
+ /* calculate remainder of query (with spliced->mcsa[0]) */
+ if (spliced->mcsa[0]->strand == 0){
+ remainder = spliced->mcsa[0]->al->uoff;
+ }
+ else {
+ remainder = spliced->mcsa[0]->al->ulen - spliced->mcsa[0]->al->uoff - getUalignlen(spliced->mcsa[0]->al);
+ }
+
+ /* update ref info (with spliced->mcsa[0]) */
+ vstart = spliced->mcsa[0]->refstart - spliced->mcsa[0]->substart + spliced->mcsa[0]->al->voff;
+ if (strand == 0){
+ vstart = (vstart + getValignlen(spliced->mcsa[0]->al) > margin) ?
+ vstart + getValignlen(spliced->mcsa[0]->al) - margin : 0;
+ }
+ }
+
+
+ /*
+ * store first alignment at end in case of right
+ * and second alignment at begin in case of left extension
+ */
+ if (right){
+ bl_listInsert(res, res->last, spliced->mcsa[0]);
+ FREEMEMORY(space, spliced->mcsa[0]);
+ spliced->mcsa[0] = NULL;
+ }
+ else {
+ bl_listInsert(res, -1, spliced->mcsa[1]);
+ FREEMEMORY(space, spliced->mcsa[1]);
+ spliced->mcsa[1] = NULL;
+ }
+
+ if (debug){
+ fprintf(stderr, "%s: ulen=%u, uoff=%u, remainder=%u\n", (right)? "right" : "left",
+ ulen, uoff, remainder);
+ }
+
+ /* store other alignment if remainder is too short for another split */
+ if (remainder < nfo->minfragmentalignlen){
+ if (right){
+ bl_listInsert(res, res->last, spliced->mcsa[1]);
+ FREEMEMORY(space, spliced->mcsa[1]);
+ spliced->mcsa[1] = NULL;
+ }
+ else {
+ bl_listInsert(res, -1, spliced->mcsa[0]);
+ FREEMEMORY(space, spliced->mcsa[0]);
+ spliced->mcsa[0] = NULL;
+ }
+
+ if (unspliced != NULL){
+ bl_remappingDestructClust(unspliced);
+ FREEMEMORY(space, unspliced);
+ unspliced = NULL;
+ }
+
+ /* change from right extension to left one */
+ if (right){
+ bl_containerDestruct(dist, bl_remappingDestructClust);
+ FREEMEMORY(space, dist);
+
+ right = 0;
+ first = 1;
+ continue;
+ }
+
+ break;
+ }
+
+ /*
+ * store second alignment as unspliced for next extension step in case of right
+ * and first alignment in case of left extension
+ */
+ if (unspliced != NULL){
+ bl_remappingDestructClust(unspliced);
+ }
+ else {
+ unspliced = ALLOCMEMORY(space, NULL, remappingclust_t, 1);
+ unspliced->chr = 0;
+ unspliced->type = 0;
+ unspliced->clust = -1;
+ unspliced->cluster = NULL;
+ }
+ unspliced->mcsa = ALLOCMEMORY(space, NULL, MultiCharSeqAlignment*, 1);
+ unspliced->noofmcsa = 1;
+ if (right){
+ unspliced->mcsa[0] = spliced->mcsa[1];
+ unspliced->score = getAlignScore(spliced->mcsa[1]->al, nfo->scores, nfo->indel);
+ spliced->mcsa[1] = NULL;
+ }
+ else {
+ unspliced->mcsa[0] = spliced->mcsa[0];
+ unspliced->score = getAlignScore(spliced->mcsa[0]->al, nfo->scores, nfo->indel);
+ spliced->mcsa[0] = NULL;
+ }
+
+ /* prepare next extension */
+ prevuoff = uoff;
+ bl_containerDestruct(dist, bl_remappingDestructClust);
+ FREEMEMORY(space, dist);
+
+ continue;
+ }
+ else if (unspliced != NULL){
+
+ if (debug){
+ fprintf(stderr, "%s: unspliced extension: qryoff=%u, qrylen=%u, vstart=%u, margin=%u, strand=%u\n",
+ (right)? "right" : "left", uoff, ulen, vstart, margin, strand);
+ bl_remappingPrintClust(stderr, unspliced, nfo);
+ }
+ extensions++;
+
+ /* store unspliced alignment */
+ bl_listInsert(res, (right) ? res->last : -1, unspliced->mcsa[0]);
+ FREEMEMORY(space, unspliced->mcsa[0]);
+ unspliced->mcsa[0] = NULL;
+ }
+ else {
+ if (debug){
+ fprintf(stderr, "%s: no extension: qryoff=%u, qrylen=%u, vstart=%u, margin=%u, strand=%u\n",
+ (right)? "right" : "left", uoff, ulen, vstart, margin, strand);
+ }
+ }
+
+ if (unspliced != NULL) {
+ bl_remappingDestructClust(unspliced);
+ FREEMEMORY(space, unspliced);
+ unspliced = NULL;
+ }
+
+ /* change from right extension to left one */
+ if (right){
+ bl_containerDestruct(dist, bl_remappingDestructClust);
+ FREEMEMORY(space, dist);
+
+ right = 0;
+ first = 1;
+ continue;
+ }
+
+ break;
+ }
+
+ /* report stuff */
+ bl_listSweep(res);
+
+ if (bl_listSize(res) > 1){
+ ret = bl_remappingReport(space, res, reads, k, nfo);
+ /* update stats */
+ status = ret ? REMAPPED : PURGED;
+ if (nfo->stat){
+ if (nfo->threadno > 1) pthread_mutex_lock(nfo->seinfo.mtx3);
+ rm_addStat(space, nfo->stat, status, aligns, extensions, maxdist);
+ if (nfo->threadno > 1) pthread_mutex_unlock(nfo->seinfo.mtx3);
+ }
+ if (!ret){
+ /* output unmatched */
+ if (nfo->seinfo.nomatchdev){
+ bl_remappingReportUnmapped(space, reads, k, nfo);
+ }
+ }
+ }
+ else {
+ /* update stats */
+ status = (aligns == 0) ? NO_DIST : UNSPLICED;
+ if (nfo->stat){
+ if (nfo->threadno > 1) pthread_mutex_lock(nfo->seinfo.mtx3);
+ rm_addStat(space, nfo->stat, status, aligns, extensions, maxdist);
+ if (nfo->threadno > 1) pthread_mutex_unlock(nfo->seinfo.mtx3);
+ }
+ /* output unmatched */
+ if (nfo->seinfo.nomatchdev){
+ bl_remappingReportUnmapped(space, reads, k, nfo);
+ }
+ }
+
+ /* clear stuff */
+ bl_containerDestruct(dist, bl_remappingDestructClust);
+ FREEMEMORY(space, dist);
+
+ for (i = 0; i < bl_listSize(res); i++){
+ mcsa = bl_listGetElem(res, i);
+ wrapMultiCharSeqAlignment(space, mcsa);
+ }
+ bl_listDestruct(res, NULL);
+ FREEMEMORY(space, res);
+
+ FREEMEMORY(space, seqs[1]);
+ }
+}
+
+#ifdef REMAPPINGTEST
+
+unsigned char mute = 0;
+char *ntcode;
+pthread_mutex_t updatemtx;
+
+void*
+checkclock(void *args) {
+ checkthreadinfo_t *t;
+
+ sleep(2);
+ cursorVisible();
+ t = (checkthreadinfo_t*) args;
+ initProgressBarVT();
+
+ while (pthread_mutex_trylock(&updatemtx) != 0) {
+ progressBarVT("reads remapped.", t->noofseqs, (*t->counter), 25);
+ }
+
+ cursorVisible();
+ fprintf(stderr, "\n");
+ return NULL;
+}
+
+int main(int argc, char **argv) {
+
+ remapping_t nfo, *thnfo;
+
+ manopt_optionset optset;
+ manopt_arg *unflagged;
+ manopt_arg *queries;
+ manopt_arg *dbfilenames;
+ manopt_intconstraint threadconstraint;
+ manopt_intconstraint accuracyconstraint;
+
+ Uint counter=0, desclen;
+ matchsplitsiteclusterlist_t *L = NULL, *R = NULL;
+ matchfileindex_t *index;
+ matchfile_t **files = NULL;
+ unsigned char gzip = 0, indexreads=1;
+ char *version, *desc;
+ int i;
+ Uint nchr;
+ Uint prefixlen=0;
+ double difmatch;
+ time_t startmatch, endmatch;
+ fasta_t **chopsuey;
+ pthread_t *threads;
+ pthread_t clockthread;
+ checkthreadinfo_t ch_info;
+
+ threadconstraint.max = 3000;
+ threadconstraint.min = 1;
+ accuracyconstraint.max = 100;
+ accuracyconstraint.min = 0;
+
+ /* set default settings */
+ rm_setdefault(&nfo);
+
+ /* init stat struct => comment out to disable */
+ //nfo.stat = ALLOCMEMORY(nfo.space, NULL, remappingstat_t, 1);
+ //rm_initStat(nfo.stat);
+
+ /* capture command */
+ nfo.seinfo.cmdline = malloc(strlen(argv[0])+1);
+ strcpy(nfo.seinfo.cmdline, argv[0]);
+
+ for(i = 1; i < argc; i++) {
+ nfo.seinfo.cmdline = realloc(nfo.seinfo.cmdline, strlen(nfo.seinfo.cmdline) +
+ strlen(argv[i]) + 2);
+ strcat(nfo.seinfo.cmdline," ");
+ strcat(nfo.seinfo.cmdline,argv[i]);
+ }
+
+ initIUPAC(1,1);
+ version = getNiceSVNVersion(VERSION);
+ manopt_initoptionset(&optset, argv[0], NULL,
+ "Remapping of unmapped reads\n",
+ "SEGEMEHL is free software for non-commercial use \n (C) 2012 Bioinformatik Leipzig\n",
+ version,
+ "Please report bugs to christian at bioinf.uni-leipzig.de");
+ manopt(&optset, LISTOPT, 1, 'd', "database",
+ "list of path/filename(s) of database sequence(s)", "<file> [<file> ...]",
+ NULL, NULL);
+ manopt(&optset, LISTOPT, 1, 'q', "query",
+ "path/filename of alignment file", "<file> [<file> ...]", NULL, NULL);
+ manopt(&optset, REQSTRINGOPT, 0, 'o', "outfile",
+ "outputfile", "<string>", NULL, &nfo.outfile);
+ manopt(&optset, STRINGOPT, 1, 'r', "remapfilename",
+ "filename for reads to be remapped", "<file>", NULL, &nfo.remapfile);
+ manopt(&optset, REQSTRINGOPT, 0, 'u', "nomatchfilename",
+ "filename for unmatched reads", "<file>", NULL, &nfo.nomatchfile);
+ manopt(&optset, REQUINTOPT, 0, 't', "threads",
+ "start <n> threads for remapping", "<n>",
+ &threadconstraint, &nfo.threadno);
+ manopt(&optset, FLAG, 0, 's', "silent",
+ "shut up!", NULL, NULL, &nfo.mute);
+ manopt(&optset, REQUINTOPT, 0, 'A', "accuracy",
+ "min percentage of matches per read in semi-global alignment", "<n>", &accuracyconstraint, &nfo.accuracy);
+ manopt(&optset, REQUINTOPT, 0, 'W', "minsplicecover",
+ "min coverage for spliced transcripts", "<n>", &accuracyconstraint, &nfo.minsplicedaligncover);
+ manopt(&optset, REQUINTOPT, 0, 'U', "minfragscore",
+ "min score of a spliced fragment", "<n>", NULL, &nfo.minfragmentalignscore);
+ manopt(&optset, REQUINTOPT, 0, 'Z', "minfraglen",
+ "min length of a spliced fragment", "<n>", NULL, &nfo.minfragmentalignlen);
+ manopt(&optset, REQUINTOPT, 0, 'M', "maxdist",
+ "max number of distant sites to consider, 0 to disable", "<n>", NULL, &nfo.maxdist);
+
+ unflagged = manopt_getopts(&optset, argc, argv);
+
+ if(unflagged->noofvalues > 1) {
+ manopt_help(&optset, "unknown argument(s)\n");
+ }
+
+ MSG("reading database sequences.\n");
+
+ dbfilenames = manopt_getarg(&optset, 'd', "database");
+ nfo.fasta = bl_fastxGetSet(nfo.space, dbfilenames->values,
+ dbfilenames->noofvalues, 1, 0, 0, 1);
+
+ for(i=0; i < nfo.fasta->noofseqs; i++) {
+ desclen = bl_fastaGetDescriptionLength(nfo.fasta, i);
+ desc = strclip(nfo.space, bl_fastaGetDescription(nfo.fasta, i), &desclen);
+ FREEMEMORY(nfo.space, nfo.fasta->seqs[i]->description);
+ nfo.fasta->seqs[i]->description = desc;
+ nfo.fasta->seqs[i]->descrlen = desclen;
+ }
+
+ NFO("%d database sequences found.\n", nfo.fasta->noofseqs);
+ MSG("reading query files.\n");
+
+ queries = manopt_getarg(&optset, 'q', "query");
+ if(queries->noofvalues > 30) {
+ manopt_help(&optset, "currently no more than 30 query files allowed\n");
+ }
+
+ ntcode = getNTcodekey(nfo.space);
+ files = ALLOCMEMORY(nfo.space, NULL, matchfile_t*, queries->noofvalues);
+
+ //using index structure only to carry the chr idx
+ index = bl_matchfileInitIndex(nfo.space);
+
+ nchr = nfo.fasta->noofseqs;
+ for(i=0; i < nchr; i++) {
+ bl_matchfileIndexAddChrom(index, bl_fastaGetDescription(nfo.fasta, i));
+ }
+
+ for(i=0; i < queries->noofvalues; i++) {
+
+ files[i] = ALLOCMEMORY(nfo.space, NULL, matchfile_t, 1);
+ files[i]->fmt = 0;
+ files[i]->index = index;
+ files[i]->filename = queries->values[i];
+
+ prefixlen = bl_fileprefixlen(files[i]->filename);
+
+ gzip = 0;
+ if(strncmp(&files[i]->filename[prefixlen], ".gz", 3) == 0 ||
+ strncmp(&files[i]->filename[prefixlen], ".gzip", 3) == 0) {
+ gzip = 1;
+ }
+
+ files[i]->gzip = gzip;
+ }
+
+ L = ALLOCMEMORY(nfo.space, NULL, matchsplitsiteclusterlist_t,nchr);
+ memset(L, 0, sizeof(matchsplitsiteclusterlist_t)*nchr);
+ R = ALLOCMEMORY(nfo.space, NULL, matchsplitsiteclusterlist_t, nchr);
+ memset(R, 0, sizeof(matchsplitsiteclusterlist_t)*nchr);
+
+ for(i=0; i < queries->noofvalues; i++) {
+ bl_matchfileRealignScanFileNew(nfo.space, files[i], NULL, nfo.fasta, 255, &L, &R, &nchr, 1,0);
+ }
+
+ bl_matchLinkAdjoinedCluster(nfo.space, L, R, nchr);
+ bl_matchLinkDistCluster (nfo.space, R, L, nchr);
+
+ nfo.seq = concatCharSequences(nfo.space, nfo.fasta->seqs, nfo.fasta->noofseqs, (char)126, (char)127);
+ if(indexreads) {
+ nfo.reads = bl_fastxGetSet(nfo.space, &nfo.remapfile, 1, 1, 0, 1, nfo.threadno);
+ } else {
+ nfo.reads = bl_fastxRead(nfo.space, nfo.reads, nfo.remapfile, 1, 0, 0, 0, 0, bl_fastxAdd);
+ }
+ NFO("%d unmatched read sequences found.\n", nfo.reads->noofseqs);
+
+ if (nfo.threadno > nfo.reads->noofseqs) {
+ NFO("more threads than unmapped reads. Exit forced\n", NULL);
+ exit(EXIT_FAILURE);
+ }
+
+ nfo.L = L;
+ nfo.R = R;
+ nfo.seinfo.outfile = nfo.outfile;
+ nfo.seinfo.fasta = nfo.fasta;
+ nfo.seinfo.rep_type = 15;
+ nfo.seinfo.threadno = nfo.threadno;
+
+ se_registerOutputDevice(nfo.space, &nfo.seinfo);
+
+ if(nfo.nomatchfile != NULL)
+ nfo.seinfo.nomatchdev = fopen(nfo.nomatchfile, "w");
+
+ if (nfo.threadno > 1){
+ nfo.counter = &counter;
+
+ if(indexreads) {
+ chopsuey = bl_fastxChopIndex(nfo.space, nfo.reads, nfo.threadno);
+ } else {
+ chopsuey = bl_fastaChop(nfo.space, nfo.reads, nfo.threadno);
+ }
+ thnfo = ALLOCMEMORY(nfo.space, NULL, remapping_t, nfo.threadno);
+ threads = ALLOCMEMORY(nfo.space, NULL, pthread_t, nfo.threadno);
+ ch_info.noofseqs = nfo.reads->noofseqs;
+ ch_info.counter = &counter;
+
+ if (!nfo.mute) {
+ pthread_mutex_init(&updatemtx, NULL);
+ pthread_mutex_lock(&updatemtx);
+ pthread_create(&clockthread, NULL, checkclock, &ch_info);
+ }
+
+ for(i=0; i < nfo.threadno; i++) {
+ NFO("%d reads in thread %d.\n", chopsuey[i]->noofseqs, i);
+ }
+
+ time (&startmatch);
+
+ for(i=0; i < nfo.threadno; i++) {
+ memmove(&thnfo[i], &nfo, sizeof(remapping_t));
+ thnfo[i].reads = chopsuey[i];
+ thnfo[i].threadid = i;
+ pthread_create(&threads[i], NULL, remappingworker, &thnfo[i]);
+ }
+
+ for(i=0; i < nfo.threadno; i++) {
+ pthread_join(threads[i], NULL);
+ }
+
+ if(!nfo.mute) {
+ /*notification via mutex - why use signals?*/
+ pthread_mutex_unlock(&updatemtx);
+ pthread_join(clockthread, NULL);
+ }
+
+ fflush(nfo.seinfo.dev);
+ time (&endmatch);
+ difmatch = difftime (endmatch, startmatch);
+ NFO("threaded remapping has taken %f seconds.\n", difmatch);
+
+ for (i=0; i < nfo.threadno; i++) {
+ bl_fastxDestructSequence(nfo.space, chopsuey[i]);
+ bl_fastxDestructChunkIndex(nfo.space, chopsuey[i]);
+ FREEMEMORY(nfo.space, chopsuey[i]);
+ }
+
+ FREEMEMORY(nfo.space, chopsuey);
+ FREEMEMORY(nfo.space, thnfo);
+ FREEMEMORY(nfo.space, threads);
+ }
+ else {
+ initProgressBarVT();
+ time (&startmatch);
+ bl_remapping(nfo.space, nfo.seq, nfo.reads, nfo.L, nfo.R, &nfo);
+ time (&endmatch);
+ difmatch = difftime (endmatch, startmatch);
+ NFO("remapping has taken %f seconds.\n", difmatch);
+ }
+
+ if (nfo.stat){
+ for (i = 0; i < nfo.stat->n; i++){
+ fprintf(stderr, "%d\t%u\t%u\t%u\n", nfo.stat->status[i],
+ nfo.stat->aligns[i], nfo.stat->extensions[i],
+ nfo.stat->maxdist[i]);
+ }
+ NFO("stats of %d elements\n", nfo.stat->n);
+ }
+
+ if (nfo.outfile)
+ fclose(nfo.seinfo.dev);
+
+ if(nfo.nomatchfile)
+ fclose(nfo.seinfo.nomatchdev);
+
+ bl_fastaDestruct(nfo.space, nfo.reads);
+ FREEMEMORY(nfo.space, nfo.reads);
+
+ destructMultiCharSeq(nfo.space, nfo.seq);
+
+ if (nfo.stat){
+ rm_destructStat(nfo.space, nfo.stat);
+ FREEMEMORY(nfo.space, nfo.stat);
+ }
+
+ FREEMEMORY(nfo.space, nfo.seinfo.mtx);
+ FREEMEMORY(nfo.space, nfo.seinfo.mtx2);
+ FREEMEMORY(nfo.space, nfo.seinfo.mtx3);
+ FREEMEMORY(nfo.space, nfo.seinfo.cmdline);
+
+ bl_matchDestructMatchsplitsiteclusterlist(nfo.space, L, nchr);
+ bl_matchDestructMatchsplitsiteclusterlist(nfo.space, R, nchr);
+ FREEMEMORY(nfo.space, L);
+ FREEMEMORY(nfo.space, R);
+
+ bl_fastaDestruct(nfo.space, nfo.fasta);
+ FREEMEMORY(nfo.space, nfo.fasta);
+ bl_matchfileDestructIndex(nfo.space, index);
+ FREEMEMORY(nfo.space, index);
+
+ if(files) {
+ for(i=0; i < queries->noofvalues; i++) {
+ FREEMEMORY(nfo.space, files[i]);
+ }
+ FREEMEMORY(nfo.space, files);
+ }
+
+ manopt_destructoptionset(&optset);
+ manopt_destructarg(unflagged);
+ FREEMEMORY(nfo.space, unflagged);
+
+ FREEMEMORY(nfo.space, ntcode);
+ FREEMEMORY(nfo.space, version);
+
+ return EXIT_SUCCESS;
+}
+
+
+
+#endif
diff --git a/segemehl/libs/remapping.h b/segemehl/libs/remapping.h
new file mode 100644
index 0000000..97c0c8a
--- /dev/null
+++ b/segemehl/libs/remapping.h
@@ -0,0 +1,146 @@
+#ifndef REMAPPING_H
+#define REMAPPING_H
+
+/**
+ *
+ * remapping.h
+ * remapping of unmapped reads
+ *
+ * @author Christian Otto, christain at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Fri Oct 12 09:55:49 CEST 2012
+ *
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 400 $
+ * Author: $Author: steve $
+ * Date: $Date: 2013-09-11 03:46:50 -0400 (Wed, 11 Sep 2013) $
+ * Id: $Id: remapping.h 400 2013-09-11 07:46:50Z steve $
+ * Url: $URL: http://www2.bioinf.uni-leipzig.de/svn5/segemehl/libs/remapping.h $
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "basic-types.h"
+
+typedef struct {
+ Uint ustart;
+ Uint uend;
+ Uint ulen;
+ Uint vstart;
+ Uint chromstart;
+ Uint chromend;
+ Uint chromidx;
+ unsigned char strand;
+} remappingseed_t;
+
+typedef struct {
+ matchsplitsitecluster_t *cluster;
+ Uint type;
+ Uint chr;
+ int clust;
+ Uint cnt;
+ MultiCharSeqAlignment **mcsa;
+ Uint noofmcsa;
+ int score;
+} remappingclust_t;
+
+typedef enum {
+ REMAPPED, /* successfully remapped */
+ PURGED, /* discarded due to error boundaries */
+ NO_SEED, /* no best seed information given */
+ NO_DIST, /* no splice sites in proximity found */
+ UNSPLICED /* no extension possible */
+} remappingstatus_t;
+
+typedef struct {
+ remappingstatus_t *status;
+ Uint *aligns;
+ Uint *extensions;
+ Uint *maxdist;
+ Uint n;
+} remappingstat_t;
+
+typedef struct {
+ void *space;
+ char *remapfile;
+ char *outfile;
+ char *nomatchfile;
+ matchsplitsiteclusterlist_t *L;
+ matchsplitsiteclusterlist_t *R;
+ MultiCharSeq *seq;
+ fasta_t *fasta;
+ fasta_t *reads;
+ Uint threadno;
+ Uint threadid;
+ unsigned char mute;
+ Uint *counter;
+ int scores[2];
+ int indel;
+ int transition;
+ Uint minfragmentalignlen;
+ int minfragmentalignscore;
+ Uint minsplicedaligncover;
+ Uint minsplicedalignscore;
+ int accuracy;
+ Uint maxdist;
+ segemehl_t seinfo;
+ remappingstat_t *stat;
+} remapping_t;
+
+
+inline static void
+rm_setdefault(remapping_t *info) {
+ info->space = NULL;
+ info->remapfile = NULL;
+ info->outfile = NULL;
+ info->nomatchfile = NULL;
+ info->L = NULL;
+ info->R = NULL;
+ info->seq = NULL;
+ info->fasta = NULL;
+ info->reads = NULL;
+ info->threadno = 1;
+ info->threadid = 0;
+ info->mute = 0;
+ info->counter = 0;
+ info->scores[0] = 1;
+ info->scores[1] = -2;
+ info->indel = -2;
+ info->transition = -10;
+ info->minfragmentalignlen = 5;
+ info->minfragmentalignscore = 5;
+ info->minsplicedaligncover = 80;
+ info->minsplicedalignscore = 5+18;
+ info->accuracy = 90;
+ info->maxdist = 100;
+ se_setdefault(&info->seinfo);
+ info->stat = NULL;
+}
+
+
+void bl_remapping (void *space, MultiCharSeq *seq, fasta_t *reads, matchsplitsiteclusterlist_t *L, matchsplitsiteclusterlist_t *R, remapping_t *nfo);
+unsigned char bl_remappingReport(void *space, List *res, fasta_t *reads, Uint k, remapping_t *nfo);
+void bl_remappingReportUnmapped(void *space, fasta_t *reads, Uint k, remapping_t *nfo);
+unsigned char bl_remappingExtractSeed(void *space, char *desc, Uint desclen, remappingseed_t *seed);
+
+void bl_remappingGetRange(void *space, Container *a, matchsplitsiteclusterlist_t *L, matchsplitsiteclusterlist_t *R,
+ Uint type, Uint chr, Uint start, Uint end, unsigned char strand, Uint maxdist, unsigned char right);
+void bl_remappingGetAdjoint(void *space, Container *a, matchsplitsiteclusterlist_t *L, matchsplitsiteclusterlist_t *R,
+ Uint type, Uint chr, Uint clust, Uint margin, unsigned char strand, Uint maxdist, unsigned char right);
+void bl_remappingGetDist(void *space, Container *a, matchsplitsiteclusterlist_t *L, matchsplitsiteclusterlist_t *R, Uint type, Uint chr, Uint clust, unsigned char strand, Uint last, Uint margin, unsigned char right);
+
+remappingclust_t *bl_remappingUnsplicedAlign(void *space, MultiCharSeq *seq, char **seqs, Uint len, Uint chr, Uint start, Uint end, unsigned char strand, remapping_t *nfo);
+remappingclust_t *bl_remappingAlign(void *space, char **seqs, Container *dist, remapping_t *nfo);
+
+void bl_remappingUpdateAlignSeqs(void *space, MultiCharSeq *seq, char **seqs, Uint qrylen, remappingclust_t *cur);
+void bl_remappingUpdateAlignOff(void *space, char **seqs, Uint qrylen, remappingclust_t *cur, Uint offset, Uint right);
+
+void bl_remappingPrintAlign(FILE *dev, MultiCharSeqAlignment *mcsa, remapping_t *nfo);
+void bl_remappingPrintClust(FILE *dev, remappingclust_t *clust, remapping_t *nfo);
+void bl_remappingDestructClust(void *elem);
+void bl_remappingInitAlign(void *space, MultiCharSeqAlignment *mcsa, Uint subidx, Uint start, Uint len, unsigned char strand);
+
+#endif /* REMAPPING_H */
diff --git a/segemehl/libs/seqclip.c b/segemehl/libs/seqclip.c
new file mode 100644
index 0000000..257076d
--- /dev/null
+++ b/segemehl/libs/seqclip.c
@@ -0,0 +1,686 @@
+
+/*
+ * seqclip.c
+ *
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 24.04.2010 21:24:34 CEST
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdint.h>
+#include "biofiles.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "memory.h"
+#include "alignment.h"
+#include "time.h"
+#include "sw.h"
+#include "seqclip.h"
+#include "bitvectoralg.h"
+
+static int clpswscr[]={1,-2};
+static int edstscr[]={1,0};
+static char *polyA =
+ "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
+/*---------------------------- bl_seqclipSoft3Prime -----------------------------
+ *
+ * @brief clipping adapter from 3'end
+ * @return length of clipped part
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_seqclipSoft3Prime(void *space, char *s, Uint len,
+ char *C, Uint clen, Uint minclipacc, Uint polyAlen)
+{
+ int *M;
+ int mrgn;
+ Uint rlen=0, rmargin = 0, rend, valignlen, adapteracc=100;
+ int polyAScore=0, adapterScore=0, adapterMatchLen=0;
+ Alignment *al;
+
+
+// fprintf(stderr, "softclipprime");
+ mrgn = MIN((((float)clen)*.2) + clen, len);
+
+// fprintf(stderr, "mrgn: %d, len:%d, len-mrgn:%d\n", mrgn, len, len-mrgn);
+ M = swmatrix(space, C, clen,
+ &s[len-mrgn], mrgn, -2, constscr_Nmatch, clpswscr);
+
+ al = ALLOCMEMORY(space, NULL, Alignment, 1);
+ initAlignment(al, C, clen, 0, &s[len-mrgn], mrgn, 0);
+
+ swtraceback(space, M, C, clen,
+ &s[len-mrgn], mrgn, -2, constscr_Nmatch, clpswscr, al);
+
+ valignlen = getValignlen(al);
+
+ rmargin = len > 4 + (int)(((double)valignlen)*0.2) ?
+ len - 4 - (int)(((double)valignlen)*0.2) : 0;
+
+ getSoftClipScores(al, polyAlen, edstscr, 0,
+ &polyAScore, &adapterScore, &adapterMatchLen);
+
+ if(adapterMatchLen > 3) {
+ adapteracc = (int)(((double)adapterScore/(double)adapterMatchLen)*100);
+ } else {
+ adapteracc = 100;
+ }
+
+ if(len - mrgn + al->voff + valignlen >= rmargin &&
+ (polyAScore >= 5 || adapterScore >= 8) &&
+ adapteracc >= minclipacc)
+ {
+ rend = (mrgn > al->voff+ valignlen) ? mrgn - al->voff - valignlen : 0;
+ rlen = getValignlen(al) + rend;
+ }
+
+
+ // fprintf(stderr, "c:%s\n", s);
+// fprintf(stderr,
+// "adapteracc:%d, minclipacc: %d, polyAScore %d, valginlen: %d, mrgn: %d, len: %d, off:%d -> rend: %d, rlen:%d\n",
+// adapteracc, minclipacc, polyAScore, valignlen, mrgn, len, al->voff, rend, rlen);
+// showAlign(al, stderr);
+
+
+
+ wrapAlignment(al);
+ FREEMEMORY(space, M);
+ FREEMEMORY(space, al);
+ return rlen;
+}
+
+
+/*---------------------------- bl_seqclipSoft5Prime -----------------------------
+ *
+ * @brief clipping adapter from 5' end
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_seqclipSoft5Prime(void *space, char *s, Uint len,
+ char *C, Uint clen, Uint minclipscr)
+{
+ int *M;
+ int mrgn, allen=0;
+ Alignment *al;
+
+
+ mrgn = MIN((((float)clen)*.2) + clen, len);
+
+ M = swmatrix(space, C, clen, s, mrgn, -2, constscr_Nmatch, clpswscr);
+
+ al = ALLOCMEMORY(space, NULL, Alignment, 1);
+ initAlignment(al, C, clen, 0, s, mrgn, 0);
+
+ swtraceback(space, M, C, clen,
+ s, mrgn, -2, constscr_Nmatch, clpswscr, al);
+
+ if(getAlignScore(al, edstscr, 0) >= minclipscr) {
+ allen = getValignlen(al);
+ }
+
+ // showAlign(al, stdout);
+
+ wrapAlignment(al);
+ FREEMEMORY(space, M);
+ FREEMEMORY(space, al);
+
+ return allen;
+}
+
+/*----------------------------- bl_seqclipPolyA ------------------------------
+ *
+ * @brief clipping polyA tails
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_seqclipPolyA(void *space, char *s, Uint len, char *clp, Uint clen)
+{
+ int *M;
+ int mrgn;
+ Uint polyAlen;
+ char *polyAseq;
+ Uint rlen = len, rmargin = 0;
+ Alignment *al;
+
+ if (len < 10) return len;
+
+ if(clp) {
+ polyAlen = strlen(polyA) + clen;
+ polyAseq = ALLOCMEMORY(space, NULL, char, polyAlen+1);
+ memset(polyAseq, 0, polyAlen+1);
+ memmove(polyAseq, polyA, strlen(polyA));
+ memmove(&polyAseq[strlen(polyA)], clp, clen);
+ } else {
+ polyAseq = polyA;
+ polyAlen = strlen(polyA);
+ }
+
+ al = ALLOCMEMORY(space, NULL, Alignment, 1);
+ mrgn =((float)len*.5);
+
+ initAlignment(al, polyAseq, polyAlen, 0, &s[len-mrgn], mrgn, 0);
+
+ M = swmatrix(space, polyAseq, polyAlen, &s[len-mrgn],
+ mrgn, -3, constscr_Nmatch, clpswscr);
+
+ swtraceback(space, M, polyAseq, polyAlen, &s[len-mrgn],
+ mrgn, -3, constscr_Nmatch, clpswscr, al);
+
+ rmargin = len > 4 + (int)((double)getValignlen(al)*0.2) ?
+ len - 4 - (int)(((double)getValignlen(al))*0.2) : 0;
+
+ if(len - mrgn + al->voff + getValignlen(al) >= rmargin) {
+ rlen = len - mrgn + al->voff;
+ }
+
+ //showAlign(al, stdout);
+
+ FREEMEMORY(space, M);
+ wrapAlignment(al);
+
+ return rlen;
+}
+
+/*--------------------------- bl_seqclipHard3Prime ---------------------------
+ *
+ * @brief hard clipping of 3'prime end
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_seqclipHard3Prime(Uint len, Uint clen) {
+ Uint mrgn;
+
+ mrgn = MIN(len, clen);
+ return len-mrgn;
+}
+
+
+/*--------------------------- bl_seqclipHard5Prime ---------------------------
+ *
+ * @brief hard clipping of 5'prime end
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_seqclipHard5Prime(char *s, Uint len, Uint clen) {
+ Uint mrgn;
+
+ mrgn = (clen >= len) ? 0 : clen;
+ return &s[mrgn];
+}
+
+
+/*----------------------------- bl_seqClipDecode -----------------------------
+ *
+ * @brief find DNA sequence of length len for a code to the base of 5
+ * @author Steve Hoffmann
+ *
+ */
+
+ char*
+bl_seqclipDecode (Uint code, Uint len)
+{
+ Uint i=0, n, r;
+ char *seq, ch;
+
+
+ seq = ALLOCMEMORY(space, NULL, char, len+1);
+ memset(seq, 'A', sizeof(char)*len);
+ seq[len] = 0;
+ n = code;
+
+ while(n > 0) {
+ r = n % 5;
+ n = n / 5;
+
+ switch(r) {
+ case 0:
+ ch = 'A';
+ break;
+ case 1:
+ ch= 'C';
+ break;
+ case 2:
+ ch = 'G';
+ break;
+ case 3:
+ ch = 'T';
+ break;
+ default:
+ ch = 'N';
+ }
+ seq[i++] = ch;
+ }
+
+ return seq;
+}
+
+
+/*---------------------------- bl_seqclipGetCode -----------------------------
+ *
+ * @brief find code for a DNA sequence of length len
+ * @author Steve Hoffmann
+ *
+ */
+
+ Uint
+bl_seqclipGetCode (char *seq, Uint len)
+{
+ Uint i, z, sum=0;
+
+ for(i=0; i < len; i++) {
+ switch(seq[i]) {
+ case 'A':
+ z=0;
+ break;
+ case 'C':
+ z=1;
+ break;
+ case 'G':
+ z=2;
+ break;
+ case 'T':
+ z=3;
+ break;
+ default:
+ z=4;
+ break;
+ }
+ sum += z *pow(5, i);
+ }
+ return sum;
+}
+
+
+
+/*---------------------------------- bl_lcs ----------------------------------
+ *
+ * @brief get
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_lcsub (void *space, char *s1, Uint l1, char *s2, Uint l2, Uint *l, Uint *r)
+{
+ Uint i, j, maxlen=0, maxi=0, maxj=0;
+ Uint *M;
+
+ M = ALLOCMEMORY(space, NULL, Uint, (l1+1)*(l2+1));
+ memset(M, 0, sizeof(Uint)*((l1+1)*(l2+1)));
+
+ for(i=0; i < l1; i++) {
+ for(j=0; j < l2; j++) {
+ if(s1[i] == s2[j]) {
+ if(i==0 || j==0) {
+ MATRIX2D(M, l2+1, i, j) = 1;
+ } else {
+ MATRIX2D(M, l2+1, i, j) =
+ MATRIX2D(M, l2+1, i-1, j-1) + 1;
+ }
+ if(MATRIX2D(M, l2+1, i, j) > maxlen) {
+ maxlen = MATRIX2D(M, l2+1, i, j);
+ maxi = i;
+ maxj = j;
+ }
+ }
+ }
+ }
+
+ *l = maxi;
+ *r = maxj;
+
+ FREEMEMORY(space, M);
+ return maxlen;
+}
+
+
+
+/*-------------------------------- bl_greedy ---------------------------------
+ *
+ * @brief assemble sequences greedily
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_Find3PrimeGreedy (void *space, PairUint *list, Uint len, Uint ws) {
+
+ Uint i=0, j=0, l, r, k, llen, rlen, start, adapterlen=0,
+ lcs, minovl, last;
+ Uint *Lovl, *Rovl, *Lchain, *Rchain, *use;
+ char *seq1, *seq2;
+ char *adapter = NULL;
+
+ Lovl = ALLOCMEMORY(space, NULL, Uint, len*len);
+ Rovl = ALLOCMEMORY(space, NULL, Uint, len*len);
+ use = ALLOCMEMORY(space, NULL, Uint, len);
+ Lchain = ALLOCMEMORY(space, NULL, Uint, len);
+ Rchain = ALLOCMEMORY(space, NULL, Uint, len);
+
+ memset(Lovl, 0,sizeof(Uint)*(len*len));
+ memset(Rovl, 0,sizeof(Uint)*(len*len));
+ minovl = (ws/3) * 2;
+
+
+ for(i=0; i < len; i++) {
+ seq1 = bl_seqclipDecode(list[i].b, ws);
+ for(j=0; j < len; j++) {
+ if(i != j) {
+ seq2 = bl_seqclipDecode(list[j].b, ws);
+ lcs = bl_lcsub(space, seq1, ws, seq2, ws, &l, &r);
+ //lcs is overlap
+ if((l == ws-1 && r == lcs-1) || (r == ws-1 && l == lcs-1)) {
+ if(l > r) {
+ MATRIX2D(Rovl, len, i, j) = lcs;
+ } else {
+ MATRIX2D(Lovl, len, i, j) = lcs;
+ }
+ }
+ FREEMEMORY(space, seq2);
+ }
+ }
+ FREEMEMORY(space, seq1);
+ }
+
+ for(i=0; i < 5; i++) {
+ memset(use, 0, sizeof(Uint)*(len));
+ use[i] = 1;
+ last = i;
+ rlen = 0;
+ Rchain[rlen] = -1;
+
+ while(1) {
+ for(l=0, j=0; j < len; j++) {
+ if (MATRIX2D(Rovl, len, last, j) > l &&
+ MATRIX2D(Rovl, len, last, j) >= minovl &&
+ !use[j]) {
+ if(Rchain[rlen] != -1) {
+ use[Rchain[rlen]] = 0;
+ }
+ Rchain[rlen] = j;
+ l = MATRIX2D(Rovl, len, last, j);
+ use[j] = 1;
+ }
+ }
+ if(Rchain[rlen] == -1) break;
+ last = Rchain[rlen++];
+ Rchain[rlen] = -1;
+ }
+
+ last = i;
+ llen = 0;
+ Lchain[llen] = -1;
+
+ while(1) {
+ for(l=0, j=0; j < len; j++) {
+ if (MATRIX2D(Lovl, len, last, j) > l &&
+ MATRIX2D(Lovl, len, last, j) >= minovl &&
+ !use[j]) {
+ if(Lchain[llen] != -1) {
+ use[Lchain[llen]] = 0;
+ }
+ Lchain[llen] = j;
+ l = MATRIX2D(Lovl, len, last, j);
+ use[j] = 1;
+ }
+ }
+ if(Lchain[llen] == -1) break;
+ last = Lchain[llen++];
+ if (llen >= len) break;
+ Lchain[llen] = -1;
+ }
+
+
+ seq2 = ALLOCMEMORY(space, NULL, char, len*ws);
+ memset(seq2, 0, len*ws);
+
+ start = 0;
+ for(j=llen; j >= 1; j--) {
+ seq1 = bl_seqclipDecode(list[Lchain[j-1]].b, ws);
+
+ if(j==llen) {
+ k = ws;
+ } else {
+ k = MATRIX2D(Lovl, len, Lchain[j-1], Lchain[j]);
+ }
+
+ memmove(&seq2[start+(ws-k)], seq1, ws);
+ start += ws-k;
+ FREEMEMORY(space, seq1);
+ }
+
+ if(Lchain[0] != -1) {
+ seq1 = bl_seqclipDecode(list[i].b, ws);
+ k = MATRIX2D(Lovl, len, i, Lchain[0]);
+ memmove(&seq2[start+(ws-k)], seq1, ws);
+ start += ws-k;
+ FREEMEMORY(space, seq1);
+ }
+
+ if(Rchain[0] != -1) {
+ seq1 = bl_seqclipDecode(list[Rchain[0]].b, ws);
+ k = MATRIX2D(Rovl, len, i, Rchain[0]);
+ memmove(&seq2[start+(ws-k)], seq1, ws);
+ start += ws-k;
+ FREEMEMORY(space, seq1);
+ }
+
+ for(j=0; j+1 < rlen; j++) {
+ seq1 = bl_seqclipDecode(list[Rchain[j+1]].b, ws);
+ k = MATRIX2D(Rovl, len, Rchain[j], Rchain[j+1]);
+ memmove(&seq2[start+(ws-k)], seq1, ws);
+ start += ws-k;
+ FREEMEMORY(space, seq1);
+ }
+
+// fprintf(stderr, "seq: %s\n", seq2);
+ if(adapterlen < strlen(seq2)) {
+ if(adapter)
+ FREEMEMORY(space, adapter);
+ adapter = seq2;
+ adapterlen = strlen(seq2);
+ } else {
+ FREEMEMORY(space, seq2);
+ }
+
+ seq2 = NULL;
+ }
+
+ FREEMEMORY(space, Rovl);
+ FREEMEMORY(space, Lovl);
+ FREEMEMORY(space, use);
+ FREEMEMORY(space, Lchain);
+ FREEMEMORY(space, Rchain);
+
+ return adapter;
+}
+
+
+
+/*---------------------- bl_seqclipFind3PrimeUpdateBest ----------------------
+ *
+ * @brief helper function to update the list of most frequent motives
+ * @author Steve Hoffmann
+ *
+ */
+PairUint *
+bl_seqclipFind3PrimeUpdateBest(PairUint *list, Uint len,
+ Uint code, Uint value) {
+
+ Uint i;
+ unsigned char shift=0;
+
+ for(i=0; i < len; i++) {
+
+ //eliminate old instance
+ if(shift && list[i].b == code) {
+ memmove(&list[i], &list[i+1], sizeof(PairUint)*(len-i));
+ break;
+ }
+
+ //no higher ranking list elem < value -> update and cancel
+ if(!shift && list[i].a < value && list[i].b == code) {
+ list[i].a = value;
+ break;
+ }
+
+ //a higher ranking list elem < value -> push list down if > 0
+ //create a new instance and raise shift flag
+ if(!shift && list[i].a < value && list[i].b != code) {
+
+ if(list[i].a > 0) {
+ list = ALLOCMEMORY(space, list, PairUint, len+1);
+ memmove(&list[i+1], &list[i], sizeof(PairUint)*(len-i));
+ shift = 1;
+ }
+
+ list[i].a = value;
+ list[i].b = code;
+
+ if(shift == 0)
+ break;
+ }
+ }
+
+ list = ALLOCMEMORY(space, list, PairUint, len);
+ return list;
+}
+
+/*--------------------------- bl_seqclipFind3Prime ---------------------------
+ *
+ * @brief find the 3 prime adapter
+ * @brief find most common sequence of size ws in frame of fs
+ * @author Steve Hoffmann
+ *
+ */
+
+char*
+bl_seqclipFind3Prime (void *space, fasta_t *set, Uint samplesize, Uint fs, int ws)
+{
+
+ char *curseq, *curframe, *curwin;
+ uint16_t *C;
+ double curent = 0;
+ Uint i, elem, size, curlen, *tab, curcode=0, bestvalue=0;
+ int curfs, j;
+ //Uint bestcode=0;
+ PairUint *B;
+
+ //WARN: samplesize > noofseqs
+ if(samplesize) {
+ size = MIN(samplesize, set->noofseqs);
+ srand(time(NULL));
+ } else {
+ size = set->noofseqs;
+ }
+
+ assert(ws <= fs);
+ C = ALLOCMEMORY(space, NULL, uint16_t, pow(5, ws));
+ B = ALLOCMEMORY(space, NULL, PairUint, 100+1);
+ tab = ALLOCMEMORY(space, NULL, Uint, 256);
+ memset(C, 0, sizeof(uint16_t)*pow(5,ws));
+ memset(B, 0, sizeof(PairUint)*101);
+ memset(tab, 0, sizeof(Uint)*256);
+
+ tab['A'] = 1;
+ tab['C'] = 2;
+ tab['G'] = 3;
+ tab['T'] = 4;
+
+
+ for(i=0; i < size; i++) {
+
+/* too costly! new scheme: sample chunks of larger size
+ if(samplesize) {
+ elem = rand() % set->noofseqs;
+ } else {
+ elem = i;
+ }
+*/
+ elem = i;
+
+ curseq = bl_fastaGetSequence(set, elem);
+ curlen = bl_fastaGetSequenceLength(set, elem);
+
+ // catch case where fs <= curlen is never true
+ if(fs >= curlen) {
+ curfs = curlen-1;
+ } else {
+ curfs = fs;
+ }
+
+ //WARN: fs > readsize
+ curframe = &curseq[curlen-curfs-1];
+
+ for(j=0; j < curfs-ws+2; j++) {
+ curwin = &curframe[j];
+ curent = shannonentropy(space, curwin, ws, 6, tab);
+
+ if (curent < 1.6) continue;
+
+ curcode = bl_seqclipGetCode(curwin, ws);
+ C[curcode]++;
+
+ if(B[100-1].a < C[curcode]) {
+ B = bl_seqclipFind3PrimeUpdateBest(B, 100, curcode, C[curcode]);
+ }
+
+ if (bestvalue < C[curcode]) {
+ bestvalue = C[curcode];
+ // bestcode = curcode;
+ }
+
+ if(C[curcode] == 65500)
+ break;
+ }
+ if(C[curcode] == 65500)
+ break;
+ }
+/*
+ {
+ Uint k=0, l=0, r=0;
+ char *nextseq;
+ for(k=0; k < 99; k++) {
+ curseq = bl_seqclipDecode(B[k].b, ws);
+ nxtseq = bl_seqclipDecode(B[k+1].b, ws);
+ bl_lcsub(space, curseq, 10, nxtseq, 10, &l, &r);
+ fprintf(stderr, "best sequence[%d]: %s, val:%d\n", k, curseq, B[k].a);
+ fprintf(stderr, "shannon: %f\n", shannonentropy(space, curseq,ws,5,tab));
+ FREEMEMORY(space, curseq);
+ FREEMEMORY(space, nxtseq);
+ }
+
+ curseq = bl_seqclipDecode(bestcode, ws);
+ fprintf(stderr, "best sequence: %s\n", curseq);
+ }
+*/
+
+ curseq = bl_Find3PrimeGreedy(space, B, 100, ws);
+
+
+ FREEMEMORY(space, C);
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, tab);
+
+
+ return curseq;
+}
+
+
diff --git a/segemehl/libs/seqclip.h b/segemehl/libs/seqclip.h
new file mode 100644
index 0000000..93a7777
--- /dev/null
+++ b/segemehl/libs/seqclip.h
@@ -0,0 +1,40 @@
+#ifndef SEQCLIP_H
+#define SEQCLIP_H
+
+/*
+ *
+ * seqclip.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 24.04.2010 22:32:24 CEST
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include "basic-types.h"
+
+
+Uint
+bl_seqclipPolyA(void *space, char *sequence, Uint len, char* clp, Uint clen);
+
+Uint
+bl_seqclipSoft3Prime(void *space, char *sequence, Uint len,
+ char *toClip, Uint clipsize, Uint minclipacc, Uint pAlen);
+
+Uint
+bl_seqclipSoft5Prime(void *space, char *s, Uint len,
+ char *C, Uint clen, Uint minclipscr);
+
+Uint
+bl_seqclipHard3Prime(Uint len, Uint clipsize);
+
+char*
+bl_seqclipHard5Prime(char *s, Uint len, Uint clen);
+
+char*
+bl_seqclipFind3Prime (void *space, fasta_t *set, Uint samplesize, Uint fs, int ws);
+
+
+#endif
diff --git a/segemehl/libs/snvsplines.c b/segemehl/libs/snvsplines.c
new file mode 100644
index 0000000..3027d4e
--- /dev/null
+++ b/segemehl/libs/snvsplines.c
@@ -0,0 +1,389 @@
+
+/*
+ * snvsplines.c
+ *
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 06/18/14 14:37:12 CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "sort.h"
+#include "alignment.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "vtprogressbar.h"
+#include "fileio.h"
+#include "matchfilesfields.h"
+#include "matchfiles.h"
+#include "debug.h"
+#include "evalmatchfiles.h"
+#include "biofiles.h"
+#include "splicesites.h"
+#include "splines.h"
+
+#include <gsl/gsl_statistics.h>
+#include <gsl/gsl_blas.h>
+#include <gsl/gsl_multifit.h>
+#include <gsl/gsl_bspline.h>
+
+
+/*-------------------------------- getcutoff ---------------------------------
+ *
+ * @brief calculate the cutoff for the calculation
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+getcutoff (matchfileSampleStats_t *stats, char *histofilename,
+ char *scorefilename, char *cutfilename, char* splinefilename,
+ char* estimatefilename)
+{
+
+ Uint nbins = 30;
+ double start = 0.0;
+ double end = 3.0;
+ double p[] = {0.25, 0.50, 0.75};
+ double step = 0.01;
+ double epsilon = 0.000001;
+ double *X = stats->s;
+ double *Xhi, *bins, *y, binsize, *q, maxval, minval;
+ double xi, dyi, dyim1, ddyi, ddyim1, yerr;
+ double yi;
+ double infval = 0;
+ FILE *histofp=NULL, *scorefp = NULL, *cutfp = NULL, *splinefp=NULL, *estimatefp=NULL;
+ Uint n = stats->s_N, m, i, *cnt;
+ splinefit_t fit;
+
+ gsl_bspline_deriv_workspace *dbw = gsl_bspline_deriv_alloc(4);
+ gsl_matrix *dB = gsl_matrix_alloc(7, 5);
+ gsl_matrix *cov = gsl_matrix_alloc(7, 7);
+ gsl_vector *ddv = gsl_vector_alloc(7);
+ gsl_vector *dv = gsl_vector_alloc(7);
+
+ if(histofilename) histofp = fopen(histofilename, "w");
+ if(scorefilename) scorefp = fopen(scorefilename, "w");
+ if(cutfilename) cutfp = fopen(cutfilename, "w");
+ if(splinefilename) splinefp = fopen(splinefilename, "w");
+ if(estimatefilename) estimatefp = fopen(estimatefilename, "w");
+
+
+ ecdf_t * secdf = ecdf_init(X, n);
+ fprintf(stderr, "n=%d, e->n=%d", n , secdf->n);
+ fprintf(stderr, "%f = %f\n", 0.1, ecdf(0.1, secdf));
+ fprintf(stderr, "%f = %f\n", 0.25, ecdf(0.25, secdf));
+ fprintf(stderr, "%f = %f\n", 0.33, ecdf(0.33, secdf));
+ fprintf(stderr, "%f = %f\n", 0.50, ecdf(0.50, secdf));
+ fprintf(stderr, "%f = %f\n", 0.66, ecdf(0.66, secdf));
+ fprintf(stderr, "%f = %f\n", 0.75, ecdf(0.75, secdf));
+ fprintf(stderr, "%f = %f\n", 0.80, ecdf(0.80, secdf));
+ fprintf(stderr, "%f = %f\n", 0.90, ecdf(0.90, secdf));
+ fprintf(stderr, "%f = %f\n", 0.95, ecdf(0.95, secdf));
+ fprintf(stderr, "%f = %f\n", 0.98, ecdf(0.98, secdf));
+ fprintf(stderr, "%f = %f\n", 0.99, ecdf(0.99, secdf));
+ fprintf(stderr, "%f = %f\n", 0.999, ecdf(0.999, secdf));
+ fprintf(stderr, "%f = %f\n", 1.0, ecdf(1.0, secdf));
+ fprintf(stderr, "%f = %f\n", 1.5, ecdf(1.5, secdf));
+ fprintf(stderr, "%f = %f\n", 1.7, ecdf(1.7, secdf));
+ fprintf(stderr, "%f = %f\n", 1.8, ecdf(1.8, secdf));
+ fprintf(stderr, "%f = %f\n", 1.9, ecdf(1.9, secdf));
+ fprintf(stderr, "%f = %f\n", 2.0, ecdf(2.0, secdf));
+ fprintf(stderr, "%f = %f\n", 2.1, ecdf(2.1, secdf));
+ fprintf(stderr, "%f = %f\n", 2.2, ecdf(2.2, secdf));
+ fprintf(stderr, "%f = %f\n", 2.3, ecdf(2.3, secdf));
+ fprintf(stderr, "%f = %f\n", 2.4, ecdf(2.4, secdf));
+ fprintf(stderr, "%f = %f\n", 2.5, ecdf(2.5, secdf));
+
+ double minrp=100, maxrp=-100;
+ for(i=0; i < 100; i++) {
+ if(stats->RP[i] > 1) {
+ maxrp = (maxrp < 1.0-(((double)stats->RP[i]+1.0)/(stats->RP_N[i]+1.0))) ?
+ 1.0-(((double)stats->RP[i]+1.0)/(stats->RP_N[i]+1.0)) : maxrp;
+
+ minrp = (minrp > 1.0-(((double)stats->RP[i]+1.0)/(stats->RP_N[i]+1.0))) ?
+ 1.0-(((double)stats->RP[i]+1.0)/(stats->RP_N[i]+1.0)) : minrp;
+ }
+ }
+
+ fprintf(stderr, "minrp: %f, maxrp:%f\n", minrp, maxrp);
+
+ for(i=0; i < 100; i++) {
+ fprintf(stderr, "%d\t%d\t%f\t%f\t%f\t%f\t%f\n", stats->RP_N[i], stats->RP[i], (((double)stats->RP[i] + 1.0)
+ /((double) stats->RP_N[i] + 1.0)), log(((double)stats->RP[i] + 1.0)
+ /((double) stats->RP_N[i] + 1.0)), log(1-(((double)stats->RP[i] + 1.0)
+ /((double) stats->RP_N[i] + 1.0))),
+ log(minrp) - log(1.0-(((double)stats->RP[i] + 1.0)/((double) stats->RP_N[i] + 1.0))),
+ log(1.0-(((double)stats->RP[i] + 1.0)/((double) stats->RP_N[i] + 1.0))) - log(maxrp));
+ }
+
+ fprintf(stderr, "\n");
+
+
+ double t = 0.5;
+
+// while(ecdf(t, secdf) > 0.96) t+=0.01;
+ while(ecdf(t, secdf) < 0.95) t+=0.01;
+
+ fprintf(stderr, "%f = %f\n", t, ecdf(t, secdf));
+
+
+ qsort(X, n, sizeof(double), cmp_dbl_qsort);
+ end = MAX(X[n-1], X[0]);
+
+ Xhi = ALLOCMEMORY(NULL, NULL, double, n);
+ memset(Xhi, 0, sizeof(double)*n);
+
+ for(i=0, m=0; i < n; i++) {
+ if(X[i] > start && X[i] <= end) {
+ Xhi[m] = X[i];
+ m++;
+ }
+ if(scorefp) fprintf(scorefp, "%f\n", X[i]);
+ }
+
+ Uint binmin = 0;
+ fprintf(stderr, "m %d - %d n\n", m, n);
+ cnt = bin(Xhi, m, &bins, &nbins);
+ y = ALLOCMEMORY(NULL, NULL, double, nbins);
+ memset(y, 0, sizeof(double)*nbins);
+ binsize = (bins[nbins-1]-bins[0])/((double)nbins-1.0);
+ fprintf(stderr, "%f - %f / %d = %f\n",bins[nbins-1], bins[0], nbins-1, binsize);
+ for(i=0; i < nbins; i++) {
+ bins[i] += binsize/2.0;
+ y[i] = (double)cnt[i];
+ if(histofp) fprintf(histofp, "%f\t%f\n", bins[i], y[i]);
+ if(!binmin) {
+ if(i > 0 && i < nbins-1)
+ NFO("y[%d]=%f (of nbins:%d), %f < %f = %d, %f < %f = %d\n", i, y[i], nbins, y[i-1], y[i], y[i-1] > y[i], y[i], y[i+1], y[i] < y[i+1]);
+ if(i > 0 && i < nbins-1 && cnt[i-1] > cnt[i] && cnt[i] < cnt[i+1]) {
+ binmin = i-1;
+ }
+ }
+ }
+
+ q = quantiles(bins, nbins, p, 3);
+
+ //GSL voodoo
+ histsplineglm(bins, y, nbins, q, 3, &fit);
+
+ if(estimatefp){
+
+ for(i = 0; i < fit.c->size; i++) {
+ fprintf(estimatefp, "%d\t%f\n", i, gsl_vector_get(fit.c, i));
+ }
+ }
+
+
+ if(splinefp) {
+ for (xi = bins[0]; xi <= bins[nbins-1]; xi += 0.1)
+ {
+
+ gsl_bspline_deriv_eval(xi, 0, dB, fit.bw, dbw);
+ gsl_bspline_eval(xi, fit.B, fit.bw);
+ gsl_vector_set(fit.B, 0, 1.0); //set the intercept
+
+ gsl_multifit_linear_est(fit.B, fit.c, cov, &yi, &yerr);
+ fprintf(splinefp, "%f\t%f\n", xi, yi);
+ fprintf(stderr, "%f\t%f\n", xi, exp(yi));
+ }
+ }
+
+
+ //go get the rightmost maximum in spline
+ maxval = bins[nbins-1];
+
+ for (xi = bins[nbins-1]; xi > bins[0]+step; xi -= step) {
+
+
+ gsl_bspline_deriv_eval(xi, 2, dB, fit.bw, dbw);
+
+ gsl_matrix_get_col (dv, dB, 1);
+ gsl_vector_set(dv, 0, 0.0);
+
+ gsl_multifit_linear_est(dv, fit.c, fit.cov, &dyi, &yerr);
+ gsl_bspline_deriv_eval(xi-step, 2, dB, fit.bw, dbw);
+ gsl_matrix_get_col (dv, dB, 1);
+ gsl_vector_set(dv, 0, 0.0);
+ gsl_multifit_linear_est(dv, fit.c, fit.cov, &dyim1, &yerr);
+
+
+ if(dyi < 0 && dyim1 > 0 && maxval == bins[nbins-1]) {
+ maxval = xi;
+ }
+ }
+
+
+ minval = 0;
+ infval = 0;
+ //go get the minimum before the rightmost maximum
+ for (xi = bins[0]+0.01; xi < maxval; xi += 0.01)
+ {
+
+ double xisave;
+ gsl_bspline_deriv_eval(xi, 2, dB, fit.bw, dbw);
+
+
+ gsl_matrix_get_col(dv, dB, 1);
+ gsl_matrix_get_col(ddv, dB, 2);
+ gsl_vector_set(dv, 0, 0.0);
+ gsl_vector_set(ddv, 0, 0.0);
+
+ gsl_multifit_linear_est(dv, fit.c, fit.cov, &dyi, &yerr);
+ gsl_multifit_linear_est(ddv, fit.c, fit.cov, &ddyi, &yerr);
+ if(xi == bins[0]+0.01) {
+ xisave = xi + epsilon;
+ } else {
+ xisave = xi;
+ }
+ gsl_bspline_deriv_eval(xisave-step, 2, dB, fit.bw, dbw);
+
+ gsl_matrix_get_col (dv, dB, 1);
+ gsl_matrix_get_col(ddv, dB, 2);
+ gsl_vector_set(dv, 0, 0.0);
+ gsl_vector_set(ddv, 0, 0.0);
+
+ gsl_multifit_linear_est(dv, fit.c, fit.cov, &dyim1, &yerr);
+ gsl_multifit_linear_est(ddv, fit.c, fit.cov, &ddyim1, &yerr);
+
+ if (dyi > 0 && dyim1 < 0) {
+ minval = xi;
+ }
+ }
+
+
+
+ if(minval > bins[0]+0.11) {
+ //maximum and minimum found - now find minimum and the inflection point
+ NFO("selected minimum-inflection method (%f)\n", maxval );
+ for (xi = minval; xi < maxval; xi += step) {
+
+ gsl_bspline_deriv_eval(xi, 2, dB, fit.bw, dbw);
+ gsl_matrix_get_col(dv, dB, 1);
+ gsl_matrix_get_col(ddv, dB, 2);
+ gsl_vector_set(dv, 0, 0.0);
+ gsl_vector_set(ddv, 0, 0.0);
+
+ gsl_multifit_linear_est(dv, fit.c, fit.cov, &dyi, &yerr);
+ gsl_multifit_linear_est(ddv, fit.c, fit.cov, &ddyi, &yerr);
+
+ gsl_bspline_deriv_eval(xi-step, 2, dB, fit.bw, dbw);
+ gsl_matrix_get_col (dv, dB, 1);
+ gsl_matrix_get_col(ddv, dB, 2);
+ gsl_vector_set(dv, 0, 0.0);
+ gsl_vector_set(ddv, 0, 0.0);
+
+ gsl_multifit_linear_est(dv, fit.c, fit.cov, &dyim1, &yerr);
+ gsl_multifit_linear_est(ddv, fit.c, fit.cov, &ddyim1, &yerr);
+
+ /* if (dyi < 0 && dyim1 > 0) {
+ //minimum
+ minval = xi;
+ }
+ */
+ if (ddyi < 0 && ddyim1 > 0) {
+ //negative inflection
+ infval = xi;
+ break;
+ }
+ }
+ } else {
+
+/* minval = 0;
+ infval = 0;
+ NFO("inflection-inflection method selected (%f) - PLEASE CHECK SCORE DISTRIBUTION!\n", maxval);
+ //maximum not found - use the first positive and the last negative inflection point
+ for (xi = bins[0]+step; xi < bins[nbins-1]; xi += step) {
+
+ gsl_bspline_deriv_eval(xi, 2, dB, fit.bw, dbw);
+ gsl_matrix_get_col(dv, dB, 1);
+ gsl_matrix_get_col(ddv, dB, 2);
+ gsl_vector_set(dv, 0, 0.0);
+ gsl_vector_set(ddv, 0, 0.0);
+
+ gsl_multifit_linear_est(dv, fit.c, fit.cov, &dyi, &yerr);
+ gsl_multifit_linear_est(ddv, fit.c, fit.cov, &ddyi, &yerr);
+
+ gsl_bspline_deriv_eval(xi-step, 2, dB, fit.bw, dbw);
+ gsl_matrix_get_col (dv, dB, 1);
+ gsl_matrix_get_col(ddv, dB, 2);
+ gsl_vector_set(dv, 0, 0.0);
+ gsl_vector_set(ddv, 0, 0.0);
+
+ gsl_multifit_linear_est(dv, fit.c, fit.cov, &dyim1, &yerr);
+ gsl_multifit_linear_est(ddv, fit.c, fit.cov, &ddyim1, &yerr);
+
+ if (ddyi > 0 && ddyim1 < 0 && minval == 0) {
+ //first positive inflection
+ minval = xi;
+ }
+
+ if (ddyi < 0 && ddyim1 > 0 && minval != 0) {
+ //second negative inflection
+ infval = xi;
+ break;
+ }
+ }*/
+
+ stats->cut = t;
+ }
+
+
+ NFO("minimum bin: %d\n", binmin);
+ if(minval < bins[0]+0.11 || infval < bins[0]+0.11) {
+ if(binmin) {
+ stats->cut = bins[binmin];
+ NFO("setting cutoff: %f [%f,%f]\n", stats->cut, minval, infval);
+ } else {
+ stats->cut = t;
+ NFO("ATTENTION! NO CUTOFF FOUND! PLEASE CHECK SCORE DISTRIBUTION - SETTING TO %f\n", stats->cut);
+ }
+ } else {
+ // stats->cut = (minval + infval)/2.0;
+
+ stats->cut = (minval + minval)/2.0;
+ stats->cut = MIN(10.0, stats->cut);
+ NFO("setting cutoff: %f [%f,%f]\n", stats->cut, minval, infval);
+ }
+
+ if(cutfp) {
+ fprintf(cutfp, "%f\t", minval);
+ fprintf(cutfp, "%f\t", infval);
+ fprintf(cutfp, "%f\n", stats->cut);
+ }
+
+ if(histofilename) fclose(histofp);
+ if(scorefilename) fclose(scorefp);
+ if(cutfilename) fclose(cutfp);
+ if(splinefilename) fclose(splinefp);
+ if(estimatefilename) fclose(estimatefp);
+
+ destructSplineFit(&fit);
+
+ gsl_bspline_deriv_free(dbw);
+ gsl_matrix_free(dB);
+ gsl_matrix_free(cov);
+ gsl_vector_free(dv);
+ gsl_vector_free(ddv);
+
+ FREEMEMORY(NULL, bins);
+ FREEMEMORY(NULL, q);
+ FREEMEMORY(NULL, cnt);
+ FREEMEMORY(NULL, y);
+ FREEMEMORY(NULL, Xhi);
+ return ;
+}
diff --git a/segemehl/libs/snvsplines.h b/segemehl/libs/snvsplines.h
new file mode 100644
index 0000000..fbd7944
--- /dev/null
+++ b/segemehl/libs/snvsplines.h
@@ -0,0 +1,51 @@
+
+/*
+ *
+ * snvsplines.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 06/18/14 14:37:44 CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "sort.h"
+#include "alignment.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "vtprogressbar.h"
+#include "fileio.h"
+#include "matchfilesfields.h"
+#include "matchfiles.h"
+#include "debug.h"
+#include "evalmatchfiles.h"
+#include "biofiles.h"
+#include "splicesites.h"
+
+
+/*-------------------------------- getcutoff ---------------------------------
+ *
+ * @brief calculate the cutoff for the calculation
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+getcutoff (matchfileSampleStats_t *stats, char *histofilename,
+ char *scorefilename, char *cutfilename, char* splinefilename,
+ char* estimatefilename);
+
diff --git a/segemehl/libs/sort.c b/segemehl/libs/sort.c
new file mode 100644
index 0000000..d89cc67
--- /dev/null
+++ b/segemehl/libs/sort.c
@@ -0,0 +1,547 @@
+
+/*
+ * sort.c
+ * implementation of various sorting algorithms
+ *
+ * @author Steve Hoffmann
+ * @date Mon 27 Nov 2006
+ *
+ * SVN
+ * Revision of last commit: $Rev: 77 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-11-17 13:16:59 +0100 (Mon, 17 Nov 2008) $
+ *
+ * Id: $Id: sort.c 77 2008-11-17 12:16:59Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/sort.c $
+ */
+
+ #include <math.h>
+ #include <string.h>
+ #include "basic-types.h"
+ #include "memory.h"
+ #include "vstack.h"
+ #include "mathematics.h"
+ #include "sort.h"
+ #include "debug.h"
+
+
+
+ Uint cmp_dbl(Uint a, Uint x, void *data, void *info) {
+ double *d = (double*) data;
+
+ /*if(floor(d[a]) > floor(d[x])) return 1;
+ if(floor(d[a]) < floor(d[x])) return 2;
+*/
+ if (fabs((double) d[a] - d[x]) <= FLT_EPSILON) return 0;
+ if ((double) d[a] - d[x] > FLT_EPSILON) return 1;
+ if ((double) d[x] - d[a] > FLT_EPSILON) return 2;
+
+ return 0;
+ }
+
+
+Uint cmp_flt(Uint a, Uint x, void *data, void *info) {
+ float *d = (float*) data;
+
+ if (d[a]>d[x]) return 1;
+ if (d[a]<d[x]) return 2;
+
+ return 0;
+ }
+
+ Uint cmp_int(Uint a, Uint x, void *data, void *info) {
+ int *d = (int*) data;
+
+ if (d[a]>d[x]) return 1;
+ if (d[a]<d[x]) return 2;
+
+ return 0;
+ }
+
+ Uint cmp_int_bin(Uint a, void *data, void *key, void *info) {
+ int *d = (int*) data;
+ int *k = (int*) key;
+
+ if (d[a]>*k) return 1;
+ if (d[a]<*k) return 2;
+
+ return 0;
+ }
+
+
+ Uint cmp_Uint_bin(Uint a, void *data, void *key, void *info) {
+ Uint *d = (Uint*) data;
+ Uint *k = (Uint*) key;
+
+ if (d[a]>*k) return 1;
+ if (d[a]<*k) return 2;
+
+ return 0;
+ }
+
+int cmp_Uint_qsort(const void *a, const void *b) {
+ Uint *first = (Uint*) a;
+ Uint *secnd = (Uint*) b;
+
+ if (*first > *secnd) return 1;
+ if (*first < *secnd) return -1;
+
+ return 0;
+
+}
+
+
+int cmp_int_qsort(const void *a, const void *b) {
+ int *first = (int*) a;
+ int *secnd = (int*) b;
+
+ if (*first > *secnd) return 1;
+ if (*first < *secnd) return -1;
+
+ return 0;
+}
+
+
+int cmp_char_qsort(const void *a, const void *b) {
+ char *first = (char*) a;
+ char *secnd = (char*) b;
+
+ if (*first > *secnd) return 1;
+ if (*first < *secnd) return -1;
+
+ return 0;
+}
+
+
+int cmp_dbl_qsort(const void *a, const void *b) {
+ double *first = (double*) a;
+ double *secnd = (double*) b;
+
+ if (*first > *secnd) return 1;
+ if (*first < *secnd) return -1;
+
+ return 0;
+
+}
+
+Uint cmp_Uint_quickSort(Uint i, Uint j, void *arr, void *nfo) {
+ Uint *cnt = (Uint*) arr;
+
+ if(cnt[i] > cnt[j]) return 2;
+ if(cnt[i] < cnt[j]) return 1;
+
+ return 0;
+}
+
+
+/*---------------------------- binarySearch_left -----------------------------
+ *
+ * @brief search the leftmost (lowest, smallest) elem that equals key
+ * if key does not exist the next largest (higher, greater) will
+ * be returned
+ * @author Steve Hoffmann
+ *
+ */
+
+
+Uint binarySearch_left(void *toSearch, Uint size, void *key,
+ Uint (*cmp)(Uint, void *, void *, void *),
+ void *info) {
+ int left=0, right=size-1, mid=left, res;
+
+ while (left <= right) {
+ mid = (left+right)/2;
+ res = cmp(mid, toSearch, key, info);
+
+#ifdef DBGSORT
+ DBG("mid:%d [left:%d, right:%d]\n", mid, left, right);
+ DBG("cmp:%d\n", res);
+#endif
+
+ if (res == 2) //key is greater
+ left = mid + 1;
+ else
+ right = mid - 1;
+#ifdef DBGSORT
+ DBG("[left:%d,right:%d]\n", left, right);
+#endif
+
+ }
+
+ return left;
+}
+
+/*---------------------------- binarySearch_right ----------------------------
+ *
+ * @brief search the rightmost (highest, largest) elem that equals key
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint binarySearch_right(void *toSearch, Uint size, void *key,
+ Uint (*cmp)(Uint, void *, void *, void *),
+ void *info) {
+
+ int left=0, right=size, mid=left, res;
+
+ while (left+1 < right) {
+ mid = (left+right)/2;
+ res = cmp(mid, toSearch, key, info);
+
+ if (res == 1)
+ right = mid;
+ else
+ left = mid;
+ }
+
+ return left;
+}
+
+
+/*------------------------------ binarySearch_l ------------------------------
+ *
+ * @brief will return the rightmost element of a series of equal elements
+ * @author Steve Hoffmann
+ *
+ */
+
+
+Uint binarySearch_l(void *toSearch, Uint size, void *key,
+ Uint (*cmp)(Uint, void *, void *, void *),
+ void *info) {
+ int left=0, right=size, mid=left, res;
+
+ while (left+1 < right) {
+ mid = (left+right)/2;
+ res = cmp(mid, toSearch, key, info);
+
+ if (res == 1) right = mid;
+ else left = mid;
+ }
+
+ return left;
+}
+
+
+
+Uint binarySearch(void *toSearch, Uint size, void *key,
+ Uint (*cmp)(Uint, void *, void *, void *),
+ void *info) {
+ int left=0, right=size, mid, res;
+
+ while (left<=right) {
+ mid = (left+right)/2;
+ res = cmp(mid, toSearch, key, info);
+
+ if (res == 1)
+ right = mid-1;
+ else
+ if (res == 2)
+ left = mid+1;
+ else
+ return mid;
+ }
+
+ return size+1;
+}
+
+
+Uint *quickSort(void *space, void* toSort, Uint size,
+ Uint (*cmp)(Uint, Uint, void *, void*),
+ void *info) {
+ int left, left2, right, right2;
+ PairSint ins, *lr;
+ Uint i, resc, *sorted, x;
+ VStack vstack;
+
+ sorted = ALLOCMEMORY(space, NULL, Uint, size);
+ for (i=0; i < size; i++) sorted[i]=i;
+ ins.a = 0;
+ ins.b = size-1;
+ bl_vstackInit(&vstack, 10000, sizeof(PairSint));
+ bl_vstackPush(&vstack, &ins);
+
+ while (!bl_vstackIsEmpty(&vstack)){
+ lr = (PairSint *) bl_vstackPop(&vstack, NULL);
+ left = lr->a;
+ right = lr->b;
+ free(lr);
+ while (left < right) {
+ x=sorted[(left+right)/2];
+ left2 = left;
+ right2 = right;
+
+ do {
+ while(cmp(sorted[left2], x, toSort, info)==2){
+ left2++;
+ }
+ while(cmp(sorted[right2], x, toSort, info)==1){
+ right2--;
+ }
+
+ if(left2 <= right2) {
+ resc = sorted[right2];
+ sorted[right2]=sorted[left2];
+ sorted[left2]=resc;
+ left2++;
+ right2--;
+ }
+ } while (right2 >= left2);
+
+
+ if ((left2-left) > (right-left2)) {
+ ins.a = left;
+ ins.b = right2;
+ bl_vstackPush(&vstack, &ins);
+ left = left2;
+ } else {
+ ins.a = left2;
+ ins.b = right;
+ bl_vstackPush(&vstack, &ins);
+ right = right2;
+ }
+ }
+ }
+ bl_vstackDestruct(&vstack, NULL);
+ return sorted;
+}
+
+Uint
+ compareMkstrptr(Uint a, Uint b, Uint depth,
+ void *data, void* info)
+{
+ char **ptr = (char**) data;
+
+
+ if (ptr[a][depth] > ptr[b][depth]) return 1;
+ if (ptr[a][depth] < ptr[b][depth]) return 2;
+
+ return 0;
+}
+
+
+Uint
+ compareMkstr(Uint a, Uint b, Uint depth,
+ void *data, void* info)
+{
+ char *ptr = (char*) data;
+
+
+ if (ptr[a+depth] > ptr[b+depth]) return 1;
+ if (ptr[a+depth] < ptr[b+depth]) return 2;
+
+ return 0;
+}
+
+/*--------------------------------- vecswap ----------------------------------
+ *
+ * swaps a vector (needed by quickSortMultiKey)
+ *
+ */
+
+void
+vecswap(int i, int j, int n, Uint *x)
+{
+ while(n-- > 0) {
+ SWAPUINT(x, i, j);
+ i++;
+ j++;
+ }
+}
+
+
+/*---------------------------- quickSortMultikey -----------------------------
+ *
+ * performs a mulitkey quicksort Bentley Sedgewick style
+ *
+ * Implementation of programm ssort1, as described in:
+ * Fast Algorithms for Sorting and Searching Strings
+ * Proc. of the ACM-SIAM Symposium on Discrete Algorithms,
+ * pages 360-369. http://www.cs.princeton.edu/~rs/strings/
+ *
+ */
+
+Uint *
+quickSortMultikey (void *space, void* toSort, Uint size,
+ Uint (*cmp)(Uint, Uint, Uint, void *, void*),
+ Uint sentinel, void *info)
+{
+ Sint a, b, c, d, v, n, r;
+ TripleSint ins;
+ Uint *sorted = NULL, offset;
+ Uint depth = 0;
+ VStack vstack;
+
+
+ if (size == 0) return NULL;
+
+ sorted = ALLOCMEMORY(space, NULL, Uint, size);
+ if (size<=1) {
+ sorted[0]=0;
+ }
+
+ for (r=0; r < size; r++) sorted[r]=r;
+ bl_vstackInit(&vstack, 100, sizeof(TripleSint));
+ n = size;
+ offset=0;
+
+ while (1) {
+ a = rand() % n;
+ SWAPUINT(sorted, offset, a+offset);
+ v = sorted[offset];
+ a = b = 1;
+ c = d = n-1;
+
+ for(;;) {
+
+ while(b<=c&&((r=cmp(sorted[b+offset],v,depth,toSort,info))==2||r==0))
+ {
+
+ if (r==0) {
+ SWAPUINT(sorted, a+offset, b+offset);
+ a++;
+ }
+ b++;
+ }
+ while(b<=c&&((r=cmp(sorted[c+offset],v,depth,toSort,info))==1||r==0))
+ {
+
+
+ if (r==0) {
+ SWAPUINT(sorted, c+offset, d+offset);
+ d--;
+ }
+ c--;
+ }
+ if (b > c) break;
+ SWAPUINT(sorted, b+offset, c+offset);
+ b++;
+ c--;
+ }
+ r = MIN(a, (b-a));
+ vecswap(offset, (b-r)+offset, r, sorted);
+ r = MIN((d-c), (n-d-1));
+ vecswap(b+offset, (n-r)+offset, r, sorted);
+ /*sort lesser*/
+ r = b-a;
+ if (r > 1) {
+ ins.a = offset;
+ ins.b = r;
+ ins.c = depth;
+ bl_vstackPush(&vstack, &ins);
+ }
+ /*sort equal*/
+ if ((a+n-d-1) > 1 && cmp(sorted[r+offset], sentinel, depth, toSort, info) != 0)
+ /*if (r > 1 && sorted[r+offset]!=sentinel)*/
+ {
+ ins.a = r+offset;
+ ins.b = (a+n-d-1);
+ ins.c = depth+1;
+ bl_vstackPush(&vstack, &ins);
+ }
+ /*sort greater*/
+ r=d-c;
+ if (r > 1) {
+ ins.a = (n-r)+offset;
+ ins.b = r;
+ ins.c = depth;
+ bl_vstackPush(&vstack, &ins);
+ }
+
+ if (!bl_vstackIsEmpty(&vstack)){
+ ins = *((TripleSint *) bl_vstackPop(&vstack, NULL));
+ offset = ins.a;
+ r = ins.b;
+ depth = ins.c;
+ } else {
+ break;
+ }
+ }
+ bl_vstackDestruct(&vstack, NULL);
+ return sorted;
+}
+
+int
+cmp_PairUint_qsort(const void* a, const void* b) {
+ PairUint *i;
+ PairUint *j;
+
+ i = (PairUint*) a;
+ j = (PairUint*) b;
+
+ if (i->a > j->a) return 1;
+ if (i->a < j->a) return -1;
+
+ return 0;
+}
+
+int
+cmp_PairUint_bsearch(const void* a, const void* b) {
+ Uint *key;
+ PairUint *obj;
+
+ key = (Uint*) a;
+ obj = (PairUint*) b;
+
+ if (*key > obj->a) return 1;
+ if (*key < obj->a) return -1;
+
+ return 0;
+}
+
+int
+cmp_PairLSint_qsort(const void* a, const void* b) {
+ PairLSint *i;
+ PairLSint *j;
+
+ i = (PairLSint*) a;
+ j = (PairLSint*) b;
+
+ if (i->a > j->a) return 1;
+ if (i->a < j->a) return -1;
+
+ return 0;
+}
+
+int
+cmp_PairSint_qsort(const void* a, const void* b) {
+ PairSint *i;
+ PairSint *j;
+
+ i = (PairSint*) a;
+ j = (PairSint*) b;
+
+ if (i->a > j->a) return 1;
+ if (i->a < j->a) return -1;
+
+ return 0;
+}
+
+int
+cmp_PairLSint_bsearch(const void* a, const void* b) {
+ Lint *key;
+ PairLSint *obj;
+
+ key = (Lint*) a;
+ obj = (PairLSint*) b;
+
+ if (*key > obj->a) return 1;
+ if (*key < obj->a) return -1;
+
+ return 0;
+}
+
+int
+cmp_PairSint_bsearch(const void* a, const void* b) {
+ int *key;
+ PairSint *obj;
+
+ key = (int*) a;
+ obj = (PairSint*) b;
+
+ if (*key > obj->a) return 1;
+ if (*key < obj->a) return -1;
+
+ return 0;
+}
+
diff --git a/segemehl/libs/sort.h b/segemehl/libs/sort.h
new file mode 100644
index 0000000..baea01a
--- /dev/null
+++ b/segemehl/libs/sort.h
@@ -0,0 +1,50 @@
+#ifndef SORT_H
+#define SORT_H
+
+/*
+ * sort.h
+ * declarations for various sorting algorithms
+ *
+ * @author Steve Hoffmann
+ * @date Mon 27 Nov 2006
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: sort.h 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/sort.h $
+ */
+ #include "basic-types.h"
+
+
+ Uint cmp_dbl(Uint, Uint, void *, void *);
+ Uint cmp_flt(Uint, Uint, void *, void *);
+ Uint cmp_int(Uint, Uint, void *, void *);
+ Uint cmp_int_bin (Uint a, void *, void *, void *);
+ Uint cmp_Uint_bin (Uint, void *, void *, void *);
+ int cmp_Uint_qsort(const void *a, const void *b);
+ int cmp_int_qsort(const void *a, const void *b);
+ int cmp_dbl_qsort(const void *a, const void *b);
+ int cmp_char_qsort(const void *a, const void *b);
+
+ Uint cmp_Uint_quickSort(Uint i, Uint j, void *arr, void *nfo);
+
+ Uint binarySearch_left(void *, Uint, void*, Uint(*cmp)(Uint,void *, void *, void *), void *);
+ Uint binarySearch_right(void *, Uint, void*, Uint(*cmp)(Uint,void *, void *, void *), void *);
+ Uint binarySearch_l(void *, Uint, void*, Uint(*cmp)(Uint,void *, void *, void *), void *);
+ Uint binarySearch_range(void *, Uint, void*, Uint(*cmp)(Uint,void *, void *, void *), void *);
+ Uint binarySearch(void *, Uint, void*, Uint(*cmp)(Uint,void *, void *, void *), void *);
+ Uint *quickSort(void *, void *, Uint, Uint (*cmp)(Uint, Uint, void *, void *), void *);
+ Uint compareMkstr(Uint, Uint, Uint depth, void *, void *);
+ Uint compareMkstrptr(Uint, Uint, Uint depth, void *, void *);
+ Uint *quickSortMultikey(void *, void*, Uint, Uint (*cmp)(Uint, Uint, Uint, void*, void*), Uint, void*);
+ int cmp_PairUint_qsort(const void*, const void*);
+ int cmp_PairUint_bsearch(const void*, const void*);
+ int cmp_PairSint_qsort(const void*, const void*);
+ int cmp_PairSint_bsearch(const void*, const void*);
+ int cmp_PairLSint_bsearch(const void* a, const void* b);
+ int cmp_PairLSint_qsort(const void* a, const void* b);
+#endif
+
diff --git a/segemehl/libs/splicesites.c b/segemehl/libs/splicesites.c
new file mode 100644
index 0000000..f489a53
--- /dev/null
+++ b/segemehl/libs/splicesites.c
@@ -0,0 +1,1251 @@
+
+/*
+ * splicesites.c
+ *
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 06/15/2011 10:56:53 PM CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "sort.h"
+#include "alignment.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "vtprogressbar.h"
+#include "fileio.h"
+#include "matchfilesfields.h"
+#include "matchfiles.h"
+#include "debug.h"
+#include "evalmatchfiles.h"
+#include "list.h"
+#include "biofiles.h"
+#include "splicesites.h"
+#include "matepairs.h"
+
+/*--------------------- bl_matchfileGetDistantSplitSites ---------------------
+ *
+ * @brief get the acceptor or donor split site. with type 'N' the function
+ * will return both
+ * @author Steve Hoffmann
+ *
+ */
+
+distsplitsites_t*
+bl_matchfileGetDistantSplitSites (void *space, matchfileCross_t *cs, Uint pos,
+ Uint cidx, char type, Uint *noofsites, Uint *checkptr)
+{
+
+ Uint i, j, k= 0;
+ distsplitsites_t *sites = NULL;
+
+ for(i=0; i < cs->noofsplits; i++) {
+ if(cs->splits[i].edgetype == type || type == 'N') {
+
+ /*site already seen*/
+ for(j=0; j < k; j++) {
+ if(sites[j].distpos == cs->splits[i].edge &&
+ sites[j].distcidx == cs->splits[i].edgechridx) {
+ break;
+ }
+ }
+
+ /*new site*/
+ if(j==k) {
+ sites = ALLOCMEMORY(space, sites, distsplitsites_t, k+1);
+ sites[k].cs = cs;
+ sites[k].noofsplits = 0;
+ sites[k].distpos = cs->splits[i].edge;
+ sites[k].distcidx = cs->splits[i].edgechridx;
+ sites[k].acceptor = 0;
+ sites[k].donor = 0;
+ sites[k].pos = pos;
+ sites[k].cidx = cidx;
+ sites[k].seen = 0;
+ sites[k].transsplits = 0;
+ k++;
+ }
+
+ if(cs->splits[i].edgetype == 'A') {
+ sites[j].acceptor++;
+ } else {
+ sites[j].donor++;
+ }
+
+ if(cs->splits[i].trans) {
+ sites[j].transsplits++;
+ }
+
+ sites[j].noofsplits++;
+
+ }
+ }
+
+ *checkptr = 0;
+ *noofsites = k;
+ return sites;
+}
+
+
+
+
+
+/*----------------------- bl_matchfileDistSplitSiteCmp -----------------------
+ *
+ * @brief compare dist splitsites
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileDistSplitSiteCmp (Uint no, void *list, void *elem, void *nfo)
+{
+
+ distsplitsites_t *a, *b;
+ List *l;
+ int i;
+
+ l = (List*) list;
+ i = bl_listGetCur(list, no);
+
+ a = (distsplitsites_t*) l->data;
+ b = (distsplitsites_t*) elem;
+
+ if(a[i].distcidx < b->distcidx) return 2;
+ if(a[i].distcidx > b->distcidx) return 1;
+ if(a[i].distpos < b->distpos) return 2;
+ if(a[i].distpos > b->distpos) return 1;
+
+ return 0;
+}
+
+
+
+/*---------------------- bl_matchfileDestructSpliceSite ----------------------
+ *
+ * @brief destructs a splice site
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDestructSpliceSite (void *space, splicesite_t *s)
+{
+
+ FREEMEMORY(space, s->splitsites);
+ FREEMEMORY(space, s->noofsplits);
+ FREEMEMORY(space, s->leftsiteidx);
+ FREEMEMORY(space, s->leftedgeweight);
+ FREEMEMORY(space, s->leftedgeacceptor);
+ FREEMEMORY(space, s->leftedgedonor);
+ FREEMEMORY(space, s->leftmatesupport);
+ FREEMEMORY(space, s->lefttranssplits);
+ FREEMEMORY(space, s->rightedgeacceptor);
+ FREEMEMORY(space, s->rightedgedonor);
+ FREEMEMORY(space, s->rightsiteidx);
+ FREEMEMORY(space, s->rightedgeweight);
+ FREEMEMORY(space, s->rightmatesupport);
+ FREEMEMORY(space, s->righttranssplits);
+ FREEMEMORY(space, s->cs);
+
+ return ;
+}
+
+
+/*---------------------- bl_matchfileDestructSpliceMap -----------------------
+ *
+ * @brief destruct splice map
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDestructSpliceMap (void *space, splicemap_t *sm)
+{
+
+ Uint i;
+
+ for(i=0; i < sm->noofsplicesites; i++) {
+ bl_matchfileDestructSpliceSite(space, &sm->map[i]);
+ }
+
+ for(i=0; i < (sm->interval)*2+1; i++) {
+ FREEMEMORY(space, sm->charhist[i]);
+ FREEMEMORY(space, sm->charhistA[i]);
+ }
+
+ FREEMEMORY(space, sm->histogram);
+ FREEMEMORY(space, sm->chrcnt);
+ FREEMEMORY(space, sm->charhist);
+ FREEMEMORY(space, sm->charhistA);
+ FREEMEMORY(space, sm->map);
+
+ return ;
+}
+
+
+/*----------------------- bl_matchfileInitSplicesites ------------------------
+ *
+ * @brief init the splice sites
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileInitSpliceSite (splicesite_t *s)
+{
+
+ s->noofleftsites = 0;
+ s->leftsiteidx = NULL;
+ s->leftedgeweight = NULL;
+ s->leftedgeacceptor = NULL;
+ s->leftedgedonor = NULL;
+ s->lefttranssplits = NULL;
+ s->leftmatesupport = NULL;
+
+ s->noofrightsites = 0;
+ s->rightsiteidx = NULL;
+ s->rightedgeweight = NULL;
+ s->rightedgeacceptor = NULL;
+ s->rightedgedonor = NULL;
+ s->righttranssplits = NULL;
+ s->rightmatesupport = NULL;
+
+ return ;
+}
+
+
+/*---------------------- bl_matchfileSpliceSiteLinkLeft ----------------------
+ *
+ * @brief link left site
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileInitLeftSite (splicesite_t *s, Uint h, Uint pos)
+{
+
+ s->leftsiteidx = ALLOCMEMORY(space, s->leftsiteidx, Uint, s->noofleftsites+1);
+
+ s->leftsiteidx[h] = pos;
+
+ s->leftedgeweight = ALLOCMEMORY(space, s->leftedgeweight, Uint,
+ s->noofleftsites+1);
+
+ s->leftedgeweight[h] = 0;
+
+ s->leftedgeacceptor = ALLOCMEMORY(space, s->leftedgeacceptor, uint16_t,
+ s->noofleftsites+1);
+
+ s->leftedgeacceptor[h] = 0;
+
+ s->leftedgedonor = ALLOCMEMORY(space, s->leftedgedonor, uint16_t,
+ s->noofleftsites+1);
+
+ s->leftedgedonor[h] = 0;
+
+ s->lefttranssplits = ALLOCMEMORY(space, s->lefttranssplits, uint16_t,
+ s->noofleftsites+1);
+
+ s->lefttranssplits[h] = 0;
+
+ s->leftmatesupport = ALLOCMEMORY(space, s->leftmatesupport, uint16_t,
+ s->noofleftsites+1);
+
+ s->leftmatesupport[h] = 0;
+ s->noofleftsites++;
+
+ return ;
+}
+
+
+/*----------------------- bl_matchfileUpdateLeftSite ------------------------
+ *
+ * @brief update the left/donor site
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileUpdatLeftSite (void *space, splitmap_t *map,
+ splicesite_t *left, splicesite_t *right, Uint h, Uint k)
+{
+
+ right->leftmatesupport[h] = bl_matchfileSearchMateLink(space,
+ &(map->matemap), right->median, right->cidx,
+ left->median, left->cidx);
+
+ left->rightmatesupport = ALLOCMEMORY(space, left->rightmatesupport, uint16_t,
+ left->noofrightsites+1);
+
+ left->rightmatesupport[left->noofrightsites] = right->leftmatesupport[h];
+
+ left->rightsiteidx = ALLOCMEMORY(space, left->rightsiteidx, Uint,
+ left->noofrightsites+1);
+
+ left->rightsiteidx[left->noofrightsites] = k;
+
+ left->rightedgeweight = ALLOCMEMORY(space, left->rightedgeweight, Uint,
+ left->noofrightsites+1);
+
+ left->rightedgeweight[left->noofrightsites] = right->leftedgeweight[h];
+
+ left->rightedgeacceptor = ALLOCMEMORY(space, left->rightedgeacceptor, uint16_t,
+ left->noofrightsites+1);
+
+ left->rightedgeacceptor[left->noofrightsites] = right->leftedgedonor[h];
+
+ left->rightedgedonor = ALLOCMEMORY(space, left->rightedgedonor, uint16_t,
+ left->noofrightsites+1);
+
+ left->rightedgedonor[left->noofrightsites] = right->leftedgeacceptor[h];
+
+ left->righttranssplits = ALLOCMEMORY(space, left->righttranssplits, uint16_t,
+ left->noofrightsites+1);
+
+ left->righttranssplits[left->noofrightsites] = right->lefttranssplits[h];
+
+ left->noofrightsites++;
+
+ return ;
+}
+
+
+/*---------------------- bl_matchfileCondenseSpliceMap -----------------------
+ *
+ * @brief condense the split map to a splice map
+ * @author Steve Hoffmann
+ *
+ */
+
+splicemap_t*
+bl_matchfileSpliceMap (void *space, splitmap_t *map, Uint interval,
+ Uint minsplitno)
+{
+
+ Uint i, j, h, sum=0, last=0, k=0, reflen;
+ distsplitsites_t **distsites, *iter, cursplice;
+ Uint *noofdistsites;
+ splicesite_t *s = NULL;
+ splicemap_t *sm;
+ Uint totalsplits, left, cur;
+ Uint atypes, dtypes;
+ Uint pstrands, mstrands;
+
+
+ Uint transsplits;
+ matchfileCross_t **cs;
+ Uint *splitsites;
+ Uint *noofsplits;
+ List distlist;
+
+ Uint idx, check;
+ char *ref = NULL;
+
+ sm = ALLOCMEMORY(space, NULL, splicemap_t, 1);
+ sm->histogram = ALLOCMEMORY(space, NULL, Uint, interval*2+1);
+ sm->chrcnt = ALLOCMEMORY(space, NULL, Uint, interval*2+1);
+ sm->charhist = ALLOCMEMORY(space, NULL, Uint*, interval*2+1);
+ sm->charhistA = ALLOCMEMORY(space, NULL, Uint*, interval*2+1);
+
+ memset(sm->histogram, 0, sizeof(Uint)*(interval*2+1));
+ memset(sm->chrcnt, 0, sizeof(Uint)*(interval*2+1));
+
+
+ for(i=0; i < interval*2+1; i++) {
+ sm->charhist[i] = ALLOCMEMORY(space, NULL, Uint, 255);
+ sm->charhistA[i] = ALLOCMEMORY(space, NULL, Uint, 255);
+ memset(sm->charhist[i], 0, 255*sizeof(Uint));
+ memset(sm->charhistA[i], 0, 255*sizeof(Uint));
+ }
+
+ bl_listInit(&distlist, 1000, sizeof(distsplitsites_t));
+
+ /*
+ * iter all splits
+ */
+
+#ifdef DBGSPLIT
+ DBG("noofsplits:%d\n", map->noofsplits);
+#endif
+
+ for(i=0; i <= map->noofsplits; i++) {
+
+#ifdef DBGSPLIT
+ DBG("split:%d\n", i);
+#endif
+
+ /*
+ * enter splice site in case
+ * a. first split seen just to set last
+ * b. new chromosome
+ * c. current split i \notin [last,last+interval]
+ * d. i == map->noofsplits is last iteration to
+ * update (nothing shall be inserted to list)
+ */
+
+ if (i==0 || i == map->noofsplits || map->cidx[i] != map->cidx[last] ||
+ map->pos[i] > map->pos[last]+interval) {
+
+
+ if(i > 0) {
+
+ /*
+ * iter all splits in interval [last,i[
+ * and collect them
+ */
+
+ splitsites = ALLOCMEMORY(space, NULL, Uint, i-last+1);
+ noofsplits = ALLOCMEMORY(space, NULL, Uint, i-last+1);
+ distsites = ALLOCMEMORY(space, NULL, distsplitsites_t*, i-last+1);
+ noofdistsites = ALLOCMEMORY(space, NULL, Uint, i-last+1);
+
+ cs = ALLOCMEMORY(space, NULL, matchfileCross_t**, i-last+1);
+ mstrands = 0;
+ pstrands = 0;
+ atypes = 0;
+ dtypes = 0;
+ totalsplits = 0;
+ transsplits = 0;
+
+ for(j=last; j < i; j++) {
+ cs[j-last] = &map->cs[j];
+ splitsites[j-last] = map->pos[j];
+ noofsplits[j-last] = map->cs[j].noofsplits;
+ totalsplits += map->cs[j].noofsplits;
+
+ /*
+ * store array of distant split sites in array of arrays
+ * and number in array
+ */
+
+ distsites[j-last] = bl_matchfileGetDistantSplitSites (space, &map->cs[j],
+ map->pos[j], map->cidx[j], 'N', &noofdistsites[j-last], &check);
+
+ for(h=0; h < map->cs[j].noofsplits; h++) {
+
+ if(map->cs[j].splits[h].strand == '-') {
+ mstrands++;
+ } else {
+ pstrands++;
+ }
+
+ if(map->cs[j].splits[h].edgetype == 'D') {
+ dtypes++;
+ } else {
+ atypes++;
+ }
+
+ if(map->cs[j].splits[h].trans) {
+ transsplits++;
+ }
+ }
+ }
+
+ /*
+ * proceed only if the splicesite qualifies
+ */
+
+ if(totalsplits >= minsplitno) {
+
+#ifdef DBGSPLIT
+ DBG("adding new site with %d splits (min:%d)\n", totalsplits, minsplitno);
+#endif
+
+ /*
+ * alloc new splice site
+ */
+
+ s = ALLOCMEMORY(space, s, splicesite_t, k+1);
+ bl_matchfileInitSpliceSite(&s[k]);
+
+ s[k].cidx = map->cidx[last];
+ s[k].chromname = bl_fastaGetDescription(map->set, s[k].cidx);
+ s[k].start = map->pos[last];
+ s[k].end = map->pos[i-1];
+ s[k].noofsplitsites = i-last+1;
+ s[k].splitsites = splitsites;
+ s[k].noofsplits = noofsplits;
+ s[k].cs = cs;
+ s[k].totalsplits = totalsplits;
+ s[k].atypes = atypes;
+ s[k].dtypes = dtypes;
+ s[k].pstrands = pstrands;
+ s[k].mstrands = mstrands;
+ s[k].transsplits = transsplits;
+ s[k].type = (s[k].atypes > s[k].dtypes) ? 'A' : 'D';
+ s[k].strand = (s[k].mstrands > s[k].pstrands) ? '-' : '+';
+
+ /*
+ * get the median split
+ */
+
+ for(sum=0, j=last; j < i; j++) {
+ if(sum < s[k].totalsplits/2) {
+ sum += s[k].noofsplits[j-last];
+ }
+ if(sum >= s[k].totalsplits/2) break;
+ }
+
+ assert(j < i);
+ s[k].median = map->pos[j];
+
+#ifdef DBGSPLIT
+ DBG("new site has median %d\n", s[k].median);
+#endif
+
+ /*
+ * insert dist splits to sorted list
+ * only insert if distpos is still to come
+ * no order on cidx yet
+ * not the case if i == map->noofsplits
+ */
+
+ if(i < map->noofsplits) {
+ for(j=last; j < i; j++) {
+ for(h=0; h < noofdistsites[j-last]; h++) {
+ if(distsites[j-last][h].distcidx != map->cidx[last] ||
+ distsites[j-last][h].distpos >= map->pos[i]) {
+
+ distsites[j-last][h].splicesite = k;
+ bl_listBinarySearchInsert(&distlist, &distsites[j-last][h],
+ bl_matchfileDistSplitSiteCmp, NULL);
+ }
+ }
+ }
+ }
+
+#ifdef DBGSPLIT
+ DBG("checking list sort w/ %d elems.\n",bl_listSize(&distlist));
+ distsplitsites_t *elem;
+ Uint checkdistcidx = 0, checkdistpos = 0;
+ for(j=0; j < bl_listSize(&distlist); j++) {
+ elem = (distsplitsites_t*) bl_listGetElem(&distlist, bl_listGetCur(&distlist, j));
+ fprintf(stderr, "%d=%d->%d(%d)\n", j, elem->pos, elem->distpos, elem->distcidx);
+ assert(elem->distcidx > checkdistcidx || (elem->distcidx == checkdistcidx && elem->distpos >= checkdistpos));
+ checkdistpos = elem->distpos;
+ checkdistcidx = elem->distcidx;
+ }
+#endif
+
+ /*
+ * find and link the donor split sites
+ * note that from distpos interval/2 is substracted
+ * to searching the whole interval
+ */
+
+ cursplice.distpos = (s[k].median > (Uint)ceil((double)interval/2.0)) ?
+ s[k].median - (Uint)ceil((double)interval/2.0) : 0;
+ cursplice.distcidx = map->cidx[last];
+
+ left = binarySearch_left(&distlist, distlist.numofelem, &cursplice,
+ bl_matchfileDistSplitSiteCmp, NULL);
+
+#ifdef DBGSPLIT
+ DBG("search for distpos:%d in list\n", cursplice.distpos);
+ if(left != -1) {
+ elem = (distsplitsites_t*) bl_listGetElem(&distlist, bl_listGetCur(&distlist, left));
+ if(elem) DBG("first elem found %d: %d->%d\n", left, elem->pos, elem->distpos);
+ }
+#endif
+ left = bl_listGetCur(&distlist, left);
+ iter = (distsplitsites_t*) distlist.data;
+
+
+ while(left != -1) {
+#ifdef DBGSPLIT
+ DBG("search for distpos:%d in list\n", cursplice.distpos);
+ if(left != -1) {
+ DBG("found:%d distcidx:%d distpos:%d [%d,%d]\n", left, iter[left].distcidx, iter[left].distpos);
+ }
+#endif
+
+ if(iter[left].distcidx == map->cidx[last] &&
+ iter[left].distpos <= cursplice.distpos+interval &&
+ iter[left].distpos >= cursplice.distpos) {
+#ifdef DBGSPLIT
+ DBG("entered:%d in [%d,%d]\n", left, cursplice.distpos-interval, cursplice.distpos+interval);
+#endif
+
+
+ for(h=0; h < s[k].noofleftsites; h++) {
+ if(iter[left].splicesite == s[k].leftsiteidx[h])
+ break;
+ }
+
+ if(h == s[k].noofleftsites) {
+ bl_matchfileInitLeftSite (&s[k], h, iter[left].splicesite);
+ }
+
+ s[k].leftedgeweight[h] += iter[left].noofsplits;
+ s[k].leftedgeacceptor[h] += iter[left].acceptor;
+ s[k].leftedgedonor[h] += iter[left].donor;
+ s[k].lefttranssplits[h] += iter[left].transsplits;
+
+ iter[left].seen = 1;
+
+ cur = left;
+ left = distlist.nodes[left].next;
+ //elem = bl_listUnlink(&distlist, cur, NULL);
+ //FREEMEMORY(space, elem);
+
+ } else {
+#ifdef DBGSPLIT
+ DBG("not entered:%d not in [%d,%d]\n", left, cursplice.distpos, cursplice.distpos+interval);
+#endif
+ break;
+ }
+ }
+
+ /* now update the donor site */
+
+ for(h=0; h < s[k].noofleftsites; h++) {
+ idx = s[k].leftsiteidx[h];
+ bl_matchfileUpdatLeftSite (space, map, &s[idx], &s[k], h, k);
+ }
+
+ idx = bl_fastxFindIDIdx(s[k].chromname, map->set);
+ ref = bl_fastaGetSequence(map->set, idx);
+ reflen = bl_fastaGetSequenceLength(map->set, idx);
+
+ /*
+ * just histograms
+ */
+
+ if(s[k].type == 'D') {
+ if(s[k].strand=='+') {
+ sm->charhist[interval][(int)ref[s[k].median]]++;
+ } else {
+ sm->charhist[interval][(int)charComplementChar(ref[s[k].median])]++;
+ }
+ for(h=1; h < MIN(interval+1, 11); h++) {
+ if(s[k].strand == '+') {
+ if (s[k].median >= h)
+ sm->charhist[interval-h][(int)ref[s[k].median-h]]++;
+ if (s[k].median + h < reflen)
+ sm->charhist[interval+h][(int)ref[s[k].median+h]]++;
+ } else {
+ if (s[k].median + h < reflen)
+ sm->charhist[interval-h][(int)charComplementChar(ref[s[k].median+h])]++;
+ if (s[k].median >= h)
+ sm->charhist[interval+h][(int)charComplementChar(ref[s[k].median-h])]++;
+ }
+ }
+ }
+
+
+ if(s[k].type == 'A') {
+ if(s[k].strand=='+') {
+ sm->charhistA[interval][(int)ref[s[k].median]]++;
+ } else {
+ sm->charhistA[interval][(int)charComplementChar(ref[s[k].median])]++;
+ }
+ for(h=1; h < MIN(interval+1, 11); h++) {
+ if(s[k].strand == '+') {
+ if (s[k].median >= h)
+ sm->charhistA[interval-h][(int)ref[s[k].median-h]]++;
+ if (s[k].median + h < reflen)
+ sm->charhistA[interval+h][(int)ref[s[k].median+h]]++;
+ } else {
+ if (s[k].median + h < reflen)
+ sm->charhistA[interval-h][(int)charComplementChar(ref[s[k].median+h])]++;
+ if (s[k].median >= h)
+ sm->charhistA[interval+h][(int)charComplementChar(ref[s[k].median-h])]++;
+ }
+ }
+ }
+
+/*
+ sm->histogram[interval] += s[k].noofsplits[j-last];
+ sum = s[k].noofsplits[j-last];
+ for(h=last; h < i; h++) {
+ if(h > j) {
+ sum += s[k].noofsplits[h-last];
+ if (h-j < interval + 1){
+ sm->histogram[interval+(h-j)] += s[k].noofsplits[h-last];
+// chrcnt[interval+(h-j)]++;
+// if (s[k].median + (h-j) < reflen) charhist[interval+(h-j)][(int)ref[s[k].median+(h-j)]]++;
+ }
+ }
+ if(h < j) {
+
+ sum += s[k].noofsplits[h-last];
+ if (j-h < interval + 1){
+ sm->histogram[interval-(j-h)] += s[k].noofsplits[h-last];
+// chrcnt[interval-(j-h)]++;
+// if (s[k].median >= (j-h)) charhist[interval-(j-h)][(int)ref[s[k].median-(j-h)]]++;
+ }
+ }
+ }*/
+ k++;
+ } else {
+
+ FREEMEMORY(space, splitsites);
+ FREEMEMORY(space, noofsplits);
+ FREEMEMORY(space, cs);
+ }
+
+ /*
+ * since distsites are stored in a list -> destruct array
+ */
+
+ for(j=last; j < i; j++) {
+ if(distsites[j-last])
+ FREEMEMORY(space, distsites[j-last]);
+ }
+ FREEMEMORY(space, distsites);
+ FREEMEMORY(space, noofdistsites);
+ }
+
+ last = i;
+ }
+ }
+
+
+ cur = distlist.first;
+ while(cur != -1) {
+ iter = (distsplitsites_t *)bl_listGetElem(&distlist, cur);
+ cur = distlist.nodes[cur].next;
+ }
+
+ bl_listDestruct(&distlist, NULL);
+
+ sm->map = s;
+ sm->noofsplicesites = k;
+ sm->interval = interval;
+
+
+ return sm;
+}
+
+
+
+
+/*----------------------- bl_matchfileDestructSplitMap -----------------------
+ *
+ * @brief destruct the split map
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileDestructSplitMap (void *space, splitmap_t *splitmap)
+{
+
+ if(splitmap->pos) FREEMEMORY(space, splitmap->pos);
+ if(splitmap->cidx) FREEMEMORY(space, splitmap->cidx);
+
+ if(splitmap->cs) {
+ bl_matchfileDestructCross(space, splitmap->cs, splitmap->noofsplits);
+ FREEMEMORY(space, splitmap->cs);
+ }
+
+ splitmap->pos = NULL;
+ splitmap->cidx = NULL;
+ splitmap->cs = NULL;
+ bl_matchfileDestructMateBinMap(space, &(splitmap->matemap));
+
+ return ;
+
+}
+
+
+/*------------------------ bl_matchfileInitSplicemap -------------------------
+ *
+ * @brief init the splicing map
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileInitSplitMap (void *space, splitmap_t *splitmap, annotationtrack_t *bed,
+ matchfile_t **files, fasta_t *set)
+{
+
+ splitmap->bed = bed;
+ splitmap->noofsplits = 0;
+ splitmap->files = files;
+ splitmap->pos = NULL;
+ splitmap->cidx = NULL;
+ splitmap->cs = NULL;
+ splitmap->set = set;
+
+ bl_matchfileInitMateBinMap (space, &(splitmap->matemap));
+
+ return ;
+}
+
+
+/*------------------------ bl_matchfileAddSplitToMap -------------------------
+ *
+ * @brief adding splice site to splice map
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileAddSplitToMap (void *space, splitmap_t *map, matchfileCross_t *cs,
+ Uint cidx, Uint pos, char ref)
+{
+ Uint k;
+
+ k = map->noofsplits;
+
+ map->cs = ALLOCMEMORY(space, map->cs, matchfileCross_t, k+1);
+ map->pos = ALLOCMEMORY(space, map->pos, Uint, k+1);
+ map->cidx = ALLOCMEMORY(space, map->cidx, Uint, k+1);
+
+ bl_matchfileCopyCross(space, &map->cs[k], cs);
+ map->pos[k] = pos;
+ map->cidx[k] = cidx;
+
+ map->noofsplits++;
+
+
+ return ;
+}
+
+
+
+/*---------------------------- bl_matchfileSplit ----------------------------
+ *
+ * @brief check for splits
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint
+bl_matchfileSplit (void *space, Uint fidx, Uint cidx, Uint pos,
+ matchfileCross_t *cs,
+ char ref, matchfileindex_t *idx, unsigned char show, void *nfo)
+{
+ splitmap_t *map = (splitmap_t*) nfo;
+ //matchfileSampleStats_t *stats = idx->stats;
+ //char *chr;
+
+ /*handle groups!*/
+
+ if(map) {
+
+ // not used: chr = bl_fastaGetDescription(map->set, cidx);
+ bl_matchfileAddToMateBinMap (space, &(map->matemap), cs, cidx, pos, ref);
+
+ if(cs->noofsplits > 0) {
+ bl_matchfileAddSplitToMap(space, map, cs, cidx, pos, ref);
+ }
+ return 1;
+ } else {
+ fprintf(stderr, "No map");
+ }
+
+ return 0;
+}
+
+/*------------------------------- printsplits --------------------------------
+ *
+ * @brief print splits
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+printsplits (void *space, char *chr, Uint pos, matchfileCross_t *cs,
+ splitmap_t *map)
+{
+ Uint i, len, j, k=0;
+ char *chr2, *ptr, *res;
+ matchfileSplit_t *split;
+ Uint *positions;
+ char **chroms;
+ Uint *poscnt;
+
+ if(cs->noofsplits > 0) {
+ printf("%s\t%d\t%d\t", chr, pos, cs->noofsplits);
+
+ chroms = ALLOCMEMORY(space, NULL, char*, cs->noofsplits);
+ positions = ALLOCMEMORY(space, NULL, Uint, cs->noofsplits);
+ poscnt = ALLOCMEMORY(space, NULL, Uint, cs->noofsplits);
+
+ memset(chroms, 0, sizeof(char*)*cs->noofsplits);
+ memset(poscnt, 0, sizeof(Uint)*cs->noofsplits);
+ memset(positions, 0, sizeof(Uint)*cs->noofsplits);
+
+ for(i=0; i < cs->noofsplits; i++) {
+ split = &cs->splits[i];
+
+ len = bl_fastaGetDescriptionLength(map->set, split->edgechridx);
+ chr2 = ALLOCMEMORY(space, NULL, char, len+1);
+ memmove(chr2, bl_fastaGetDescription(map->set, split->edgechridx), len);
+ chr2[len] = 0;
+
+ for(j=0; j < k; j++) {
+ if(chroms[j] && !strcmp(chroms[j], chr2) && positions[j] == split->edge) {
+ poscnt[j]++;
+ break;
+ }
+ }
+
+ if(j == k) {
+ chroms[j] = chr2;
+ positions[j] = split->edge;
+ poscnt[j] = 1;
+ k++;
+ } else {
+ FREEMEMORY(space, chr2);
+ }
+ }
+
+
+ for(j=0; j < k; j++) {
+ res = chroms[j];
+ ptr = strtok(chroms[j], " ");
+ printf("%s:%d:%d\t", ptr, positions[j], poscnt[j]);
+ FREEMEMORY(space, res);
+ }
+
+ printf("\n");
+ FREEMEMORY(space, chroms);
+ FREEMEMORY(space, poscnt);
+ FREEMEMORY(space, positions);
+ }
+
+ return ;
+}
+
+/*------------------------------ printsplicebed ------------------------------
+ *
+ * @brief print the splice bed file
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+printsplicebed(void *space, splicemap_t *sm, Uint minsplitno, char* title, FILE *dev, FILE *transdev) {
+ Uint i, j;
+ char *type;
+ double totalsplitfraction = 0.0;
+
+ fprintf(dev, "track name=splicesites_%s description=\"splice sites %s\" useScore=0\n", title, title);
+ fprintf(transdev, "track name=transsplicesites_%s description=\"trans splice sites %s\" useScore=0\n", title, title);
+
+#ifdef DBGSPLIT
+ DBG("number of splice sites: %d\n", sm->noofsplicesites);
+#endif
+
+ for(i=0; i < sm->noofsplicesites; i++) {
+ for(j=0; j < sm->map[i].noofrightsites; j++) {
+
+ if(sm->map[sm->map[i].rightsiteidx[j]].cidx == sm->map[i].cidx &&
+ sm->map[sm->map[i].rightsiteidx[j]].median - sm->map[i].median < 200000) {
+
+ if(sm->map[i].righttranssplits[j]) {
+
+ if(sm->map[i].rightedgeweight[j] >= minsplitno &&
+ sm->map[i].rightedgeweight[j] >=
+ (int)(((double)sm->map[i].totalsplits)*totalsplitfraction)) {
+
+ fprintf(transdev, "%s\t%d\t%d\t%s:%d:%d:%d\t%d\t%c\n",
+ sm->map[i].chromname, sm->map[i].median,
+ sm->map[sm->map[i].rightsiteidx[j]].median,
+ "strandsplice",
+ sm->map[i].rightedgeweight[j],
+// sm->map[i].righttranssplits[j],
+ sm->map[i].totalsplits,
+ sm->map[sm->map[i].rightsiteidx[j]].totalsplits,
+ sm->map[i].rightmatesupport[j],
+// (int)(500+((float)sm->map[i].totalsplits/50.0)*500),
+ sm->map[i].strand);
+
+ }
+
+ } else {
+
+ fprintf(dev, "%s\t%d\t%d\t%s:%d:%d:%d\t%d\t%c\n",
+ sm->map[i].chromname, sm->map[i].median,
+ sm->map[sm->map[i].rightsiteidx[j]].median,
+ "splice",
+ sm->map[i].rightedgeweight[j],
+ sm->map[i].totalsplits,
+ sm->map[sm->map[i].rightsiteidx[j]].totalsplits,
+ sm->map[i].rightmatesupport[j],
+// (int)(500+((float)sm->map[i].totalsplits/50.0)*500),
+ sm->map[i].strand);
+ }
+
+ } else {
+
+ if(sm->map[i].righttranssplits[j])
+ type = "diststrandsplice";
+ else
+ type = "distsplice";
+ if(sm->map[i].rightedgeweight[j] >= minsplitno &&
+ sm->map[i].rightedgeweight[j] >=
+ (int)(((double)sm->map[i].totalsplits)*totalsplitfraction) &&
+ sm->map[i].rightedgeweight[j] >=
+ (int)(((double)sm->map[sm->map[i].rightsiteidx[j]].totalsplits)*totalsplitfraction)) {
+
+ fprintf(transdev, "%s\t%d\t%d\t%s:%s:%d:%d:%d:%d\t%d\t%c\n",
+ sm->map[i].chromname, sm->map[i].median,
+ sm->map[i].median,
+ type,
+ sm->map[sm->map[i].rightsiteidx[j]].chromname,
+ sm->map[sm->map[i].rightsiteidx[j]].median,
+ sm->map[i].rightedgeweight[j],
+// sm->map[i].righttranssplits[j],
+ sm->map[i].totalsplits,
+ sm->map[sm->map[i].rightsiteidx[j]].totalsplits,
+ sm->map[i].rightmatesupport[j],
+// (int)(500+((float)sm->map[i].totalsplits/50.0)*500),
+ sm->map[i].strand);
+ }
+ }
+ }
+
+ for(j=0; j < sm->map[i].noofleftsites; j++) {
+ if(sm->map[sm->map[i].leftsiteidx[j]].cidx != sm->map[i].cidx
+ || sm->map[i].median - sm->map[sm->map[i].leftsiteidx[j]].median >= 200000) {
+
+ if(sm->map[i].lefttranssplits[j])
+ type = "diststrandsplice";
+ else
+ type = "distsplice";
+
+ if(sm->map[i].leftedgeweight[j] >= minsplitno &&
+ sm->map[i].leftedgeweight[j] >=
+ (int)(((double)sm->map[i].totalsplits)*totalsplitfraction) &&
+ sm->map[i].leftedgeweight[j] >=
+ (int)(((double)sm->map[sm->map[i].leftsiteidx[j]].totalsplits)*totalsplitfraction)) {
+
+ fprintf(transdev, "%s\t%d\t%d\t%s:%s:%d:%d:%d:%d\t%d\t%c\n",
+ sm->map[i].chromname, sm->map[i].median, sm->map[i].median,
+ type,
+ sm->map[sm->map[i].leftsiteidx[j]].chromname,
+ sm->map[sm->map[i].leftsiteidx[j]].median,
+ sm->map[i].leftedgeweight[j],
+// sm->map[i].lefttranssplits[j],
+ sm->map[i].totalsplits,
+ sm->map[sm->map[i].leftsiteidx[j]].totalsplits,
+ sm->map[i].leftmatesupport[j],
+ // (int)(500+((float)sm->map[i].totalsplits/50.0)*500),
+ sm->map[i].strand);
+ }
+ }
+ }
+
+ }
+}
+
+
+/*----------------------- bl_matchfileSpliceAnnotation -----------------------
+ *
+ * @brief intersect splicesites with exon annotation
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_matchfileSpliceAnnotation (void *space, splicemap_t* sm, annotationtrack_t *track)
+{
+
+ Uint i, j, left, len;
+ int ldist, rdist;
+ annotationitem_t cur;
+ char *attr;
+
+ for(i=0; i < sm->noofsplicesites; i++) {
+
+ cur.chromname = sm->map[i].chromname;
+ cur.strand = sm->map[i].strand;
+ cur.start = (sm->map[i].median < sm->interval) ? 0 :
+ sm->map[i].median - sm->interval;
+ cur.end = sm->map[i].median + sm->interval;
+
+ left = binarySearch_left(track, track->noofitems, &cur,
+ bl_annotationitem_cmp_track, NULL);
+ /*
+ fprintf(stdout, "searching:%s,%d(%d, %d[i:%d]) -> leftmost %s,%d,%d\n",
+ cur.chromname, cur.start, sm->map[i].median, sm->map[i].totalsplits, i,
+ track->items[left].chromname,
+ track->items[left].start,
+ track->items[left].end);
+ */
+ while(track->items[left].start <= cur.end) {
+
+ if(sm->map[i].noofleftsites || sm->map[i].noofrightsites) {
+
+ ldist = (int)sm->map[i].median - track->items[left].start;
+ rdist = (int)sm->map[i].median - track->items[left].end;
+
+
+ if((abs(ldist) <= sm->interval || abs(rdist) <= sm->interval) &&
+ (sm->map[i].strand == track->items[left].strand)) {
+
+
+ len = snprintf(NULL, 0, "%s \"%c\"", "alignedsplicesite_type",
+ sm->map[i].type);
+ attr = ALLOCMEMORY(space, NULL, char, len+1);
+ snprintf(attr, len+1, "%s \"%c\"", "alignedsplicesite_type",
+ sm->map[i].type);
+ bl_GFFAddAttribute(space, &track->items[left], attr, len);
+ FREEMEMORY(space, attr);
+
+
+ len = snprintf(NULL, 0, "%s %d", "alignedsplicesite_pos", sm->map[i].median);
+ attr = ALLOCMEMORY(space, NULL, char, len+1);
+ snprintf(attr, len+1, "%s %d", "alignedsplicesite_pos", sm->map[i].median);
+ bl_GFFAddAttribute(space, &track->items[left], attr, len);
+ FREEMEMORY(space, attr);
+
+ len = snprintf(NULL, 0, "%s %d", "alignedsplicesite_dist",
+ (abs(ldist) < abs(rdist)) ? ldist : rdist);
+ attr = ALLOCMEMORY(space, NULL, char, len+1);
+ snprintf(attr, len+1, "%s %d", "alignedsplicesite_dist",
+ (abs(ldist) < abs(rdist)) ? ldist : rdist);
+ bl_GFFAddAttribute(space, &track->items[left], attr, len);
+ FREEMEMORY(space, attr);
+
+
+ for(j=0; j < sm->map[i].noofrightsites; j++) {
+ len = snprintf(NULL, 0, "%s \"%s\" %d %d", "distsplicesite_pos",
+ sm->map[sm->map[i].rightsiteidx[j]].chromname,
+ sm->map[sm->map[i].rightsiteidx[j]].median,
+ sm->map[i].rightedgeweight[j]);
+ attr = ALLOCMEMORY(space, NULL, char, len+1);
+
+ snprintf(attr, len+1, "%s \"%s\" %d %d", "distsplicesite_pos",
+ sm->map[sm->map[i].rightsiteidx[j]].chromname,
+ sm->map[sm->map[i].rightsiteidx[j]].median,
+ sm->map[i].rightedgeweight[j]);
+
+ bl_GFFAddAttribute(space, &track->items[left], attr, len);
+ FREEMEMORY(space, attr);
+ }
+
+ for(j=0; j < sm->map[i].noofleftsites; j++) {
+ len = snprintf(NULL, 0, "%s \"%s\" %d %d", "distsplicesite_pos",
+ sm->map[sm->map[i].leftsiteidx[j]].chromname,
+ sm->map[sm->map[i].leftsiteidx[j]].median,
+ sm->map[i].leftedgeweight[j]);
+ attr = ALLOCMEMORY(space, NULL, char, len+1);
+
+ snprintf(attr, len+1, "%s \"%s\" %d %d", "distsplicesite_pos",
+ sm->map[sm->map[i].leftsiteidx[j]].chromname,
+ sm->map[sm->map[i].leftsiteidx[j]].median,
+ sm->map[i].leftedgeweight[j]);
+
+ bl_GFFAddAttribute(space, &track->items[left], attr, len);
+ FREEMEMORY(space, attr);
+ }
+
+ }
+ }
+ left++;
+ }
+ }
+
+
+ return ;
+}
+
+/*------------------------------- printsplice --------------------------------
+ *
+ * @brief print splice
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+printsplice (void *space, splicemap_t *sm, FILE *dev)
+{
+
+ Uint i, j, noofdistsites, check;
+ distsplitsites_t *distsites;
+
+ for(i=0; i < sm->noofsplicesites; i++) {
+ fprintf(dev, "%d\t%d\t%d\t%d\t%d\t",
+ sm->map[i].cidx, sm->map[i].median,
+ sm->map[i].totalsplits,
+ sm->map[i].start, sm->map[i].end
+ );
+
+ for(j=0; j < sm->map[i].noofleftsites; j++) {
+ fprintf(dev, "left:%d,%d,%d,(acceptor:%d, donor:%d)\t",
+ sm->map[sm->map[i].leftsiteidx[j]].cidx,
+ sm->map[sm->map[i].leftsiteidx[j]].median,
+ sm->map[i].leftedgeweight[j],
+ sm->map[i].leftedgeacceptor[j],
+ sm->map[i].leftedgedonor[j]);
+ }
+
+ for(j=0; j < sm->map[i].noofrightsites; j++) {
+ fprintf(dev, "right:%d,%d,%d,(acceptor:%d, donor:%d)\t",
+ sm->map[sm->map[i].rightsiteidx[j]].cidx,
+ sm->map[sm->map[i].rightsiteidx[j]].median,
+ sm->map[i].rightedgeweight[j],
+ sm->map[i].rightedgeacceptor[j],
+ sm->map[i].rightedgedonor[j]);
+ }
+
+ if(sm->map[i].noofrightsites == 0 && sm->map[i].noofleftsites == 0) {
+ distsites = bl_matchfileGetDistantSplitSites (space, *sm->map[i].cs, 0, 0,
+ 'N', &noofdistsites, &check);
+
+ for(j=0; j < noofdistsites; j++) {
+ fprintf(dev, "unconfirmed:%d,%d,%d,(acceptor:%d, donor:%d)\t",
+ distsites[j].distcidx,
+ distsites[j].distpos,
+ distsites[j].noofsplits,
+ distsites[j].acceptor,
+ distsites[j].donor);
+ }
+ }
+
+ fprintf(dev, "\n");
+ }
+
+ fprintf(dev, "Precision Histogram\n");
+
+ for(i=0; i < sm->interval*2+1; i++) {
+ fprintf(dev, "%u\t%u\n", i, sm->histogram[i]);
+ }
+ fprintf(dev, "Char Histogram Donor\n");
+
+ for(i=0; i < sm->interval*2+1; i++) {
+ fprintf(dev, "%d\t%d\t%d\t%d\t%d\t%d\n", i, sm->chrcnt[i],
+ sm->charhist[i]['A'],
+ sm->charhist[i]['C'],
+ sm->charhist[i]['G'],
+ sm->charhist[i]['T']);
+ FREEMEMORY(space, sm->charhist[i]);
+ }
+
+ fprintf(dev, "Char Histogram Acceptor\n");
+ for(i=0; i < sm->interval*2+1; i++) {
+ fprintf(dev, "%d\t%d\t%d\t%d\t%d\t%d\n", i, sm->chrcnt[i],
+ sm->charhistA[i]['A'],
+ sm->charhistA[i]['C'],
+ sm->charhistA[i]['G'],
+ sm->charhistA[i]['T']);
+ FREEMEMORY(space, sm->charhistA[i]);
+ }
+
+ return ;
+}
+
+
+
diff --git a/segemehl/libs/splicesites.h b/segemehl/libs/splicesites.h
new file mode 100644
index 0000000..5a27d55
--- /dev/null
+++ b/segemehl/libs/splicesites.h
@@ -0,0 +1,34 @@
+#ifndef SPLICESITES_H
+#define SPLICESITES_H
+
+/*
+ *
+ * splicesites.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 06/15/2011 11:49:01 PM CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "basic-types.h"
+#include "matchfiles.h"
+#include "biofiles.h"
+
+void bl_matchfileInitSplitMap (void *space, splitmap_t *splitmap, annotationtrack_t *bed, matchfile_t **files, fasta_t*);
+Uint bl_matchfileSplit (void *space, Uint fidx, Uint cidx, Uint pos, matchfileCross_t *cs, char ref, matchfileindex_t *idx, unsigned char show, void *nfo);
+splicemap_t* bl_matchfileSpliceMap (void *space, splitmap_t *map, Uint interval, Uint minsplitno);
+void printsplits (void *space, char *chr, Uint pos, matchfileCross_t *cs, splitmap_t *map);
+void printsplice (void *space, splicemap_t *sm, FILE *out);
+void printsplicebed (void *space, splicemap_t *sm, Uint minsplitno, char *title, FILE *out, FILE *transout);
+void bl_matchfileDestructSpliceMap (void *space, splicemap_t *sm);
+void bl_matchfileDestructSplitMap (void *space, splitmap_t *splitmap);
+distsplitsites_t* bl_matchfileGetDistantSplitSites (void *space, matchfileCross_t *cs, Uint pos,
+ Uint cidx, char type, Uint *noofsites, Uint *checkptr);
+void bl_matchfileSpliceAnnotation (void *space, splicemap_t* sm, annotationtrack_t *track);
+
+#endif
diff --git a/segemehl/libs/splines.c b/segemehl/libs/splines.c
new file mode 100644
index 0000000..bf918ef
--- /dev/null
+++ b/segemehl/libs/splines.c
@@ -0,0 +1,214 @@
+
+/*
+ * splines.c
+ *
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 06/18/14 14:06:03 CEST
+ *
+ */
+#include "mathematics.h"
+#include "sort.h"
+#include <float.h>
+#include <string.h>
+#include <limits.h>
+#include "708.h"
+#include <math.h>
+#include <complex.h>
+#include <gsl/gsl_statistics.h>
+#include <gsl/gsl_blas.h>
+#include <gsl/gsl_multifit.h>
+#include <gsl/gsl_rng.h>
+#include <gsl/gsl_randist.h>
+#include <gsl/gsl_bspline.h>
+#include "splines.h"
+
+
+/*---------------------------- destructSplineFit -----------------------------
+ *
+ * @brief destruct the splinefit
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+destructSplineFit (splinefit_t* fit)
+{
+
+ gsl_vector_free(fit->B);
+ gsl_vector_free(fit->c);
+ gsl_matrix_free(fit->cov);
+ gsl_bspline_free(fit->bw);
+
+ return ;
+}
+
+/*----------------------------- binsplinepoisson -----------------------------
+ *
+ * @brief fits a spline basis to histogram using glm poisson
+ * @author Steve Hoffmann
+ *
+ */
+
+gsl_vector*
+histsplineglm(double *myx, double *myy, int myn, double* q, Uint myqn,
+ splinefit_t *fit) {
+ size_t n = myn;
+ const size_t K = 4;
+ const size_t ncoeffs = 7;
+ const size_t nbreak = ncoeffs-2;
+ size_t i, j;
+ gsl_bspline_workspace *bw;
+ gsl_vector *B;
+ gsl_rng *r;
+ gsl_vector *c, *w;
+ gsl_vector *x, *y;
+ gsl_vector *breaks;
+ gsl_matrix *X, *cov;
+ gsl_multifit_linear_workspace *mw;
+
+ gsl_rng_env_setup();
+ r = gsl_rng_alloc(gsl_rng_default);
+
+ /* alloc a cubic bspline workspace (k = 4) */
+ bw = gsl_bspline_alloc(K, nbreak);
+ B = gsl_vector_alloc(ncoeffs);
+
+ x = gsl_vector_alloc(n);
+ y = gsl_vector_alloc(n);
+ X = gsl_matrix_alloc(n, ncoeffs);
+ c = gsl_vector_alloc(ncoeffs);
+ breaks = gsl_vector_alloc(nbreak);
+ w = gsl_vector_alloc(n);
+ cov = gsl_matrix_alloc(ncoeffs, ncoeffs);
+ mw = gsl_multifit_linear_alloc(n, ncoeffs);
+
+
+ for(i=0; i < myn; i++) {
+ gsl_vector_set(x, i, myx[i]);
+ gsl_vector_set(y, i, myy[i]);
+ gsl_vector_set(w, i, 1.0);
+ }
+
+ gsl_vector_set(breaks, 0, myx[0]); //myx[0]
+ gsl_vector_set(breaks, 1, q[0]);
+ gsl_vector_set(breaks, 2, q[1]);
+ gsl_vector_set(breaks, 3, q[2]);
+ gsl_vector_set(breaks, 4, myx[myn-1]); //myx[myn-1]
+
+ gsl_bspline_knots(breaks, bw);
+
+ /* construct the fit matrix X */
+ for (i = 0; i < n; ++i){
+ double xi = gsl_vector_get(x, i);
+
+ /* compute B_j(xi) for all j */
+ gsl_bspline_eval(xi, B, bw);
+
+
+ /* fill in row i of X */
+ for (j = 0; j < ncoeffs; ++j)
+ {
+ double Bj = gsl_vector_get(B, j);
+ gsl_matrix_set(X, i, j, Bj);
+ }
+ }
+
+ /* do the fit */
+ irls(X, w, y, c, cov, mw, n, ncoeffs);
+
+ fit->B = B;
+ fit->c = c;
+ fit->cov = cov;
+ fit->bw = bw;
+
+ gsl_rng_free(r);
+ gsl_vector_free(x);
+ gsl_vector_free(breaks);
+ gsl_vector_free(y);
+ gsl_matrix_free(X);
+ gsl_vector_free(w);
+ gsl_multifit_linear_free(mw);
+
+ return NULL;
+
+}
+
+
+/*----------------------------------- irls -----------------------------------
+ *
+ * @brief iterative least square method to fit poisson family function
+ * @author Steve Hoffmann
+ *
+ * TODO: implement SVD and matrix methods to remove gsl lib dependency
+ *
+ */
+
+gsl_vector *
+irls(gsl_matrix *X, gsl_vector *w, gsl_vector *y, gsl_vector *c, gsl_matrix *cov,
+ gsl_multifit_linear_workspace *mw, Uint n, Uint ncoeffs) {
+
+ Uint i;
+ size_t rank;
+ double oldchisq =-1, chisq, yi, mui, zi;
+ gsl_vector *z = gsl_vector_calloc(n);
+ gsl_vector *mu = gsl_vector_calloc(n);
+
+ //init the mu
+ for(i=0; i < n; i++) {
+ yi = gsl_vector_get(y,i);
+ if(yi == 0) {
+ gsl_vector_set(mu, i, 0.01);
+ } else {
+ gsl_vector_set(mu, i, yi);
+ }
+ //remove intercept
+ gsl_matrix_set(X, i, 0, 1.0);
+ }
+
+ //IRLS iteration
+ while(1){
+
+ for(i=0; i < n; i++) {
+ //calculate Z - log transformed response value
+ yi = gsl_vector_get(y,i);
+ mui = gsl_vector_get(mu, i);
+ zi = log(mui) + (1.0/mui)*(yi-mui);
+ gsl_vector_set(z, i, zi);
+ //calculate W
+ gsl_vector_set(w, i, mui);
+ }
+
+ gsl_multifit_wlinear_svd(X, w, z, GSL_DBL_EPSILON, &rank, c, cov, &chisq, mw);
+
+ if(fabs(chisq - oldchisq) < 1e-12){
+ break;
+ }
+
+ oldchisq = chisq;
+
+ //compute new mu
+ gsl_matrix *B = gsl_matrix_calloc(ncoeffs,1);
+ for(i=0; i < ncoeffs; i++) {
+ gsl_matrix_set(B, i, 0, gsl_vector_get(c, i));
+ }
+
+ gsl_matrix *C = gsl_matrix_calloc(n, 1);
+ gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1, X, B, 0, C); //C = XB
+
+ for(i=0; i < n; i++) {
+ mui = exp(gsl_matrix_get(C, i, 0));
+ gsl_vector_set(mu, i, mui);
+ }
+
+ gsl_matrix_free(B);
+ gsl_matrix_free(C);
+ }
+
+
+ gsl_vector_free(mu);
+ gsl_vector_free(z);
+ return c;
+}
+
diff --git a/segemehl/libs/splines.h b/segemehl/libs/splines.h
new file mode 100644
index 0000000..ca0f704
--- /dev/null
+++ b/segemehl/libs/splines.h
@@ -0,0 +1,51 @@
+#ifndef SPLINES_H
+#define SPLINES_H
+
+/*
+ *
+ * splines.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 06/18/14 14:11:41 CEST
+ *
+ */
+
+#include <gsl/gsl_statistics.h>
+#include <gsl/gsl_blas.h>
+#include <gsl/gsl_multifit.h>
+#include <gsl/gsl_bspline.h>
+
+/*typedef struct
+{
+ size_t k;
+ gsl_matrix *A;
+ gsl_matrix *dB;
+} gsl_bspline_deriv_workspace;
+
+int
+gsl_bspline_deriv_eval(const double x,
+ const size_t nderiv,
+ gsl_matrix *dB,
+ gsl_bspline_workspace *w,
+ gsl_bspline_deriv_workspace *dw);
+
+gsl_bspline_deriv_workspace * gsl_bspline_deriv_alloc(const size_t k);
+void gsl_bspline_deriv_free(gsl_bspline_deriv_workspace *w);
+*/
+
+typedef struct{
+ gsl_matrix *cov;
+ gsl_vector *B;
+ gsl_vector *c;
+ gsl_bspline_workspace *bw;
+} splinefit_t;
+
+gsl_vector * irls(gsl_matrix *X, gsl_vector *w, gsl_vector *y, gsl_vector *c, gsl_matrix *cov,
+ gsl_multifit_linear_workspace *mw, Uint n, Uint ncoeffs);
+double *quantiles(double *x, Uint n, double* q, Uint k);
+gsl_vector *histsplineglm(double *myx, double *myy, int myn, double* q, Uint myqn,
+ splinefit_t *fit) ;
+void destructSplineFit (splinefit_t* fit);
+#endif
diff --git a/segemehl/libs/stack.c b/segemehl/libs/stack.c
new file mode 100644
index 0000000..c363017
--- /dev/null
+++ b/segemehl/libs/stack.c
@@ -0,0 +1,164 @@
+/*
+ This file is part of gdub.
+ (C) 2006 Steve Hoffmann
+
+ gdub is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ gdub is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with gdub; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+
+ */
+
+/**
+ * stack.c
+ * implementation of a simple stack for int
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Tue Oct 28 10:42:34 CET 2008
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 73 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-29 10:03:28 +0100 (Wed, 29 Oct 2008) $
+ * Id: $Id: stack.c 73 2008-10-29 09:03:28Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/stack.c $
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "debug.h"
+#include "stack.h"
+
+/*----------------------------- bl_stackInit -----------------------------------
+ *
+ * @brief init stack
+ * @author Steve Hoffmann
+ *
+ */
+void bl_stackInit(Stack *stack, Lint allocelem) {
+ if (allocelem <= 0){
+ DBG("stack.c: Attempt to initialize a stack of size %d. Exit forced.\n",
+ allocelem);
+ exit(-1);
+ }
+ stack->stackspace = (Stackelement *) malloc(sizeof(Stackelement) * allocelem);
+ if (stack->stackspace == NULL){
+ DBG("stack.c: Memory allocation failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ stack->allocelem=allocelem;
+ stack->top=-1;
+
+}
+
+/*--------------------------- bl_stackDestruct ---------------------------------
+ *
+ * @brief destruct stack
+ * @author Steve Hoffmann
+ *
+ */
+void bl_stackDestruct(Stack *stack) {
+ free(stack->stackspace);
+ stack->top = 0;
+ stack->allocelem = 0;
+}
+
+/*---------------------------- bl_stackIsEmpty ---------------------------------
+ *
+ * @brief returns if the stack is empty
+ * @author Steve Hoffmann
+ *
+ */
+BOOL bl_stackIsEmpty(Stack *stack) {
+ return (stack->top < 0);
+}
+
+
+/*----------------------------- bl_stackPush -----------------------------------
+ *
+ * @brief pushs elements on the top of the stack
+ * @author Steve Hoffmann
+ *
+ */
+void bl_stackPush(Stack *stack, Stackelement elem) {
+
+ if(stack->top >= stack->allocelem - 1) {
+
+ stack->stackspace = (Stackelement *) realloc(stack->stackspace,
+ sizeof(Stackelement) *
+ (stack->allocelem + BASEINC));
+ if (stack->stackspace == NULL || BASEINC <= 0){
+ DBG("stack.c: Memory reallocation failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ stack->allocelem += BASEINC;
+ }
+
+ stack->top++;
+ stack->stackspace[stack->top] = elem;
+}
+
+/*------------------------------ bl_stackPop -----------------------------------
+ *
+ * @brief pops the top of the stack
+ * @author Steve Hoffmann
+ *
+ */
+Stackelement bl_stackPop(Stack *stack){
+ if(!bl_stackIsEmpty(stack)) {
+ return stack->stackspace[stack->top--];
+ }
+ return STACK_NULL_TYPE;
+}
+
+/*------------------------------ bl_stackTop -----------------------------------
+ *
+ * @brief returns top of the stack
+ * @author Steve Hoffmann
+ *
+ */
+Stackelement bl_stackTop(Stack *stack){
+ if(!bl_stackIsEmpty(stack)) {
+ return stack->stackspace[stack->top];
+ }
+ return STACK_NULL_TYPE;
+}
+
+/*------------------------------ bl_stackTopN --------------------------------
+ *
+ * @brief returns Nth highest object of the stack
+ * with N = 0,..,numofelems - 1
+ * @author Steve Hoffmann
+ *
+ */
+Stackelement bl_stackTopN(Stack *stack, Lint n){
+ if(!bl_stackIsEmpty(stack) && n >= 0 && n <= stack->top) {
+ return stack->stackspace[stack->top - n];
+ }
+ return STACK_NULL_TYPE;
+}
+
+/*------------------------------ bl_stackSize ----------------------------------
+ *
+ * @brief returns number of elements on the stack
+ * @author Steve Hoffmann
+ *
+ */
+Lint bl_stackSize(Stack *stack) {
+ return (stack->top + 1);
+}
+
diff --git a/segemehl/libs/stack.h b/segemehl/libs/stack.h
new file mode 100644
index 0000000..cdadd4a
--- /dev/null
+++ b/segemehl/libs/stack.h
@@ -0,0 +1,71 @@
+/*
+ This file is part of gdub.
+ (C) 2006 Steve Hoffmann
+
+ gdub is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ gdub is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with gdub; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+
+ */
+
+/**
+ * stack.h
+ * implementation of a simple stack for int
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Tue Oct 28 10:42:34 CET 2008
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 73 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-29 10:03:28 +0100 (Wed, 29 Oct 2008) $
+ * Id: $Id: stack.h 73 2008-10-29 09:03:28Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/stack.h $
+ */
+
+#ifndef STACK_H
+#define STACK_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "basic-types.h"
+
+#define STACK_NULL_TYPE 0
+#define STACKINC 10000
+#ifndef BASEINC
+#define BASEINC STACKINC
+#endif
+
+typedef Lint Stackelement;
+
+typedef struct{
+ Stackelement* stackspace;
+ Lint allocelem;
+ Lint top;
+} Stack;
+
+void bl_stackInit(Stack* stack, Lint allocelem);
+void bl_stackDestruct(Stack *stack);
+BOOL bl_stackIsEmpty(Stack *stack);
+void bl_stackPush(Stack *stack, Stackelement elem);
+Stackelement bl_stackPop(Stack *stack);
+Stackelement bl_stackTop(Stack *stack);
+Stackelement bl_stackTopN(Stack *stack, Lint n);
+Lint bl_stackSize(Stack *stack);
+
+#endif
diff --git a/segemehl/libs/startsites.c b/segemehl/libs/startsites.c
new file mode 100644
index 0000000..81434e6
--- /dev/null
+++ b/segemehl/libs/startsites.c
@@ -0,0 +1,72 @@
+
+/*
+ * startsites.c
+ * finding start sites
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 07/01/2011 09:53:24 PM CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "sort.h"
+#include "alignment.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "vtprogressbar.h"
+#include "fileio.h"
+#include "matchfilesfields.h"
+#include "matchfiles.h"
+#include "debug.h"
+#include "evalmatchfiles.h"
+#include "list.h"
+#include "biofiles.h"
+#include "startsites.h"
+
+
+
+/*----------------------------- bl_getStartSites -----------------------------
+ *
+ * @brief getting the startsites
+ * @author Steve Hoffmann
+ *
+ */
+
+
+Uint
+bl_matchfileStartSites(void *space, Uint fidx, Uint cidx, Uint pos,
+ matchfileCross_t *cs, char ref, matchfileSampleStats_t *stats,
+ unsigned char show, void *nfo) {
+
+ Uint *cntr = (Uint *) nfo;
+
+ if(cs->len > 5) {
+ if(cs->starts < 255) cntr[(int)((double)(cs->starts*100.0)/cs->len)]++;
+ }
+
+ return 0;
+}
+
+Uint
+bl_coverage(void *space, Uint fidx, Uint cidx, Uint pos,
+ matchfileCross_t *cs, char ref, matchfileindex_t *idx,
+ unsigned char show, void *nfo) {
+
+ fprintf(stdout, "%d\n", cs->len);
+
+ return 0;
+}
+
diff --git a/segemehl/libs/startsites.h b/segemehl/libs/startsites.h
new file mode 100644
index 0000000..2d70726
--- /dev/null
+++ b/segemehl/libs/startsites.h
@@ -0,0 +1,47 @@
+
+/*
+ *
+ * startsites.h
+ * get the start sites
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 07/02/2011 01:00:18 AM CEST
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "sort.h"
+#include "alignment.h"
+#include "stringutils.h"
+#include "basic-types.h"
+#include "mathematics.h"
+#include "matfile.h"
+#include "bitVector.h"
+#include "info.h"
+#include "vtprogressbar.h"
+#include "fileio.h"
+#include "matchfilesfields.h"
+#include "matchfiles.h"
+#include "debug.h"
+#include "evalmatchfiles.h"
+#include "list.h"
+#include "biofiles.h"
+
+Uint
+bl_matchfileStartSites(void *space, Uint fidx, Uint cidx, Uint pos,
+ matchfileCross_t *cs, char ref, matchfileSampleStats_t *stats,
+ unsigned char show, void *nfo);
+Uint
+bl_coverage(void *space, Uint fidx, Uint cidx, Uint pos,
+ matchfileCross_t *cs, char ref, matchfileindex_t *idx,
+ unsigned char show, void *nfo);
+
diff --git a/segemehl/libs/stringutils.c b/segemehl/libs/stringutils.c
new file mode 100644
index 0000000..05d2e0a
--- /dev/null
+++ b/segemehl/libs/stringutils.c
@@ -0,0 +1,446 @@
+/*
+ * stringutils.c
+ * functions to manipulate strings
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: stringutils.c 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/stringutils.c $
+ */
+
+ #include <stdlib.h>
+ #include <stdio.h>
+ #include <string.h>
+ #include <assert.h>
+ #include "stringutils.h"
+ #include "basic-types.h"
+
+
+void
+printSubseq(char* seq, Uint start, Uint end) {
+ int i;
+ assert(end <= strlen(seq));
+ for(i=start; i <= end; i++) {
+ printf("%c", seq[i]);
+ }
+}
+
+char* strrev(char *str, Uint len){
+ int end = len-1;
+ int start = 0;
+
+ while (start<end) {
+ str[start] ^= str[end];
+ str[end] ^= str[start];
+ str[start] ^= str[end];
+ start++;
+ end--;
+ }
+ return str;
+}
+
+char* strtok_bl(char *s, char *delim, char **saveptr) {
+
+ char *ret;
+
+ /* init */
+ if (s == NULL){
+ s = *saveptr;
+ if (s == NULL){
+ *saveptr = NULL;
+ return NULL;
+ }
+ }
+ /* skip delims at begin */
+ while(*s && strchr(delim, *s)){
+ s++;
+ }
+
+ if (*s == 0){
+ *saveptr = NULL;
+ return NULL;
+ }
+
+ /* locate next delim or end */
+ ret = s;
+ while(*s && !strchr(delim, *s)){
+ s++;
+ }
+
+ if (*s == 0){
+ *saveptr = NULL;
+ }
+ else {
+ *s = 0;
+ s++;
+ *saveptr = s;
+ }
+
+ return ret;
+}
+
+
+stringset_t* tokensToStringset(void *space, char* delim, char* toTokens,
+ Uint len){
+ Uint toklen;
+ char* token;
+ char* saveptr;
+ char* buffer;
+ stringset_t *set;
+
+ set = ALLOCMEMORY(space, NULL, stringset_t, 1);
+ set->noofstrings = 0;
+ set->strings = NULL;
+
+ if (toTokens == NULL || len == 0)
+ return set;
+
+ buffer = ALLOCMEMORY(space, NULL, char, len+1);
+ buffer = memcpy(buffer, toTokens, len+1);
+ buffer[len] = 0;
+
+ if (buffer == NULL) {
+ fprintf(stderr, "copy tokenstring %s to buffer failed.\n", toTokens);
+ exit(-1);
+ }
+
+ token = strtok_bl(buffer, delim, &saveptr);
+
+ while(token != NULL) {
+
+ toklen = strlen(token);
+ set->noofstrings++;
+ set->strings = ALLOCMEMORY(space, set->strings, string_t, set->noofstrings);
+ set->strings[set->noofstrings-1].str = ALLOCMEMORY(space, NULL, char, toklen+1);
+ set->strings[set->noofstrings-1].str = memcpy(set->strings[set->noofstrings-1].str, token, toklen);
+ set->strings[set->noofstrings-1].str[toklen]='\0';
+ set->strings[set->noofstrings-1].len = toklen;
+ token = strtok_bl(NULL, delim, &saveptr);
+ }
+
+ FREEMEMORY(space, buffer);
+ return set;
+}
+
+
+char* strtrimquote (void *spacetab, char *toTrim, Uint *len) {
+ Uint i=0;
+ int start=-1;
+ int end =-2;
+ char* trimmed = NULL;
+
+ for(i=0; i < *len; i++) {
+
+ if(ISQUOTE(toTrim[i])) {
+ continue;
+ }
+ else if(start == -1) {
+ start=i;
+ end=i;
+ }
+ else
+ end=i;
+ }
+
+ if(start >= 0) {
+ trimmed = ALLOCMEMORY(spacetab, NULL, char, (end-start)+2);
+ memmove(trimmed, &toTrim[start], (end-start)+1);
+ trimmed[(end-start)+1]='\0';
+ }
+
+ *len = (end-start)+1;
+ return trimmed;
+
+}
+
+
+char* strtrim(void *spacetab, char* toTrim, Uint *len) {
+ Uint i=0;
+ int start=-1;
+ int end =-2;
+ char* trimmed = NULL;
+
+ for(i=0; i < *len; i++) {
+
+ if(ISWHITESPACE(toTrim[i])) {
+ continue;
+ }
+ else if(start == -1) {
+ start=i;
+ end=i;
+ }
+ else
+ end=i;
+ }
+
+ if(start >= 0) {
+ trimmed = ALLOCMEMORY(spacetab, NULL, char, (end-start)+2);
+ memmove(trimmed, &toTrim[start], (end-start)+1);
+ trimmed[(end-start)+1]='\0';
+ }
+
+ *len = (end-start)+1;
+ return trimmed;
+}
+
+char* strclip(void *spacetab, char* str, Uint *len){
+ Uint i;
+ char *tmp;
+ // clip string after first whitespace
+ for (i = 0; i < *len; i++){
+ if (ISWHITESPACE(str[i])){
+ break;
+ }
+ }
+
+ tmp = ALLOCMEMORY(spacetab, NULL, char, i + 1);
+ memmove(tmp, str, i);
+ tmp[i] = '\0';
+ *len = i;
+ return tmp;
+}
+
+void strconvert(char *seq, Uint len, char orig, char replace){
+ Uint i;
+ for (i = 0; i < len; i++){
+ if (seq[i] == orig){
+ seq[i] = replace;
+ }
+ }
+}
+
+char* concat(void *spacetab, char* strA, char* strB, int lenA, int lenB) {
+
+ if(strB == NULL || lenB == 0)
+ return strA;
+ if(strA == NULL || lenA == 0)
+ return strB;
+
+ strA=ALLOCMEMORY(spacetab, strA, char, lenA+lenB+1);
+ memmove(&strA[lenA], strB, lenB);
+
+ strA[lenA+lenB]='\0';
+
+ return strA;
+}
+
+char* concatdelim(void *spacetab, char* strA, char* strB, int lenA, int lenB, char delim) {
+
+ if(strB == NULL || lenB == 0)
+ return strA;
+ if(strA == NULL || lenA == 0)
+ return strB;
+
+ strA=ALLOCMEMORY(spacetab, strA, char, lenA+lenB+2);
+ strA[lenA]=delim;
+ memmove(&strA[lenA+1], strB, lenB);
+
+ strA[lenA+lenB+1]='\0';
+
+ return strA;
+}
+
+
+void destructStringset(void *space, stringset_t *s) {
+ Uint i;
+
+ if (s->strings) {
+ for(i=0; i < s->noofstrings; i++) {
+ if(s->strings[i].str != NULL)
+ FREEMEMORY(space, s->strings[i].str);
+ }
+
+ FREEMEMORY(space, s->strings);
+ }
+
+ FREEMEMORY(space, s);
+}
+
+
+stringset_t *initStringset(void *space) {
+ stringset_t *set;
+
+ set = ALLOCMEMORY(space, NULL, stringset_t, 1);
+ set->noofstrings=0;
+ set->strings = NULL;
+
+ return set;
+}
+
+void addString(void *space, stringset_t *set, char *string, Uint len) {
+
+ set->noofstrings++;
+ set->strings=ALLOCMEMORY(space, set->strings, string_t, set->noofstrings);
+ set->strings[set->noofstrings-1].str = string;
+ set->strings[set->noofstrings-1].len = len;
+}
+
+
+
+/*---------------------------------- strrev ----------------------------------
+ *
+ * reversing a string
+ *
+ */
+
+char*
+strreverse(char *s, Uint len)
+{
+ Uint i;
+ char resc;
+
+ for(i=0; i < (len/2); i++) {
+ resc = s[i];
+ s[i] = s[len-1-i];
+ s[len-1-i] = resc;
+ }
+
+ return s;
+}
+
+/* -------------------------------- my_itoa ---------------------------------
+ *
+ * just in case that there's no itoa
+ *
+ */
+
+char*
+my_itoa (int value, char *buffer, Uint radix)
+{
+ const char* base ="0123456789abcdef";
+ int i=0;
+
+ if (value == 0) {
+ buffer[0]=base[0];
+ buffer[1]='\0';
+ return buffer;
+ }
+
+ for (i=0; value > 0; i++, value /= radix)
+ buffer[i] = base[(value % radix)];
+
+ buffer[i] ='\0';
+
+ buffer = strreverse(buffer, i);
+ return buffer;
+}
+
+
+
+/*-------------------------------- attachext ---------------------------------
+ *
+ * attaches an extension to a filename
+ *
+ */
+
+char *
+attachext (void *space, char *str, Uint l, char *ext, Uint m)
+{
+ char *new;
+
+ new = ALLOCMEMORY(space, NULL, char, l+m+1);
+ strncpy(new, str, l);
+ new[l]='\0';
+ strncat(new, ext, m);
+
+ return new;
+}
+/*-------------------------------- attachpath --------------------------------
+ *
+ * attach a path (or any other string) to a filename (or any other string)
+ * and extensions (or any other string) at the end.
+ *
+ */
+
+char*
+attachpath (void *space, char *str, Uint l, char *path, Uint m,
+ char *ext, Uint n)
+{
+ char *new;
+
+ new = ALLOCMEMORY(space, NULL, char, (l+m+n+1));
+ strncpy(new, path, m);
+ new[m] = '\0';
+ strncat(new, str, l);
+ strncat(new, ext, n);
+
+ return new;
+}
+
+int checkmd5(unsigned char *a, unsigned char *b) {
+ return (strncmp((char*)a, (char*)b, 16));
+}
+
+void
+fprintStringset(FILE *dev, stringset_t *set) {
+ Uint i;
+
+ for(i=0; i < set->noofstrings; i++) {
+ fprintf(dev, "%d:'%s' (len:%d)\n", i,
+ set->strings[i].str, set->strings[i].len);
+ }
+
+}
+
+
+char *
+sprintchar (char **str, char chr) {
+ char *tmp;
+
+ tmp = calloc(2, 1);
+ tmp[0] = chr;
+
+ *str = tmp;
+ return *str;
+}
+
+char *
+sprintint (char **str, int n) {
+ char *tmp;
+
+ tmp = calloc(MAX_INT_LENGTH+1, 1);
+ sprintf(tmp, "%d", n);
+
+ *str = tmp;
+ return *str;
+}
+
+char *
+sprintUint (char **str, Uint n) {
+ char *tmp;
+
+ tmp = calloc(MAX_INT_LENGTH+1, 1);
+ sprintf(tmp, "%u", n);
+
+ *str = tmp;
+ return *str;
+}
+
+char *
+sprintstr (char **str, char *src, Uint len) {
+ char *tmp;
+
+ tmp = calloc(len+1, 1);
+ memmove(tmp, src, len);
+
+ *str = tmp;
+ return *str;
+}
+
+char *
+sprintflt (char **str, double flt) {
+ Uint size;
+ char *tmp;
+
+ size = snprintf(NULL, 0, "%.4f", flt);
+ tmp = calloc(size+2, 1);
+ snprintf(tmp, size+1, "%.4f", flt);
+
+ *str = tmp;
+ return *str;
+}
+
+
diff --git a/segemehl/libs/stringutils.h b/segemehl/libs/stringutils.h
new file mode 100644
index 0000000..b831524
--- /dev/null
+++ b/segemehl/libs/stringutils.h
@@ -0,0 +1,72 @@
+#ifndef STRINGUTILS_H
+#define STRINGUTILS_H
+
+#include "basic-types.h"
+
+#ifndef ALLOCMEMORY
+ #include "memory.h"
+#endif
+
+#define TAB '\t' /*0x09*/
+#define LF '\n' /*0x0A*/
+#define VT '\v'
+#define FF '\f'
+#define CR '\r' /*0x0D*/
+#define SP ' '
+#define DQT '\"'
+#define SQT '\''
+
+
+#define COPYSTR(M,D,S,L) D=ALLOCMEMORY(M,D,char,L+1); \
+ strncpy(D,S,L);\
+ D[L]='\0'
+
+#define INDENT(s,x,c) {int p; for(p=0;p<x;p++) fprintf(s,"%c",c);}
+#define ISWHITESPACE(C) (C == SP || C == TAB || C == LF \
+ || C == VT || C == CR || C == FF )
+#define ISQUOTE(C) (C== DQT || C==SQT)
+#define SETSTR(X,I) (X)->strings[(I)].str
+#define SETSTRLEN(X,I) (X)->strings[(I)].len
+
+#define APPENDCHAR(S, X, L, Y) (X)=ALLOCMEMORY(S, X, char, L+1);\
+ X[L-1]=Y;\
+ X[L]=0
+
+typedef struct{
+
+ char* str;
+ Uint len;
+
+} string_t;
+
+typedef struct{
+
+ string_t* strings;
+ Uint noofstrings;
+
+} stringset_t;
+
+char * sprintflt (char **str, double flt);
+char * sprintstr (char **str, char *src, Uint len);
+char * sprintUint (char **str, Uint n);
+char * sprintint (char **str, int n);
+char * sprintchar (char **str, char chr);
+char * strtok_bl (char *, char *, char **);
+stringset_t *tokensToStringset(void *, char *, char *, Uint);
+stringset_t *initStringset(void *);
+char* strrev(char *str, Uint len);
+char* strtrim (void *, char *, Uint *);
+char* strtrimquote (void *, char *, Uint *);
+char* strclip (void *, char *, Uint *);
+void strconvert(char *, Uint, char, char);
+void addString(void *, stringset_t *, char *, Uint);
+void destructStringset(void *, stringset_t *);
+char* concat(void *spacetab, char* strA, char* strB, int lenA, int lenB);
+char* concatdelim(void *spacetab, char* strA, char* strB, int lenA, int lenB, char delim);
+char* strreverse(char*, Uint);
+char* my_itoa(int, char*, Uint);
+char * attachext (void *, char *, Uint, char *, Uint);
+int checkmd5(unsigned char *a, unsigned char *b);
+void fprintStringset(FILE *dev, stringset_t *set);
+#endif
+
diff --git a/segemehl/libs/sufarray/charsequence.c b/segemehl/libs/sufarray/charsequence.c
new file mode 100644
index 0000000..1a00bbe
--- /dev/null
+++ b/segemehl/libs/sufarray/charsequence.c
@@ -0,0 +1,459 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include "memory.h"
+#include "stringutils.h"
+#include "charsequence.h"
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 87 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-11-20 11:24:26 +0100 (Thu, 20 Nov 2008) $
+ *
+ * Id: $Id: charsequence.c 87 2008-11-20 10:24:26Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/sufarray/charsequence.c $
+ */
+
+/* ----------------------------- printSequence -------------------------------
+ *
+ * prints a beautiful CharSequence to a buffer
+ *
+ */
+
+char *
+printSequence (void *space, CharSequence *s, Uint cols)
+{
+ Uint i, c, k, pos=0, l=0, width=0;
+ char *buf, *buf2;
+ stringset_t *entries;
+
+ entries=initStringset(space);
+ for (i=0; i < s->length; i++)
+ {
+ /*buf = ALLOCMEMORY(space, NULL, char, 32);
+ buf = my_itoa(s->sequence[i], buf, 10);*/
+ addString(space, entries, &s->sequence[i], strlen(&s->sequence[i]));
+ if(SETSTRLEN(entries, i) > width)
+ width = SETSTRLEN(entries, i);
+ }
+
+ /*add spacer*/
+ width++;
+ c = cols / (width+1);
+ l = (s->descrlen+2)+(s->namelen+2)+(s->length*(width+1))+((entries->noofstrings/c)*(5+1+1));
+
+ buf = ALLOCMEMORY(space, NULL, char, l);
+ memset(&buf[pos++], '>', 1);
+ memmove(&buf[pos], s->url, s->urllen);
+ pos+=s->urllen;
+ memset(&buf[pos++], '\n', 1);
+ memset(&buf[pos++], '>', 1);
+ memmove(&buf[pos], s->alphabetname, s->namelen);
+ pos+=s->namelen;
+ memset(&buf[pos++], '\n', 1);
+ for (i=0; i < entries->noofstrings; i++) {
+ if((i%c) == 0) {
+ memset(&buf[pos++], '\n', 1);
+ buf2 = ALLOCMEMORY(space, NULL, char, 5);
+ buf2 = my_itoa(i, buf2, 10);
+ memset(&buf[pos], ' ', 5-strlen(buf2));
+ pos += 5-strlen(buf2);
+ memmove(&buf[pos], buf2, strlen(buf2));
+ pos += strlen(buf2);
+ memset(&buf[pos++], '\t', 1);
+ FREEMEMORY(space, buf2);
+ }
+ k = (width-SETSTRLEN(entries,i));
+ memset(&buf[pos], ' ', k);
+ pos += k;
+
+ memmove(&buf[pos], SETSTR(entries, i), SETSTRLEN(entries, i));
+ pos += SETSTRLEN(entries, i);
+ }
+
+ buf[pos]='\0';
+
+ destructStringset(space, entries);
+ return buf;
+}
+
+/* ----------------------------- printAlignment -------------------------------
+ *
+ * prints a beautiful alignment to a buffer
+ *
+ */
+
+char *
+printAlignment (void *space, int *align, Uint size,
+ CharSequence *a, CharSequence *b, Uint cols)
+{
+ Uint i, c, k, pos1=0, pos2=0, pos3=0, l=0, width=0, m=0, d=size;
+ char *buf, *bufa, *bufb, *nobuf;
+ stringset_t *first, *second;
+
+ first=initStringset(space);
+ for (i=0; i < a->length; i++)
+ {
+ /*buf = ALLOCMEMORY(space, NULL, char, 32);
+ buf = my_itoa(a->sequence[i], buf, 10);*/
+ addString(space, first, &a->sequence[i], strlen(&a->sequence[i]));
+ if(SETSTRLEN(first, i) > width)
+ width = SETSTRLEN(first, i);
+ }
+
+
+ second = initStringset(space);
+ for (i=0; i < b->length; i++)
+ {
+ /*buf = ALLOCMEMORY(space, NULL, char, 32);
+ buf = my_itoa(b->sequence[i], buf, 10);*/
+ addString(space, second, &b->sequence[i], strlen(&b->sequence[i]));
+ if(SETSTRLEN(second, i) > width)
+ width = SETSTRLEN(second, i);
+ }
+
+
+ /*add spacer*/
+ width++;
+ c = cols / (width+1);
+ l = (a->descrlen+2)+(a->namelen+2)
+ +(a->length*(width+1)) +((first->noofstrings/c)*(5+1+1));
+ m= (b->length*(width+1)) +((second->noofstrings/c)*(5+1+1));
+
+ bufa = ALLOCMEMORY(space, NULL, char, l);
+ bufb = ALLOCMEMORY(space, NULL, char, m);
+ buf = ALLOCMEMORY(space, NULL, char, (l+m)*2);
+
+ memset(bufa, 0, l);
+ memset(bufb, 0, m);
+ memset(buf, 0, (l+m)*2);
+
+ memset(&bufa[pos1++], '>', 1);
+ memmove(&bufa[pos1], a->url, a->urllen);
+ pos1+=a->urllen;
+ memset(&bufa[pos1++], '\n', 1);
+ memset(&bufa[pos1++], '>', 1);
+ memmove(&bufa[pos1], a->alphabetname, a->namelen);
+ pos1+=a->namelen;
+ memset(&bufa[pos1++], '\n', 1);
+
+ for (i=0; i < first->noofstrings; i++) {
+ if((i%c) == 0) {
+ memset(&bufa[pos1++], '\n', 1);
+ memset(&bufb[pos2++], '\n', 1);
+
+ memmove(&buf[pos3], bufa, pos1);
+ pos3+=pos1;
+ memmove(&buf[pos3], bufb, pos2);
+ pos3+=pos2;
+ memset(&buf[pos3++], '\n', 1);
+
+ pos1 =0;
+ pos2 =0;
+
+ nobuf = ALLOCMEMORY(space, NULL, char, 5);
+ nobuf = my_itoa(i, nobuf, 10);
+ memset(&bufa[pos1], ' ', 5-strlen(nobuf));
+ pos1 += 5-strlen(nobuf);
+ memmove(&bufa[pos1], nobuf, strlen(nobuf));
+ pos1 += strlen(nobuf);
+
+ if (d > 1)
+ nobuf = my_itoa(align[d-1], nobuf, 10);
+ else
+ nobuf = my_itoa(align[1], nobuf, 10);
+
+ memset(&bufb[pos2], ' ', 5-strlen(nobuf));
+ pos2 += 5-strlen(nobuf);
+ memmove(&bufb[pos2], nobuf, strlen(nobuf));
+ pos2 += strlen(nobuf);
+
+ memset(&bufa[pos1++], '\t', 1);
+ memset(&bufb[pos2++], '\t', 1);
+
+ FREEMEMORY(space, nobuf);
+ }
+
+ k = (width-SETSTRLEN(first,i));
+ memset(&bufa[pos1], ' ', k);
+ pos1 += k;
+ memmove(&bufa[pos1], SETSTR(first, i), SETSTRLEN(first, i));
+ pos1 += SETSTRLEN(first, i);
+
+ if (d > 1 && align[d-2]-1 == i) {
+ k = (width-SETSTRLEN(second, align[d-1]-1));
+ memset(&bufb[pos2], ' ', k);
+ pos2 += k;
+ memmove(&bufb[pos2], SETSTR(second, align[d-1]-1),
+ SETSTRLEN(second, align[d-1]-1));
+ pos2 += SETSTRLEN(second, align[d-1]-1);
+ d-=2;
+ } else {
+ k = width-1;
+ memset(&bufb[pos2], ' ', k);
+ pos2 += k;
+ memset(&bufb[pos2], '-', 1);
+ pos2++;
+ }
+ }
+
+ memset(&bufa[pos1++], '\n', 1);
+ memset(&bufb[pos2++], '\n', 1);
+
+ memmove(&buf[pos3], bufa, pos1);
+ pos3+=pos1;
+ memmove(&buf[pos3], bufb, pos2);
+ pos3+=pos2;
+ memset(&buf[pos3++], '\n', 1);
+
+
+ buf[pos3]='\0';
+
+ destructStringset(space, first);
+ destructStringset(space, second);
+ FREEMEMORY(space, bufa);
+ FREEMEMORY(space, bufb);
+
+ return buf;
+}
+
+
+
+
+/* ------------------------------ dumpSequence -------------------------------
+ * dumps the sequence to the screen.
+ *
+ */
+
+void
+dumpSequence (CharSequence *s)
+{
+ Uint i;
+ printf("sequence:\n");
+ for (i=0; i < s->length; i++) {
+ printf("%c", s->sequence[i]);
+ if (i!=(s->length-1)) printf("-");
+ }
+ printf("\n");
+/* printf("info:\n");
+ for (i=0; i < s->length; i++) {
+ printf("%c", s->info[i]);
+ if (i!=(s->length-1)) printf("-");
+ }
+ printf("\n");
+*/
+}
+
+/*------------------------------- loadSequence -------------------------------
+ * loads a sequence from a file.
+ *
+ */
+
+CharSequence*
+loadSequence (void *space, char *filename)
+{
+ //long size;
+ FILE *infile;
+ char *sequence;
+ /*Uint *info;*/
+ CharSequence *s;
+
+ infile = fopen( filename, "r" );
+ if ( infile == NULL )
+ {
+ fprintf ( stderr, "couldn't open file '%s'; %s\n",
+ filename, strerror(errno) );
+ exit (EXIT_FAILURE);
+ }
+ fseek(infile, 0, SEEK_END);
+ //not used: size = ftell(infile);
+ rewind(infile);
+
+ s = initSequence(space);
+ fread(s, sizeof(CharSequence), 1, infile);
+
+ s->description = ALLOCMEMORY(space, NULL, char, s->descrlen+1);
+ s->alphabetname = ALLOCMEMORY(space, NULL, char, s->namelen+1);
+ s->url = ALLOCMEMORY(space, NULL, char, s->urllen+1);
+ sequence = ALLOCMEMORY(space, NULL, char, s->length);
+ /*info = ALLOCMEMORY(space, NULL, Uint, s->length);*/
+
+ fread(s->description, sizeof(char), s->descrlen+1, infile);
+ fread(s->alphabetname, sizeof(char), s->namelen+1, infile);
+ fread(s->url, sizeof(char), s->urllen+1, infile);
+ fread(sequence, sizeof(char), s->length, infile);
+ /*fread(info, sizeof(Uint), s->length, infile);*/
+
+ s->sequence = sequence;
+ /*s->info = info;*/
+ s->noofinfo = 0;
+ s->info = NULL;
+
+ if( fclose(infile) == EOF) /* close input file */
+ {
+ fprintf ( stderr, "couldn't close file '%s'; %s\n",
+ filename, strerror(errno) );
+ exit (EXIT_FAILURE);
+ }
+
+ return s;
+}
+
+/*------------------------------- saveSequence -------------------------------
+ *
+ * saves the sequences to a file
+ *
+ */
+
+void
+saveSequence (CharSequence *s, char *filename)
+{
+ FILE *outfile;
+
+ outfile = fopen (filename, "w");
+ if (outfile == NULL)
+ {
+ fprintf ( stderr, "couldn't open file '%s'; %s\n",
+ filename, strerror(errno) );
+ exit (EXIT_FAILURE);
+ }
+
+ fwrite(s, sizeof(CharSequence), 1, outfile);
+ fwrite(s->description, sizeof(char), s->descrlen+1, outfile);
+ fwrite(s->alphabetname, sizeof(char), s->namelen+1, outfile);
+ fwrite(s->url, sizeof(char), s->urllen+1, outfile);
+ fwrite(s->sequence, sizeof(char), s->length, outfile);
+ fwrite(s->info, sizeof(Uint), s->length, outfile);
+
+ if( fclose(outfile) == EOF )
+ {
+ fprintf ( stderr, "couldn't close file '%s'; %s\n",
+ filename, strerror(errno) );
+ exit (EXIT_FAILURE);
+ }
+ return ;
+}
+
+
+
+/*---------------------------- createSequenceHash ----------------------------
+ *
+ * creates a hash table (array) for CharSequences
+ *
+ */
+
+CharSequence**
+createSequenceHash (void *space, Uint hashsize)
+{
+ CharSequence** hashTable;
+ hashTable = ALLOCMEMORY(space, NULL, CharSequence*, hashsize);
+ memset(hashTable, 0, sizeof(CharSequence*)*hashsize);
+ return hashTable;
+}
+
+
+
+/*----------------------------- findSequenceHash -----------------------------
+ *
+ * find a slot in the sequence hash
+ *
+ */
+
+void
+findSequenceHash ( )
+{
+ return ;
+}
+
+
+/*---------------------------- lookupSequenceHash ----------------------------
+ *
+ * lookup if a key is already present in a hash table
+ *
+ */
+
+void
+lookupSequenceHash ( )
+{
+ return ;
+}
+
+
+
+
+
+
+/*------------------------------ resetSequence -------------------------------
+ *
+ * @brief resets (initalizes) a Sequence
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+resetSequence (CharSequence* s)
+{
+ s->description=NULL;
+ s->descrlen=0;
+ s->alphabetname=NULL;
+ s->namelen=0;
+ s->sequence=NULL;
+ s->info=NULL;
+ s->length=0;
+ s->urllen =0;
+
+ return ;
+}
+
+/*------------------------------- initSequence -------------------------------
+ *
+ * allocates and initializes a sequence struct
+ *
+ */
+
+CharSequence*
+initSequence (void *space)
+{
+ CharSequence *s;
+
+ s=ALLOCMEMORY(space, NULL, CharSequence, 1);
+ s->description=NULL;
+ s->descrlen=0;
+ s->alphabetname=NULL;
+ s->namelen=0;
+ s->sequence=NULL;
+ s->info=NULL;
+ s->length=0;
+ s->urllen =0;
+ s->url = NULL;
+ s->map=NULL;
+ s->mapsize=0;
+ s->clip3[0] = 0;
+ s->clip3[1] = 0;
+ s->clip5[0] = 0;
+ s->clip5[1] = 0;
+ #ifdef HASHING
+ s->quantity = 1;
+ #endif
+
+ return s;
+}
+
+void destructSequence(void *space, CharSequence *sequence) {
+ if (sequence->sequence != NULL)
+ FREEMEMORY(space, sequence->sequence);
+ FREEMEMORY(space, sequence->description);
+ if (sequence->alphabetname != NULL)
+ FREEMEMORY(space, sequence->alphabetname);
+ if (sequence->url != NULL)
+ FREEMEMORY(space, sequence->url);
+ if (sequence->info != NULL)
+ FREEMEMORY(space, sequence->info);
+
+ FREEMEMORY(space, sequence);
+ return;
+ }
+
diff --git a/segemehl/libs/sufarray/charsequence.h b/segemehl/libs/sufarray/charsequence.h
new file mode 100644
index 0000000..3f5f25b
--- /dev/null
+++ b/segemehl/libs/sufarray/charsequence.h
@@ -0,0 +1,320 @@
+ #ifndef INTSEQUENCE_H
+ #define INTSEQUENCE_H
+
+/*
+ * charsequence.h
+ * declaration of char sequence
+ * and functions working on it
+ *
+ * @author Steve Hoffmann
+ * @date Mon 27 Nov 2006
+ *
+ * SVN
+ * Revision of last commit: $Rev: 87 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-11-20 11:24:26 +0100 (Thu, 20 Nov 2008) $
+ *
+ * Id: $Id: charsequence.h 87 2008-11-20 10:24:26Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/sufarray/charsequence.h $
+ */
+
+ #include "basic-types.h"
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include "stringutils.h"
+
+ typedef struct {
+ Uint descrlen;
+ Uint namelen;
+ Uint urllen;
+ Uint noofinfo;
+ Uint *infolength;
+
+ char *description; /*a description*/
+ char *alphabetname; /*the name of the corresponding alphabet*/
+ char *url; /*the name of the sequences url*/
+ char *sequence; /*the sequence itself*/
+ char *info; /*additional information*/
+ Uint length;
+
+ Uint clip5[2]; /*clipping information*/
+ Uint clip3[2];
+
+ #ifdef HASHING
+ Uint quantity; /* quantity of equal sequences */
+ #endif
+ Uint *map;
+ Uint mapsize;
+
+ } CharSequence;
+
+ void destructSequence(void *, CharSequence *);
+ CharSequence* initSequence(void *);
+ void resetSequence(CharSequence *);
+ char* printSequence(void *, CharSequence *, Uint);
+ void dumpSequence(CharSequence *s);
+ void saveSequence (CharSequence *s, char *filename);
+ CharSequence* loadSequence (void *space, char *filename);
+ char * printAlignment (void *, int *, Uint, CharSequence *, CharSequence *,
+ Uint);
+ CharSequence** createSequenceHash(void *, Uint);
+
+ static inline char* charDNAcomplement(void *space, char *s, Uint len) {
+ Uint i,k=0;
+ char* buffer;
+
+ buffer = ALLOCMEMORY(space, NULL, char, len+1);
+ for(i=len; i > 0; i--) {
+ switch(s[i-1]) {
+ case 'a':
+ buffer[k] = 't';
+ break;
+ case 't':
+ buffer[k] = 'a';
+ break;
+ case 'c':
+ buffer[k] = 'g';
+ break;
+ case 'g':
+ buffer[k] = 'c';
+ break;
+ case 'n':
+ buffer[k] = 'n';
+ break;
+ case 'A':
+ buffer[k] = 'T';
+ break;
+ case 'T':
+ buffer[k] = 'A';
+ break;
+ case 'C':
+ buffer[k] = 'G';
+ break;
+ case 'G':
+ buffer[k] = 'C';
+ break;
+ case 'N':
+ buffer[k] = 'N';
+ break;
+ default:
+ buffer[k] = s[i-1];
+ break;
+ }
+ k++;
+ }
+ buffer[k] = '\0';
+ return buffer;
+ }
+
+
+ static inline char charComplementChar(char ch) {
+
+ switch(ch) {
+ case 'a':
+ return 't';
+ break;
+ case 't':
+ return 'a';
+ break;
+ case 'c':
+ return 'g';
+ break;
+ case 'g':
+ return 'c';
+ break;
+ case 'n':
+ return 'n';
+ break;
+ case 'A':
+ return 'T';
+ break;
+ case 'T':
+ return 'A';
+ break;
+ case 'C':
+ return 'G';
+ break;
+ case 'G':
+ return 'C';
+ break;
+ case 'N':
+ return 'N';
+ break;
+ default:
+ return ch;
+ break;
+ }
+
+ return ch;
+ }
+
+static inline char* charIUPACcomplement(void *space, char *s, Uint len) {
+ Uint i,k=0;
+ char* buffer;
+
+ buffer = ALLOCMEMORY(space, NULL, char, len+1);
+ for(i=len; i > 0; i--) {
+ switch(s[i-1]) {
+ case 'a':
+ buffer[k] = 't';
+ break;
+ case 't':
+ buffer[k] = 'a';
+ break;
+ case 'c':
+ buffer[k] = 'g';
+ break;
+ case 'g':
+ buffer[k] = 'c';
+ break;
+ case 'r':
+ buffer[k] = 'y';
+ break;
+ case 'y':
+ buffer[k] = 'r';
+ break;
+ case 's':
+ buffer[k] = 's';
+ break;
+ case 'w':
+ buffer[k] = 'w';
+ break;
+ case 'k':
+ buffer[k] = 'm';
+ break;
+ case 'm':
+ buffer[k] = 'k';
+ break;
+ case 'b':
+ buffer[k] = 'v';
+ break;
+ case 'd':
+ buffer[k] = 'h';
+ break;
+ case 'h':
+ buffer[k] = 'd';
+ break;
+ case 'v':
+ buffer[k] = 'b';
+ break;
+ case 'n':
+ buffer[k] = 'n';
+ break;
+ case 'A':
+ buffer[k] = 'T';
+ break;
+ case 'T':
+ buffer[k] = 'A';
+ break;
+ case 'C':
+ buffer[k] = 'G';
+ break;
+ case 'G':
+ buffer[k] = 'C';
+ break;
+ case 'R':
+ buffer[k] = 'Y';
+ break;
+ case 'Y':
+ buffer[k] = 'R';
+ break;
+ case 'S':
+ buffer[k] = 'S';
+ break;
+ case 'W':
+ buffer[k] = 'W';
+ break;
+ case 'K':
+ buffer[k] = 'M';
+ break;
+ case 'M':
+ buffer[k] = 'K';
+ break;
+ case 'B':
+ buffer[k] = 'V';
+ break;
+ case 'D':
+ buffer[k] = 'H';
+ break;
+ case 'H':
+ buffer[k] = 'D';
+ break;
+ case 'V':
+ buffer[k] = 'B';
+ break;
+ case 'N':
+ buffer[k] = 'N';
+ break;
+ default:
+ buffer[k] = s[i-1];
+ break;
+ }
+ k++;
+ }
+ buffer[k] = '\0';
+ return buffer;
+}
+
+static inline void bl_convertBisulfite(char *seq, Uint len, Uint bisulfite, Uint seed) {
+ if (seed){
+ /* bisulfite or PARCLIP in run 1 */
+ if (bisulfite <=4 && bisulfite % 2 == 1){
+ //fprintf(stderr,"seed conv of reads: C --> T\n");
+ strconvert(seq, len, 'C', 'T');
+ }
+ /* bisulfite or PARCLIP in run 2 */
+ if (bisulfite <=4 && bisulfite % 2 == 0){
+ //fprintf(stderr,"seed conv of reads: G --> A\n");
+ strconvert(seq, len, 'G', 'A');
+ }
+ }
+ else {
+ /* bisulfite or PARCLIP with 4SG in run 1 */
+ if (bisulfite == 1){
+ //fprintf(stderr,"align conv of reads: T --> Y\n");
+ strconvert(seq, len, 'T', 'Y');
+ }
+ /* bisulfite or PARCLIP with 4SG in run 2 */
+ if (bisulfite == 2){
+ //fprintf(stderr,"align conv of reads: A --> R\n");
+ strconvert(seq, len, 'A', 'R');
+ }
+ /* PARCLIP with 4SU in run 1 */
+ if (bisulfite == 3 || bisulfite == 5){
+ //fprintf(stderr,"align conv of reads: C --> Y\n");
+ strconvert(seq, len, 'C', 'Y');
+ }
+ /* PARCLIP with 4SU in run 2 */
+ if (bisulfite == 4 || bisulfite == 6){
+ //fprintf(stderr,"align conv of reads: G --> R\n");
+ strconvert(seq, len, 'G', 'R');
+ }
+ }
+}
+
+static inline void bl_reconvertBisulfite(char *seq, Uint len, Uint bisulfite) {
+ /*
+ * restoring original state is only possible in case of seed == 0
+ * (collapsing the alphabet is unrecoverable)
+ * => hence seed parameter omitted here
+ */
+
+ /* bisulfite or PARCLIP with 4SG in run 1 */
+ if (bisulfite == 1){
+ strconvert(seq, len, 'Y', 'T');
+ }
+ /* bisulfite or PARCLIP with 4SG in run 2 */
+ if (bisulfite == 2){
+ strconvert(seq, len, 'R', 'A');
+ }
+ /* PARCLIP with 4SU in run 1 */
+ if (bisulfite == 3 || bisulfite == 5){
+ strconvert(seq, len, 'Y', 'C');
+ }
+ /* PARCLIP with 4SU in run 2 */
+ if (bisulfite == 4 || bisulfite == 6){
+ strconvert(seq, len, 'R', 'G');
+ }
+}
+
+ #endif
diff --git a/segemehl/libs/sufarray/falphabet.c b/segemehl/libs/sufarray/falphabet.c
new file mode 100644
index 0000000..1df4697
--- /dev/null
+++ b/segemehl/libs/sufarray/falphabet.c
@@ -0,0 +1,52 @@
+
+/*
+ * falphabet.c
+ * implmentations for a flexible alphabet
+ *
+ * SVN
+ * Revision of last commit: $Rev: 39 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-09-02 12:22:02 +0200 (Tue, 02 Sep 2008) $
+ *
+ * Id: $Id: falphabet.c 39 2008-09-02 10:22:02Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/trunk/libs/sufarray/falphabet.c $
+ */
+
+ #include <stdlib.h>
+ #include "basic-types.h"
+ #include "falphabet.h"
+ #include "sort.h"
+ #include "memory.h"
+ #include "debug.h"
+
+ Uint lookupChar(FAlphabet* alphabet, Uint mapped) {
+ Uint ch;
+
+ ch = binarySearch(alphabet->mapdomain, alphabet->mapsize, &mapped, cmp_int_bin, NULL);
+ return ch;
+
+ }
+
+ void destructAlphabet (void *space, FAlphabet *alphabet) {
+
+ FREEMEMORY(space, alphabet->characters);
+ FREEMEMORY(space, alphabet->mapdomain);
+ FREEMEMORY(space, alphabet);
+ }
+
+ Uint lookupMapping(FAlphabet* alphabet, Uint ch) {
+ Uint i;
+ i = binarySearch(alphabet->characters, alphabet->domainsize, &ch, cmp_int_bin, NULL);
+ return i;
+ }
+
+ Uint cmp_map(FAlphabet* f, Uint a, Uint b){
+ Uint amap = lookupMapping(f, a);
+ Uint bmap = lookupMapping(f, b);
+ if (amap < 0 || amap >= f->mapsize || bmap < 0 || amap >= f->mapsize) {
+ DBG("Found char could not be mapped. Exit forced.\n",NULL);
+ exit(-1);
+ }
+ else
+ return f->mapdomain[amap] == f->mapdomain[bmap];
+ }
diff --git a/segemehl/libs/sufarray/falphabet.h b/segemehl/libs/sufarray/falphabet.h
new file mode 100644
index 0000000..4d497d8
--- /dev/null
+++ b/segemehl/libs/sufarray/falphabet.h
@@ -0,0 +1,40 @@
+ #ifndef FALPHABET_H
+ #define FALPHABET_H
+
+/*
+ * alphabet.h
+ * declarations for a flexible alphabet
+ *
+ * SVN
+ * Revision of last commit: $Rev: 40 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-09-03 10:42:48 +0200 (Wed, 03 Sep 2008) $
+ *
+ * Id: $Id: falphabet.h 40 2008-09-03 08:42:48Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/trunk/libs/sufarray/falphabet.h $
+ */
+
+ #include "basic-types.h"
+
+ typedef struct {
+
+ Uint *characters,
+ *mapdomain;
+
+ Uint domainsize,
+ mapsize,
+
+ mappedwildcards,
+ undefsymbol,
+ *symbolmap;
+
+ } FAlphabet;
+
+
+ /*from mapdomain to character*/
+ Uint lookupChar(FAlphabet *, Uint);
+ void destructAlphabet(void *space, FAlphabet *);
+ Uint cmp_map(FAlphabet*, Uint, Uint);
+ Uint lookupMapping(FAlphabet*, Uint);
+
+#endif
diff --git a/segemehl/libs/sufarray/mmchar.c b/segemehl/libs/sufarray/mmchar.c
new file mode 100644
index 0000000..0a4d377
--- /dev/null
+++ b/segemehl/libs/sufarray/mmchar.c
@@ -0,0 +1,174 @@
+
+/*
+ * mmchar.c
+ * implementations for searches manber myers style
+ * on enhanced suffix arrays (type char)
+ *
+ * @author Steve Hoffmann
+ * @email shoffmann at zbh.uni-hamburg.de
+ * @date 12/22/06 19:13:27 CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: mmchar.c 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/sufarray/mmchar.c $
+ */
+
+ #include <stdlib.h>
+ #include <stdio.h>
+ #include "basic-types.h"
+ #include "memory.h"
+ #include "mathematics.h"
+ #include "sufarray.h"
+ #include "mmchar.h"
+
+/*---------------------------------- mmleft ----------------------------------
+ *
+ * part of the manber-myers pattern search algorithm
+ *
+ */
+
+int
+mmleft(Suffixarray *arr, char *pattern, Uint len, int h, int l, int r)
+{
+ PairSint lbound, rbound, ibound;
+ int mid;
+
+ lbound = mmcompare (arr, pattern, len, l, h);
+ if(lbound.b <= 0) {
+ return l;
+ }
+ rbound = mmcompare (arr, pattern, len, r, h);
+ if(rbound.b > 0) {
+ return r+1;
+ }
+ while (r > l+1) {
+ mid = (l+r)/2;
+ ibound = mmcompare (arr, pattern, len, mid, MIN(lbound.a, rbound.a));
+ if(ibound.b <= 0) {
+ rbound.a = ibound.a;
+ r = mid;
+ } else {
+ lbound.a = ibound.a;
+ l = mid;
+ }
+ }
+ return r;
+}
+
+/*--------------------------------- mmright ----------------------------------
+ *
+ * part of the manber-myers pattern search algorithm
+ *
+ */
+
+int
+mmright (Suffixarray *arr, char *pattern, Uint len, int h, int l, int r)
+{
+ PairSint lbound, rbound, ibound;
+ int mid;
+
+ lbound = mmcompare(arr, pattern, len, l, h);
+ if (lbound.b < 0) {
+ return -1;
+ }
+ rbound = mmcompare(arr, pattern, len, r, h);
+ if (rbound.b >= 0) {
+ return r;
+ }
+
+ while (r > l+1) {
+ mid = (l+r)/2;
+ ibound = mmcompare(arr, pattern, len, mid, MIN(lbound.a, rbound.a));
+ if (ibound.b >= 0 ) {
+ lbound.a = ibound.a;
+ l = mid;
+ } else {
+ rbound.a = ibound.a;
+ r = mid;
+ }
+ }
+ return l;
+}
+
+/*-------------------------------- mmcompare ---------------------------------
+ *
+ * manber 'n' myers compare for suffixarrays
+ *
+ */
+
+PairSint
+mmcompare (Suffixarray *arr, char *pattern, Uint len, int idx, int start)
+{
+ char *sufptr;
+ int t, margin;
+ PairSint res;
+
+ sufptr = &arr->seq->sequences[arr->suftab[idx]];
+ t = start;
+ /*length = totallength - relativeposition*/
+ margin = MIN((len),(arr->seq->totallength-(sufptr-arr->seq->sequences)));
+
+ while (t < margin) {
+ if(pattern[t] < sufptr[t]) {
+ res.a=t;
+ res.b=-1;
+
+ return res;
+ } else {
+ if(pattern[t] > sufptr[t]) {
+ res.a=t;
+ res.b=1;
+
+ return res;
+ } else {
+ t++;
+ }
+ }
+ }
+
+ if (t == len) {
+ res.a=t;
+ res.b=0;
+ return res;
+ }
+
+ res.a=t;
+ res.b=-1;
+
+ return res;
+}
+
+
+/*--------------------------------- mmsearch ---------------------------------
+ *
+ * search manber'n'myers style
+ *
+ */
+
+PairSint
+mmsearch (Suffixarray *arr, char *pattern, Uint len, Uint h, Uint l, Uint r)
+{
+ int p, q;
+ PairSint ps;
+
+ p = mmleft(arr, pattern, len, h, l, r);
+ q = mmright(arr, pattern, len, h, l, r);
+
+ if (p <= q) {
+ ps.a = p;
+ ps.b = q;
+
+ return ps;
+ }
+
+ ps.a = 1;
+ ps.b = 0;
+
+ return ps;
+}
+
+
diff --git a/segemehl/libs/sufarray/mmchar.h b/segemehl/libs/sufarray/mmchar.h
new file mode 100644
index 0000000..3e39c4a
--- /dev/null
+++ b/segemehl/libs/sufarray/mmchar.h
@@ -0,0 +1,31 @@
+#ifndef MMCHAR_H
+#define MMCHAR_H
+
+/*
+ *
+ * mmchar.h
+ * declaration for searches manber myers style
+ * on enhanced suffix arrays
+ *
+ * @author Steve Hoffmann, shoffmann at zbh.uni-hamburg.de
+ * @company Center for Bioinformatics, Hamburg
+ * @date 12/22/06 19:11:16 CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: mmchar.h 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/sufarray/mmchar.h $
+ */
+
+ #include "basic-types.h"
+ #include "sufarray.h"
+
+ int mmleft(Suffixarray *, char*, Uint, int, int, int);
+ int mmright(Suffixarray *, char*, Uint, int, int, int);
+ PairSint mmcompare(Suffixarray *, char*, Uint, int, int);
+ PairSint mmsearch(Suffixarray *, char*, Uint, Uint, Uint, Uint);
+
+#endif
diff --git a/segemehl/libs/sufarray/multicharseq.c b/segemehl/libs/sufarray/multicharseq.c
new file mode 100644
index 0000000..0b500e1
--- /dev/null
+++ b/segemehl/libs/sufarray/multicharseq.c
@@ -0,0 +1,393 @@
+
+/*
+ * multiseq.c
+ * some functions to handle multiseqs (type char)
+ *
+ * @author Steve Hoffmann
+ * @email shoffmann at zbh.uni-hamburg.de
+ * @date 12/15/06 11:42:53 CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 66 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-02 13:38:05 +0200 (Thu, 02 Oct 2008) $
+ *
+ * Id: $Id: multicharseq.c 66 2008-10-02 11:38:05Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/sufarray/multicharseq.c $
+ */
+
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+ #include "basic-types.h"
+ #include "memory.h"
+ #include "debug.h"
+ #include "info.h"
+ #include "charsequence.h"
+ #include "vtprogressbar.h"
+ #include "multicharseq.h"
+ #include "alignment.h"
+ #include "mathematics.h"
+ #include "sort.h"
+
+/*---------------------------- concatCharSequences ----------------------------
+ *
+ * concatenates CharSequences using a given Uint delimiter
+ * and stores them in a MultiCharSeq container.
+ *
+ */
+
+MultiCharSeq *
+concatCharSequences (void *space, CharSequence **s, Uint len,
+ char delim, char sentinel)
+{
+ char *buf=NULL;
+ char *map = NULL;
+ Uint i, j, k=0,
+ totallength=0,
+ *markpos;
+ MultiCharSeq *mseq;
+
+ mseq = ALLOCMEMORY(space, NULL, MultiCharSeq, 1);
+ markpos = ALLOCMEMORY(space, NULL, Uint, len);
+ mseq->ref = ALLOCMEMORY(space, NULL, SeqReference, len);
+ map = ALLOCMEMORY(space, NULL, char, 257);
+ memset(map, 0, 256);
+ mseq->delim = delim;
+
+ for(i=0; i < len; i++) {
+
+ mseq->ref[i].ref = s[i];
+
+ totallength += (s[i]->length+1);
+ buf = ALLOCMEMORY(space, buf, char, totallength+1);
+ if (buf==NULL) {
+ NFO("allocation of %d bytes failed: exiting\n", totallength);
+ exit(-1);
+ }
+
+ for(j=0; j < s[i]->length; j++) {
+ buf[k] = s[i]->sequence[j];
+ if ((Uint)buf[k] == 0){
+ NFO("invalid character (NUL) in database sequences. Exit forced\n", NULL);
+ exit(-1);
+ }
+ map[(Uint)buf[k]]=buf[k];
+ k++;
+ }
+ /*separate sequences or finalize*/
+ if (i == (len-1)) {
+ buf[k] = sentinel;
+ map[(Uint)buf[k]]=buf[k];
+ markpos[i] = k;
+ k++;
+ buf[k]='\0';
+
+ } else {
+ buf[k] = delim;
+ map[(Uint)buf[k]]=buf[k];
+ markpos[i] = k;
+ k++;
+ }
+
+ /*FREEMEMORY(space, s[i]->sequence);*/
+ }
+ mseq->totallength = totallength;
+ mseq->numofsequences = len;
+ mseq->sequences = buf;
+ mseq->markpos = markpos;
+
+ for(i=0; i < 256; i++) {
+ if(map[i]==0) {
+ j=i+1;
+ while(j<256 && map[j]==0) j++;
+ if (j < 256) {
+ map[i]=map[j];
+ map[j]=0;
+ } else {
+ break;
+ }
+ }
+ }
+
+ map = ALLOCMEMORY(space, map, char, i+1);
+ mseq->map = map;
+ mseq->mapsize = i;
+
+
+ return mseq;
+}
+
+
+
+/*----------------------------- destructMultiSeq -----------------------------
+ *
+ * destructs a MultiSeq structure
+ *
+ */
+
+void
+destructMultiCharSeq (void *space, MultiCharSeq *mseq)
+{
+
+ FREEMEMORY(space, mseq->sequences);
+ if (mseq->markpos != NULL)
+ FREEMEMORY(space, mseq->markpos);
+ if (mseq->map != NULL)
+ FREEMEMORY(space, mseq->map);
+ if (mseq->ref != NULL)
+ FREEMEMORY(space, mseq->ref);
+ FREEMEMORY(space, mseq);
+ return ;
+}
+
+
+/*------------------------------- cmp_markpos --------------------------------
+ *
+ * compare function for getMultiSeqIndex
+ *
+ */
+
+Uint
+cmp_markpos (Uint a, void *data, void *key, void *info)
+{
+ Uint *d = (Uint*) data;
+ Uint *k = (Uint*) key;
+
+ if (d[a] > *k) {
+ if (a > 0) {
+ if (d[a-1] < *k) {
+ return 0;
+ } else {
+ return 1;
+ }
+ } else {
+ return 0;
+ }
+ }
+
+ if (d[a] < *k) return 2;
+ return 0;
+}
+
+/*-------------------------- getMultiSeqIndex --------------------------
+ *
+ * returns index of a sequence in multiseq addressed by a pointer
+ *
+ */
+
+Uint
+getMultiCharSeqIndex (MultiCharSeq *mseq, char *ptr)
+{
+ Uint pos, i;
+
+ if (mseq->numofsequences == 1){
+ return 0;
+ }
+ pos = (ptr - mseq->sequences);
+ if (mseq->numofsequences < MSEQ_BSEARCH_THRESHOLD) {
+ i=binarySearch(mseq->markpos, mseq->numofsequences, &pos,
+ cmp_markpos, NULL);
+ } else {
+ for (i=0; i < mseq->numofsequences; i++) {
+ if (mseq->markpos[i] > pos) break;
+ }
+ }
+
+ return i;
+}
+
+/*------------------------- getMultiCharSeqIdxBounds -------------------------
+ *
+ * @brief return start and end of idx
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+getMultiCharSeqIdxBounds(MultiCharSeq *mseq, Uint idx, Uint *start, Uint *end)
+{
+
+ *start = (idx > 0) ? mseq->markpos[idx-1]+1 : 0;
+ *end = mseq->markpos[idx];
+ return ;
+}
+
+
+/*---------------------------- nextMultiSeqDelim -----------------------------
+ *
+ * returns positions of next delimiter in multiseq (ie. end of current seq)
+ *
+ */
+
+Uint
+nextMultiSeqDelim (MultiCharSeq *mseq, char *ptr)
+{
+ return mseq->markpos[getMultiCharSeqIndex(mseq,ptr)];
+}
+
+
+
+/*---------------------------- getMultiSeqRelPos -----------------------------
+ *
+ * returns the relative position of a pointer to multiseq
+ * with respect to the addressed sequence.
+ *
+ */
+
+Uint
+getMultiCharSeqRelPos (MultiCharSeq *mseq, char *ptr)
+{
+ Uint idx;
+ CharSequence *seq;
+ idx = getMultiCharSeqIndex(mseq, ptr);
+ seq = getCharSequence(mseq, idx);
+ return (ptr - seq->sequence);
+}
+
+
+/*------------------------------- dumpMultiSeq -------------------------------
+ *
+ * dumps a multiseq to the screen
+ *
+ */
+
+void
+dumpMultiCharSeq (MultiCharSeq *mseq)
+{
+ Uint i;
+
+ for(i=0; i < mseq->totallength; i++) {
+ printf("%c-", mseq->sequences[i]);
+ }
+
+ printf("\n");
+ return ;
+}
+
+
+/*----------------------------- getCharSequence ------------------------------
+ *
+ * @brief return the CharSequence at index idx
+ * @author Steve Hoffmann
+ *
+ */
+
+CharSequence*
+getCharSequence (MultiCharSeq *mseq, Uint idx)
+{
+ return (CharSequence*) mseq->ref[idx].ref;
+}
+
+
+/*------------------------ initMultiCharSeqAlignment -------------------------
+ *
+ * @brief initalize an alignment in the multichar seq
+ *
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+initMultiCharSeqAlignment(
+ void *space, MultiCharSeqAlignment* a, MultiCharSeq *seq, Uint pos,
+ Uint loff, Uint len, unsigned char strand,
+ char *qrydesc, char *query, Uint qrylen)
+{
+ Uint sub_start,
+ sub_end;
+ //Uint i;
+
+ a->subidx = getMultiCharSeqIndex(seq, &seq->sequences[pos]);
+ getMultiCharSeqIdxBounds(seq, a->subidx, &sub_start, &sub_end);
+ a->substart = sub_start;
+ a->subend = sub_end;
+
+ a->refstart = MAX(sub_start, (Lint)pos-loff);
+
+ if(a->refstart > sub_end) {
+ fprintf(stderr, "refstart > substart: skiping MultiCharSeqAlignment\n");
+ return 0;
+ }
+
+ a->reflen = (sub_end > (Lint)a->refstart + len)?len:(sub_end - a->refstart)+1;
+ a->refseq = &seq->sequences[a->refstart];
+ a->refdesc = ((CharSequence*)seq->ref[a->subidx].ref)->description;
+ a->qrydesc = qrydesc;
+ a->query = query;
+ a->strand = strand;
+ a->qrylen = qrylen;
+/*
+ for(i=0; i < a->reflen; i++) {
+ fprintf(stdout, "%c", a->refseq[i]);
+ }
+ fprintf(stdout, "\n");
+*/
+ a->al = ALLOCMEMORY(space, NULL, Alignment, 1);
+ initAlignment(a->al, query, qrylen, 0, a->refseq, a->reflen, 0);
+
+ return 1;
+}
+
+void
+wrapMultiCharSeqAlignment(void *space, MultiCharSeqAlignment *a) {
+ wrapAlignment(a->al);
+ FREEMEMORY(space, a->al);
+}
+/*----------------------- initMultiCharSeqAlignmentOpt -----------------------
+ *
+ * @brief init a mcsa with query bounds
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+initMultiCharSeqAlignmentOpt(
+ void *space, MultiCharSeqAlignment* a, MultiCharSeq *seq, Uint pos,
+ char *qrydesc, char *query, Uint start, Uint end,
+ Uint qrylen, Uint floff, Uint flen, Uint uloff, Uint uroff, Uint maxoff, unsigned char strand)
+{
+
+ Uint sub_start,
+ sub_end,
+ rlen, qstart, qend, qlen;
+
+ //get bounds and length of reference sequence
+ a->subidx = getMultiCharSeqIndex(seq, &seq->sequences[pos]);
+ getMultiCharSeqIdxBounds(seq, a->subidx, &sub_start, &sub_end);
+ a->substart = sub_start;
+ a->subend = sub_end;
+ a->refstart = MAX(sub_start, (Lint)pos-floff); //maxoff
+ a->floff = pos - a->refstart;
+
+ //this should not happen
+ if(a->refstart > sub_end) {
+ fprintf(stderr, "refstart > substart: skiping MultiCharSeqAlignment\n");
+ return 0;
+ }
+
+ rlen = flen;
+ a->refseq = &seq->sequences[a->refstart];
+ a->reflen = (sub_end > (Lint)a->refstart + rlen) ? rlen : (sub_end - a->refstart)+1;
+
+
+ //get bounds and length of query sequence
+ qstart = (start > maxoff+uloff) ? start-(maxoff+uloff) : 0;
+ qend = (end + uroff + maxoff < qrylen) ? end + uroff + maxoff : qrylen;
+ qlen = qend - qstart;
+ a->query = query;
+ a->qrystart = qstart;
+ a->qrylen = qlen;
+ a->strand = strand;
+
+ //descriptions
+ a->refdesc = ((CharSequence*)seq->ref[a->subidx].ref)->description;
+ a->qrydesc = qrydesc;
+ //init alignment and return
+ a->al = ALLOCMEMORY(space, NULL, Alignment, 1);
+ initAlignment(a->al, query, qlen, 0, a->refseq, a->reflen, 0);
+
+ return 1;
+}
+
+
diff --git a/segemehl/libs/sufarray/multicharseq.h b/segemehl/libs/sufarray/multicharseq.h
new file mode 100644
index 0000000..008ed24
--- /dev/null
+++ b/segemehl/libs/sufarray/multicharseq.h
@@ -0,0 +1,87 @@
+#ifndef MULTI_SEQ_H
+#define MULTI_SEQ_H
+
+/*
+ * multiseq.h
+ * declarations for a datastructure containing
+ * multiple integer sequences
+ *
+ * @author Steve Hoffmann, shoffmann at zbh.uni-hamburg.de
+ * @company Center for Bioinformatics, Hamburg
+ * @date 12/11/06 15:09:15 CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 65 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-09-22 00:14:54 +0200 (Mon, 22 Sep 2008) $
+ *
+ * Id: $Id: multicharseq.h 65 2008-09-21 22:14:54Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/sufarray/multicharseq.h $
+ */
+
+
+#include "basic-types.h"
+#include "charsequence.h"
+#include "alignment.h"
+
+#define MSEQ_BSEARCH_THRESHOLD 10
+
+typedef struct {
+ void *ref;
+} SeqReference;
+
+
+typedef struct {
+ Uint numofsequences;
+ Uint totallength;
+ Uint *markpos; /*markpos[i] is the position of a*/
+ /*separator character between S_i and S_i+1*/
+ char *sequences; /*array of concatenated sequences*/
+ SeqReference *ref; /*ref[i] points to the original sequence*/
+ /*that starts at position markpos[i]*/
+ char *map;
+ Uint mapsize;
+ char delim;
+
+} MultiCharSeq;
+
+
+typedef struct {
+ Uint subidx;
+ Uint substart;
+ Uint subend;
+
+ char *refdesc;
+ char *refseq;
+ Uint refstart;
+ Uint reflen;
+ Uint floff;
+
+ char *qrydesc;
+ char *query;
+ Uint qrystart;
+ Uint qrylen;
+ Alignment *al;
+ unsigned char strand;
+ char pass;
+
+} MultiCharSeqAlignment;
+
+void dumpMultiCharSeq (MultiCharSeq *);
+MultiCharSeq* concatCharSequences(void *, CharSequence **, Uint, char, char);
+void destructMultiCharSeq(void*, MultiCharSeq *);
+Uint getMultiCharSeqIndex(MultiCharSeq *, char *);
+Uint getMultiCharSeqRelPos(MultiCharSeq *, char *);
+CharSequence* getCharSequence(MultiCharSeq *, Uint idx);
+void getMultiCharSeqIdxBounds(MultiCharSeq *mseq, Uint idx, Uint *start, Uint *end);
+int initMultiCharSeqAlignment(
+ void *space, MultiCharSeqAlignment* a, MultiCharSeq *seq, Uint pos, Uint loff, Uint len,
+ unsigned char strand, char *querydesc, char *query, Uint qlen);
+void wrapMultiCharSeqAlignment(void *space, MultiCharSeqAlignment *a);
+
+int initMultiCharSeqAlignmentOpt(
+ void *space, MultiCharSeqAlignment* a, MultiCharSeq *seq, Uint pos,
+ char *qrydesc, char *query, Uint start, Uint end,
+ Uint qrylen, Uint floff, Uint flen, Uint uloff, Uint uroff, Uint maxoff, unsigned char strand) ;
+
+#endif
diff --git a/segemehl/libs/sufarray/sufarray.c b/segemehl/libs/sufarray/sufarray.c
new file mode 100644
index 0000000..4011760
--- /dev/null
+++ b/segemehl/libs/sufarray/sufarray.c
@@ -0,0 +1,1542 @@
+
+/*
+ * sufarray.c
+ * implementations for enhanced suffix arrays
+ * for large integer alphabets
+ *
+ * @author Steve Hoffmann
+ * @email shoffmann at zbh.uni-hamburg.de
+ * @date 12/11/06 14:56:57 CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 74 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-29 15:03:04 +0100 (Wed, 29 Oct 2008) $
+ *
+ * Id: $Id: sufarray.c 74 2008-10-29 14:03:04Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/sufarray/sufarray.c $
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include "basic-types.h"
+#include "memory.h"
+#include "mathematics.h"
+#include "sufarray.h"
+#include "charsequence.h"
+#include "falphabet.h"
+#include "stack.h"
+#include "vstack.h"
+#include "sort.h"
+#include "container.h"
+#include "vtprogressbar.h"
+#include "aluruSort.h"
+#include "vqueue.h"
+#include "debug.h"
+#include "md5.h"
+#include "info.h"
+#include "stringutils.h"
+#include "bitArray.h"
+
+unsigned char sl_diskacc = 0;
+
+void
+destructinterval(void *space, void *data) {
+ FREEMEMORY(space, data);
+}
+
+
+/*------------------------------ checksuflinks -------------------------------
+ *
+ * @brief integrity check for suflinks
+ * @author Steve Hoffmann
+ *
+ */
+
+void checksuflinks(Suffixarray *s, Uint i, Uint j){
+ Uint k, childlcp, suflcp, *space = NULL;
+ PairUint* child, childsuf;
+ Container *children;
+ // ignore singletons as initial input
+ if (i == j){
+ return;
+ }
+ children = getChildintervals(space, s, i, j, 0);
+ for (k = 0; k < bl_containerSize(children); k++){
+ child = (PairUint *) bl_containerGet(children, k);
+ // exclude singletons
+ if (child->a == child->b){
+ return;
+ }
+ // check suflink of child
+ childlcp = getlcpval(s, child->a, child->b);
+ childsuf = getSuflink(s, child->a, child->b);
+ suflcp = getlcpval(s, childsuf.a, childsuf.b);
+ if (childlcp != suflcp + 1){
+ DBG("suf[%u, %u, %u]=[%u, %u, %u]\n", child->a, child->b, childlcp,
+ childsuf.a, childsuf.b, suflcp);
+ }
+ // recursively check all children of child
+ checksuflinks(s, child->a, child->b);
+ }
+ bl_containerDestruct(children, NULL);
+ free(children);
+}
+
+/* ------------------------------ cmpCharSequence ----------------------------
+ *
+ * function to compare CharSequences for mulitkey sort (sort.c)
+ *
+ */
+
+ Uint
+cmpCharSequence (Uint a, Uint b, Uint depth, void *data, void *info)
+{
+ char *s = (char*) data;
+ Uint *end;
+
+ /*quick fix to meet end of multiintsequence criterion*/
+ if (info == NULL) {
+ if(s[b] == (char) 127) {
+ if (s[a+depth] == (char) 127) {
+ return 0;
+ }
+ return 1;
+ }
+ } else {
+ end = (Uint*) info;
+ if (*end == b) {
+ if (s[a+depth] == (char) 127) {
+ return 0;
+ }
+ return 1;
+ }
+ }
+
+
+ /*real comparison*/
+ if (s[a+depth] > s[b+depth]) return 1;
+ if (s[a+depth] < s[b+depth]) return 2;
+
+ return 0;
+}
+
+
+
+/* ---------------------------- constructSufArr -----------------------------
+ *
+ * constructs a suffix array from an (unsigned) integer sequence
+ * should be working in O(n). It uses linear sorting method
+ * introduced by Aluru et al.
+ *
+ */
+
+ Suffixarray*
+constructSufArr(void *space,
+ CharSequence **s,
+ Uint len,
+ FAlphabet* alphabet,
+ unsigned char silent)
+{
+
+ Uint i, numofsuffixes,
+ *sorted,
+ *inv_suftab;
+ MultiCharSeq *mseq;
+ Suffixarray *arr;
+ unsigned char *temp,
+ *mdfive=NULL;
+
+
+
+ mseq = concatCharSequences(space, s, len, (char)126, (char)127);
+ numofsuffixes = mseq->totallength;
+ mdfive = ALLOCMEMORY(space, NULL, char, 16);
+ temp = MD5R((unsigned char*)mseq->sequences, numofsuffixes, NULL);
+
+
+ memmove(mdfive, temp, 16);
+
+
+ if(!silent) NFO("alphabet of size (%d): %s\n", mseq->mapsize, mseq->map);
+ if(!silent) NFO("size of db sequence: %u\n", numofsuffixes);
+ inv_suftab = ALLOCMEMORY(space, NULL, Uint , numofsuffixes);
+ arr = ALLOCMEMORY(space, NULL, Suffixarray, 1);
+
+ if(!silent) MSG("constructing suftab.\n");
+
+#ifdef SUF_MKQUICKSORT
+ sorted = quickSortMultikey (space, mseq->sequences, numofsuffixes,
+ cmpCharSequence, numofsuffixes-1, NULL);
+#else
+ sorted = alurusort(space, mseq->sequences,
+ &(numofsuffixes));
+#endif
+
+ if (!silent)NFO("constructing inv_suftab (%u).\n", numofsuffixes);
+ for (i=0; i < numofsuffixes; i++) {
+ if (sorted[i] > numofsuffixes) fprintf(stderr, "construction error? %u: %u\n",i, sorted[i]);
+ inv_suftab[sorted[i]]=i;
+ }
+ if (!silent)MSG("inv_suftab constructed.\n");
+
+ arr->seq = mseq;
+ arr->numofsuffixes = numofsuffixes;
+ arr->suftab = sorted;
+ arr->inv_suftab = inv_suftab;
+ arr->mdfive = mdfive;
+ arr->lcpctab = NULL;
+ arr->llvtab = NULL;
+
+ arr->id = NULL;
+ arr->idvtab = NULL;
+ arr->chldtab = NULL;
+ arr->bcktab = NULL;
+
+ arr->suflink = NULL;
+ arr->suflink_l = NULL;
+ arr->suflink_r = NULL;
+ arr->llint = 1;
+ return arr;
+}
+
+
+void
+writeSuffixarray(Suffixarray *s, char *filename) {
+ FILE *fp;
+ Uint nmemb,
+ idvmemb,
+ llvmemb;
+ unsigned char flags = 0;
+
+ fp = fopen(filename, "w");
+ if (fp == NULL) {
+ DBG("Couldn't open file %s. Exit forced.\n", filename);
+ exit(-1);
+ }
+
+ if (s->lcpctab != NULL) {
+ flags |= LCP_TAB_STORED;
+ }
+
+ if (s->chldtab != NULL) {
+ flags |= CHLD_TAB_STORED;
+ }
+
+ if(s->suflink != NULL) {
+ flags |= SUFLINK_TAB_STORED;
+ flags |= SUFLINK_COMPRESSED;
+ if(s->llint) flags |= LINT_SUFLINKS;
+ }
+
+ if(s->suflink_l != NULL) {
+ flags |= SUFLINK_TAB_STORED;
+ }
+
+ if(s->mdfive != NULL) {
+ flags |= MD5_STORED;
+ }
+
+ nmemb = s->numofsuffixes;
+ fwrite(&nmemb, sizeof(Uint), 1, fp);
+ fwrite(s->suftab, sizeof(Uint), nmemb, fp);
+ fwrite(&flags, sizeof(char), 1, fp);
+
+ if (s->lcpctab != NULL) {
+ fwrite(s->lcpctab, sizeof(char), nmemb, fp);
+ llvmemb = (Uint) s->llvcnt;
+ fwrite(&llvmemb, sizeof(Uint), 1, fp);
+ fwrite(s->llvtab, 2*sizeof(Uint), llvmemb, fp);
+ }
+
+ if (s->chldtab != NULL) {
+ fwrite(s->chldtab, sizeof(Uint), nmemb, fp);
+ }
+
+ if (s->suflink != NULL) {
+ fwrite(s->suflink, sizeof(Uint), nmemb, fp);
+ fwrite(s->id, sizeof(char), nmemb, fp);
+ idvmemb = (Uint) s->idvcnt;
+ fwrite(&idvmemb, sizeof(Uint), 1, fp);
+ fwrite(s->idvtab, sizeof(PairLSint), idvmemb, fp);
+ }
+
+ if (s->mdfive != NULL) {
+ fwrite(s->mdfive, sizeof(char), 16, fp);
+ }
+
+ fclose(fp);
+}
+
+
+Suffixarray *
+readSuffixarray(void *space,
+ char *idxfilename,
+ CharSequence **seqs,
+ Uint len,
+ unsigned char silent) {
+ FILE *fp;
+ Uint nmemb = 0,
+ idvmemb = 0,
+ llvmemb = 0,
+ numofsuffixes,
+ *suftab = NULL,
+ idvi =0;
+ childtab *chldtab = NULL;
+ unsigned char flags=0,
+ *lcpctab = NULL;
+ unsigned char *mdfive=NULL,
+ *check=NULL;
+ PairUint *llvtab = NULL;
+ PairLSint *idvtab = NULL;
+ PairSint *idvutab = NULL;
+
+ MultiCharSeq *mseq;
+ Suffixarray *s;
+
+#ifdef SUFLINK_MMAP
+ int fd;
+ signed char *id = NULL;
+ long curiopos, offset;
+ struct stat sb;
+ char *suflinkptr;
+ int pagediff_id;
+ int pagediff_sl;
+#elif SUFLINK_DISKACC
+ int fd;
+ off_t off_sl;
+ off_t off_id;
+#else
+ signed char *id = NULL;
+ Uint *suflink = NULL;
+#endif
+
+ mseq = concatCharSequences(space, seqs, len, (char)126, (char)127);
+ numofsuffixes = mseq->totallength;
+
+ fp = fopen(idxfilename, "r");
+ if (fp == NULL) {
+ DBG("Couldn't open file '%s'. Exit forced.\n", idxfilename);
+ exit(-1);
+ }
+
+ fread(&nmemb, sizeof(Uint), 1, fp);
+ suftab = ALLOCMEMORY(NULL, NULL, Uint, nmemb);
+ fread(suftab, sizeof(Uint), nmemb, fp);
+ fread(&flags, sizeof(char), 1, fp);
+
+ if (flags & LCP_TAB_STORED) {
+ if (!silent) MSG("reading lcpc/vtab.\n");
+ lcpctab = ALLOCMEMORY(space, NULL, unsigned char, nmemb);
+ fread(lcpctab, sizeof(unsigned char), nmemb, fp);
+
+ fread(&llvmemb, sizeof(Uint), 1, fp);
+ llvtab = ALLOCMEMORY(space, NULL, PairUint, nmemb);
+ fread(llvtab, sizeof(PairUint), llvmemb, fp);
+ }
+
+ if (flags & CHLD_TAB_STORED) {
+ if(!silent) MSG("reading childtab.\n");
+ chldtab = ALLOCMEMORY(space, NULL, childtab, nmemb);
+ fread(chldtab, sizeof(childtab), nmemb, fp);
+ }
+
+ if ((flags & SUFLINK_TAB_STORED)) {
+ if(!silent) MSG("reading suflinks.\n");
+
+#ifdef SUFLINK_MMAP
+ curiopos = ftell(fp);
+ fd = open(idxfilename, O_RDONLY);
+ if (fd == -1) {
+ perror("open");
+ exit(EXIT_FAILURE);
+ }
+
+ if (fstat(fd, &sb) == -1) {
+ perror("fstat");
+ exit(EXIT_FAILURE);
+ }
+
+ offset = curiopos & ~(sysconf(_SC_PAGE_SIZE) - 1);
+ if (curiopos >= sb.st_size) {
+ fprintf(stderr, "offset is past end of file\n");
+ exit(EXIT_FAILURE);
+ }
+
+ pagediff_sl = curiopos - offset;
+ suflinkptr = mmap(0, nmemb*sizeof(Uint) + pagediff_sl, PROT_READ, MAP_SHARED, fd, offset);
+
+ if (suflinkptr == MAP_FAILED) {
+ perror("mmap");
+ exit(EXIT_FAILURE);
+ }
+#elif SUFLINK_DISKACC
+ sl_diskacc = 1;
+ off_sl = ftell(fp);
+ fd = open(idxfilename, O_RDONLY);
+#else
+ suflink = ALLOCMEMORY(space, NULL, Uint, nmemb);
+ fread(suflink, sizeof(Uint), nmemb, fp);
+#endif
+
+#ifdef SUFLINK_MMAP
+ offset = (curiopos+(nmemb*sizeof(Uint))) & ~(sysconf(_SC_PAGE_SIZE) - 1);
+ if (curiopos >= sb.st_size) {
+ fprintf(stderr, "offset is past end of file\n");
+ exit(EXIT_FAILURE);
+ }
+
+ pagediff_id = (curiopos+(nmemb*sizeof(Uint))) - offset;
+ id = mmap(0, nmemb*sizeof(signed char) + pagediff_id, PROT_READ, MAP_SHARED, fd, offset);
+
+ if (id == MAP_FAILED) {
+ perror("mmap");
+ exit(EXIT_FAILURE);
+ }
+ fseek(fp, nmemb*(sizeof(Uint)+sizeof(signed char)), SEEK_CUR);
+
+#elif SUFLINK_DISKACC
+ off_id = off_sl+(nmemb*sizeof(Uint));
+ fseek(fp, nmemb*(sizeof(Uint)+sizeof(signed char)), SEEK_CUR);
+#else
+ id = ALLOCMEMORY(space, NULL, signed char, nmemb);
+ fread(id, sizeof(signed char), nmemb, fp);
+#endif
+
+ fread(&idvmemb, sizeof(Uint), 1, fp);
+ idvtab = ALLOCMEMORY(space, NULL, PairLSint, idvmemb);
+ if ((flags & LINT_SUFLINKS)) {
+ if(!silent) MSG("reading lsint id.\n");
+ fread(idvtab, sizeof(PairLSint), idvmemb, fp);
+ } else {
+ idvutab = ALLOCMEMORY(space, NULL, PairSint, idvmemb);
+ if(!silent) MSG("reading uint id.\n");
+ fread(idvutab, sizeof(PairUint), idvmemb, fp);
+ for(idvi=0; idvi < idvmemb; idvi++) {
+ idvtab[idvi].a = idvutab[idvi].a;
+ idvtab[idvi].b = idvutab[idvi].b;
+ }
+ free(idvutab);
+ }
+ }
+
+ if ((flags & MD5_STORED)) {
+ mdfive = ALLOCMEMORY(space, NULL, unsigned char, 16);
+ fread(mdfive, sizeof(unsigned char), 16, fp);
+ }
+
+ s = ALLOCMEMORY(space, NULL, Suffixarray, 1);
+
+ if ((flags & LINT_SUFLINKS))
+ s->llint = 1; else s->llint=0;
+ s->suftab = suftab;
+ s->seq = mseq;
+ s->numofsuffixes = numofsuffixes;
+ s->lcpctab = lcpctab;
+ s->llvtab = llvtab;
+ s->llvcnt = llvmemb;
+ s->inv_suftab=NULL;
+ s->chldtab = chldtab;
+
+#ifdef SUFLINK_MMAP
+ s->suflink = (Uint*) &suflinkptr[pagediff_sl];
+ s->id = &id[pagediff_id];
+ s->pagediff_id = pagediff_id;
+ s->pagediff_sl = pagediff_sl;
+#elif SUFLINK_DISKACC
+ s->fd = fd;
+ s->off_sl = off_sl;
+ s->off_id = off_id;
+#else
+ s->suflink = suflink;
+ s->id = id;
+#endif
+
+ s->idvtab = idvtab;
+ s->idvcnt = idvmemb;
+ s->mdfive = mdfive;
+
+ if (!silent) NFO("read suffix array '%s' with %u elements.\n",
+ idxfilename, nmemb);
+
+ fclose(fp);
+
+ check = MD5R((unsigned char*)mseq->sequences, numofsuffixes, NULL);
+ //Uint di;
+ //fprintf(stderr, "fasta:");
+ //for(di=0; di < 16; di++) {
+ // fprintf(stderr, "%02x", check[di]);
+ //}
+ //fprintf(stderr, "\nindex:");
+ //for(di=0; di < 16; di++) {
+ // fprintf(stderr, "%02x", mdfive[di]);
+ //}
+ //fprintf(stderr, "\n");
+
+ if (mdfive == NULL) {
+ MSG("warning: index does not contain md5 key.\n");
+ } else {
+ if(checkmd5(check, mdfive) != 0) {
+ MSG("error: db and idx MD5 mismatch. Wrong db?\n");
+ char op = 0;
+ while(op != 'i' && op != 'u' && op != 'a') {
+ MSG("options: (i)gnore (u)pdate index file (a)bort: ");
+ do {
+ op = fgetc(stdin);
+ } while(ISWHITESPACE(op));
+ }
+ if (op == 'u'){
+ NFO("updating suffix array '%s' on disk.\n", idxfilename);
+ s->mdfive = check;
+ writeSuffixarray(s, idxfilename);
+ }
+ if (op == 'a'){
+ exit(-1);
+ }
+ } else {
+ MSG("md5 keys of index and db match.\n");
+ }
+ }
+ FREEMEMORY(space, check);
+
+ return s;
+}
+
+
+
+/*------------------------------ destructSufArr ------------------------------
+ *
+ * destruct a suffix array.
+ *
+ */
+ void
+destructSufArr (void *space, Suffixarray *arr)
+{
+ FREEMEMORY(space, arr->suftab);
+ if (arr->lcpctab != NULL)
+ FREEMEMORY(space, arr->lcpctab);
+ if (arr->inv_suftab != NULL)
+ FREEMEMORY(space, arr->inv_suftab);
+ if (arr->seq != NULL)
+ destructMultiCharSeq(space, arr->seq);
+ if (arr->idvtab != NULL)
+ FREEMEMORY(space, arr->idvtab);
+ if (arr->suflink != NULL)
+#ifdef SUFLINK_MMAP
+ munmap(arr->id, arr->numofsuffixes*sizeof(Uint) + arr->pagediff_sl);
+#else
+ FREEMEMORY(space, arr->suflink);
+#endif
+ if (arr->chldtab != NULL)
+ FREEMEMORY(space, arr->chldtab);
+ if (arr->mdfive)
+ FREEMEMORY(space, arr->mdfive);
+ if(arr->llvtab != NULL)
+ FREEMEMORY(space, arr->llvtab);
+ if(arr->id != NULL)
+#ifdef SUFLINK_MMAP
+ munmap(arr->id, arr->numofsuffixes*sizeof(signed char) + arr->pagediff_id);
+#else
+ FREEMEMORY(space, arr->id);
+#endif
+ FREEMEMORY(space, arr);
+#ifdef SUFLINK_DISKACC
+ close(arr->fd);
+#endif
+ return ;
+}
+inline Uint
+lcp(Suffixarray *s, Uint i) {
+ PairUint *ret;
+ Uint val;
+
+ /* return s->lcptab[i];*/
+
+ if(s->lcpctab[i] < 254) {
+ return (Uint) s->lcpctab[i];
+ } else {
+ val = i;
+ ret=bsearch(&val, s->llvtab, s->llvcnt, sizeof(PairUint), cmp_PairUint_bsearch);
+ }
+ if (ret == NULL) {
+ DBG("lcp '%d' not found. Exit forced.\n", i);
+ exit(-1);
+ }
+
+ return ret->b;
+}
+
+
+inline Uint
+maxlcp(Suffixarray *s) {
+ Uint i;
+ Uint max = 0;
+
+ for(i=0; i < s->numofsuffixes; i++) {
+ if (lcp(s,i) > max) max = lcp(s,i);
+ }
+ return max;
+}
+
+/*------------------------------ computeLcpTab -------------------------------
+ *
+ * computes the lcp tab from suftab and inv_suftab in O(n).
+ *
+ */
+
+ void
+constructLcp (void *space, Suffixarray *arr)
+{
+ Uint i, j, k;
+ Uint max = 0;
+ Lint l=0;
+
+ /*arr->lcptab = ALLOCMEMORY(space, NULL, Uint, arr->numofsuffixes);
+ memset(arr->lcptab, 0, sizeof(Uint)*arr->numofsuffixes);*/
+
+ arr->lcpctab = ALLOCMEMORY(space, NULL, unsigned char, arr->numofsuffixes);
+ arr->llvcnt = 0;
+ arr->llvtab = NULL;
+
+ initProgressBarVT();
+ for(i=0; i < arr->numofsuffixes; i++) {
+ j = arr->inv_suftab[i];
+
+ if (j > 0) {
+ k = arr->suftab[j-1];
+ l=l-1;
+ if (l < 0) l=0;
+
+ while (arr->seq->sequences[i+l] == arr->seq->sequences[k+l]
+ // && arr->seq->sequences[i+l] != (char)126
+ // && arr->seq->sequences[k+l] != (char)126
+ ){
+ l++;
+ }
+
+ /* arr->lcptab[j] = l;*/
+ if (l > max) max = l;
+ if (l < 254) {
+ arr->lcpctab[j] = (char) l;
+ } else {
+ arr->lcpctab[j] = 254;
+ arr->llvtab = ALLOCMEMORY(space, arr->llvtab, PairUint, arr->llvcnt+1);
+ arr->llvtab[arr->llvcnt].a = j;
+ arr->llvtab[arr->llvcnt].b = l;
+ arr->llvcnt++;
+ }
+ }
+ }
+
+ qsort(arr->llvtab, arr->llvcnt, sizeof(PairUint), cmp_PairUint_qsort);
+ arr->maxlcp=max;
+ arr->lcpctab[0]=0;
+
+ return;
+}
+
+inline Lint
+id(Suffixarray *s, Uint i) {
+ PairLSint *retl;
+ signed char ch;
+ Lint vall;
+ ssize_t nbytes;
+
+ if(sl_diskacc) {
+ lseek(s->fd, s->off_id+(i*sizeof(signed char)), SEEK_SET);
+ nbytes = read(s->fd, &ch, sizeof(signed char));
+
+ if(nbytes == -1) {
+ perror("suflink access failed");
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ ch = s->id[i];
+ }
+
+ if(ch != (signed char)-128) {
+ return (Lint) ch;
+ } else {
+
+ vall = (Lint) i;
+ retl = bsearch(&vall, s->idvtab, s->idvcnt, sizeof(PairLSint),
+ cmp_PairLSint_bsearch);
+ if (retl == NULL) {
+ DBG("id '%d' not found. Exit forced.\n", i);
+ exit(-1);
+ }
+ return retl->b;
+ }
+ exit(-1);
+ return retl->b;
+}
+
+
+inline unsigned char
+isnextlIndex(Suffixarray *s, Uint i) {
+ return (lcp(s,s->chldtab[i].val) == lcp(s,i));
+}
+
+inline unsigned char
+isdownIndex(Suffixarray *s, Uint i) {
+ return (lcp(s,s->chldtab[i].val) > lcp(s,i));
+}
+
+inline unsigned char
+isupIndex(Suffixarray *s, Uint i) {
+ return (lcp(s,i) > lcp(s, i+1));
+}
+
+inline Uint
+getfirstlindex(Suffixarray *s, Uint i, Uint j){
+ Uint val=0;
+
+ if((i==0 && j == s->numofsuffixes-1) || i==j) return 0;
+
+ if (j < s->numofsuffixes && isupIndex(s,j)
+ && i < s->chldtab[j].val && j >= s->chldtab[j].val) {
+ val = s->chldtab[j].val;
+ } else if (isdownIndex(s,i)){
+ val = s->chldtab[i].val;
+ }
+
+ return val;
+}
+
+inline Uint
+getlcpval(Suffixarray *s, Uint i, Uint j){
+ Uint val=0;
+
+ if((i==0 && j == s->numofsuffixes-1) || i==j) return 0;
+
+ if (j < s->numofsuffixes && isupIndex(s,j)
+ && i < s->chldtab[j].val && j >= s->chldtab[j].val) {
+ val = lcp(s, s->chldtab[j].val);
+ } else if (isdownIndex(s,i)){
+ val = lcp(s, s->chldtab[i].val);
+ }
+
+ return val;
+}
+
+
+inline void
+addinterval(void *space, Container *c, Uint a, Uint b) {
+ PairUint range;
+ PairUint *check;
+
+ range.a=a;
+ range.b=b;
+ bl_containerAdd(c, &range);
+ if(bl_containerSize(c) > 0){
+ check = (PairUint*) bl_containerGet(c, bl_containerSize(c) - 1);
+ if(range.a < check->a) {
+ printf("check->a: %d, range.a: %d\n", check->a, range.a);
+ }
+ }
+ return;
+}
+
+inline Lint*
+getChildintervalsArr(void *space,
+ Suffixarray *s,
+ Uint i,
+ Uint j,
+ Uint *noofintervals,
+ BOOL checkdelim) {
+
+ Lint *c;
+ Uint count=0;
+ Uint i1,
+ i2,
+ lcp =0;
+ unsigned char child;
+
+ /*ALERT -1*/
+ child = (i > 0 || j < s->numofsuffixes-1);
+ c = (Lint*) malloc(sizeof(Lint)*s->seq->mapsize*2+1);
+ if(checkdelim) lcp = getlcpval(s, i, j);
+
+ if(child) {
+ if (i < s->chldtab[j].val && s->chldtab[j].val <=j) {
+ i1 = s->chldtab[j].val;
+ } else {
+ i1 = s->chldtab[i].val;
+ }
+ if(!checkdelim || s->seq->sequences[s->suftab[i] + lcp] != s->seq->delim) {
+ c[count*2] = i;
+ c[count*2+1] = i1-1;
+ count++;
+ }
+
+ } else {
+ i1 = i;
+ }
+
+ while(i1 < s->numofsuffixes-1 && isnextlIndex(s,i1) && !isupIndex(s,i1)
+ && s->chldtab[i1].val != 0) {
+ i2 = s->chldtab[i1].val;
+ if(!checkdelim || s->seq->sequences[s->suftab[i1] + lcp] != s->seq->delim) {
+ c[count*2] = i1;
+ c[count*2+1] = i2-1;
+ count++;
+ }
+ i1 = i2;
+ }
+
+ if(child && (!checkdelim || s->seq->sequences[s->suftab[i1] + lcp] != s->seq->delim)) {
+ c[count*2] = i1;
+ c[count*2+1] = j;
+ count++;
+ }
+
+ *noofintervals = count;
+ return c;
+}
+
+inline Container*
+getChildintervals(void *space,
+ Suffixarray *s,
+ Uint i,
+ Uint j,
+ BOOL checkdelim) {
+
+ Container *c;
+ Uint i1,
+ i2,
+ lcp = 0;
+ unsigned char child;
+
+ /*ALERT -1*/
+ child = (i > 0 || j < s->numofsuffixes-1);
+ c = (Container *) malloc(sizeof(Container));
+ bl_containerInit(c, 10, sizeof(PairUint));
+ if(checkdelim) lcp = getlcpval(s, i, j);
+
+ if(child) {
+ if (i < s->chldtab[j].val && s->chldtab[j].val <=j) {
+ i1 = s->chldtab[j].val;
+ } else {
+ i1 = s->chldtab[i].val;
+ }
+ if(!checkdelim || s->seq->sequences[s->suftab[i]+lcp] != s->seq->delim) {
+ addinterval(space, c, i, i1-1);
+ }
+
+ } else {
+ i1 = i;
+ }
+
+ while(i1 < s->numofsuffixes-1 && isnextlIndex(s,i1) && !isupIndex(s,i1)
+ && s->chldtab[i1].val != 0) {
+ i2 = s->chldtab[i1].val;
+ if(!checkdelim || s->seq->sequences[s->suftab[i1]+lcp] != s->seq->delim) {
+ addinterval(space, c, i1, i2-1);
+ }
+ i1 = i2;
+ }
+
+ if(child && (!checkdelim || s->seq->sequences[s->suftab[i1]+lcp] != s->seq->delim)) {
+ addinterval(space, c, i1,j);
+ }
+ return c;
+}
+
+
+
+inline PairUint
+getSuflink(Suffixarray *s, Uint i, Uint j) {
+ Uint slidx, base;
+ ssize_t nbytes;
+ Lint off;
+ PairUint link;
+ Lint a, b;
+
+ slidx = getfirstlindex(s, i, j);
+
+ if(sl_diskacc) {
+ lseek(s->fd, s->off_sl+(slidx*sizeof(Uint)), SEEK_SET);
+ nbytes = read(s->fd, &base, sizeof(Uint));
+ if(nbytes == -1) {
+ perror("suflink access failed");
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ base = s->suflink[slidx];
+ }
+
+ if ((off=id(s, base)) > 0) {
+ a = base;
+ b = off + base;
+ } else {
+ a = base+off;
+ b = base;
+ }
+
+ link.a = a;
+ link.b = b;
+ return link;
+}
+
+inline PairUint
+jumpkSuflinks(Suffixarray *s, Uint i, Uint j, Uint k) {
+ Uint v;
+ PairUint link;
+
+ link.a = i;
+ link.b = j;
+
+ for(v=0; v < k && getlcpval(s, link.a, link.b) > 0; v++) {
+ link = getSuflink(s, link.a, link.b);
+ }
+
+ return link;
+}
+
+/*---------------------------- constructchildtab -----------------------------
+ *
+ * @brief performs bottom-up traversals to construct childtable
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+constructchildtab(void *space, Suffixarray *s) {
+ Uint i;
+ Lint lastIndex = -1;
+ Stack *stack;
+
+ s->chldtab = ALLOCMEMORY(space, NULL, childtab, s->numofsuffixes+1);
+ memset(s->chldtab, 0, s->numofsuffixes*sizeof(childtab));
+ stack = ALLOCMEMORY(space, NULL, Stack, 1);
+ bl_stackInit(stack, 100000);
+
+ bl_stackPush(stack, 0);
+
+ for(i = 0; i < s->numofsuffixes; i++)
+ {
+ while(lcp(s,i) < lcp(s, bl_stackTop(stack))) {
+ lastIndex = bl_stackPop(stack);
+ if(lcp(s,i) <= lcp(s, bl_stackTop(stack)) &&
+ lcp(s,bl_stackTop(stack)) != lcp(s,lastIndex))
+ {
+ s->chldtab[bl_stackTop(stack)].val = lastIndex;
+ }
+ }
+ if (lastIndex != -1) {
+ s->chldtab[i-1].val = lastIndex;
+ lastIndex = -1;
+ }
+ bl_stackPush(stack, i);
+ }
+
+ /*construction of nextlIndex value*/
+ bl_stackDestruct(stack);
+ bl_stackInit(stack, 10000);
+ bl_stackPush(stack, 0);
+
+ for(i = 1; i < s->numofsuffixes; i++) {
+ while(lcp(s,i) < lcp(s, bl_stackTop(stack))) {
+ bl_stackPop(stack);
+ }
+ if (lcp(s,i) == lcp(s, bl_stackTop(stack))) {
+ lastIndex = bl_stackPop(stack);
+ s->chldtab[lastIndex].val = i;
+ }
+ bl_stackPush(stack, i);
+ }
+
+ bl_stackDestruct(stack);
+ FREEMEMORY(space, stack);
+ return;
+}
+
+
+
+/*------------------------------- computeId ----------------------------------
+ *
+ * @brief performs a top down traversal on the tree represented
+ * by the suffix array to compute unique ids for each interval
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+computeId (void *space, Suffixarray *s) {
+ Uint i;
+ Lint l,
+ r;
+ Container *c;
+ VQueue vqueue;
+ PairUint ival;
+
+ bl_vqueueInit(&vqueue, 1000, sizeof(PairUint));
+ s->id = ALLOCMEMORY(space, NULL, char, s->numofsuffixes+2);
+ memset(s->id, 0, sizeof(char)*s->numofsuffixes+2);
+
+ s->idvtab = ALLOCMEMORY(space, NULL, PairLSint, 1);
+ s->idvcnt = 1;
+
+ ival.a = 0;
+ ival.b = s->numofsuffixes-1;
+
+ s->id[0] = (signed char) -128;
+ s->idvtab[0].a = 0;
+ s->idvtab[0].b = (s->numofsuffixes-1);
+
+ bl_vqueueEnqueue(&vqueue, &ival);
+ while (!bl_vqueueIsEmpty(&vqueue)){
+
+ PairUint *tmp = (PairUint *) bl_vqueueDequeue(&vqueue, NULL);
+ memcpy(&ival, tmp, sizeof(PairUint));
+ free(tmp);
+
+ c = getChildintervals(space, s, ival.a, ival.b, 0);
+ for(i=0; i < bl_containerSize(c); i++){
+
+ l = ((PairUint*)bl_containerGet(c, i))->a;
+ r = ((PairUint*)bl_containerGet(c, i))->b;
+
+ if (l < r) {
+ if(s->id[l] == 0) {
+ if (r-l <= 127){
+ s->id[l] = (signed char) r-l;
+ } else {
+ s->id[l] = (signed char)-128;
+ s->idvtab = ALLOCMEMORY(space, s->idvtab, PairLSint, s->idvcnt+1);
+ s->idvtab[s->idvcnt].a = l;
+ s->idvtab[s->idvcnt].b = r-l;
+ s->idvcnt = s->idvcnt+1;
+ }
+
+ // id[l] = r;
+ } else if(s->id[r] == 0) {
+
+ if(l-r > -128) {
+ s->id[r] = (signed char) l-r;
+ } else {
+ s->id[r] = (signed char)-128;
+ s->idvtab = ALLOCMEMORY(space, s->idvtab, PairLSint, s->idvcnt+1);
+ s->idvtab[s->idvcnt].a = r;
+ s->idvtab[s->idvcnt].b = l-r;
+ s->idvcnt = s->idvcnt+1;
+ }
+ // id[r] = -l;
+ } else {
+ DBG("ID failed id[l]:%d, id[r]:%d\n\n", s->id[l], s->id[r]);
+ exit(-1);
+ }
+ ival.a = l;
+ ival.b = r;
+ bl_vqueueEnqueue(&vqueue, &ival);
+ }
+ }
+ bl_containerDestruct(c, NULL);
+ free(c);
+ }
+ qsort(s->idvtab, s->idvcnt, sizeof(PairLSint), cmp_PairLSint_qsort);
+ bl_vqueueDestruct(&vqueue, NULL);
+ return;
+}
+
+
+/*------------------------------- getsuffsucc --------------------------------
+ *
+ * @brief performs a bottom-up traversal of the suffix array collecting
+ * cause(p)-successors for suffix link construction
+ * @author Steve Hoffmann
+ *
+ */
+
+Uint *
+getsufsucc(void *space, Suffixarray *s){
+
+ Lint i,
+ lb,
+ //llcp,
+ llb,
+ min1,
+ min2,
+ m;
+
+ Uint *A;
+ Stack *stack,
+ *mstack;
+
+ A = ALLOCMEMORY(space, NULL, Uint, s->numofsuffixes+2);
+ memset(A, 255, sizeof(Uint)*s->numofsuffixes+2);
+
+ stack = ALLOCMEMORY(space, NULL, Stack, 1);
+ mstack = ALLOCMEMORY(space, NULL, Stack, 1);
+
+ bl_stackInit(stack, 100000);
+ bl_stackInit(mstack, 100000);
+
+ /*push lcp and lbound*/
+ bl_stackPush(stack, 0);
+ bl_stackPush(stack, 0);
+
+
+ bl_stackPush(mstack, s->suftab[0]);
+ bl_stackPush(mstack, 0);
+
+ for(i = 1; i < s->numofsuffixes; i++) {
+ lb = i-1;
+
+ bl_stackPush(mstack, s->suftab[i-1]);
+ bl_stackPush(mstack, i-1);
+
+ while (lcp(s,i) < bl_stackTop(stack)) {
+ bl_stackPop(stack);
+ //not used: llcp = bl_stackPop(stack);
+ llb = bl_stackPop(stack);
+
+ /*child interval is given by llcp-[llb, i-1]*/
+ /*cycle children here*/
+
+ min1 = s->numofsuffixes+1;
+ min2 = s->numofsuffixes+1;
+ while (!bl_stackIsEmpty(mstack) && llb <= bl_stackTop(mstack) &&
+ bl_stackTop(mstack) <= i-1) {
+ bl_stackPop(mstack);
+ m = bl_stackPop(mstack);
+
+ if (m < min1) {
+ min2 = min1;
+ min1 = m;
+ } else {
+ if (m < min2 && m != min1)
+ min2 = m;
+ }
+ }
+ lb = llb;
+
+ bl_stackPush(mstack, min1);
+ bl_stackPush(mstack, lb);
+ if (id(s, lb) + lb == i-1) {
+ A[min2+1] = lb;
+ } else {
+ A[min2+1] = i-1;
+ }
+ }
+
+ if(lcp(s,i) > bl_stackTop(stack)){
+ bl_stackPush(stack, lb);
+ bl_stackPush(stack, lcp(s,i));
+ }
+ }
+
+ bl_stackDestruct(stack);
+ bl_stackDestruct(mstack);
+ FREEMEMORY(space, stack);
+ FREEMEMORY(space, mstack);
+ return A;
+}
+
+
+/*---------------------------- constructsuflinks ----------------------------
+ *
+ * @brief performs a top down traversal on the tree represented
+ * by the suffix array
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+constructsuflinks (void *space, Suffixarray *s, Uint *succ) {
+ Uint i,
+ a, b,
+ l, r,
+ max=0, pushes=0;
+ PairUint ab, lr, *tmp;
+ Lint u, v;
+ Uint d,
+ slidx,
+ lidx,
+ *B;
+ Container *c;
+ VStack *vstack;
+
+#ifdef EXPLICITSUFLINKS
+
+ PairUint suflink;
+ Lint m, n;
+
+ s->suflink_l = ALLOCMEMORY(space, NULL, Uint, s->numofsuffixes+1);
+ s->suflink_r = ALLOCMEMORY(space, NULL, Uint, s->numofsuffixes+1);
+ memset(s->suflink_l, 0, s->numofsuffixes*sizeof(Uint));
+ memset(s->suflink_r, 0, s->numofsuffixes*sizeof(Uint));
+#else
+ s->suflink = ALLOCMEMORY(space, NULL, Uint, s->numofsuffixes+1);
+ memset(s->suflink, 0, s->numofsuffixes*sizeof(Uint));
+#endif
+
+ vstack = (VStack *) malloc(sizeof(VStack));
+ bl_vstackInit(vstack, 100000, sizeof(PairUint));
+
+ B = ALLOCMEMORY(space, NULL, Uint, s->maxlcp+1);
+
+ ab.a = 0;
+ ab.b = s->numofsuffixes-1;
+ bl_vstackPush(vstack, &ab);
+
+ while(!bl_vstackIsEmpty(vstack)){
+ if(max < bl_vstackSize(vstack)) max = bl_vstackSize(vstack);
+ tmp = (PairUint *) bl_vstackPop(vstack, NULL);
+ a = tmp->a;
+ b = tmp->b;
+ free(tmp);
+
+ c = getChildintervals(space, s, a, b, 0);
+ d = getlcpval(s, a, b);
+
+ if (id(s, a)+a == b) {
+ B[d] = a;
+ } else if (a == b+id(s, b)){
+ B[d] = b;
+ } else {
+ DBG("Id failed. id[a]: %d\n", id(s,a));
+ }
+
+ for (i=0; i < bl_containerSize(c); i++) {
+
+ lr = *((PairUint *) bl_containerGet(c,i));
+ l = lr.a;
+ r = lr.b;
+
+ if(l < r) {
+ bl_vstackPush(vstack, &lr);
+ pushes++;
+ } else {
+
+ lidx = s->suftab[l];
+
+ if((succ[lidx] > 0 && succ[lidx] < s->numofsuffixes-1) &&
+ (abs(id(s, succ[lidx])) < s->numofsuffixes-1)) {
+
+ if (id(s, succ[lidx]) > 0) {
+ u = succ[lidx];
+ v = id(s, succ[lidx]) + succ[lidx];
+ } else {
+ u = succ[lidx] + id(s, succ[lidx]);
+ v = succ[lidx];
+ }
+
+ d = getlcpval(s, u, v);
+ slidx = getfirstlindex(s, u, v);
+
+#ifdef EXPLICITSUFLINKS
+
+ if (id(s, B[d-1]) > 0) {
+ m = B[d-1];
+ n = id(s, B[d-1])+B[d-1];
+ } else {
+ m = B[d-1] + id(s, B[d-1]);
+ n = B[d-1];
+ }
+
+ s->suflink_l[slidx] = m;
+ s->suflink_r[slidx] = n;
+ suflink = getSuflink(s, u, v);
+#else
+ d = MAX(1,d);
+ s->suflink[slidx] = B[d-1];
+#endif
+
+#ifdef SUFLINKDEBUG
+
+ fprintf(stderr, "linking [%d,%d] -> [%d,%d] {%d,%d}\n",
+ u,v,m,n, s->inv_suftab[s->suftab[u]+1],
+ s->inv_suftab[s->suftab[v]+1]);
+
+
+ { Lint w;
+ for(w=0; w < getlcpval(s,u,v)-1; w++) {
+ if(s->seq->sequences[s->suftab[u] + w+1]!= s->seq->sequences[s->suftab[m]+ w]
+ || getlcpval(s, u, v) != getlcpval(s, m, n)+1){
+ DBG("Suffixlink construction failed with %d-[%d,%d] -> %d-[%d,%d]\n", getlcpval(s, u, v), u, v, getlcpval(s, m, n),m, n);
+ exit(-1);
+ }
+ if(s->seq->sequences[s->suftab[v]+w+1]!= s->seq->sequences[s->suftab[n] + w]
+ || getlcpval(s, u, v) != getlcpval(s, m, n)+1){
+ DBG("Suffixlink construction failed with %d-[%d,%d] -> %d-[%d,%d]\n", getlcpval(s, u, v), u, v, getlcpval(s, m, n), m, n);
+ exit(-1);
+ }
+ }
+ }
+#endif
+ }
+ }
+ }
+ bl_containerDestruct(c, NULL);
+ free(c);
+ }
+ bl_vstackDestruct(vstack, NULL);
+ free(vstack);
+ FREEMEMORY(space, B);
+ fprintf(stderr, "suflink construction. pushes: %d, maxstack: %d\n", pushes, max);
+ return;
+}
+
+
+
+Uint
+childCount(void *space,
+ Suffixarray *s,
+ Uint i,
+ Uint j) {
+
+ Uint childcount=0;
+ Uint i1,
+ i2;
+ unsigned char child;
+
+ /*ALERT -1*/
+ child = (i > 0 || j < s->numofsuffixes-1);
+
+ if(child) {
+ if (i < s->chldtab[j].val && s->chldtab[j].val <=j) {
+ i1 = s->chldtab[j].val;
+ } else {
+ i1 = s->chldtab[i].val;
+ }
+ childcount++;
+
+ } else {
+ i1 = i;
+ }
+
+ while(isnextlIndex(s,i1) && !isupIndex(s,i1)
+ && s->chldtab[i1].val != 0) {
+ i2 = s->chldtab[i1].val;
+ childcount++;
+ i1 = i2;
+ }
+
+ if(child) {
+ childcount++;
+ }
+
+ return childcount;
+}
+
+
+
+/*----------------------------- getCharInterval ------------------------------
+ *
+ * @brief gets a child interval starting with character ch
+ * pos deprecated (write what you like)
+ * @return returns an interval [l,r], empty interval l > r
+ * @author Steve Hoffmann
+ *
+ */
+
+
+inline PairUint
+getCharIntervalArr(void *space,
+ Suffixarray *s,
+ Uint i,
+ Uint j,
+ Uint pos,
+ char ch)
+{
+ Lint *c;
+ Uint count=0;
+ Uint lcp=0;
+ PairUint lr;
+
+ lr.a = 1;
+ lr.b = 0;
+
+ if(i==j) return lr;
+
+ c = getChildintervalsArr(space,s, i, j, &count, 1);
+ lcp = getlcpval(s, i, j);
+
+ for(i=0; i < count; i++) {
+ if(s->seq->sequences[ s->suftab[c[i*2]] + lcp] == ch){
+ lr.a = c[i*2];
+ lr.b = c[i*2+1];
+ break;
+ }
+ }
+
+ free(c);
+ return lr;
+}
+
+
+
+/*----------------------------- getCharInterval ------------------------------
+ *
+ * @brief gets a child interval starting with character ch
+ * pos deprecated (write what you like)
+ * @return returns an interval [l,r], empty interval l > r
+ * @author Steve Hoffmann
+ *
+ */
+
+inline PairUint
+getCharInterval(void *space,
+ Suffixarray *s,
+ Uint i,
+ Uint j,
+ Uint pos,
+ char ch)
+{
+ Container *c;
+ Uint lcp=0;
+ PairUint lr;
+
+ lr.a = 1;
+ lr.b = 0;
+
+ if(i==j) return lr;
+
+ c = getChildintervals(space,s, i, j, 1);
+ lcp = getlcpval(s, i, j);
+
+ for(i=0; i < bl_containerSize(c); i++) {
+
+ if(s->seq->sequences[ s->suftab[((PairUint*)bl_containerGet(c, i))->a] + lcp] == ch) {
+ lr.a = ((PairUint*)bl_containerGet(c, i))->a;
+ lr.b = ((PairUint*)bl_containerGet(c, i))->b;
+
+ break;
+ }
+ }
+ bl_containerDestruct(c, NULL);
+ free(c);
+ return lr;
+}
+
+
+
+/*-------------------------------- dumpSufArr --------------------------------
+ *
+ * dumps a suffix array to a screen
+ *
+ */
+
+ void
+dumpSufArr (Suffixarray *arr)
+{
+ Uint i;
+
+ for(i=0; i < arr->numofsuffixes; i++) {
+ printf("%d \t %d \t %d \t %d \t %d \t %s\n", i,
+ arr->suftab[i],
+ lcp(arr,i),
+ arr->inv_suftab[i],
+ arr->seq->sequences[arr->suftab[i]],
+ &arr->seq->sequences[arr->suftab[i]]);
+ }
+
+ return;
+}
+
+void
+dumplcps(Suffixarray *arr) {
+ Uint i, j, s, t;
+
+
+ for(i=0; i < arr->numofsuffixes; i++) {
+ if (lcp(arr,i) > 0) {
+ s = &(arr->seq->sequences[arr->suftab[i-1]])-arr->seq->sequences;
+ t = &(arr->seq->sequences[arr->suftab[i]])-arr->seq->sequences;
+ printf("lcp of suffix %d and %d has length %d\t:\n", i-1, i, lcp(arr,i));
+ for(j=0; j <= lcp(arr,i); j++) printf(" %d ", arr->seq->sequences[s+j]);
+ printf("\n");
+ for(j=0; j <= lcp(arr,i); j++) printf(" %d ", arr->seq->sequences[t+j]);
+ printf("\n");
+ }
+ }
+}
+
+void
+dumplcptab(Suffixarray *s) {
+
+ Uint i;
+
+ for(i=0; i < s->numofsuffixes; i++) {
+ printf("i:%d lcp:%d\n",
+ i, lcp(s,i));
+ }
+
+ printf("\n");
+
+}
+
+
+void
+dumpchildtab(Suffixarray *s) {
+ Uint i;
+
+ for(i=0; i < s->numofsuffixes; i++) {
+ printf("i:%d up:%d, down:%d, nextlIndex:%d := %d\n",
+ i, isnextlIndex(s,i), isdownIndex(s,i), isnextlIndex(s,i), s->chldtab[i].val);
+ }
+
+ printf("\n");
+}
+
+
+
+
+/*------------------------------- searchSuffix -------------------------------
+ *
+ * @brief looking up a suffix
+ * @author Steve Hoffmann
+ *
+ */
+
+ PairUint
+searchSuffix (void *space, Suffixarray *arr, char *p, Uint len)
+{
+
+ PairUint res, cur;
+ Uint i = 0, ell;
+ char *suf, *q, *qend, *sufend;
+
+ res.a = 1;
+ res.b = 0;
+ cur.a = 0;
+ cur.b = arr->numofsuffixes-1;
+
+ q = p;
+ qend = &q[len-1];
+
+ do {
+
+ cur = getCharInterval(space, arr, cur.a, cur.b, 0, *q);
+ if (cur.a > cur.b) return res;
+
+ if(cur.a < cur.b) {
+ ell = getlcpval(arr, cur.a, cur.b);
+ } else {
+ ell = len;
+ }
+
+ suf = &arr->seq->sequences[arr->suftab[cur.a]];
+ sufend = &suf[MIN(ell, len)-1];
+ suf = &suf[i];
+
+ while(*q && suf <= sufend && q <= qend && *suf == *q) {
+ ++q;
+ ++suf;
+ ++i;
+ }
+
+ if(*q && suf <= sufend && q <= qend && *suf != *q) return res;
+
+ } while (i < len);
+
+ return cur;
+}
+
diff --git a/segemehl/libs/sufarray/sufarray.h b/segemehl/libs/sufarray/sufarray.h
new file mode 100644
index 0000000..744b35c
--- /dev/null
+++ b/segemehl/libs/sufarray/sufarray.h
@@ -0,0 +1,124 @@
+#ifndef SUF_ARRAY_H
+#define SUF_ARRAY_H
+
+/*
+ *
+ * sufarray.h
+ * declarations for enhanced suffix arrays
+ * for char alphabets
+ *
+ * @author Steve Hoffmann, shoffmann at zbh.uni-hamburg.de
+ * @company Center for Bioinformatics, Hamburg
+ * @date 12/10/06 22:01:33 CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 72 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-28 18:14:42 +0100 (Tue, 28 Oct 2008) $
+ *
+ * Id: $Id: sufarray.h 72 2008-10-28 17:14:42Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/libs/sufarray/sufarray.h $
+ */
+
+#include <sys/types.h>
+#include "basic-types.h"
+#include "falphabet.h"
+#include "charsequence.h"
+#include "container.h"
+#include "multicharseq.h"
+
+
+#define LCP_TAB_STORED ((unsigned char) (1 << 0))
+#define CHLD_TAB_STORED ((unsigned char) (1 << 1))
+#define SUFLINK_TAB_STORED ((unsigned char) (1 << 2))
+#define SUFLINK_COMPRESSED ((unsigned char) (1 << 3))
+#define MD5_STORED ((unsigned char) (1 << 4))
+#define LINT_SUFLINKS ((unsigned char) (1 << 5))
+
+typedef struct {
+ Uint *suf;
+ Uint pos;
+} suffix_t;
+
+typedef struct {
+ /* int up;
+ int down;
+ int nextlIndex;
+ */
+
+ Uint val;
+} childtab;
+
+
+typedef struct {
+ MultiCharSeq *seq;
+
+ Uint numofsuffixes;
+ Uint *suftab;
+ Uint *inv_suftab;
+ Uint *suflink;
+ Uint *suflink_l;
+ Uint *suflink_r;
+
+ Uint *bwttab; /* burrows-wheeler array*/
+ Uint *lcptab; /* alternative: Abouelhoda et al.*/
+
+ unsigned char *lcpctab; /* nB to store lcp values < 255*/
+ PairUint *llvtab; /* array of 8B to store lcp val >=255*/
+ Uint llvcnt;
+ Uint maxlcp;
+
+ signed char *id;
+ PairLSint *idvtab;
+ Uint idvcnt;
+
+ childtab *chldtab; /* a child table*/
+ Uint *bcktab; /* the bucket container*/
+
+ unsigned char *mdfive;
+ unsigned char llint;
+#ifdef SUFLINK_MMAP
+ int pagediff_id;
+ int pagediff_sl;
+#endif
+ int fd;
+ off_t off_sl;
+ off_t off_id;
+
+} Suffixarray;
+
+
+
+Uint getMultiCharSeqIndex (MultiCharSeq *mseq, char *ptr);
+Suffixarray* readSuffixarray(void *, char *, CharSequence **, Uint, unsigned char silent);
+void writeSuffixarray(Suffixarray *s, char *filename);
+Suffixarray* constructSufArr(void *, CharSequence **, Uint, FAlphabet *, unsigned char silent);
+void constructchildtab(void *, Suffixarray *);
+void constructsuflinks(void *, Suffixarray *, Uint *);
+void constructLcp (void *, Suffixarray *);
+void computeId(void*, Suffixarray *);
+Uint* getsufsucc(void *, Suffixarray *);
+void checksuflinks(Suffixarray *s, Uint i, Uint j);
+void destructSufArr (void *, Suffixarray *);
+extern Container* getChildintervals(void *, Suffixarray *, Uint, Uint, BOOL);
+extern Lint* getChildintervalsArr(void *, Suffixarray *, Uint, Uint, Uint *, BOOL);
+extern PairUint getCharInterval(void *, Suffixarray *, Uint, Uint, Uint, char);
+extern PairUint getCharIntervalArr(void *, Suffixarray *, Uint, Uint, Uint, char);
+extern PairUint getSuflink(Suffixarray *, Uint, Uint);
+extern PairUint jumpkSuflinks(Suffixarray *s, Uint i, Uint j, Uint k);
+extern Uint getlcpval(Suffixarray *, Uint, Uint);
+extern Uint getfirstlindex(Suffixarray *, Uint, Uint);
+extern Lint id (Suffixarray *, Uint);
+extern Uint lcp(Suffixarray *s, Uint i);
+extern Uint maxlcp(Suffixarray *s);
+extern unsigned char isnextlIndex(Suffixarray *s, Uint i);
+extern unsigned char isdownIndex(Suffixarray *s, Uint i);
+extern unsigned char isupIndex(Suffixarray *s, Uint i);
+extern PairUint jumpkSuflinks(Suffixarray *s, Uint i, Uint j, Uint k);
+extern void addinterval(void *space, Container *c, Uint a, Uint b);
+void destructinterval(void *space, void *data);
+void dumplcps (Suffixarray *);
+PairUint searchSuffix (void *space, Suffixarray *arr, char *p, Uint len);
+
+#endif
+
diff --git a/segemehl/libs/sufarray/sufmatch.c b/segemehl/libs/sufarray/sufmatch.c
new file mode 100644
index 0000000..7d34ce0
--- /dev/null
+++ b/segemehl/libs/sufarray/sufmatch.c
@@ -0,0 +1,599 @@
+
+/*
+ * sufmatch.c
+ * functions for matching in suffixarrays
+ *
+ * @author Steve Hoffmann
+ * @email shoffmann at zbh.uni-hamburg.de
+ * @date 12/19/06 15:13:18 CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: sufmatch.c 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/sufarray/sufmatch.c $
+ */
+
+
+ #include <stdlib.h>
+ #include <stdio.h>
+ #include <string.h>
+ #include <math.h>
+ #include "basic-types.h"
+ #include "memory.h"
+ #include "mathematics.h"
+ #include "sufarray.h"
+ #include "sufmatch.h"
+ #include "mm.h"
+ #include "intsequence.h"
+ #include "list.h"
+ #include "depictseqs.h"
+ #include "gnuplot_i.h"
+ #include "dpalign.h"
+ #include "cantor.h"
+ #include "wurstimbiss.h"
+ #include "falphabet.h"
+
+/*------------------------------- suffixscore --------------------------------
+ *
+ * scoring function for sw alignments of structure sequences
+ *
+ */
+
+int
+suffixscore (symtype a, symtype b, void* info)
+{
+ double* scores;
+
+ scores = (double*) info;
+ if (a == b) return 3;
+
+ return -2;
+}
+
+
+/*------------------------------- sufSubstring -------------------------------
+ *
+ * retrieve the longest substring matches in a suffixarray
+ *
+ */
+
+PairSint*
+sufSubstring (void *space, Suffixarray *arr, Uint *pattern,
+ Uint len, Uint sublen)
+{
+ Uint i;
+ PairSint *res, d;
+
+ if (len <= sublen)
+ {
+ return NULL;
+ }
+
+ res = ALLOCMEMORY(space, NULL, PairSint, len-sublen);
+ for(i=0; i < len-sublen; i++)
+ {
+ d=mmsearch(arr, &pattern[i], sublen, 0, 0, arr->numofsuffixes-1);
+ res[i].a=d.a;
+ res[i].b=d.b;
+ }
+
+ return res;
+}
+
+
+/*------------------------------ reportSufmatch ------------------------------
+ *
+ * returns a beautified match string
+ *
+ */
+
+void
+reportSufmatch (Suffixarray *a, PairSint *matches,
+ Uint len, Uint threshold,
+ IntSequence **s)
+{
+ Uint i, j, idx;
+ char *info;
+
+ for (i=0; i < len; i++) {
+ if (matches[i].b >= ((matches[i].a)+threshold)) {
+ /*valid matches*/
+ for (j=matches[i].a; j <= matches[i].b; j++) {
+ idx = getMultiSeqIndex(a->seq, &a->seq->sequences[a->suftab[j]]);
+ info = s[idx]->description;
+ printf("[%d]:\t %s\n", j, info);
+ }
+ }
+ }
+
+ return ;
+}
+
+
+/*---------------------------------- cmp_* -----------------------------------
+ *
+ * compare functions for clibs's qsort and bsearch
+ * 1. cmp_suffixno : used in rankSufmatch to sort sequence numbers
+ * 2. cmp_ranks : used in rankSufmatch to sort sequence ranks
+ * 3. cmp_ranks_ptr: used in rankSufmatchList to sort sequence ranks
+ *
+ */
+
+int
+cmp_blast(const void *a, const void *b) {
+ Matchtype *first = (Matchtype*)a;
+ Matchtype *second =(Matchtype*)b;
+ double frac_first, frac_second;
+
+
+ frac_first = (double) first->blast ;
+ frac_second = (double) second->blast ;
+
+ if(frac_first > frac_second) return 1;
+ if(frac_first < frac_second) return -1;
+
+ return 0;
+}
+
+
+int
+cmp_swscore(const void *a, const void *b) {
+ Matchtype *first = (Matchtype*)a;
+ Matchtype *second =(Matchtype*)b;
+ double frac_first, frac_second;
+
+ if (first->swscore == 0 && second->swscore == 0) {
+ if(first->count > second->count) return 1;
+ if(first->count < second->count) return -1;
+ }
+
+ frac_first = (double) first->swscore ;
+ frac_second = (double) second->swscore ;
+
+ if(frac_first > frac_second) return 1;
+ if(frac_first < frac_second) return -1;
+
+ return 0;
+}
+
+int
+cmp_score(const void *a, const void *b) {
+ Matchtype *first = (Matchtype*)a;
+ Matchtype *second =(Matchtype*)b;
+ double frac_first, frac_second;
+
+ if (first->score == 0 && second->score == 0) {
+ if(first->count > second->count) return 1;
+ if(first->count < second->count) return -1;
+ }
+
+ frac_first = (double) first->score ;
+ frac_second = (double) second->score;
+
+ if(frac_first > frac_second) return 1;
+ if(frac_first < frac_second) return -1;
+
+ return 0;
+}
+
+
+int
+cmp_suffixno(const void *a, const void* b) {
+ Matchtype *first = (Matchtype*)a;
+ Matchtype *second =(Matchtype*)b;
+
+
+ if(first->id > second->id) return 1;
+ if(first->id < second->id) return -1;
+
+ return 0;
+}
+
+
+int
+cmp_ranks(const void *a, const void* b) {
+ Matchtype *first = (Matchtype*)a;
+ Matchtype *second =(Matchtype*)b;
+
+ if(first->count > second->count) return 1;
+ if(first->count < second->count) return -1;
+
+ return 0;
+}
+
+int
+cmp_rank_ptr(const void *a, const void* b) {
+ Matchtype **first = (Matchtype**)a;
+ Matchtype **second = (Matchtype**)b;
+
+ if(first[0]->count > second[0]->count) return 1;
+ if(first[0]->count < second[0]->count) return -1;
+
+ return 0;
+}
+
+/*------------------------------ freeMatchtype -------------------------------
+ *
+ * a function to delete Matchtypes in a list
+ *
+ */
+
+void
+freeMatchtype (void *space, void *data)
+{
+ Matchtype *d = (Matchtype*)data;
+
+ FREEMEMORY(space, d->pos);
+ FREEMEMORY(space, data);
+}
+
+
+/*---------------------------------- subscr ----------------------------------
+ *
+ * a function that assigns scores from a substitution matrix for matches
+ * and mismatches given in info matrix
+ *
+ */
+
+int
+subscr (symtype a, symtype b, void *info )
+{
+ double* M;
+ FAlphabet *alphabet;
+ imbissinfo *p;
+ int val;
+
+
+ p = (imbissinfo*) info;
+ alphabet = p->alphabet;
+ M = (double*) p->sub;
+
+/* ad = alphabet->mapdomain[(Uint)a];
+ bd = alphabet->mapdomain[(Uint)b];
+ printf("decoding %d and %d", ad, bd);
+ tupela = decodeCantor(NULL, ad, 2);
+ tupelb = decodeCantor(NULL, bd, 2);
+ printf("... ended ... to %d and %d \n", VECTOR(tupela,0), VECTOR(tupelb,0));
+ val = MATRIX2D(M, 309, VECTOR(tupela, 0), VECTOR(tupelb,0));
+*/
+ val = MATRIX2D(M, 308, a ,b);
+ if (val < -10) val = -10;
+
+ return val;
+}
+
+
+
+/*-------------------------------- getEntropy --------------------------------
+ *
+ * calculates the entropy of a sequence, given probabilities.
+ *
+ */
+
+double
+getEntropy(void *space, Uint* sequence, Uint l, double* prob) {
+ int i;
+ double sum=0;
+
+ for(i=0; i < l; i++) {
+ sum += prob[sequence[i]]*log2(prob[sequence[i]]);
+ }
+
+ return sum;
+}
+
+
+ /*local alignment*/
+ /*printf("max sw score: %f\n", occ[i-1].swscore);
+
+
+ swres = swmatrix(space, queryseq->sequence, queryseq->length,
+ s[occ[i-1].id]->sequence, s[occ[i-1].id]->length,
+ -5, constscr, swscores);
+
+
+ printf("max sw score: %d\n", swres[arraymax(swres,
+ (queryseq->length+1)*(s[occ[i-1].id]->length+1))]);
+
+ align = swgaplesstraceback(space, swres,
+ queryseq->sequence, queryseq->length,
+ s[occ[k].id]->sequence, s[occ[k].id]->length,
+ //suffixscore,((imbissinfo*)info)->score
+ -5,
+ constscr, swscores,
+ &alignsize);
+
+ if (depictsw) {
+ alignstr = printAlignment(space, align, alignsize,
+ queryseq, s[occ[i-1].id], 80);
+ printf("%s\n", alignstr);
+ FREEMEMORY(space, alignstr);
+ FREEMEMORY(space, align);
+ }*/
+
+
+Matchtype*
+selectBlastScoreSWconst(void *space, Matchtype *m, Uint k,
+ IntSequence *a, IntSequence **s, void *info) {
+
+ Uint l, i;
+ int *swres;
+ imbissinfo *imbiss;
+
+ imbiss = (imbissinfo*) info;
+
+ qsort(m, k, sizeof(Matchtype), cmp_blast);
+
+ l=0;
+ for (i=k; i > 0 && l < 1000; i--) {
+ if (m[i-1].count >= imbiss->minseeds) {
+
+ swres = swgapless(space,
+ a->sequence, a->length,
+ s[m[i-1].id]->sequence, s[m[i-1].id]->length,
+ constscr, imbiss->swscores
+ /*subscr, info*/
+ );
+
+ m[i-1].swscore= swres[arraymax(swres,
+ (a->length+1)*(s[m[i-1].id]->length+1))];
+
+ FREEMEMORY(space, swres);
+ } else {
+ m[i-1].swscore = 0;
+ }
+ l++;
+ }
+
+ qsort(m, k, sizeof(Matchtype), cmp_swscore);
+ return m;
+}
+
+
+Matchtype*
+selectScoreSWconst(void *space, Matchtype *m, Uint k,
+ IntSequence *a, IntSequence **s, void *info) {
+
+ Uint l, i;
+ int *swres;
+ imbissinfo *imbiss;
+
+ imbiss = (imbissinfo*) info;
+
+ qsort(m, k, sizeof(Matchtype), cmp_score);
+
+ l=0;
+ for (i=k; i > 0 && l < 1000; i--) {
+ if (m[i-1].count >= imbiss->minseeds) {
+
+ swres = swgapless(space,
+ a->sequence, a->length,
+ s[m[i-1].id]->sequence, s[m[i-1].id]->length,
+ constscr, imbiss->swscores
+ /*subscr, info*/
+ );
+
+ m[i-1].swscore= swres[arraymax(swres,
+ (a->length+1)*(s[m[i-1].id]->length+1))];
+
+ FREEMEMORY(space, swres);
+ } else {
+ m[i-1].swscore = 0;
+ }
+ l++;
+ }
+
+ qsort(m, k, sizeof(Matchtype), cmp_swscore);
+ return m;
+}
+
+
+Matchtype*
+selectSW (void *space, Matchtype *m, Uint k,
+ IntSequence *a, IntSequence **s, void *info) {
+ qsort(m, k, sizeof(Matchtype), cmp_swscore);
+ return m;
+}
+
+Matchtype*
+selectBlastScore (void *space, Matchtype *m, Uint k,
+ IntSequence *a, IntSequence **s, void* info) {
+
+ qsort(m, k, sizeof(Matchtype), cmp_blast);
+ return m;
+}
+
+Matchtype*
+selectScore (void *space, Matchtype *m, Uint k,
+ IntSequence *a, IntSequence **s, void* info) {
+
+ qsort(m, k, sizeof(Matchtype), cmp_score);
+ return m;
+}
+
+
+double
+scorefilter (void *space, Matchtype *m, IntSequence *a, IntSequence *b,
+ Uint *ptr, Uint len, Uint pos, void *info) {
+ Uint l;
+ double temp = 0;
+ double sum = 0;
+ imbissinfo *imbiss;
+
+ imbiss=(imbissinfo*) info;
+
+ m->count++;
+ m->pos = ALLOCMEMORY(space, m->pos, Uint, m->count);
+ m->org = ALLOCMEMORY(space, m->org, Uint, m->count);
+ m->pos[(m->count)-1]=pos;
+ m->org[(m->count)-1]=pos;
+
+ for (l=0; l < len; l++){
+ temp = ((imbissinfo*)info)->score[(Uint)*ptr];
+ sum += temp;
+ m->score += temp;
+ ptr++;
+ }
+
+ m->blast = m->blast > sum ? m->blast : sum;
+
+ imbiss->consensus[pos] += (Uint) 1
+ /*((double)imbiss->lambda*sum)*/;
+
+ return sum > 0 ? sum : 0;
+}
+
+
+
+double
+swconstfilter(void *space, Matchtype *m, IntSequence *a, IntSequence *b,
+ Uint *ptr, Uint len, Uint pos, void *info) {
+
+ imbissinfo *imbiss;
+ int *swres;
+ double t;
+
+ imbiss = (imbissinfo*) info;
+ t=scorefilter(space, m, a, b, ptr, len, pos, info);
+
+ if (m->count == imbiss->minseeds) {
+ swres = swgapless(space, a->sequence, a->length,
+ b->sequence, b->length,
+ constscr, imbiss->swscores);
+ m->swscore= swres[arraymax(swres, (a->length+1)*(b->length+1))];
+
+ FREEMEMORY(space, swres);
+ }
+
+ return t;
+}
+
+
+
+/*------------------------------ initMatchtype -------------------------------
+ *
+ * init a Matchtype struct
+ *
+ */
+
+void
+initMatchtype (Matchtype *m, Uint id)
+{
+
+ m->id = id;
+ m->count = 0;
+ m->pos = NULL;
+ m->org = NULL;
+ m->m=0;
+ m->blast=0;
+ m->score = 0;
+ m->swscore = 0;
+
+ return ;
+}
+
+/*------------------------------- rankSufmatch ------------------------------
+ *
+ * ranks matches
+ * given in an array of PairSint of length len. Sorting is done
+ * by several calls to clib's qsort. For each item of the sorted
+ * array a handler-function is invoked.
+ *
+ */
+
+void
+rankSufmatch ( void *space,
+ Suffixarray *a,
+ PairSint *matches,
+ Uint len,
+ Uint maxmatches,
+ Uint S,
+ IntSequence **s,
+ Uint noofseqs,
+ double (*fltr)( void *,
+ Matchtype *,
+ IntSequence *,
+ IntSequence *,
+ Uint *,
+ Uint,
+ Uint,
+ void *),
+ Matchtype* (*sel)( void *,
+ Matchtype *,
+ Uint,
+ IntSequence *,
+ IntSequence **,
+ void *),
+ int (*handler)( void *,
+ Matchtype *,
+ IntSequence **,
+ Uint,
+ Uint,
+ void *),
+ IntSequence *queryseq,
+ void *info,
+ double *scores,
+ unsigned char depictsw
+)
+
+{
+
+ Matchtype key, *cur, *occ=NULL;
+ int i=0, j=0, k=0, l=0, r=0, retval=0;
+ int *hashTable;
+ double t;
+ Uint *ptr;
+
+ hashTable = ALLOCMEMORY(space, NULL, int, (noofseqs+1));
+ memset(hashTable, -1, sizeof(int)*(noofseqs+1));
+
+ for(i=0; i < len; i++) {
+ if(matches[i].b >= ((matches[i].a))) {
+
+ for(j=matches[i].a; j <= matches[i].b; j++) {
+ key.id = getMultiSeqIndex(a->seq, &a->seq->sequences[a->suftab[j]]);
+ r = hashTable[key.id];
+
+ if (r == -1) {
+ occ = ALLOCMEMORY(space, occ, Matchtype, k+1);
+ hashTable[key.id]=(&occ[k])-(occ);
+ initMatchtype(&occ[k], key.id);
+ cur = &occ[k];
+ k++;
+ } else {
+ cur = ((Matchtype*)(occ+r));
+ }
+
+ /*score the matches if no < maxmatches*/
+ if ((matches[i].b-matches[i].a) < maxmatches) {
+ ptr = &a->seq->sequences[a->suftab[j]]);
+ t=fltr(space, cur, queryseq, s[cur->id], ptr, S, i, info);
+
+ if (t == -1) break;
+ }
+ }
+ }
+ }
+
+ occ = sel(space, occ, k, queryseq, s, info);
+
+ l=0;
+ for (i=k; i > 0; i--) {
+ retval = handler(space, &occ[i-1], s, len, l, info);
+ if (retval) l++;
+ if (retval == -1) break;
+ }
+
+ FREEMEMORY(space, hashTable);
+
+ for(i=0; i < k; i++) {
+ FREEMEMORY(space, occ[i].pos);
+ FREEMEMORY(space, occ[i].org);
+ }
+
+ FREEMEMORY(space, occ);
+}
+
diff --git a/segemehl/libs/sufarray/sufmatch.h b/segemehl/libs/sufarray/sufmatch.h
new file mode 100644
index 0000000..76ea559
--- /dev/null
+++ b/segemehl/libs/sufarray/sufmatch.h
@@ -0,0 +1,110 @@
+
+#ifndef SUF_MATCH
+#define SUF_MATCH
+
+/*
+ *
+ * sufmatch.h
+ * declarations for matching functions in suffixarrays
+ *
+ * @author Steve Hoffmann, shoffmann at zbh.uni-hamburg.de
+ * @company Center for Bioinformatics, Hamburg
+ * @date 12/19/06 15:16:31 CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: sufmatch.h 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/sufarray/sufmatch.h $
+ */
+
+ #include "basic-types.h"
+ #include "intsequence.h"
+ #include "sufmatch.h"
+ #include "sufarray.h"
+
+typedef struct {
+ Uint id;
+ Uint count;
+ Uint *pos;
+ Uint *org;
+ Uint m;
+ float score;
+ float swscore;
+ double blast;
+} Matchtype;
+
+PairSint* sufSubstring (void *, Suffixarray *, Uint *, Uint, Uint);
+void reportSufmatch (Suffixarray *, PairSint *, Uint, Uint,
+ IntSequence **);
+
+
+void
+rankSufmatch ( void *space,
+ Suffixarray *a,
+ PairSint *matches,
+ Uint len,
+ Uint maxmatches,
+ Uint S,
+ IntSequence **s,
+ Uint noofseqs,
+ double (*fltr)( void *,
+ Matchtype *,
+ IntSequence *,
+ IntSequence *,
+ Uint *,
+ Uint,
+ Uint,
+ void *),
+ Matchtype* (*sel)( void *,
+ Matchtype *,
+ Uint,
+ IntSequence *,
+ IntSequence **,
+ void *),
+ int (*handler)( void *,
+ Matchtype *,
+ IntSequence **,
+ Uint,
+ Uint,
+ void *),
+ IntSequence *queryseq,
+ void *info,
+ double *scores,
+ unsigned char depictsw
+);
+
+Matchtype*
+selectScoreSWconst(void *space, Matchtype *m, Uint k,
+ IntSequence *a, IntSequence **s, void *info);
+
+Matchtype*
+selectSW (void *space, Matchtype *m, Uint k, IntSequence *a,
+ IntSequence **s, void* info);
+
+Matchtype*
+selectScore (void *space, Matchtype *m, Uint k, IntSequence *a,
+ IntSequence **s, void* info);
+
+double
+swconstfilter(void *space, Matchtype *m, IntSequence *a, IntSequence *b,
+ Uint *ptr, Uint len, Uint pos, void *info);
+
+double
+scorefilter(void *space, Matchtype *m, IntSequence *a, IntSequence *b,
+ Uint *ptr, Uint len, Uint pos, void *info);
+
+
+Matchtype*
+selectBlastScore (void *space, Matchtype *m, Uint k,
+ IntSequence *a, IntSequence **s, void* info);
+
+
+Matchtype*
+selectBlastScoreSWconst(void *space, Matchtype *m, Uint k,
+ IntSequence *a, IntSequence **s, void *info);
+
+#endif
+
diff --git a/segemehl/libs/sw.c b/segemehl/libs/sw.c
new file mode 100644
index 0000000..797897a
--- /dev/null
+++ b/segemehl/libs/sw.c
@@ -0,0 +1,1562 @@
+
+/*
+ * sw.c
+ *
+ * local alignments
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 06.02.2010 13:47:30 CET
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "basic-types.h"
+#include "memory.h"
+#include "mathematics.h"
+#include "alignment.h"
+#include "sw.h"
+#include "kdchain.h"
+
+/*---------------------------------- edist -----------------------------------
+ *
+ * evaluation of the edit distance in O(n) space
+ * the function accepts two sequences of symtype and
+ * an m-dimensional substitution matrix
+ *
+ */
+
+Uint edist(void *space, symtype *sa, Uint lena, symtype *sb, Uint lenb,
+ Uint indel, Uint *sub, Uint m) {
+
+ Uint minlen, maxlen;
+ Uint i, j, r1=0, r2=0;
+ Uint pen;
+ Uint* col;
+ symtype *min;
+ symtype *max;
+
+ minlen = (MIN(lena, lenb)) + 1;
+ maxlen = (MAX(lena, lenb)) + 1;
+ col = ALLOCMEMORY(space, NULL, Uint, minlen);
+
+ if(minlen-1 == lena) {
+ min = sa;
+ max = sb;
+ } else {
+ min = sb;
+ max = sa;
+ }
+
+ for(i=0; i < maxlen; i++) {
+ for(j=0; j < minlen; j++) {
+ if (i==0) {
+ col[j]=j;
+ } else {
+ if (j==0) {
+ r1 = col[j]++;
+ } else {
+ r2 = col[j];
+ if (sub == NULL) {
+ pen = (min[j-1]==max[i-1]) ? 0 : 1;
+ } else {
+ pen = (min[j-1]==max[i-1]) ? 0 :
+ MATRIX2D(sub, m, min[j-1], max[i-1]);
+ }
+ col[j] = MIN ((col[j-1]+indel),
+ (MIN ((r1 + pen), (col[j]+indel))));
+ r1 = r2;
+ }
+ }
+ }
+ }
+
+ r1 = col[minlen-1];
+ FREEMEMORY(space, col);
+ return r1;
+}
+
+/*------------------------------- constscr_Nmatch ------------------------------
+ *
+ * a function that assigns constant scores for matches and mismatches
+ * given in info[0] and info[1], respectively.
+ *
+ */
+
+ int
+constscr_Nmatch (symtype a, symtype b, void *info)
+{
+ int* scores;
+
+ scores = (int*) info;
+ if(a == b || a == 'N' || b == 'N')
+ return scores[0];
+
+ return scores[1];
+}
+
+
+/*--------------------------------- constscr ----------------------------------
+ *
+ * a function that assigns constant scores for matches and mismatches
+ * given in info[0] and info[1], respectively.
+ *
+ */
+
+ int
+constscr (symtype a, symtype b, void *info)
+{
+ int* scores;
+
+ scores = (int*) info;
+ if(a == b) return scores[0];
+
+ return scores[1];
+}
+
+
+/*--------------------------------- swgapless ----------------------------------
+ *
+ * smith-waterman local similarity alignment w/o gaps
+ * returns a matrix of size (m+1)*(n+1) where m is length of given sequence a
+ * and n the length of sequence b, respectively. Function expects
+ * a function to calculate a substitution score
+ *
+ */
+
+ int*
+swgapless (void *space, symtype *a, Uint m, symtype *b, Uint n,
+ Sint (*sub)(symtype, symtype, void *), void *nfo)
+{
+ int i, j, cols, rows, size;
+ int *L;
+
+ rows = m+1;
+ cols = n+1;
+
+ size = rows*cols;
+ L = ALLOCMEMORY(space, NULL, int, size);
+ L = memset(L, 0, sizeof(int)*size);
+
+ for(i=1; i < m+1; i++) {
+ for(j=1; j < n+1; j++) {
+
+ MATRIX2D(L, cols, i, j) = MAX(0,
+ MATRIX2D(L, cols, (i-1), (j-1)) + sub(a[i-1], b[j-1], nfo));
+ }
+ }
+
+ return L;
+}
+
+
+
+/*--------------------------------- swalign ----------------------------------
+ *
+ * smith-waterman local similarity alignment
+ * returns a matrix of size (m+1)*(n+1) where m is length of given sequence a
+ * and n the length of sequence b, respectively. Function expects
+ * a function to calculate a substitution score
+ *
+ */
+
+ int*
+swmatrix (void *space, symtype *a, Uint m, symtype *b, Uint n, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo)
+{
+ int i, j, cols, rows, size;
+ int *L;
+
+ rows = m+1;
+ cols = n+1;
+
+ size = rows*cols;
+ L = ALLOCMEMORY(space, NULL, int, size);
+ L = memset(L, 0, sizeof(int)*size);
+
+ for(i=1; i < m+1; i++) {
+ for(j=1; j < n+1; j++) {
+
+ MATRIX2D(L, cols, i, j) =
+ MAX4(0,
+ MATRIX2D(L, cols, (i-1), j) + indel ,
+ MATRIX2D(L, cols, i, (j-1)) + indel ,
+ MATRIX2D(L, cols, (i-1), (j-1)) + sub(a[i-1], b[j-1], nfo)
+ );
+ }
+ }
+
+ return L;
+}
+
+
+/*------------------------------- swtraceback --------------------------------
+ *
+ * traceback to find optimal local alignment path
+ *
+ */
+
+ void
+swtraceback (void *space, int *M,
+ symtype *a, Uint m, symtype *b, Uint n, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, Alignment *al)
+{
+ Uint i, j, ncol, cur, start;
+
+ ncol = (n+1);
+ start = arraymax(M, (m+1)*ncol);
+ i = start / ncol;
+ j = start % ncol;
+
+ al->uoff = 0;
+ al->voff = 0;
+
+ while(i > 0 && j > 0) {
+
+ cur = MATRIX2D(M, ncol, i, j);
+ if (MATRIX2D(M, ncol, i-1, j) + indel == cur){
+ insertEop(al, Insertion);
+ i--;
+ } else {
+ if (MATRIX2D(M, ncol, i, j-1) + indel == cur) {
+ insertEop(al, Deletion);
+ j--;
+ } else {
+ if (MATRIX2D(M, ncol, i-1, j-1)+sub(a[i-1], b[j-1], nfo)
+ == cur){
+ insertEop(al, Replacement);
+ i--; j--;
+ }
+ else {
+ assert(cur == 0);
+
+ al->uoff = i;
+ al->voff = j;
+
+ revMeops(al);
+ return;
+ }
+ }
+ }
+ }
+
+ al->uoff = i;
+ al->voff = j;
+ revMeops(al);
+
+ return;
+}
+
+/*------------------------------- splicescore --------------------------------
+ *
+ * @brief consensus splice site score
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+splicescore (char a, char b, Uint strand, Uint type)
+{
+
+ if((type == 0 && strand == 0) ||
+ (type == 1 && strand == 1)) {
+ if((a == 'G' && b == 'T') ||
+ (a == 'C' && b == 'T')) return 1;
+ } else {
+ if((a == 'A' && b == 'G') ||
+ (a == 'A' && b == 'C')) return 1;
+ }
+ return 0;
+}
+
+
+#define LOCALMULTISPLICECALCOFF
+
+/*------------------------- localmulitsplicedmatrixopt --------------------------
+ *
+ * @brief aligning a read a1 and its reverse complement a2 of length m to
+ * noofseqs loci
+ * @author Steve Hoffmann
+ *
+ */
+
+int***
+localmultisplicedmatrixopt (void *space, symtype *a1, symtype *a2, Uint qrylen, Uint *m,
+ symtype **b, Uint *n, Uint *strand, Uint *qstart, Uint *qend, Uint *tstart, Uint *tend, Uint noofseqs, int indel, int trans,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, int ***lv, int ***lr, int ***lc, PairUint **bestscr, char ****KBAND, PairUint *diag){
+
+ int i, j, k, q, cols=0, rows=0, abs, relq, tstartq, tendq, lk, rk;
+ unsigned int ovlrange;
+ PairUint *scr;
+ Uint off,start,margin=50;
+ int ***L, **lmr=NULL, **lmv=NULL, **lmc=NULL, tmp;
+ symtype cura, curb, r1, r2, l1, l2;
+
+#ifdef DEBUGMULTISPLICEOPT
+ int maxk, maxi, maxj, lastpick, maxpick;
+#endif
+
+#ifdef DEBUGKBAND
+ char ***K;
+ K = ALLOCMEMORY(space, NULL, char**, noofseqs);
+#endif
+
+ lmr = ALLOCMEMORY(space, NULL, int*, noofseqs);
+ lmv = ALLOCMEMORY(space, NULL, int*, noofseqs);
+ lmc = ALLOCMEMORY(space, NULL, int*, noofseqs);
+ L = ALLOCMEMORY(space, NULL, int**, noofseqs);
+ scr = ALLOCMEMORY(space, NULL, PairUint, noofseqs);
+ memset(scr, 0, sizeof(PairUint)*noofseqs);
+
+
+ for(k=0; k < noofseqs; k++) {
+#ifdef DEBUGKBAND
+ K[k] = ALLOCMEMORY(space, NULL, char*, m[k]+1);
+#endif
+ cols += n[k] + 1;
+ rows += m[k] + 1;
+ lmr[k] = ALLOCMEMORY(space, NULL, int, m[k]+1);
+ lmv[k] = ALLOCMEMORY(space, NULL, int, m[k]+1);
+ lmc[k] = ALLOCMEMORY(space, NULL, int, m[k]+1);
+ L[k] = ALLOCMEMORY(space, NULL, int*, m[k]+1);
+ for(i=0; i < m[k]+1; i++) {
+#ifdef DEBUGKBAND
+ K[k][i] = ALLOCMEMORY(space, NULL, char, n[k]+1);
+ memset(K[k][i], 0, sizeof(n[k]+1));
+#endif
+ L[k][i]=ALLOCMEMORY(space, NULL, int, n[k]+1);
+ memset(L[k][i], 0, sizeof(int)*(n[k]+1));
+ }
+ memset(lmv[k], 0, sizeof(int)*m[k]+1);
+ memset(lmr[k], 0, sizeof(int)*m[k]+1);
+ memset(lmc[k], 0, sizeof(int)*m[k]+1);
+ }
+
+ for(k=0; k < noofseqs; k++) {
+#ifdef DEBUGMULTISPLICEOPT
+ maxk = 0;
+ maxi = 0;
+ maxj = 0;
+ lastpick = 0;
+ maxpick = 0;
+#endif
+
+ ovlrange =0;
+
+ for(q=0; q < k; q++) {
+#ifdef LOCALMULTISPLICECALCOFF
+ if(strand[q] == 0) {
+ tstartq = qstart[q];
+ tendq = qend[q];
+ } else {
+ tstartq = qrylen - (qstart[q] + m[q]);
+ tendq = tstartq + m[q] - 1;
+ }
+ assert(tstartq == tstart[q] && tendq == tend[q]);
+#else
+ tstartq = tstart[q];
+ tendq = tend[q];
+#endif
+
+ if(tstartq <= tstart[k] && tstart[k] <= tendq && ovlrange < tendq) {
+ ovlrange = tendq;
+ }
+ }
+
+ for (i=1; i < m[k]+1; i++) {
+
+ lmv[k][i] = lmv[k][i-1];
+ lmr[k][i] = lmr[k][i-1];
+ lmc[k][i] = lmc[k][i-1];
+
+#ifdef LOCALMULTISPLICECALCOFF
+ if(strand[k] == 0)
+ abs = qstart[k] + i;
+ else
+ abs = qrylen - qstart[k] - m[k] + i;
+
+ assert(tstart[k] +i == abs);
+#else
+ abs = tstart[k] + i;
+#endif
+
+ start = (diag[k].b > diag[k].a) ? diag[k].b - diag[k].a : 0;
+ lk = (start + i > margin) ? start + i - margin : 1;
+ rk = (start + i + margin < n[k]+1) ? start + i + margin : n[k]+1;
+
+#ifdef NOKBAND
+ for (j=1; j < n[k]+1; j++){
+#else
+ for (j=lk; j < rk; j++){
+#endif
+
+#ifdef DEBUGKBAND
+ if(j >= lk && j <= rk) K[k][i][j] = 1;
+#endif
+ l1 = 0;
+ l2 = 0;
+ r1 = 0;
+ r2 = 0;
+
+ if (strand[k] == 0){
+ off = qstart[k]; //off supports 0!
+ cura = a1[off+i-1];
+ curb = b[k][j-1];
+ if(j+1 < n[k]) {
+ r1 = b[k][j];
+ r2 = b[k][j+1];
+ }
+ if(j > 2) {
+ l1 = b[k][j-3];
+ l2 = b[k][j-2];
+ }
+ } else {
+ off = qend[k]+1; //off should b m -> qend + 1
+ cura = a2[off-i];
+ curb = b[k][n[k]-j];
+ if(j > 2) {
+ l1 = b[k][n[k]-j+1];
+ l2 = b[k][n[k]-j+2];
+ }
+ if(n[k]-j > 1) {
+ r1 = b[k][n[k]-j-2];
+ r2 = b[k][n[k]-j-1];
+ }
+ }
+
+ L[k][i][j] = MAX4(0, L[k][i-1][j]+indel,
+ L[k][i][j-1]+indel,
+ L[k][i-1][j-1]+sub(cura, curb, nfo));
+
+ if(abs <= ovlrange) {
+ for(q=0; q < k; q++) {
+#ifdef LOCALMULTISPLICECALCOFF
+ if(strand[q] == 0) {
+ tstartq = qstart[q];
+ tendq = qend[q];
+ } else {
+ tstartq = qrylen - (qstart[q] + m[q]);
+ tendq = tstartq + m[q] - 1;
+ }
+ assert(tstartq == tstart[q] && tendq == tend[q]);
+#else
+ tstartq = tstart[q];
+ tendq = tend[q];
+#endif
+
+
+ if(tstartq < abs && abs < tendq) {
+
+ relq = abs - tstartq - 1;
+ assert(relq < m[q]);
+
+ tmp = MAX(L[k][i][j],
+ lmv[q][relq] + sub(cura, curb, nfo) + trans + splicescore(l1, l2, strand[k], 1)) ;
+
+#ifdef DEBUGMULTISPLICEOPT
+ if(tmp > L[k][i][j] && tmp >= maxpick) {
+ maxpick = tmp;
+ lastpick = q;
+ }
+#endif
+ L[k][i][j] = tmp;
+ }
+ }
+ }
+
+ if(L[k][i][j] > L[k][scr[k].a][scr[k].b]) {
+ scr[k].a = i;
+ scr[k].b = j;
+ }
+
+ //if strand - acc1, acc2
+ //if strand + don1, don2
+ if (L[k][i][j]
+ + splicescore(r1, r2, strand[k], 0)
+ > lmv[k][i-1]) {
+ lmv[k][i] = L[k][i][j] + splicescore(r1, r2, strand[k], 0);
+ lmr[k][i] = i;
+ lmc[k][i] = j;
+ }
+#ifdef DEBUGMULTISPLICEOPT
+ if(maxk < L[k][i][j] ) {
+ maxk = L[k][i][j];
+ maxi = abs;
+ maxj = j;
+ }
+#endif
+
+
+ }
+ }
+#ifdef DEBUGMULTISPLICEOPT
+ fprintf(stdout, "maximum calculated score on matrix L[%d][%d][%d]=%d, maxtransistion from %d:%d\n", k, maxi, maxj, maxk, lastpick, maxpick);
+#endif
+ }
+
+ *lv = lmv;
+ *lr = lmr;
+ *lc = lmc;
+ *bestscr = scr;
+
+#ifdef DEBUGKBAND
+ *KBAND=K;
+#endif
+
+ return L;
+}
+
+
+/*----------------------- localmultisplicedtracebackopt -------------------------
+ *
+ * @brief tracing localmultisplicedmatrix back producing noofseqs split aligns
+ * @author Steve Hoffmann
+ *
+ */
+
+char***
+localmultisplicedtracebackopt (void *space, int ***M, symtype *a1, symtype *a2, Uint qrylen, Uint *m,
+ symtype **b, Uint* n, Uint *strand, Uint *qstart, Uint *qend, Uint *tstart, Uint *tend, Uint noofseqs, int indel, int trans,
+ Sint (*sub)(symtype, symtype, void *), void *nfo,
+ Alignment **al, int **lmv, int **lmr, int **lmc, PairUint *scr){
+
+ Uint u=1, v=1, cur, cols=0, rows=0, off=0, q, k, xmax=0, maxk, maxu, maxv, abs, relq, tstartq, tendq;
+ int x, y, maxval;
+ char breakiter=0;
+ symtype cura, curb, l1, l2;//, r1, r2;
+ char ***backtracetable = NULL;
+#if DEBUGMULTISPLICEOPT
+ Uint i,j;
+#endif
+
+#if DEBUGKBAND
+ backtracetable = ALLOCMEMORY(space, NULL, char**, noofseqs);
+#endif
+
+ maxval = 0;
+ maxu = 0;
+ maxv = 0;
+ maxk = 0;
+
+ for(k=0; k < noofseqs; k++) {
+
+#if DEBUGKBAND
+ backtracetable[k] = ALLOCMEMORY(space, NULL, char*, m[k]+1);
+ for(i=0; i < m[k]+1; i++) {
+ backtracetable[k][i] = ALLOCMEMORY(space, NULL, char, n[k]+1);
+ memset(backtracetable[k][i], 0, sizeof(n[k]+1));
+ }
+#endif
+ al[k]->uoff = 0;
+ al[k]->voff = 0;
+ cols += n[k] + 1;
+ rows += m[k] + 1;
+
+ if(maxval < M[k][scr[k].a][scr[k].b]) {
+ maxk = k;
+ maxu = scr[k].a;
+ maxv = scr[k].b;
+ maxval = M[k][scr[k].a][scr[k].b];
+ }
+
+
+#if DEBUGMULTISPLICEOPT
+ for(i=0; i < m[k]+1; i++) {
+ for(j=0; j < n[k]+1; j++) {
+ if(maxval < M[k][i][j]) {
+ maxk = k;
+ maxu = i;
+ maxv = j;
+ maxval = M[k][i][j];
+ }
+ }
+ }
+#endif
+ }
+
+
+ k=maxk;
+ u = maxu;
+ v = maxv;
+
+ //fprintf(stdout, "\nstarting on seq: %d,%d,%d -> %d \n", k, u, v, M[k][u][v]);
+
+ while(u > 0 && v > 0){
+ cur = M[k][u][v];
+
+#ifdef DEBUGKBAND
+ backtracetable[k][u][v] = 1;
+ fprintf(stdout, "continue with matrix[%d,%d,%d]:%d strand:%d [u'=%d]\n", k, u, v, M[k][u][v], strand[k], qstart[k]+u);
+#endif
+
+
+ if (M[k][u-1][v] + indel == cur){
+ insertEop(al[k], Insertion);
+ assert(u > 0);
+ u--;
+ } else {
+ if (M[k][u][v-1] + indel == cur){
+ insertEop(al[k], Deletion);
+ assert(v > 0);
+ v--;
+ } else {
+ if (M[k][u][v]) {
+ l1 = 0;
+ l2 = 0;
+
+ if (strand[k] == 0){
+ off = qstart[k];
+ assert(off+u-1 <= qend[k]);
+ cura = a1[off+u-1];
+ curb = b[k][v-1];
+
+ if(v > 2) {
+ l1 = b[k][v-3];
+ l2 = b[k][v-2];
+ }
+ } else {
+ off = qend[k]+1;
+ assert(off >= u);
+ cura = a2[off-u];
+ curb = b[k][n[k]-v];
+ if(v > 2) {
+ l1 = b[k][n[k]-v+1];
+ l2 = b[k][n[k]-v+2];
+ }
+ }
+
+// fprintf(stdout, "sub[%c,%c]=%d\n", cura, curb, sub(cura,curb,nfo));
+ if (M[k][u-1][v-1] + sub(cura, curb, nfo) == cur){
+ insertEop(al[k], Replacement);
+ assert(u > 0 && v > 0);
+ u--; v--;
+ } else {
+
+ if(k==0) {
+ if (strand[k] == 0){
+ //TODO: do we need to reset offsets?
+ off = qstart[k];
+ al[k]->uoff = off+u;
+ al[k]->voff = v;
+ revMeops(al[k]);
+ } else {
+ off = qend[k]+1; //off equals m in fulllocalmultsplice -> qend[k]+1
+ assert(off >= u+getUalignlen(al[k]));
+ al[k]->uoff = off-u-getUalignlen(al[k]);
+
+ assert(al[k]->uoff == qstart[k]+(m[k]-u-getUalignlen(al[k]))); //-1
+ assert(n[k] >= v+getValignlen(al[k]));
+ al[k]->voff = n[k]-v-getValignlen(al[k]);
+ }
+
+#ifdef DEBUGMULTISPLICEOPT
+ fprintf(stdout, "exit 1 at k=%d\n", k);
+#endif
+ breakiter=1;
+ break;
+ }
+
+ x = -1;
+ y = -1;
+#ifdef LOCALMULTISPLICECALCOFF
+ if(strand[k] == 0)
+ abs = qstart[k] + u;
+ else
+ abs = qrylen - qstart[k] - m[k] + u;
+#else
+ abs = tstart[k] + u;
+#endif
+ for(q=k; q > 0; q--) {
+
+#ifdef LOCALMULTISPLICECALCOFF
+ if(strand[q-1] == 0) {
+ tstartq = qstart[q-1];
+ tendq = qend[q-1];
+ } else {
+ tstartq = qrylen - (qstart[q-1] + m[q-1]);
+ tendq = tstartq + m[q-1]-1;
+ }
+
+ assert(tstartq == tstart[q-1] && tend[q-1] == tendq);
+#else
+ tstartq = tstart[q-1];
+ tendq = tend[q-1];
+#endif
+
+
+ if(tstartq < abs && abs < tendq) {
+
+ relq = abs - tstartq -1;
+ assert(relq < m[q-1]);
+
+ if (lmv[q-1][relq] + splicescore(l1, l2, strand[k], 1) +
+ sub(cura, curb, nfo) + trans == cur) {
+ x=q-1;
+ y=relq;
+ xmax = lmc[q-1][relq];
+ }
+ }
+ }
+// fprintf(stderr, "matrix:U[%d,%d+%d=%d]:%d, k:%d, q:%d, x:%d -> [%d,%d]\n", u, cum[k], v, cum[k]+v, cur, k, q, x, lmr[x][u-1], xmax);
+ assert(x > -1);
+ insertEop(al[k], Replacement);
+ assert(u > 0 && v > 0);
+ u--; v--;
+
+ if (strand[k] == 0){
+ off = qstart[k];
+ al[k]->uoff = off+u;
+ al[k]->voff = v;
+ revMeops(al[k]);
+ } else {
+ off = qend[k]+1; //off=m in fulllocalmultisplice -> qend[k]+1
+ assert(off >= u+getUalignlen(al[k]));
+ assert(n[k] >= v+getValignlen(al[k]));
+ al[k]->uoff= off-u-getUalignlen(al[k]);
+ assert(al[k]->uoff == qstart[k]+(m[k]-u-getUalignlen(al[k]))); //-1
+ al[k]->voff = n[k]-v-getValignlen(al[k]);
+ }
+#ifdef DEBUGMULTISPLICEOPT
+ fprintf(stdout, "transition from k:%d - %d:x : m:%d, u:%d, v:%d, ulen:%d, vlen:%d -> u': %d v':%d \n",
+ k, x, m[k], u, v, getUalignlen(al[k]), getValignlen(al[k]), lmr[x][y], xmax);
+
+ showAlign(al[k], stdout);
+ fprintf(stdout, "\n");
+#endif
+ k = x;
+ u = lmr[x][y];
+ v = xmax;
+
+ assert(u >= 0 && v >= 0);
+ }
+ } else {
+ //fprintf(stderr, "reversing end\n"); was k>0
+ if (strand[k] == 0){
+ off = qstart[k];
+ al[k]->uoff = off+u;
+ al[k]->voff = v;
+ revMeops(al[k]);
+ } else {
+ off = qend[k]+1; // off=m;
+ assert(off >= u+getUalignlen(al[k]));
+ al[k]->uoff = off-u-getUalignlen(al[k]);
+ assert(al[k]->uoff == qstart[k]+(m[k]-u-getUalignlen(al[k]))); //-1
+ assert(n[k] >= v+getValignlen(al[k]));
+ al[k]->voff = n[k]-v-getValignlen(al[k]);
+ }
+#ifdef DEBUGMULTISPLICEOPT
+ fprintf(stdout, "exit 2 at k=%d\n", k);
+#endif
+ breakiter=1;
+ break;
+ }
+ }
+ }
+ }
+
+
+// if(k==0){
+ if (!breakiter){
+ if (strand[k] == 0){
+ off=qstart[k];
+ al[k]->uoff = off+u;
+ al[k]->voff = v;
+ revMeops(al[k]);
+ } else {
+ /* fprintf(stdout, "finalize alignment at u:%d, v:%d, ulen:%d, vlen:%d\n", u, v, getUalignlen(al[k]), getValignlen(al[k]));*/
+ off=qend[k]+1; //off = m;
+ assert(off >= u+getUalignlen(al[k]));
+ al[k]->uoff = off-u-getUalignlen(al[k]);
+ assert(al[k]->uoff == qstart[k]+(m[k]-u-getUalignlen(al[k]))); //-1
+ assert(n[k] >= v+getValignlen(al[k]));
+ al[k]->voff = n[k]-v-getValignlen(al[k]);
+ }
+ }
+
+#ifdef DEBUGMULTISPLICEOPT
+ fprintf(stdout, "starting on seq: %d (%d), u:%d, v:%d, uoff:%d, voff:%d, ulen:%d, vlen:%d\nq:%s\n", k, strand[k], u, v, al[k]->uoff, al[k]->voff, getUalignlen(al[k]), getValignlen(al[k]), al[k]->u);
+ showAlign(al[k], stdout);
+#endif
+
+
+ return backtracetable;
+}
+
+
+
+/*------------------------- localmulitsplicedmatrix --------------------------
+ *
+ * @brief aligning a read a1 and its reverse complement a2 of length m to
+ * noofseqs loci
+ * @author Steve Hoffmann
+ *
+ */
+
+int*
+localmultisplicedmatrix (void *space, symtype *a1, symtype *a2, Uint m,
+ symtype **b, Uint *n, Uint *strand, Uint noofseqs, int indel, int trans,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, int ***lv, int ***lr, int ***lc){
+
+ int i, j, k, q, cols=0, rows, size;
+ Uint *cum=NULL;
+// unsigned char transition;
+ int *L, **lmr=NULL, **lmv=NULL, **lmc=NULL;
+ symtype cura, curb, r1, r2, l1, l2;
+
+ rows = m + 1;
+
+ cum = ALLOCMEMORY(space, NULL, Uint, noofseqs);
+ lmr = ALLOCMEMORY(space, NULL, int*, noofseqs);
+ lmv = ALLOCMEMORY(space, NULL, int*, noofseqs);
+ lmc = ALLOCMEMORY(space, NULL, int*, noofseqs);
+
+ for(k=0; k < noofseqs; k++) {
+ cum[k] = cols;
+ cols += n[k] + 1;
+ lmr[k] = ALLOCMEMORY(space, NULL, int, rows);
+ lmv[k] = ALLOCMEMORY(space, NULL, int, rows);
+ lmc[k] = ALLOCMEMORY(space, NULL, int, rows);
+ memset(lmv[k], 0, sizeof(int)*rows);
+ memset(lmr[k], 0, sizeof(int)*rows);
+ memset(lmc[k], 0, sizeof(int)*rows);
+ }
+
+ size = rows * cols;
+ L = ALLOCMEMORY(space, NULL, int, size);
+ memset(L, 0, sizeof(int)*size);
+
+ for(k=0; k < noofseqs; k++) {
+ for (i=1; i < rows; i++){
+ lmv[k][i] = lmv[k][i-1];
+ lmr[k][i] = lmr[k][i-1];
+ lmc[k][i] = lmc[k][i-1];
+
+ for (j=1; j < n[k]+1; j++){
+ l1 = 0;
+ l2 = 0;
+ r1 = 0;
+ r2 = 0;
+
+ if (strand[k] == 0){
+ cura = a1[i-1];
+ curb = b[k][j-1];
+ if(j+1 < n[k]) {
+ r1 = b[k][j];
+ r2 = b[k][j+1];
+ }
+ if(j > 2) {
+ l1 = b[k][j-3];
+ l2 = b[k][j-2];
+ }
+ } else {
+ cura = a2[m-i];
+ curb = b[k][n[k]-j];
+ if(j > 2) {
+ l1 = b[k][n[k]-j+1];
+ l2 = b[k][n[k]-j+2];
+ }
+ if(n[k]-j > 1) {
+ r1 = b[k][n[k]-j-2];
+ r2 = b[k][n[k]-j-1];
+ }
+ }
+
+ MATRIX2D(L, cols, i, (cum[k] + j)) =
+ MAX4(0,
+ MATRIX2D(L, cols, (i-1), (cum[k] + j)) + indel,
+ MATRIX2D(L, cols, i, (cum[k] + j-1)) + indel,
+ MATRIX2D(L, cols, (i-1), (cum[k] + j-1)) + sub(cura, curb, nfo));
+
+// transition = 0;
+ for(q=0; q < k; q++) {
+// Uint max=0;
+// max = arraymax(&MATRIX2D(L, cols, lmr[q][i-1], cum[q]), n[q]+1);
+
+// if(lmv[q][i-1] != MATRIX2D(L, cols, lmr[q][i-1], cum[q]+max)) {
+// fprintf(stderr, "k:%d, n[%d]+1:%d, lmv[%d][%d]:%d != matrix[lmr[%d][%d],%d+%d]:%d, lmc[q:%d][i-1:%d]:%d, i:%d, j:%d\n",
+// k, q, n[q]+1, q, i-1, lmv[q][i-1], q, i-1, cum[q], max,
+// MATRIX2D(L, cols, lmr[q][i-1], cum[q]+max), q, i-1, lmc[q][i-1], i, j);
+// return NULL;
+// }
+
+ //if strand[q] != strand[k] splicescore = -lms[q][i-1]
+ MATRIX2D(L, cols, i, (cum[k] + j)) =
+ MAX(
+ MATRIX2D(L, cols, i, (cum[k] + j)),
+ lmv[q][i-1] + sub(cura, curb, nfo) + trans + splicescore(l1, l2, strand[k], 1)) ;
+ }
+
+
+ //if strand - acc1, acc2
+ //if strand + don1, don2
+ if (MATRIX2D(L, cols, i, cum[k]+j) + splicescore(r1, r2, strand[k], 0) > lmv[k][i-1]) {
+ lmv[k][i] = MATRIX2D(L, cols, i, cum[k]+j) + splicescore(r1, r2, strand[k], 0);
+ lmr[k][i] = i;
+ lmc[k][i] = j;
+ }
+ }
+ }
+ }
+
+ FREEMEMORY(space, cum);
+ *lv = lmv;
+ *lr = lmr;
+ *lc = lmc;
+ return L;
+}
+
+
+/*----------------------- localmultisplicedtraceback -------------------------
+ *
+ * @brief tracing localmultisplicedmatrix back producing noofseqs split aligns
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+localmultisplicedtraceback (void *space, int *M, symtype *a1, symtype *a2, Uint m,
+ symtype **b, Uint* n, Uint *strand, Uint noofseqs, int indel, int trans,
+ Sint (*sub)(symtype, symtype, void *), void *nfo,
+ Alignment **al, int **lmv, int **lmr, int **lmc){
+
+ Uint u=1, v=1, cur, cols=0, start, p, q, k,
+ *cum, xmax=0;
+ int x;
+ symtype cura, curb, l1, l2;//, r1, r2;
+
+ cum = ALLOCMEMORY(space, NULL, Uint, noofseqs);
+ for(k=0; k < noofseqs; k++) {
+ al[k]->uoff = 0;
+ al[k]->voff = 0;
+ cum[k] = cols;
+ cols += n[k] + 1;
+ }
+
+ start = arraymax(&MATRIX2D(M, cols, 0, 0), (cols*(m+1)));
+
+ p = start / (cols);
+ q = start % (cols);
+
+ k=0;
+ while(k+1 < noofseqs && q > cum[k+1]) k++;
+
+ u = p;
+ v = q-cum[k];
+
+// fprintf(stderr, "\nstarting on seq: %d (%d), u:%d, v:%d\n", k, strand[k], u, v);
+ //if(u < m) {
+// for(i=u; i < m; i++) insertEop(al[k], Insertion);
+ //}
+
+ while(u > 0 && v > 0){
+ cur = MATRIX2D(M, cols, u, (cum[k]+v));
+// fprintf(stderr, "continue on %d with matrix[%d,%d+%d=%d]:%d strand:%d\n", k, u, cum[k], v, cum[k]+v, MATRIX2D(M, cols, u, (cum[k]+v)), strand[k]);
+
+ if (MATRIX2D(M, cols, (u-1), (cum[k]+v)) + indel == cur){
+ insertEop(al[k], Insertion);
+ assert(u > 0);
+ u--;
+ } else {
+ if (MATRIX2D(M, cols, u, (cum[k]+v-1)) + indel == cur){
+ insertEop(al[k], Deletion);
+ assert(v > 0);
+ v--;
+ } else {
+ if (MATRIX2D(M, cols, u, (cum[k]+v))) {
+ l1 = 0;
+ l2 = 0;
+ /*
+ r1 = 0;
+ r2 = 0;*/
+
+ if (strand[k] == 0){
+ cura = a1[u-1];
+ curb = b[k][v-1];
+
+ /* not used
+ if(v+1 < n[k]) {
+ r1 = b[k][v];
+ r2 = b[k][v+1];
+ } */
+ if(v > 2) {
+ l1 = b[k][v-3];
+ l2 = b[k][v-2];
+ }
+ } else {
+ cura = a2[m-u];
+ curb = b[k][n[k]-v];
+ if(v > 2) {
+ l1 = b[k][n[k]-v+1];
+ l2 = b[k][n[k]-v+2];
+ }
+ /* not used
+ if(n[k]-v > 1) {
+ r1 = b[k][n[k]-v-2];
+ r2 = b[k][n[k]-v-1];
+ }*/
+ }
+
+// fprintf(stderr, "sub[%c,%c]=%d\n", cura, curb, sub(cura,curb,nfo));
+ if (MATRIX2D(M, cols, (u-1), (cum[k]+v-1)) +
+ sub(cura, curb, nfo) == cur){
+ insertEop(al[k], Replacement);
+ assert(u > 0 && v > 0);
+ u--; v--;
+ } else {
+
+ if(k==0) {
+ if (strand[k] == 0){
+ al[k]->uoff = u;
+ al[k]->voff = v;
+ revMeops(al[k]);
+ } else {
+ assert(m >= u+getUalignlen(al[k]));
+ al[k]->uoff = m-u-getUalignlen(al[k]);
+ assert(n[k] >= v+getValignlen(al[k]));
+ al[k]->voff = n[k]-v-getValignlen(al[k]);
+ }
+ break;
+ }
+
+ x = -1;
+ for(q=k; q > 0; q--) {
+/* Uint max;
+ max = arraymax(&MATRIX2D(M, cols, lmr[q-1][u-1], cum[q-1]), n[q-1]+1);
+ assert(MATRIX2D(M, cols, lmr[q-1][u-1],cum[q-1]+max)
+ == lmv[q-1][u-1]);
+ fprintf(stderr, "transition: matrix[lmr[%d][%d],%d]=%d, sub[%c,%c]=%d\n", q-1, u-1, max, lmv[q-1][u-1], cura, curb, sub(cura,curb,nfo));
+*/
+ if (
+ //MATRIX2D(M, cols, lmr[q-1][u-1], cum[q-1]+lmc[q-1][u-1]) +
+ lmv[q-1][u-1] + splicescore(l1, l2, strand[k], 1) +
+ sub(cura, curb, nfo) + trans == cur) {
+ x=q-1;
+ xmax = lmc[q-1][u-1];
+ }
+ }
+
+// fprintf(stderr, "matrix:U[%d,%d+%d=%d]:%d, k:%d, q:%d, x:%d -> [%d,%d]\n", u, cum[k], v, cum[k]+v, cur, k, q, x, lmr[x][u-1], xmax);
+
+ assert(x > -1);
+ insertEop(al[k], Replacement);
+ assert(u > 0 && v > 0);
+ u--; v--;
+
+ if (strand[k] == 0){
+ al[k]->uoff = u;
+ al[k]->voff = v;
+ revMeops(al[k]);
+ } else {
+ assert(m >= u+getUalignlen(al[k]));
+ assert(n[k] >= v+getValignlen(al[k]));
+ al[k]->uoff= m-u-getUalignlen(al[k]);
+ al[k]->voff = n[k]-v-getValignlen(al[k]);
+ }
+/*
+ fprintf(stderr, "transition from: m:%d, u:%d, v:%d, ulen:%d, vlen:%d -> u': %d v':%d \n",
+ m, u, v, getUalignlen(al[k]), getValignlen(al[k]), lmr[x][u], xmax);
+
+ showAlign(al[k], stdout);
+ fprintf(stdout, "\n");
+*/
+ u = lmr[x][u];
+ v = xmax;
+
+ assert(u >= 0 && v >= 0);
+ k = x;
+ }
+ } else {
+ //fprintf(stderr, "reversing end\n");
+ if (strand[k] == 0 && k > 0){
+ al[k]->uoff = u;
+ al[k]->voff = v;
+ revMeops(al[k]);
+ } else {
+ assert(m >= u+getUalignlen(al[k]));
+ al[k]->uoff = m-u-getUalignlen(al[k]);
+ assert(n[k] >= v+getValignlen(al[k]));
+ al[k]->voff = n[k]-v-getValignlen(al[k]);
+ }
+ break;
+ }
+ }
+ }
+ }
+
+// fprintf(stderr, "starting on seq: %d (%d), u:%d, v:%d\n", k, strand[k], u, v);
+
+ if (k==0){
+ if (strand[k] == 0){
+ al[k]->uoff = u;
+ al[k]->voff = v;
+ revMeops(al[k]);
+ } else {
+ /* fprintf(stdout, "finalize alignment at u:%d, v:%d, ulen:%d, vlen:%d\n", u, v, getUalignlen(al[k]), getValignlen(al[k]));*/
+ assert(m >= u+getUalignlen(al[k]));
+ al[k]->uoff = m-u-getUalignlen(al[k]);
+ assert(n[k] >= v+getValignlen(al[k]));
+ al[k]->voff = n[k]-v-getValignlen(al[k]);
+ }
+ }
+
+ FREEMEMORY(space, cum);
+ return;
+}
+
+
+/*---------------------------- localsplicedmatrix ----------------------------
+ *
+ * @brief generate local spliced alignment
+ * @author Steve Hoffmann
+ *
+ */
+
+
+int*
+localsplicedmatrix (void *space, symtype *a1, symtype *a2, Uint m,
+ symtype *b1, Uint n1, symtype *b2, Uint n2,
+ Uint strand1, Uint strand2, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, int **lv, int **lr){
+ int i, j, k, max, cols1, cols2, cols, rows, size, maxval=0;
+ int *L, *lmr=NULL, *lmv=NULL;
+ symtype cura, curb;
+
+ rows = m + 1;
+ cols1 = n1 + 1;
+ cols2 = n2 + 1;
+ cols = cols1 + cols2;
+
+ size = rows * cols;
+ L = ALLOCMEMORY(space, NULL, int, size);
+ memset(L, 0, sizeof(int)*size);
+
+ lmr = ALLOCMEMORY(space, NULL, int, rows);
+ lmv = ALLOCMEMORY(space, NULL, int, rows);
+ memset(lmv, 0, sizeof(int)*rows);
+ memset(lmr, 0, sizeof(int)*rows);
+
+ for (i = 1; i < rows; i++){
+ lmv[i] = lmv[i-1];
+ lmr[i] = lmr[i-1];
+
+ for (j = 1; j < cols1; j++){
+ if (strand1 == 0 || strand1 == strand2){
+ cura = a1[i-1];
+ curb = b1[j-1];
+ }
+ else {
+ cura = a1[m-i];
+ curb = b1[n1-j];
+ }
+
+ MATRIX2D(L, cols, i, j) =
+ MAX4(0,
+ MATRIX2D(L, cols, (i-1), j) + indel,
+ MATRIX2D(L, cols, i, (j-1)) + indel,
+ MATRIX2D(L, cols, (i-1), (j-1)) + sub(cura, curb, nfo));
+
+ if (MATRIX2D(L, cols, i, j) > lmv[i-1]) {
+ lmv[i] = MATRIX2D(L, cols, i, j);
+ lmr[i] = i;
+ }
+ }
+
+ max = arraymax(&MATRIX2D(L, cols, lmr[i-1], 0), cols1);
+ assert(lmv[i-1] == MATRIX2D(L, cols, lmr[i-1],max));
+
+ for (k = 1; k < cols2; k++){
+ if (strand2 == 0 || strand1 == strand2){
+ cura = a2[i-1];
+ curb = b2[k-1];
+ }
+ else {
+ cura = a2[m-i];
+ curb = b2[n2-k];
+ }
+
+ MATRIX2D(L, cols, i, (cols1 + k)) =
+ MAX5(0,
+ MATRIX2D(L, cols, (i-1), (cols1+k)) + indel,
+ MATRIX2D(L, cols, i, (cols1+k-1)) + indel,
+ MATRIX2D(L, cols, (i-1), (cols1+k-1)) + sub(cura, curb, nfo),
+ MATRIX2D(L, cols, lmr[i-1], max) + sub(cura, curb, nfo));
+
+ if(MATRIX2D(L, cols, i, (cols1+k)) > maxval) {
+ maxval = MATRIX2D(L, cols, i, (cols1+k));
+ }
+ }
+ }
+
+ *lv = lmv;
+ *lr = lmr;
+ return L;
+}
+
+
+/*--------------------------- localsplicetraceback ---------------------------
+ *
+ * @brief tracing back a local spliced alignment
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+localsplicedtraceback (void *space, int *M, symtype *a1, symtype *a2, Uint m,
+ symtype *b1, Uint n1,
+ symtype *b2, Uint n2, Uint strand1, Uint strand2, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo,
+ Alignment *al1, Alignment *al2, int* lmv, int *lmr){
+
+ Uint i=1, j=1, u=1, v=1, cur, cols1, cols2, cols, start, p, q;
+ int max;
+ BOOL seq1;
+ symtype cura, curb;
+
+ al1->uoff = 0;
+ al1->voff = 0;
+ al2->uoff = 0;
+ al2->voff = 0;
+
+ cols1 = n1 + 1;
+ cols2 = n2 + 1;
+ cols = cols1 + cols2;
+
+ start = arraymax(&MATRIX2D(M, cols, 0, 0), (cols*(m+1)));
+
+ p = start / (cols);
+ q = start % (cols);
+
+ if (q > cols1) {
+ u = p;
+ v = q-cols1;
+ seq1 = 0;
+ } else {
+ i = p;
+ j = q;
+ seq1 = 1;
+ }
+
+ while(i > 0 && j > 0 && u > 0 && v > 0){
+
+ if (seq1){
+ cur = MATRIX2D(M, cols, i, j);
+
+ if (MATRIX2D(M, cols, (i-1), j) + indel == cur){
+ insertEop(al1, Deletion);
+ i--;
+ } else {
+ if (MATRIX2D(M, cols, i, (j-1)) + indel == cur){
+ insertEop(al1, Insertion);
+ j--;
+ } else {
+ if (MATRIX2D(M, cols, (i), (j))) {
+ if (strand1 == 0 || strand1 == strand2){
+ cura = a1[i-1];
+ curb = b1[j-1];
+ } else {
+ cura = a1[m-i];
+ curb = b1[n1-j];
+ }
+ assert(MATRIX2D(M, cols, (i-1), (j-1))
+ + sub(cura, curb, nfo) == cur);
+ insertEop(al1, Replacement);
+ i--; j--;
+ } else {
+ break;
+ }
+ }
+ }
+ } else {
+
+ cur = MATRIX2D(M, cols, u, (cols1+v));
+ if (MATRIX2D(M, cols, (u-1), (cols1+v)) + indel == cur){
+ insertEop(al2, Deletion);
+ u--;
+ } else {
+ if (MATRIX2D(M, cols, u, (cols1+v-1)) + indel == cur){
+ insertEop(al2, Insertion);
+ v--;
+ } else {
+ if (MATRIX2D(M, cols, u, (cols1+v))) {
+ if (strand2 == 0 || strand1 == strand2){
+ cura = a2[u-1];
+ curb = b2[v-1];
+ } else {
+ cura = a2[m-u];
+ curb = b2[n2-v];
+ }
+ if (MATRIX2D(M, cols, (u-1), (cols1+v-1)) +
+ sub(cura, curb, nfo) == cur){
+ insertEop(al2, Replacement);
+ u--; v--;
+ } else {
+ max = arraymax(&MATRIX2D(M, cols, lmr[u-1], 0), cols1);
+ assert(MATRIX2D(M, cols, lmr[u-1], max) +
+ sub(cura, curb, nfo) == cur);
+
+ insertEop(al2, Replacement);
+ u--; v--;
+ al2->uoff = u;
+ al2->voff = v;
+
+ if (strand2 == 0 || strand1 == strand2){
+ revMeops(al2);
+ }
+ i = lmr[u];
+ j = max;
+ seq1 = 1;
+ }
+ } else {
+
+ al2->uoff = u;
+ al2->voff = v;
+ if (strand2 == 0 || strand1 == strand2){
+ revMeops(al2);
+ }
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (seq1){
+ al1->uoff = i;
+ al1->voff = j;
+ if (strand1 == 0 || strand1 == strand2){
+ revMeops(al1);
+ }
+ }
+
+ return;
+}
+
+
+/*------------------------------ splicedmatrix -------------------------------
+ *
+ * @brief calculate semi-global alignment matrix
+ * @author Christian Otto
+ *
+ */
+
+int*
+splicedmatrix (void *space, symtype *a1, symtype *a2, Uint m,
+ symtype *b1, Uint n1, symtype *b2, Uint n2,
+ Uint strand1, Uint strand2, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo){
+ int i, j, k, max, cols1, cols2, cols, rows, size;
+ int *L;
+ symtype cura, curb;
+
+ rows = m + 1;
+ cols1 = n1 + 1;
+ cols2 = n2 + 1;
+ cols = cols1 + cols2;
+
+ size = rows * cols;
+ L = ALLOCMEMORY(space, NULL, int, size);
+ L = memset(L, 0, sizeof(int)*size);
+ for (i = 1; i < rows; i++){
+ MATRIX2D(L, cols, i, 0) = i * indel;
+ MATRIX2D(L, cols, i, cols1) = i * indel;
+ }
+
+ for (i = 1; i < rows; i++){
+ for (j = 1; j < cols1; j++){
+ if (strand1 == 0 || strand1 == strand2){
+ cura = a1[i-1];
+ curb = b1[j-1];
+ }
+ else {
+ cura = a1[m-i];
+ curb = b1[n1-j];
+ }
+ MATRIX2D(L, cols, i, j) =
+ MAX3(MATRIX2D(L, cols, (i-1), j) + indel,
+ MATRIX2D(L, cols, i, (j-1)) + indel,
+ MATRIX2D(L, cols, (i-1), (j-1)) + sub(cura, curb, nfo));
+ }
+ max = arraymax(&MATRIX2D(L, cols, (i-1), 0), cols1);
+ for (k = 1; k < cols2; k++){
+ if (strand2 == 0 || strand1 == strand2){
+ cura = a2[i-1];
+ curb = b2[k-1];
+ }
+ else {
+ cura = a2[m-i];
+ curb = b2[n2-k];
+ }
+ MATRIX2D(L, cols, i, (cols1 + k)) =
+ MAX4(MATRIX2D(L, cols, (i-1), (cols1+k)) + indel,
+ MATRIX2D(L, cols, i, (cols1+k-1)) + indel,
+ MATRIX2D(L, cols, (i-1), (cols1+k-1)) + sub(cura, curb, nfo),
+ MATRIX2D(L, cols, (i-1), max) + sub(cura, curb, nfo));
+ }
+ }
+ return L;
+}
+
+/*----------------------------- splicedtraceback -----------------------------
+ *
+ * @brief tracing back semi-global spliced alignment
+ * @author Christian Otto
+ *
+ */
+
+void
+splicedtraceback (void *space, int *M,
+ symtype *a1, symtype *a2, Uint m,
+ symtype *b1, Uint n1,
+ symtype *b2, Uint n2,
+ Uint strand1, Uint strand2,
+ int indel, Sint (*sub)(symtype, symtype, void *), void *nfo,
+ Alignment *al1, Alignment *al2){
+
+ Uint i, j, k, cur, cols1, cols2, cols;
+ int max;
+ BOOL seq1;
+ symtype cura, curb;
+
+ al1->uoff = 0;
+ al1->voff = 0;
+ al2->uoff = 0;
+ al2->voff = 0;
+
+ cols1 = n1 + 1;
+ cols2 = n2 + 1;
+ cols = cols1 + cols2;
+ i = m;
+ j = arraymax(&MATRIX2D(M, cols, i, 0), cols1);
+ k = arraymax(&MATRIX2D(M, cols, i, cols1), cols2);
+
+ if (MATRIX2D(M, cols, i, j) > MATRIX2D(M, cols, i, (cols1+k))){
+ seq1 = 1;
+ }
+ else {
+ seq1 = 0;
+ }
+
+ while(i > 0 && j > 0 && k > 0){
+
+ if (seq1){
+ cur = MATRIX2D(M, cols, i, j);
+ if (MATRIX2D(M, cols, (i-1), j) + indel == cur){
+ insertEop(al1, Deletion);
+ i--;
+ }
+ else {
+ if (MATRIX2D(M, cols, i, (j-1)) + indel == cur){
+ insertEop(al1, Insertion);
+ j--;
+ }
+ else {
+ if (strand1 == 0 || strand1 == strand2){
+ cura = a1[i-1];
+ curb = b1[j-1];
+ }
+ else {
+ cura = a1[m-i];
+ curb = b1[n1-j];
+ }
+ assert(MATRIX2D(M, cols, (i-1), (j-1))
+ + sub(cura, curb, nfo) == cur);
+
+ insertEop(al1, Replacement);
+ i--; j--;
+ }
+ }
+ } else {
+ cur = MATRIX2D(M, cols, i, (cols1+k));
+ if (MATRIX2D(M, cols, (i-1), (cols1+k)) + indel == cur){
+ insertEop(al2, Deletion);
+ i--;
+ } else {
+ if (MATRIX2D(M, cols, i, (cols1+k-1)) + indel == cur){
+ insertEop(al2, Insertion);
+ k--;
+ } else {
+ if (strand2 == 0 || strand1 == strand2){
+ cura = a2[i-1];
+ curb = b2[k-1];
+ } else {
+ cura = a2[m-i];
+ curb = b2[n2-k];
+ }
+ if (MATRIX2D(M, cols, (i-1), (cols1+k-1)) + sub(cura, curb, nfo) == cur){
+ insertEop(al2, Replacement);
+ i--; k--;
+ } else {
+ max = arraymax(&MATRIX2D(M, cols, (i-1), 0), cols1);
+ assert(MATRIX2D(M, cols, (i-1), max)
+ + sub(cura, curb, nfo) == cur);
+
+ insertEop(al2, Replacement);
+ i--; k--;
+
+ al2->uoff = i;
+ al2->voff = k;
+ if (strand2 == 0 || strand1 == strand2){
+ revMeops(al2);
+ }
+ j = max;
+ seq1 = 1;
+ }
+ }
+ }
+ }
+ }
+ if (seq1){
+ al1->uoff = i;
+ al1->voff = j;
+ if (strand1 == 0 || strand1 == strand2){
+ revMeops(al1);
+ }
+ }
+
+
+ return;
+}
+
+/*------------------------------- swgaplesstraceback -----------------------------
+ *
+ * traceback to find optimal local alignment path
+ *
+ */
+
+ int*
+swgaplesstraceback (void *space, int *M,
+ symtype *a, Uint m, symtype *b, Uint n,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, int* alignsize)
+{
+ Uint i, j, ncol, cur, start;
+ int *align = NULL;
+
+ *alignsize = 0;
+ ncol = (n+1);
+ start = arraymax(M, (m+1)*ncol);
+ i = start / ncol;
+ j = start % ncol;
+
+ while(i > 0 && j > 0) {
+
+ cur = MATRIX2D(M, ncol, i, j);
+ if (cur==0)
+ return align;
+
+ align = ALLOCMEMORY(space, align, int, *alignsize+2);
+ align[*alignsize]=i;
+ align[*alignsize+1]=j;
+ *alignsize +=2;
+
+ i--; j--;
+
+ }
+
+ return align;
+}
+
+
+
+
diff --git a/segemehl/libs/sw.h b/segemehl/libs/sw.h
new file mode 100644
index 0000000..f8edd2a
--- /dev/null
+++ b/segemehl/libs/sw.h
@@ -0,0 +1,111 @@
+#ifndef DPALIGN_H
+#define DPALIGN_H
+
+/*
+ *
+ * sw.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 06.02.2010 14:14:53 CET
+ *
+ */
+
+
+
+
+#include "basic-types.h"
+#include "alignment.h"
+
+typedef char symtype;
+
+Uint edist(void *, symtype *, Uint, symtype *, Uint, Uint, Uint *, Uint);
+int constscr (symtype, symtype, void *);
+int constscr_Nmatch (symtype, symtype, void *);
+int* swmatrix (void *, symtype*, Uint, symtype*, Uint, int,
+ Sint (*sub)(symtype, symtype, void *), void *);
+
+int* splicedmatrix (void *, symtype*, symtype*, Uint, symtype*, Uint, symtype*, Uint,
+ Uint, Uint, int, Sint (*sub)(symtype, symtype, void *), void *);
+
+int* swgapless (void *, symtype*, Uint, symtype*, Uint,
+ Sint (*sub)(symtype, symtype, void *), void *);
+
+int* swgaplesstraceback (void *, int *,
+ symtype *, Uint, symtype *, Uint,
+ Sint (*sub)(symtype, symtype, void *), void *, int*);
+
+void
+swtraceback (void *space, int *M,
+ symtype *a, Uint m, symtype *b, Uint n, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, Alignment *al);
+
+void
+splicedtraceback (void *space, int *M, symtype *a1, symtype *a2, Uint m,
+ symtype *b1, Uint n1, symtype *b2, Uint n2, Uint strand1, Uint strand2, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, Alignment *al1, Alignment *al2);
+
+
+void
+swtracebacksplit (void *space, int *L,
+ symtype *a, symtype *b, Uint m, symtype *s1, symtype *s2, Uint n, int indel,
+ unsigned char rc,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, Alignment *al1, Alignment *al2, FILE *dev);
+
+
+int*
+swsplitalign (void *space, symtype *a, symtype *b,
+ Uint m, symtype *s1, symtype *s2, Uint n, int indel,
+ unsigned char rc, Sint (*sub)(symtype, symtype, void *), void *nfo);
+
+
+void
+localsplicedtraceback (void *space, int *M, symtype *a1, symtype *a2, Uint m, symtype *b1, Uint n1,
+ symtype *b2, Uint n2, Uint strand1, Uint strand2, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo,
+ Alignment *al1, Alignment *al2, int* lmv, int *lmr);
+
+
+int*
+localsplicedmatrix (void *space, symtype *a1, symtype *a2, Uint m,
+ symtype *b1, Uint n1, symtype *b2, Uint n2,
+ Uint strand1, Uint strand2, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, int **lv, int **lr);
+
+void
+localsplicedtraceback_test (void *space, int *M, symtype *a1, symtype *a2, Uint m, symtype *b1, Uint n1,
+ symtype *b2, Uint n2, Uint strand1, Uint strand2, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo,
+ Alignment *al1, Alignment *al2, int* lmv, int *lmr);
+
+
+int*
+localsplicedmatrix_test (void *space, symtype *a1, symtype *a2, Uint m,
+ symtype *b1, Uint n1, symtype *b2, Uint n2,
+ Uint strand1, Uint strand2, int indel,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, int **lv, int **lr);
+
+int*
+localmultisplicedmatrix (void *space, symtype *a1, symtype *a2, Uint m,
+ symtype **b, Uint *n, Uint *strand, Uint noofseqs, int indel, int trans,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, int ***lv, int ***lr, int ***lc);
+
+int***
+localmultisplicedmatrixopt (void *space, symtype *a1, symtype *a2, Uint qrylen, Uint *m,
+ symtype **b, Uint *n, Uint *strand, Uint *qstart, Uint *qend, Uint *tstart, Uint *tend, Uint noofseqs, int indel, int trans,
+ Sint (*sub)(symtype, symtype, void *), void *nfo, int ***lv, int ***lr, int ***lc, PairUint **scr, char ****, PairUint *diag);
+
+void
+localmultisplicedtraceback (void *space, int *M, symtype *a1, symtype *a2, Uint m,
+ symtype **b, Uint* n, Uint *strand, Uint noofseqs, int indel, int trans,
+ Sint (*sub)(symtype, symtype, void *), void *nfo,
+ Alignment **al, int **lmv, int **lmr, int **lmc);
+
+char***
+localmultisplicedtracebackopt (void *space, int ***M, symtype *a1, symtype *a2, Uint qrylen, Uint *m,
+ symtype **b, Uint* n, Uint *strand, Uint *qstart, Uint *qend, Uint *tstart, Uint *tend, Uint noofseqs, int indel, int trans,
+ Sint (*sub)(symtype, symtype, void *), void *nfo,
+ Alignment **al, int **lmv, int **lmr, int **lmc, PairUint *);
+#endif
+
diff --git a/segemehl/libs/vqueue.c b/segemehl/libs/vqueue.c
new file mode 100644
index 0000000..668fb3b
--- /dev/null
+++ b/segemehl/libs/vqueue.c
@@ -0,0 +1,213 @@
+/**
+ * vqueue.c
+ * implementation of a simple queue for objects of defined size
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Mon Oct 13 14:13:08 CEST 2008
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 72 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-28 18:14:42 +0100 (Tue, 28 Oct 2008) $
+ * Id: $Id$
+ * Url: $URL$
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "debug.h"
+#include "basic-types.h"
+#include "vqueue.h"
+
+/*----------------------------- bl_vqueueInit ----------------------------------
+ *
+ * @brief init vqueue
+ * @author Christian Otto
+ *
+ */
+void bl_vqueueInit(VQueue *q, Lint allocelem, size_t sizeofelem){
+ if (allocelem <= 0){
+ DBG("vqueue.c: Attempt to initialize a vqueue of size %d. Exit forced.\n",
+ allocelem);
+ exit(-1);
+ }
+ if (sizeofelem <= 0){
+ DBG("vqueue.c: Attempt to initialize a vqueue with sizeofelem %d.\
+Exit forced.\n", sizeofelem);
+ exit(-1);
+ }
+ q->queuespace = malloc(allocelem * sizeofelem);
+ if (q->queuespace == NULL){
+ DBG("vqueue.c: Memory allocation failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ q->allocelem = allocelem;
+ q->numofelem = 0;
+ q->enqueueindex = 0;
+ q->dequeueindex = 0;
+ q->sizeofelem = sizeofelem;
+}
+
+/*--------------------------- bl_vqueueDestruct --------------------------------
+ *
+ * @brief destruct vqueue,
+ * remove method for elems as parameter possible
+ * @author Christian Otto
+ *
+ */
+void bl_vqueueDestruct(VQueue *q, void (*rmv)(void*)){
+ Lint i;
+ char *p;
+ if (rmv != NULL){
+ p = (char *) q->queuespace;
+ for(i = 0; i < q->numofelem; i++){
+ rmv(p + (q->dequeueindex * q->sizeofelem));
+ if (q->dequeueindex == q->allocelem - 1){
+ q->dequeueindex = 0;
+ }
+ else {
+ q->dequeueindex++;
+ }
+ }
+ }
+ free(q->queuespace);
+ q->allocelem = 0;
+ q->numofelem = 0;
+ q->enqueueindex = 0;
+ q->dequeueindex = 0;
+ q->sizeofelem = 0;
+}
+
+/*---------------------------- bl_vqueueIsEmpty --------------------------------
+ *
+ * @brief returns if the vqueue is empty
+ * @author Christian Otto
+ *
+ */
+BOOL bl_vqueueIsEmpty(VQueue *q){
+ return (q->numofelem == 0);
+}
+
+/*---------------------------- bl_vqueueEnqueue --------------------------------
+ *
+ * @brief enqueues elements at the back of the vqueue
+ * @author Christian Otto
+ *
+ */
+void bl_vqueueEnqueue(VQueue *q, void *elem){
+ char *p;
+ if (q->numofelem == q->allocelem){
+ bl_vqueueResize(q);
+ }
+ p = (char *) q->queuespace;
+ memmove(p + (q->enqueueindex * q->sizeofelem), elem, q->sizeofelem);
+ q->numofelem++;
+ /* implements circular data structure */
+ if (q->enqueueindex == q->allocelem - 1){
+ q->enqueueindex = 0;
+ }
+ else {
+ q->enqueueindex++;
+ }
+}
+
+/*---------------------------- bl_vqueueDequeue --------------------------------
+ *
+ * @brief dequeues element from the front of the vqueue as copy
+ * and removes it from the vqueue
+ * @author Christian Otto
+ *
+ */
+void* bl_vqueueDequeue(VQueue *q, void (*rmv)(void*)){
+ char *p, *elem;
+ if (bl_vqueueIsEmpty(q)){
+ return NULL;
+ }
+ p = (char *) q->queuespace;
+ elem = (char *) malloc(q->sizeofelem);
+ memmove(elem, p + (q->dequeueindex * q->sizeofelem), q->sizeofelem);
+ if (rmv != NULL){
+ rmv(p + (q->dequeueindex * q->sizeofelem));
+ }
+ q->numofelem--;
+ /* implements circular data structure */
+ if (q->dequeueindex == q->allocelem - 1){
+ q->dequeueindex = 0;
+ }
+ else {
+ q->dequeueindex++;
+ }
+ return elem;
+}
+
+/*---------------------------- bl_vqueueFront ----------------------------------
+ *
+ * @brief returns the front of the queue as pointer
+ * (next element that will be dequeued)
+ * @author Christian Otto
+ *
+ */
+void* bl_vqueueFront(VQueue *q){
+ char *p;
+ if (bl_vqueueIsEmpty(q)){
+ return NULL;
+ }
+ p = (char *) q->queuespace;
+ return (p + (q->dequeueindex * q->sizeofelem));
+}
+
+/*---------------------------- bl_vqueueFrontN ---------------------------------
+ *
+ * @brief returns Nth nearest object to the front of the vqueue
+ * with N = 0,..,numofelems - 1
+ * @author Christian Otto
+ *
+ */
+void* bl_vqueueFrontN(VQueue *q, Lint n){
+ char *p;
+ int pos;
+ if (bl_vqueueIsEmpty(q) || n < 0 || n >= q->numofelem){
+ return NULL;
+ }
+ p = (char *) q->queuespace;
+ pos = (q->dequeueindex + n) % q->allocelem;
+ return (p + (pos * q->sizeofelem));
+}
+
+/*---------------------------- bl_vqueueResize ---------------------------------
+ *
+ * @brief expands the size of the vqueue to the double
+ * @author Christian Otto
+ *
+ */
+void bl_vqueueResize(VQueue *q){
+ char *src, *dest;
+ q->queuespace = realloc(q->queuespace, q->sizeofelem * (q->allocelem * 2));
+ if (q->queuespace == NULL){
+ DBG("vqueue.c: Memory reallocation failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ /* stretch the circle to line */
+ if(q->dequeueindex >= q->enqueueindex){
+ src = (char *) q->queuespace + (q->sizeofelem * q->dequeueindex);
+ dest = (char *) q->queuespace +
+ (q->sizeofelem * (q->allocelem + q->dequeueindex));
+ memmove(dest, src, (q->allocelem - q->dequeueindex) * q->sizeofelem);
+ q->dequeueindex = q->dequeueindex + q->allocelem;
+ }
+ q->allocelem *= 2;
+}
+
+/*------------------------------ bl_vqueueSize ---------------------------------
+ *
+ * @brief returns number of elements in the vqueue
+ * @author Christian Otto
+ *
+ */
+Lint bl_vqueueSize(VQueue *q){
+ return q->numofelem;
+}
diff --git a/segemehl/libs/vqueue.h b/segemehl/libs/vqueue.h
new file mode 100644
index 0000000..170b386
--- /dev/null
+++ b/segemehl/libs/vqueue.h
@@ -0,0 +1,45 @@
+/**
+ * vqueue.h
+ * implementation of a simple queue for objects of defined size
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Mon Oct 13 14:13:08 CEST 2008
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 69 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-16 15:10:07 +0200 (Thu, 16 Oct 2008) $
+ * Id: $Id$
+ * Url: $URL$
+ */
+
+#ifndef VQUEUE_H
+#define VQUEUE_H
+
+#include <stdlib.h>
+#include "basic-types.h"
+
+typedef struct {
+ void *queuespace;
+ Lint enqueueindex;
+ Lint dequeueindex;
+ Lint numofelem;
+ Lint allocelem;
+ size_t sizeofelem;
+} VQueue;
+
+void bl_vqueueInit(VQueue *q, Lint allocelem, size_t sizeofelem);
+void bl_vqueueDestruct(VQueue *q, void (*rmv)(void*));
+BOOL bl_vqueueIsEmpty(VQueue *q);
+void bl_vqueueResize(VQueue *q);
+void bl_vqueueEnqueue(VQueue *q, void *elem);
+void* bl_vqueueDequeue(VQueue *q, void (*rmv)(void*));
+void* bl_vqueueFront(VQueue *q);
+void* bl_vqueueFrontN(VQueue *q, Lint N);
+Lint bl_vqueueSize(VQueue *q);
+
+#endif /* VQUEUE_H */
diff --git a/segemehl/libs/vstack.c b/segemehl/libs/vstack.c
new file mode 100644
index 0000000..ac7be40
--- /dev/null
+++ b/segemehl/libs/vstack.c
@@ -0,0 +1,168 @@
+/**
+ * vstack.c
+ * implementation of a simple stack for objects of defined size
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Fri Oct 10 11:37:36 CEST 2008
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 72 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-28 18:14:42 +0100 (Tue, 28 Oct 2008) $
+ * Id: $Id$
+ * Url: $URL$
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "basic-types.h"
+#include "debug.h"
+#include "vstack.h"
+
+/*----------------------------- bl_vstackInit ----------------------------------
+ *
+ * @brief init vstack
+ * @author Christian Otto
+ *
+ */
+void bl_vstackInit(VStack *s, Lint allocelem, size_t sizeofelem){
+ if (allocelem <= 0){
+ DBG("vstack.c: Attempt to initialize a vstack of size %d. Exit forced.\n",
+ allocelem);
+ exit(-1);
+ }
+ if (sizeofelem <= 0){
+ DBG("vstack.c: Attempt to initialize a vstack with sizeofelem %d.\
+Exit forced.\n", sizeofelem);
+ exit(-1);
+ }
+ s->stackspace = malloc(allocelem * sizeofelem);
+ if (s->stackspace == NULL){
+ DBG("vstack.c: Memory allocation failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ s->allocelem = allocelem;
+ s->top = -1;
+ s->sizeofelem = sizeofelem;
+}
+
+/*--------------------------- bl_vstackDestruct --------------------------------
+ *
+ * @brief destruct vstack,
+ * remove method for elems as parameter possible
+ * @author Christian Otto
+ *
+ */
+void bl_vstackDestruct(VStack *s, void (*rmv)(void*)){
+ Lint i;
+ char *p;
+ if (rmv != NULL){
+ p = (char *) s->stackspace;
+ for(i = 0; i <= s->top; i++){
+ rmv(p + (i * s->sizeofelem));
+ }
+ }
+ free(s->stackspace);
+ s->allocelem = 0;
+ s->top = 0;
+ s->sizeofelem = 0;
+}
+
+/*---------------------------- bl_vstackIsEmpty --------------------------------
+ *
+ * @brief returns if the vstack is empty
+ * @author Christian Otto
+ *
+ */
+BOOL bl_vstackIsEmpty(VStack *s){
+ return (s->top < 0);
+}
+
+/*----------------------------- bl_vstackPush ----------------------------------
+ *
+ * @brief pushs elements on the top of the vstack
+ * @author Christian Otto
+ *
+ */
+void bl_vstackPush(VStack *s, void *elem){
+ char *p;
+ if (s->top >= s->allocelem - 1){
+ s->stackspace = realloc(s->stackspace,
+ s->sizeofelem * (s->allocelem + BASEINC));
+ if (s->stackspace == NULL || BASEINC <= 0){
+ DBG("vstack.c: Memory reallocation failed. Exit forced.\n", NULL);
+ exit(-1);
+ }
+ s->allocelem += BASEINC;
+ }
+ s->top++;
+ p = (char *) s->stackspace;
+ memmove(p + (s->top * s->sizeofelem), elem, s->sizeofelem);
+}
+
+/*------------------------------ bl_vstackTop ----------------------------------
+ *
+ * @brief returns top of the vstack as pointer
+ * @author Christian Otto
+ *
+ */
+void* bl_vstackTop(VStack *s){
+ char *p;
+ if (bl_vstackIsEmpty(s)){
+ return NULL;
+ }
+ p = (char *) s->stackspace;
+ return (p + (s->top * s->sizeofelem));
+}
+
+/*------------------------------ bl_vstackTopN ---------------------------------
+ *
+ * @brief returns Nth highest object of the vstack
+ * with N = 0,..,numofelems - 1
+ * @author Christian Otto
+ *
+ */
+void* bl_vstackTopN(VStack *s, Lint n){
+ char *p;
+ if (bl_vstackIsEmpty(s) || n < 0 || n > s->top){
+ return NULL;
+ }
+ p = (char *) s->stackspace;
+ return (p + (s->top - n) * s->sizeofelem);
+}
+
+/*------------------------------ bl_vstackPop ----------------------------------
+ *
+ * @brief pops the top of the vstack as copy
+ * and removes it from the vstack
+ * @author Christian Otto
+ *
+ */
+void* bl_vstackPop(VStack *s, void (*rmv)(void*)){
+ char *p, *elem;
+ if (bl_vstackIsEmpty(s)){
+ return NULL;
+ }
+ p = (char *) s->stackspace;
+ elem = (char *) malloc(s->sizeofelem);
+ memmove(elem, p + (s->top * s->sizeofelem), s->sizeofelem);
+ if (rmv != NULL){
+ rmv(p + (s->top * s->sizeofelem));
+ }
+ s->top--;
+ return elem;
+}
+
+/*------------------------------ bl_vstackSize ---------------------------------
+ *
+ * @brief returns number of elements on the vstack
+ * @author Christian Otto
+ *
+ */
+Lint bl_vstackSize(VStack *s){
+ return (s->top + 1);
+}
diff --git a/segemehl/libs/vstack.h b/segemehl/libs/vstack.h
new file mode 100644
index 0000000..56e6e9e
--- /dev/null
+++ b/segemehl/libs/vstack.h
@@ -0,0 +1,49 @@
+/**
+ * vstack.h
+ * implementation of a simple stack for objects of defined size
+ *
+ * @author Christian Otto
+ * @email christian at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date Fri Oct 10 11:37:36 CEST 2008
+ */
+
+/*
+ * SVN
+ * Revision of last commit: $Rev: 69 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-10-16 15:10:07 +0200 (Thu, 16 Oct 2008) $
+ * Id: $Id$
+ * Url: $URL$
+ */
+
+#ifndef VSTACK_H
+#define VSTACK_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "basic-types.h"
+
+#define VSTACKINC 10000
+
+#ifndef BASEINC
+#define BASEINC VSTACKINC
+#endif
+
+typedef struct{
+ void* stackspace;
+ Lint allocelem;
+ Lint top;
+ size_t sizeofelem;
+} VStack;
+
+void bl_vstackInit(VStack *s, Lint allocelem, size_t sizeofelem);
+void bl_vstackDestruct(VStack *s, void (*rmv)(void*));
+BOOL bl_vstackIsEmpty(VStack *s);
+void bl_vstackPush(VStack *s, void *elem);
+void* bl_vstackTop(VStack *s);
+void* bl_vstackTopN(VStack *s, Lint n);
+void* bl_vstackPop(VStack *s, void (*rmv)(void*));
+Lint bl_vstackSize(VStack *s);
+
+#endif /* VSTACK_H */
diff --git a/segemehl/libs/vtprogressbar.c b/segemehl/libs/vtprogressbar.c
new file mode 100644
index 0000000..01e45fa
--- /dev/null
+++ b/segemehl/libs/vtprogressbar.c
@@ -0,0 +1,84 @@
+
+/*
+ * vtprogressbar.c
+ * implementation for a very simple
+ * progress bar
+ *
+ * @author Steve Hoffmann
+ * @email shoffmann at zbh.uni-hamburg.de
+ * @date 12/05/06 02:11:30 CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 19 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-05-14 15:43:29 +0200 (Wed, 14 May 2008) $
+ *
+ * Id: $Id: vtprogressbar.c 19 2008-05-14 13:43:29Z steve $
+ * Url: $URL: file:///homes/bierdepot/steve/svn/segemehl/trunk/libs/vtprogressbar.c $
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "basic-types.h"
+#include "vtprogressbar.h"
+
+
+void
+cursorInvisible() {
+ fprintf(stderr, "%c%c%c%d%c", 27, '[', '?', 25, 'l');
+}
+
+void
+cursorVisible() {
+ fprintf(stderr, "%c%c%c%d%c", 27, '[', '?', 25, 'h');
+}
+
+
+
+/*---------------------------- initProgressBarVT -----------------------------
+ *
+ * initializes a progress bar for VT
+ *
+ */
+
+void
+initProgressBarVT ()
+{
+ fprintf(stderr, "%c%c%c", 27, '[', 's');
+ fprintf(stderr, "%c%c%c", 27, '[', 'K');
+ return ;
+}
+
+/*------------------------------ progressBarVT -------------------------------
+ *
+ * a simple progress bar for VT terminals
+ *
+ */
+
+void
+progressBarVT (char *message, Uint complete, Uint processed, Uint size)
+{
+ Uint i, bar, percent;
+ double p;
+ char cur;
+
+ if (complete == 0) complete = 1;
+ p = ((double) processed)/complete;
+ bar = (Uint) (size * p);
+ percent = (Uint) (100 * p);
+ fprintf(stderr, "[");
+ for(i=0; i < size; i++) {
+ if(i<=bar) fprintf(stderr, "=");
+ else fprintf(stderr," ");
+ }
+ i = processed % 30;
+ if (i<=10) cur = '/';
+ else if (i<=20)cur='\\';
+ else cur='-';
+ fprintf(stderr,"] %d%c(%d) %s %c\n", percent, '%',
+ processed, message, cur);
+ fprintf(stderr,"%c%c%c", 27, '[', 'A');
+ return;
+}
+
+
diff --git a/segemehl/libs/vtprogressbar.h b/segemehl/libs/vtprogressbar.h
new file mode 100644
index 0000000..5fc000d
--- /dev/null
+++ b/segemehl/libs/vtprogressbar.h
@@ -0,0 +1,24 @@
+/*
+ * =====================================================================================
+ *
+ * Filename: vtprogressbar.h
+ *
+ * Description: header file for a simple vt100 progressbar
+ *
+ * Version: 1.0
+ * Created: 12/07/06 00:11:54 CET
+ * Revision: none
+ * Compiler: gcc
+ *
+ * Author: Steve Hoffmann (SH), shoffmann at zbh.uni-hamburg.de
+ * Company: Center for Bioinformatics, Hamburg
+ *
+ * =====================================================================================
+ */
+
+
+void progressBarVT (char *message, Uint complete, Uint processed, Uint size);
+void cursorInvisible();
+void cursorVisible();
+void initProgressBarVT();
+
diff --git a/segemehl/libs/zran.c b/segemehl/libs/zran.c
new file mode 100644
index 0000000..3e6a00c
--- /dev/null
+++ b/segemehl/libs/zran.c
@@ -0,0 +1,521 @@
+/* zran.c -- example of zlib/gzip stream indexing and random access
+ * Copyright (C) 2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+[ Version 1.0 29 May 2005 Mark Adler */
+
+/* Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()
+ for random access of a compressed file. A file containing a zlib or gzip
+ stream is provided on the command line. The compressed stream is decoded in
+ its entirety, and an index built with access points about every SPAN bytes
+ in the uncompressed output. The compressed file is left open, and can then
+ be read randomly, having to decompress on the average SPAN/2 uncompressed
+ bytes before getting to the desired block of data.
+
+ An access point can be created at the start of any deflate block, by saving
+ the starting file offset and bit of that block, and the 32K bytes of
+ uncompressed data that precede that block. Also the uncompressed offset of
+ that block is saved to provide a referece for locating a desired starting
+ point in the uncompressed stream. build_index() works by decompressing the
+ input zlib or gzip stream a block at a time, and at the end of each block
+ deciding if enough uncompressed data has gone by to justify the creation of
+ a new access point. If so, that point is saved in a data structure that
+ grows as needed to accommodate the points.
+
+ To use the index, an offset in the uncompressed data is provided, for which
+ the latest accees point at or preceding that offset is located in the index.
+ The input file is positioned to the specified location in the index, and if
+ necessary the first few bits of the compressed data is read from the file.
+ inflate is initialized with those bits and the 32K of uncompressed data, and
+ the decompression then proceeds until the desired offset in the file is
+ reached. Then the decompression continues to read the desired uncompressed
+ data from the file.
+
+ Another approach would be to generate the index on demand. In that case,
+ requests for random access reads from the compressed data would try to use
+ the index, but if a read far enough past the end of the index is required,
+ then further index entries would be generated and added.
+
+ There is some fair bit of overhead to starting inflation for the random
+ access, mainly copying the 32K byte dictionary. So if small pieces of the
+ file are being accessed, it would make sense to implement a cache to hold
+ some lookahead and avoid many calls to extract() for small lengths.
+
+ Another way to build an index would be to use inflateCopy(). That would
+ not be constrained to have access points at block boundaries, but requires
+ more memory per access point, and also cannot be saved to file due to the
+ use of pointers in the state. The approach here allows for storage of the
+ index in a file.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "basic-types.h"
+#include "zlib.h"
+#include "zran.h"
+
+/* Deallocate an index built by build_index() */
+void free_index(struct access *index)
+{
+ if (index != NULL) {
+ free(index->list);
+ free(index);
+ }
+}
+
+/* Add an entry to the access point list. If out of memory, deallocate the
+ existing list and return NULL. */
+
+struct access *addpoint(struct access *index, int bits,
+ off_t in, off_t out, unsigned left, unsigned char *window)
+{
+ struct point *next;
+ Uint i, oldsize;
+
+ /* if list is empty, create it (start with eight points) */
+ if (index == NULL) {
+ index = malloc(sizeof(struct access));
+ if (index == NULL) return NULL;
+ index->list = malloc(sizeof(struct point) << 3);
+
+ for(i=0; i < 8; i++) {
+ index->list[i].out = 0ul;
+ index->list[i].in = 0ul;
+ index->list[i].bits= 0;
+ memset(index->list[i].window, 0, WINSIZE);
+ }
+ if (index->list == NULL) {
+ free(index);
+ return NULL;
+ }
+ index->size = 8;
+ index->have = 0;
+ }
+
+ /* if list is full, make it bigger */
+ else if (index->have == index->size) {
+ oldsize = index->size;
+ index->size <<= 1;
+ next = realloc(index->list, sizeof(struct point) * index->size);
+ if (next == NULL) {
+ free_index(index);
+ return NULL;
+ }
+
+ for(i=oldsize; i < index->size; i++) {
+ next[i].out = 0;
+ next[i].in = 0;
+ next[i].bits= 0;
+ memset(next[i].window, 0, WINSIZE);
+ }
+ index->list = next;
+ }
+
+ /* fill in entry and increment how many we have */
+ next = index->list + index->have;
+ next->bits = bits;
+ next->in = in;
+ next->out = out;
+ if (left)
+ memcpy(next->window, window + WINSIZE - left, left);
+ if (left < WINSIZE)
+ memcpy(next->window + left, window, WINSIZE - left);
+ index->have++;
+
+ /* return list, possibly reallocated */
+ return index;
+}
+
+/* Make one entire pass through the compressed stream and build an index, with
+ access points about every span bytes of uncompressed output -- span is
+ chosen to balance the speed of random access against the memory requirements
+ of the list, about 32K bytes per access point. Note that data after the end
+ of the first zlib or gzip stream in the file is ignored. build_index()
+ returns the number of access points on success (>= 1), Z_MEM_ERROR for out
+ of memory, Z_DATA_ERROR for an error in the input file, or Z_ERRNO for a
+ file read error. On success, *built points to the resulting index. */
+int build_index(FILE *in, off_t span, struct access **built)
+{
+ int ret;
+ off_t totin, totout; /* our own total counters to avoid 4GB limit */
+ off_t last; /* totout value of last access point */
+ struct access *index; /* access points being generated */
+ z_stream strm;
+ unsigned char input[CHUNK];
+ unsigned char window[WINSIZE];
+
+ memset(window, 0, WINSIZE);
+ /* initialize inflate */
+ strm.zalloc = Z_NULL;
+ strm.zfree = Z_NULL;
+ strm.opaque = Z_NULL;
+ strm.avail_in = 0;
+ strm.next_in = Z_NULL;
+ ret = inflateInit2(&strm, 47); /* automatic zlib or gzip decoding */
+ if (ret != Z_OK)
+ return ret;
+
+ /* inflate the input, maintain a sliding window, and build an index -- this
+ also validates the integrity of the compressed data using the check
+ information at the end of the gzip or zlib stream */
+ totin = totout = last = 0;
+ index = NULL; /* will be allocated by first addpoint() */
+ strm.avail_out = 0;
+ do {
+ /* get some compressed data from input file */
+ strm.avail_in = fread(input, 1, CHUNK, in);
+ if (ferror(in)) {
+ ret = Z_ERRNO;
+ goto build_index_error;
+ }
+ if (strm.avail_in == 0) {
+ ret = Z_DATA_ERROR;
+ goto build_index_error;
+ }
+ strm.next_in = input;
+
+ /* process all of that, or until end of stream */
+ do {
+ /* reset sliding window if necessary */
+ if (strm.avail_out == 0) {
+ strm.avail_out = WINSIZE;
+ strm.next_out = window;
+ }
+
+ /* inflate until out of input, output, or at end of block --
+ update the total input and output counters */
+ totin += strm.avail_in;
+ totout += strm.avail_out;
+ ret = inflate(&strm, Z_BLOCK); /* return at end of block */
+ totin -= strm.avail_in;
+ totout -= strm.avail_out;
+ if (ret == Z_NEED_DICT)
+ ret = Z_DATA_ERROR;
+ if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
+ goto build_index_error;
+ if (ret == Z_STREAM_END)
+ break;
+
+ /* if at end of block, consider adding an index entry (note that if
+ data_type indicates an end-of-block, then all of the
+ uncompressed data from that block has been delivered, and none
+ of the compressed data after that block has been consumed,
+ except for up to seven bits) -- the totout == 0 provides an
+ entry point after the zlib or gzip header, and assures that the
+ index always has at least one access point; we avoid creating an
+ access point after the last block by checking bit 6 of data_type
+ */
+ if ((strm.data_type & 128) && !(strm.data_type & 64) &&
+ (totout == 0 || totout - last > span)) {
+ index = addpoint(index, strm.data_type & 7, totin,
+ totout, strm.avail_out, window);
+ if (index == NULL) {
+ ret = Z_MEM_ERROR;
+ goto build_index_error;
+ }
+ last = totout;
+ }
+ } while (strm.avail_in != 0);
+ } while (ret != Z_STREAM_END);
+
+ /* clean up and return index (release unused entries in list) */
+ (void)inflateEnd(&strm);
+ index = realloc(index, sizeof(struct point) * index->have);
+ index->size = index->have;
+ *built = index;
+ return index->size;
+
+ /* return error */
+ build_index_error:
+ (void)inflateEnd(&strm);
+ if (index != NULL)
+ free_index(index);
+ return ret;
+}
+
+/* Use the index to read len bytes from offset into buf, return bytes read or
+ negative for error (Z_DATA_ERROR or Z_MEM_ERROR). If data is requested past
+ the end of the uncompressed data, then extract() will return a value less
+ than len, indicating how much as actually read into buf. This function
+ should not return a data error unless the file was modified since the index
+ was generated. extract() may also return Z_ERRNO if there is an error on
+ reading or seeking the input file. */
+int extract(FILE *in, struct access *index, off_t offset,
+ unsigned char *buf, int len)
+{
+ int ret, skip;
+ z_stream strm;
+ struct point *here;
+ unsigned char input[CHUNK];
+ unsigned char discard[WINSIZE];
+
+ /* proceed only if something reasonable to do */
+ if (len < 0)
+ return 0;
+
+ /* find where in stream to start */
+ here = index->list;
+ ret = index->have;
+ while (--ret && here[1].out <= offset)
+ here++;
+
+ /* initialize file and inflate state to start there */
+ strm.zalloc = Z_NULL;
+ strm.zfree = Z_NULL;
+ strm.opaque = Z_NULL;
+ strm.avail_in = 0;
+ strm.avail_out = 0;
+ strm.next_in = Z_NULL;
+ ret = inflateInit2(&strm, -15); /* raw inflate */
+ if (ret != Z_OK)
+ return ret;
+ ret = fseeko(in, here->in - (here->bits ? 1 : 0), SEEK_SET);
+ if (ret == -1)
+ goto extract_ret;
+ if (here->bits) {
+ ret = getc(in);
+ if (ret == -1) {
+ ret = ferror(in) ? Z_ERRNO : Z_DATA_ERROR;
+ goto extract_ret;
+ }
+ (void)inflatePrime(&strm, here->bits, ret >> (8 - here->bits));
+ }
+ (void)inflateSetDictionary(&strm, here->window, WINSIZE);
+
+ /* skip uncompressed bytes until offset reached, then satisfy request */
+ offset -= here->out;
+ strm.avail_in = 0;
+ skip = 1; /* while skipping to offset */
+ do {
+ /* define where to put uncompressed data, and how much */
+ if (offset == 0 && skip) { /* at offset now */
+ strm.avail_out = len;
+ strm.next_out = buf;
+ skip = 0; /* only do this once */
+ }
+ if (offset > WINSIZE) { /* skip WINSIZE bytes */
+ strm.avail_out = WINSIZE;
+ strm.next_out = discard;
+ offset -= WINSIZE;
+ }
+ else if (offset != 0) { /* last skip */
+ strm.avail_out = (unsigned)offset;
+ strm.next_out = discard;
+ offset = 0;
+ }
+
+ /* uncompress until avail_out filled, or end of stream */
+ do {
+ if (strm.avail_in == 0) {
+ strm.avail_in = fread(input, 1, CHUNK, in);
+ if (ferror(in)) {
+ ret = Z_ERRNO;
+ goto extract_ret;
+ }
+ if (strm.avail_in == 0) {
+ ret = Z_DATA_ERROR;
+ goto extract_ret;
+ }
+ strm.next_in = input;
+ }
+ ret = inflate(&strm, Z_NO_FLUSH); /* normal inflate */
+ if (ret == Z_NEED_DICT)
+ ret = Z_DATA_ERROR;
+ if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
+ goto extract_ret;
+ if (ret == Z_STREAM_END)
+ break;
+ } while (strm.avail_out != 0);
+
+ /* if reach end of stream, then don't keep trying to get more */
+ if (ret == Z_STREAM_END)
+ break;
+
+ /* do until offset reached and requested data read, or stream ends */
+ } while (skip);
+
+ /* compute number of uncompressed bytes read after offset */
+ ret = skip ? 0 : len - strm.avail_out;
+
+ /* clean up and return bytes read or error */
+ extract_ret:
+ (void)inflateEnd(&strm);
+ return ret;
+}
+
+struct access*
+bl_zranGetIndex(char *filename, int *len) {
+
+ int l;
+ FILE *fp;
+ struct access* gzindex;
+
+ fp = fopen(filename, "rb");
+ if (fp == NULL) {
+ fprintf(stderr, "zran: could not open %s for reading\n", filename);
+ exit(-1);
+ }
+
+ /* build index */
+ l = build_index(fp, SPAN, &gzindex);
+ if (l < 0) {
+ fclose(fp);
+ switch (l) {
+ case Z_MEM_ERROR:
+ fprintf(stderr, "zran: out of memory\n");
+ break;
+ case Z_DATA_ERROR:
+ fprintf(stderr, "zran: compressed data error in %s\n", filename);
+ break;
+ case Z_ERRNO:
+ fprintf(stderr, "zran: read error on %s\n", filename);
+ break;
+ default:
+ fprintf(stderr, "zran: error %d while building index\n", l);
+ }
+ exit(-1);
+ }
+ fclose(fp);
+
+ *len = l;
+ return gzindex;
+}
+
+
+
+/*------------------------------ bl_initgzfile -------------------------------
+ *
+ * @brief
+ * @author Steve Hoffmann
+ *
+ */
+
+struct gzidxfile*
+bl_initgzidxfile(FILE *fp, struct access *index, off_t offset, int mychunk)
+{
+ struct gzidxfile *file;
+
+ if(mychunk == 0) mychunk = LARGECHUNK;
+ file = calloc(1, sizeof(struct gzidxfile));
+ file->fp = fp;
+ file->index = index;
+ file->mychunk = mychunk;
+ file->buf = calloc(file->mychunk+1, sizeof(unsigned char));
+ file->buf[file->mychunk] = 0;
+ file->curap = offset;
+ file->len = 0;
+ file->pos = file->buf;
+
+ return file;
+}
+
+void
+bl_destructgzidxfile(struct gzidxfile *file) {
+ free(file->buf);
+}
+
+/*-------------------------------- bl_getgzc ---------------------------------
+ *
+ * @brief get char from gz stream
+ * @author Steve Hoffmann
+ *
+ */
+
+int
+bl_getgzidxc (struct gzidxfile *f)
+{
+
+
+ if(f->len == 0 || f->pos - f->buf >= f->len) {
+
+ memset(f->buf, 0, f->mychunk+1);
+ f->curap += f->pos - f->buf;
+ // fprintf(stderr, "-");
+ f->len = extract(f->fp, f->index, f->curap, (unsigned char*) f->buf, f->mychunk);
+ // fprintf(stderr, "|");
+
+ if (f->len == 0) return EOF;
+ if (f->len < 0) {
+ fprintf (stderr, "zran: extraction failed: %s error \n",
+ f->len == Z_MEM_ERROR ? "out of memory" : "input corrupted");
+ return EOF;
+ } else {
+// fprintf(stderr, "zran: extracted %d bytes at %llu\n", f->len, f->curap);
+ }
+
+ f->pos = f->buf;
+ }
+
+ return *f->pos++;
+}
+
+off_t
+bl_ftellgzidx(struct gzidxfile *f) {
+ return f->curap + (f->pos - f->buf);
+}
+
+#ifdef TESTZRAN
+
+/* Demonstrate the use of build_index() and extract() by processing the file
+ provided on the command line, and the extracting 16K from about 2/3rds of
+ the way through the uncompressed output, and writing that to stdout. */
+int main(int argc, char **argv)
+{
+ int len;
+ off_t offset;
+ FILE *in;
+ struct access *index;
+ unsigned char buf[CHUNK];
+
+ /* open input file */
+ if (argc != 2) {
+ fprintf(stderr, "usage: zran file.gz\n");
+ return 1;
+ }
+ in = fopen(argv[1], "rb");
+ if (in == NULL) {
+ fprintf(stderr, "zran: could not open %s for reading\n", argv[1]);
+ return 1;
+ }
+
+ /* build index */
+ len = build_index(in, SPAN, &index);
+ if (len < 0) {
+ fclose(in);
+ switch (len) {
+ case Z_MEM_ERROR:
+ fprintf(stderr, "zran: out of memory\n");
+ break;
+ case Z_DATA_ERROR:
+ fprintf(stderr, "zran: compressed data error in %s\n", argv[1]);
+ break;
+ case Z_ERRNO:
+ fprintf(stderr, "zran: read error on %s\n", argv[1]);
+ break;
+ default:
+ fprintf(stderr, "zran: error %d while building index\n", len);
+ }
+ return 1;
+ }
+ fprintf(stderr, "zran: built index with %d access points\n", len);
+
+ /* use index by reading some bytes from an arbitrary offset */
+ offset = (index->list[index->have - 1].out << 1) / 3;
+ len = extract(in, index, offset, buf, CHUNK);
+ if (len < 0)
+ fprintf(stderr, "zran: extraction failed: %s error\n",
+ len == Z_MEM_ERROR ? "out of memory" : "input corrupted");
+ else {
+ fwrite(buf, 1, len, stdout);
+ fprintf(stderr, "zran: extracted %d bytes at %llu\n", len, offset);
+ }
+
+ /* clean up and exit */
+ free_index(index);
+ fclose(in);
+ return 0;
+}
+#endif
diff --git a/segemehl/libs/zran.h b/segemehl/libs/zran.h
new file mode 100644
index 0000000..ae1bdd9
--- /dev/null
+++ b/segemehl/libs/zran.h
@@ -0,0 +1,72 @@
+#ifndef ZRAN_H
+#define ZRAN_H
+/*
+ *
+ * zran.h
+ *
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 07/06/2010 03:24:28 PM CEST
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include "zlib.h"
+
+#define SPAN 1048576L /* desired distance between access points */
+#define WINSIZE 32768U /* sliding window size */
+#define CHUNK 16384 /* file input buffer size */
+#define LARGECHUNK 1638400000
+#define MEDIUMCHUNK 1638400
+
+
+/* access point entry */
+struct point {
+ off_t out; /* corresponding offset in uncompressed data */
+ off_t in; /* offset in input file of first full byte */
+ int bits; /* number of bits (1-7) from byte at in - 1, or 0 */
+ unsigned char window[WINSIZE]; /* preceding 32K of uncompressed data */
+};
+
+/* access point list */
+struct access {
+ int have; /* number of list entries filled in */
+ int size; /* number of list entries allocated */
+ struct point *list; /* allocated list */
+};
+
+struct gzidxfile {
+ FILE *fp;
+ struct access *index;
+ off_t curap;
+ int mychunk;
+ unsigned char *buf;
+ unsigned char *pos;
+ int len;
+};
+
+/* Deallocate an index built by build_index() */
+void free_index(struct access *index);
+
+int extract(FILE *in, struct access *index, off_t offset,
+ unsigned char *buf, int len);
+
+struct access* bl_zranGetIndex(char *filename, int *len);
+
+void bl_destructgzidxfile(struct gzidxfile *file);
+
+off_t bl_ftellgzidx(struct gzidxfile *f);
+
+int build_index(FILE *in, off_t span, struct access **built);
+
+struct gzidxfile* bl_initgzidxfile(FILE *fp, struct access *index, off_t offset, int len);
+
+
+int bl_getgzidxc (struct gzidxfile *f);
+
+
+#endif
diff --git a/segemehl/src/filebintest.c b/segemehl/src/filebintest.c
new file mode 100644
index 0000000..b56bb68
--- /dev/null
+++ b/segemehl/src/filebintest.c
@@ -0,0 +1,135 @@
+
+/*
+ * filebintest.c
+ * segemehl
+ *
+ * Created by Steve Hoffmann on 09.02.10.
+ * Copyright 2010 University Leipzig.
+ * All rights reserved.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <limits.h>
+#include <sys/types.h>
+#include "basic-types.h"
+#include "radixsort.h"
+#include "fileBins.h"
+#include "filebintest.h"
+
+unsigned char mute=0;
+
+LLint getLineColKey(char *src, void *nfo) {
+
+ lineColKeyInfo_t *lcknfo = (lineColKeyInfo_t*) nfo;
+ char c;
+ char *s, *start;
+ int sepcnt;
+ LLint key = LLONG_MIN;
+
+ start = src;
+ s = src;
+ c = lcknfo->sep;
+ sepcnt = 0;
+
+ if (!*s) return 0;
+
+ do {
+ if (*s == c) {
+ sepcnt++;
+
+ if(sepcnt == lcknfo->col+1) break;
+ if(sepcnt == lcknfo->col) {
+ start = s+1;
+ }
+ }
+ } while (*++s);
+
+ if(sepcnt-1 == lcknfo->col) {
+ *s = '\0';
+ key = atol(start);
+ *s = c;
+ }
+
+ if(sepcnt == lcknfo->col && *s == 0 && start < s) {
+ key = atol(start);
+ }
+ return key;
+
+}
+
+
+int main (int argc, char** argv) {
+ bl_fileBins_t *myBins;
+ bl_fileBin_t* fb;
+ lineColKeyInfo_t nfo;
+ int i, j, a, b, c, d;
+ char *line;
+ time_t startsuf, endsuf;
+ double difsuf;
+ char *buffer[]={"string1","string2","string3","string4", "string5"};
+
+ int range = 2000000,
+ noofbins = 5,
+ nooflines = 2000;
+
+ unsigned int iseed = (unsigned int)time(NULL);
+ line = malloc(1000);
+ nfo.sep='\t';
+ nfo.col=1;
+
+ myBins = calloc(1, sizeof(bl_fileBins_t));
+ bl_fileBinsAdd(NULL, myBins, noofbins, bl_fileBinCClassAssign,
+ buffer, NULL, "bla", 3);
+
+ for(i=0; i < noofbins; i++) {
+ fprintf(stderr, "file %d: name=%s\n", i, myBins->b[i].fname);
+ }
+
+ srand (iseed);
+
+ for(i=0; i < noofbins; i++) {
+ fb = bl_fileBinsFind(NULL, myBins, bl_fileBinsCClassSelect, buffer[i]);
+ fprintf(stderr, "try to open %s (i:%d)\n", fb->fname, i);
+ bl_fileBinsOpen(NULL, fb, "w");
+
+ for(j=0; j < nooflines; j++) {
+ a = rand()%range;
+ b = rand()%range;
+ c = rand()%range;
+ d = rand()%range;
+
+ sprintf(line, "%d \t %d \t %d \t %d%c",
+ a, b, c, d, '\0');
+ bl_fileBinsWriteLn(NULL, fb, line);
+// fprintf(fp, "%s", line);
+// myBins->b[i].lines++;
+ memset(line, 0, 1000);
+ }
+ //bl_fileBinsClose(NULL, fb);
+
+/* fp = fopen(fb->fname, "r");
+ if (fp == NULL){
+ fprintf(stderr, "main Opening of file %s failed. Exit forced.\n",
+ fb->fname);
+ exit(EXIT_FAILURE);
+ }
+ fclose(fp);
+ */
+ }
+ free(line);
+
+ fprintf(stderr, "start to sort\n");
+ time (&startsuf);
+ bl_fileBinsSortLine(NULL, myBins, 1, "trash.txt", 1, getLineColKey, &nfo);
+ time (&endsuf);
+ difsuf = difftime (endsuf, startsuf);
+ fprintf(stderr, "sort has taken %f seconds.\n", difsuf);
+ bl_fileBinsDestruct(NULL, myBins);
+ free(myBins);
+
+ return 0;
+}
+
diff --git a/segemehl/src/filebintest.h b/segemehl/src/filebintest.h
new file mode 100644
index 0000000..41d3cc9
--- /dev/null
+++ b/segemehl/src/filebintest.h
@@ -0,0 +1,21 @@
+#ifndef FILEBINTEST_H
+#define FILEBINTEST_H
+/*
+ * filebintest.h
+ * segemehl
+ *
+ * Created by Steve Hoffmann on 09.02.10.
+ * Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+
+typedef struct lineColKeyInfo_s {
+ Uint col;
+ char sep;
+} lineColKeyInfo_t;
+
+LLint getLineColKey(char *s, void *nfo);
+
+
+#endif
\ No newline at end of file
diff --git a/segemehl/src/genfasta.c b/segemehl/src/genfasta.c
new file mode 100644
index 0000000..cf55161
--- /dev/null
+++ b/segemehl/src/genfasta.c
@@ -0,0 +1,383 @@
+
+/*
+ * genfasta.c
+ * generate random fasta files
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 07.01.2010 19:18:07 CET
+ *
+ */
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "biofiles.h"
+#include "charsequence.h"
+#include "randseqs.h"
+#include "manopt.h"
+#include "mathematics.h"
+
+unsigned char mute=0;
+
+int
+main(int argc, char **argv) {
+
+ manopt_optionset optset;
+ manopt_arg *unflagged;
+ manopt_arg *list;
+ manopt_intconstraint bisulfiteconstraint;
+ annotationtrack_t* mytrack, *transtrack;
+ fasta_t *fasta = NULL;
+ Uint *space = NULL;
+ Uint reflen = 1000000;
+ Uint minreadlen = 75;
+ Uint maxreadlen = 100;
+ Uint minqual = 33;
+ Uint maxqual = 73;
+ Uint mindist = 150;
+ Uint maxdist = 200;
+ Uint polyAlen = 0;
+ Uint alphabetlen = 4;
+ Uint fiveprimelen = 0;
+ Uint threeprimelen = 0;
+ Uint n = 100;
+ Uint i;
+ geneset_t *geneset = NULL, *transset = NULL;
+ double acc = 0.95;
+ double Pmis = 0.8;//CHANGED from 0.6!!!
+ double Pins = 0.1;//CHANGED from 0.2!!!
+ double Pdel = 0.1;//CHANGED from 0.2!!!
+ Uint bisulfite = 0;
+ double rate;
+ char *rates = NULL;
+ char *sequencerates = NULL;
+ char *revcomprates = NULL;
+ char *alphabet = "ACTG";
+ char *sequence = NULL;
+ char *revcomp = NULL;
+ char *subjectfilename = NULL;
+ char *readsfilename = NULL;
+ char *matefilename = NULL;
+ char *fiveprime = NULL;
+ char *threeprime = NULL;
+ unsigned char fastq=0, split=0, splice=0;
+ FILE *subjectdev = NULL;
+ FILE *readsdev = stdout;
+ FILE *matedev = NULL;
+ FILE *dev = stdout;
+ Uint maxchildren = 10;
+ Uint mincisdist = 100;
+ Uint maxcisdist = 1000;
+ double Pcis = 0.9;
+ double Pstrandswitch = 0.5;
+ char *dbfilename = NULL;
+ char *splicefilename = NULL;
+ char spliceedges =0;
+ Uint trans = 0;
+ char simulate = 0;
+ char isoforms = 0;
+ char distsplice = 0;
+ char transtype = 'S';
+ Uint cov = 10;
+ char newseed = 0;
+
+ bisulfiteconstraint.min = 0;
+ bisulfiteconstraint.max = 2;
+
+ manopt_initoptionset(&optset, argv[0], NULL,
+ " Generate random fasta sequences and simulated reads\n",
+ " GENFASTA is free software for non-commercial use \n (C) 2008 Bioinformatik Leipzig\n",
+ "0.1" ,
+ " Please report bugs to steve at bioinf.uni-leipzig.de");
+
+ manopt(&optset, REQSTRINGOPT, 0, 'f', "readfile",
+ "path/filename to write the output to", "<file>", NULL, &readsfilename);
+ manopt(&optset, REQSTRINGOPT, 0, 's', "subjectfile",
+ "path/filename to write subject sequence to", "<file>", NULL, &subjectfilename);
+ manopt(&optset, FLAG, 0, 't', "split",
+ "split/spliced reads", NULL, NULL, &split);
+ manopt(&optset, FLAG, 0, 'S', "splice", "generate a set of splice sites",
+ NULL, NULL, &splice);
+ manopt(&optset, REQSTRINGOPT, 0, 'p', "pairs",
+ "path/filename to write mate pair sequences to", NULL, NULL, &matefilename);
+ manopt(&optset, REQUINTOPT, 0, 'l', "minreadlen",
+ "minimum size of queries", "<n>", NULL, &minreadlen);
+ manopt(&optset, REQUINTOPT, 0, 'm', "maxreadlen",
+ "maximum size of queries", "<n>", NULL, &maxreadlen);
+ manopt(&optset, FLAG, 0, 'q', "fastq",
+ "generate fastq reads", NULL, NULL, &fastq);
+ manopt(&optset, REQSTRINGOPT, 0, 'a', "alphabet",
+ "alphabet for the fasta sequences", "<string>", NULL, &alphabet);
+ manopt(&optset, REQINTOPT, 0, 'n', "readnumber",
+ "number of reads to be generated", "<string>", NULL, &n);
+ manopt(&optset, REQINTOPT, 0, 'r', "reflen",
+ "length of reference sequence", "<string>", NULL, &reflen);
+ manopt(&optset, DBLOPT, 0, 'A', "accuracy",
+ "accuracy of reads (1/errorrate)","<float>", NULL, &acc);
+ manopt(&optset, DBLOPT, 0, 'M', "mismatches",
+ "probability of a mismatch for an erroneous site", "<float>", NULL, &Pmis);
+ manopt(&optset, DBLOPT, 0, 'I', "insertions",
+ "probability of insertion for an erroneous site that is not a mismatch", "<float>", NULL, &Pins);
+
+ manopt(&optset, REQUINTOPT, 0, 'B', "biseq",
+ "bisulfite sequencing protocol (0 = no bisulfite, 1 = Lister et al., 2 = Cokus et al.)",
+ "<n>", &bisulfiteconstraint, &bisulfite);
+ manopt(&optset, LISTOPT, 0, 'R', "rates",
+ "methylation rate(s) in reads","<float> [<float>]", NULL, NULL);
+ manopt(&optset, REQSTRINGOPT, 0, '5', "5prime",
+ "add 5' adapter", "<string>", NULL, &fiveprime);
+ manopt(&optset, REQSTRINGOPT, 0, '3', "3prime",
+ "add 3' adapter", "<string>", NULL, &threeprime);
+ manopt(&optset, REQINTOPT, 0, 'T', "polyA",
+ "attach polyA tail", "<n>", NULL, &polyAlen);
+
+ manopt(&optset, REQSTRINGOPT, 0, 'd', "database",
+ "path/filename of database sequences", "<file>", NULL, &dbfilename);
+ manopt(&optset, REQSTRINGOPT, 0, 'b', "splicesites",
+ "path/filename of bed for splice sites", "<file>", NULL, &splicefilename);
+ manopt(&optset, FLAG, 0, 'e', "edges",
+ "extract splice edges", NULL, NULL, &spliceedges);
+ manopt(&optset, REQINTOPT, 0, 'V', "trans",
+ "generate random trans splicing events", NULL, NULL, &trans);
+ manopt(&optset, FLAG, 0, 'D', "dist",
+ "generate dist splice sites", NULL, NULL, &distsplice);
+ manopt(&optset, FLAG, 0, 'G', "simulate",
+ "simulate sequencing", NULL, NULL, &simulate);
+ manopt(&optset, FLAG, 0, 'F', "isoforms",
+ "extract isoform sequences", NULL, NULL, &isoforms);
+ manopt(&optset, REQUINTOPT, 0, 'C', "coverage",
+ "coverage for simulation", "<n>", NULL, &cov);
+ manopt(&optset, FLAG, 0, 'N', "newseed",
+ "init random generator seed with time", NULL, NULL, &newseed);
+
+
+ unflagged = manopt_getopts(&optset, argc, argv);
+ if(unflagged->noofvalues > 1) {
+ manopt_help(&optset, "unknown argument(s)\n");
+ }
+
+
+ assert(Pmis+Pins <= 1.0);
+ Pdel = 1.0 - Pmis - Pins;
+
+ if (newseed){
+ srand((unsigned int)time(NULL));
+ }
+
+ if(dbfilename) {
+ fasta = bl_fastxGetSet(space, &dbfilename, 1, 1, 0, 0, 1);
+ }
+
+ if(splicefilename) {
+ mytrack= bl_BEDread (space, splicefilename);
+ geneset = bl_getGeneModelFromBEDtrack(space, mytrack);
+ }
+
+ alphabetlen = strlen(alphabet);
+ if(readsfilename) {
+ readsdev = fopen(readsfilename, "w");
+ if(readsdev == NULL) {
+ fprintf(stderr, "couldn't open %s - exit forced", readsfilename);
+ exit(-1);
+ }
+ }
+
+
+ if(isoforms) {
+ if(!dbfilename || !splicefilename)
+ manopt_help(&optset, "database and splice annotation needed!\n");
+ for(i=0; i < geneset->noofgenes; i++) {
+ sequence = bl_getGeneSequence(space, fasta, &geneset->genes[i]);
+ fprintf(readsdev, ">%s\n%s\n", geneset->genes[i].id, sequence);
+ }
+ }
+
+ if(simulate) {
+ if(!dbfilename || !splicefilename)
+ manopt_help(&optset, "database and splice annotation needed!\n");
+ bl_simulateGeneSetSequencing (space, readsdev, fasta, geneset, minreadlen, cov,
+ alphabet, alphabetlen, minqual, maxqual, acc, Pmis, Pins);
+ }
+
+ if(trans) {
+ if(!splicefilename)
+ manopt_help(&optset, "splice annotation needed!\n");
+ if(distsplice) transtype = 'D';
+ transset = bl_simulateTransSplicing (space, geneset, transtype, trans);
+ transtrack = bl_getTrackFromGeneModel (space, transset);
+ bl_BEDwrite(transtrack, dev);
+ }
+
+ if(spliceedges) {
+ if(!splicefilename)
+ manopt_help(&optset, "splice annotation needed!\n");
+ bl_printSplicingEdges (space, dev, geneset);
+ }
+
+ if(subjectfilename) {
+ subjectdev = fopen(subjectfilename, "w");
+ sequence = ALLOCMEMORY(space, NULL, char, reflen+1);
+ fprintf(subjectdev, ">subject sequence (len: %d)\n", reflen);
+ for(i=0; i < reflen; i++) {
+ sequence[i]=alphabet[(Uint) RANDINT(alphabetlen-1)];
+ fprintf(subjectdev,"%c", sequence[i]);
+ if (i > 0 && i % 80 == 0) fprintf(subjectdev,"\n");
+ }
+ fprintf(subjectdev,"\n");
+ sequence[reflen] = '\0';
+ fclose(subjectdev);
+ }
+
+ if(fiveprime) {
+ fiveprimelen = strlen(fiveprime);
+ fprintf(stderr,"5prime adapter: %s (%d)", fiveprime, fiveprimelen);
+ }
+ if(threeprime) {
+ threeprimelen = strlen(threeprime);
+ fprintf(stderr,"3prime adapter: %s (%d)", threeprime, threeprimelen);
+ }
+
+
+ if(subjectfilename && !matefilename && !split && !bisulfite) {
+
+ bl_fastxPrintRandomReads(readsdev, sequence, reflen,
+ n, minreadlen, maxreadlen, alphabet, alphabetlen, acc,
+ Pmis, Pins, Pdel, fastq, minqual, maxqual,
+ fiveprime, fiveprimelen, threeprime, threeprimelen, polyAlen);
+ }
+
+
+ if(subjectfilename && splice && !matefilename && !bisulfite) {
+ bl_fastxSimulateSpliceSites (space, sequence, reflen, 100, maxchildren,
+ alphabet, alphabetlen, acc,
+ Pmis, Pins, Pdel,
+ minqual, maxqual, Pcis, mincisdist, maxcisdist, Pstrandswitch, 100);
+
+ }
+
+ if(subjectfilename && split && !matefilename && !bisulfite) {
+ fprintf(stderr, "reflen: %d; maxreadlen %d\n", reflen, maxreadlen);
+ bl_fastxPrintRandomSplitReads(readsdev, sequence, reflen,
+ n, minreadlen, maxreadlen, alphabet, alphabetlen, acc,
+ Pmis, Pins, Pdel, fastq, minqual, maxqual,
+ fiveprime, fiveprimelen, threeprime, threeprimelen, polyAlen);
+ }
+
+ if(subjectfilename && matefilename && !bisulfite) {
+ matedev = fopen(matefilename, "w");
+
+ bl_fastxPrintRandomMatePairs(readsdev, matedev, sequence, reflen,
+ n, minreadlen, maxreadlen, mindist, maxdist,
+ alphabet, alphabetlen, acc,
+ Pmis, Pins, Pdel, fastq, minqual, maxqual,
+ fiveprime, fiveprimelen, threeprime, threeprimelen, polyAlen);
+
+ fclose(matedev);
+ }
+
+ if(subjectfilename && bisulfite){
+ /* reverse complement of subject */
+ revcomp = charDNAcomplement(space, sequence, reflen);
+
+ /* read methylation rates between 0 and 1 (at most two
+ * decimal places are considered) and encode as char value
+ * with 1 as offset due to \0
+ */
+ if (!manopt_isset(&optset, 'R', "rates")){
+ manopt_help(&optset, "please give methylation rate(s) in bisulfite mode\n");
+ }
+ list = manopt_getarg(&optset, 'R', "rates");
+ rates = (char *) malloc(list->noofvalues);
+ for (i = 0; i < list->noofvalues; i++){
+ rate = 100 * atof(list->values[i]);
+ rate = (rate >= floor(rate) + 0.5) ? floor(rate) + 1: floor(rate);
+ assert(rate >= 0 && rate <= 100);
+ rates[i] = (char) ((Uint) rate + 1);
+ }
+
+ /*
+ * assign methylation rates to each cytosine
+ * on each strand, other nucleotides have
+ * methylation rate of 1 (=> no conversion)
+ */
+ sequencerates = (char *) malloc(reflen);
+ memset(sequencerates, 101, reflen);
+ revcomprates = (char *) malloc(reflen);
+ memset(revcomprates, 101, reflen);
+ for (i = 0; i < reflen; i++){
+ if (sequence[i] == 'C'){
+ sequencerates[i] = rates[RANDINT(list->noofvalues-1)];
+ printf("%d\t+\tC\t%.2f\n", (i+1), ((double)((int)sequencerates[i]-1))/100);
+ }
+ if (revcomp[i] == 'C'){
+ revcomprates[i] = rates[RANDINT(list->noofvalues-1)];
+ printf("%d\t-\tC\t%.2f\n", reflen-i, ((double)((int)revcomprates[i]-1))/100);
+ }
+ }
+
+ /* Lister et al. protocol */
+ if (bisulfite == 1){
+
+ /* +FW reads */
+ bl_fastxPrintRandomBisulfiteReads(readsdev, sequence, sequencerates, reflen,
+ (int)n/2, minreadlen, maxreadlen, alphabet, alphabetlen, acc,
+ Pmis, Pins, Pdel, fastq, minqual, maxqual,
+ fiveprime, fiveprimelen, threeprime, threeprimelen, polyAlen, "+FW");
+ /* -FW reads */
+ bl_fastxPrintRandomBisulfiteReads(readsdev, revcomp, revcomprates, reflen,
+ n-(int)n/2, minreadlen, maxreadlen, alphabet, alphabetlen, acc,
+ Pmis, Pins, Pdel, fastq, minqual, maxqual,
+ fiveprime, fiveprimelen, threeprime, threeprimelen, polyAlen, "-FW");
+ }
+ /* Cokus et al. protocol (+FW, +RC, -FW, -RC) */
+ else {
+ /* +FW reads */
+ bl_fastxPrintRandomBisulfiteReads(readsdev, sequence, sequencerates, reflen,
+ (int)n/4, minreadlen, maxreadlen, alphabet, alphabetlen, acc,
+ Pmis, Pins, Pdel, fastq, minqual, maxqual,
+ fiveprime, fiveprimelen, threeprime, threeprimelen, polyAlen, "+FW");
+ /* -FW reads */
+ bl_fastxPrintRandomBisulfiteReads(readsdev, revcomp, revcomprates, reflen,
+ (int)n/4, minreadlen, maxreadlen, alphabet, alphabetlen, acc,
+ Pmis, Pins, Pdel, fastq, minqual, maxqual,
+ fiveprime, fiveprimelen, threeprime, threeprimelen, polyAlen, "-FW");
+ /* +RC reads */
+ char *tmp = charDNAcomplement(space, sequence, reflen);
+ char *tmprates = (char *) malloc(reflen);
+ for (i = 0; i < reflen; i++){
+ tmprates[i] = sequencerates[reflen-i-1];
+ }
+ bl_fastxPrintRandomBisulfiteReads(readsdev, tmp, tmprates, reflen,
+ (int)n/4, minreadlen, maxreadlen, alphabet, alphabetlen, acc,
+ Pmis, Pins, Pdel, fastq, minqual, maxqual,
+ fiveprime, fiveprimelen, threeprime, threeprimelen, polyAlen, "+RC");
+ FREEMEMORY(space, tmp);
+ FREEMEMORY(space, tmprates);
+ /* -RC reads */
+ tmp = charDNAcomplement(space, revcomp, reflen);
+ tmprates = (char *) malloc(reflen);
+ for (i = 0; i < reflen; i++){
+ tmprates[i] = revcomprates[reflen-i-1];
+ }
+ bl_fastxPrintRandomBisulfiteReads(readsdev, tmp, tmprates, reflen,
+ n-3*((int)n/4), minreadlen, maxreadlen, alphabet, alphabetlen, acc,
+ Pmis, Pins, Pdel, fastq, minqual, maxqual,
+ fiveprime, fiveprimelen, threeprime, threeprimelen, polyAlen, "-RC");
+ FREEMEMORY(space, tmp);
+ FREEMEMORY(space, tmprates);
+ }
+ FREEMEMORY(space, rates);
+ FREEMEMORY(space, sequencerates);
+ FREEMEMORY(space, revcomp);
+ FREEMEMORY(space, revcomprates);
+ }
+
+ if(readsfilename) fclose(readsdev);
+ FREEMEMORY(space, sequence);
+ manopt_destructoptionset(&optset);
+ manopt_destructarg(unflagged);
+ FREEMEMORY(space, unflagged);
+ return 0;
+}
+
diff --git a/segemehl/src/kdmatch.c b/segemehl/src/kdmatch.c
new file mode 100644
index 0000000..635b116
--- /dev/null
+++ b/segemehl/src/kdmatch.c
@@ -0,0 +1,2094 @@
+
+/*
+ * kdmatch.c
+ * routines 4 relaxed alignments
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 11/27/2007 04:08:39 PM CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 103 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-12-10 15:18:18 +0100 (Wed, 10 Dec 2008) $
+ *
+ * Id: $Id: kdmatch.c 103 2008-12-10 14:18:18Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/src/kdmatch.c $
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <unistd.h>
+#include <float.h>
+#include <math.h>
+#include <assert.h>
+#include "memory.h"
+#include "fileio.h"
+#include "stringutils.h"
+#include "charsequence.h"
+#include "multicharseq.h"
+#include "sufarray.h"
+#include "mmchar.h"
+#include "mathematics.h"
+#include "manout.h"
+#include "biofiles.h"
+#include "vtprogressbar.h"
+#include "karlin.h"
+#include "sort.h"
+#include "basic-types.h"
+#include "bitvectoralg.h"
+#include "bitVector.h"
+#include "kdmatch.h"
+#include "bitArray.h"
+#include "segemehl.h"
+#include "container.h"
+#include "kdchain.h"
+#include "debug.h"
+#include "info.h"
+#include "kdseed.h"
+#include "alignment.h"
+#include "sw.h"
+#include "seqclip.h"
+#include <pthread.h>
+#include "iupac.h"
+
+/*---------------------------- se_kdFindBestMate -----------------------------
+ *
+ * @brief find the 'best' mate from a list of hits to a given hit
+ * @author Christian Otto
+ *
+ */
+
+gmatchlist_t *
+se_kdFindBestMate (void *space, gmatchlist_t *list, gmatch_t *match, Uint maxedist){
+ Uint i, u;
+ int bestedist = -1;
+ unsigned char found = 0;
+ Alignment *al;
+ PairUint best;
+ gmatchlist_t *res;
+
+ res = NULL;
+ memset(&best, 0, sizeof(PairUint));
+
+ for (u = 0; u < 2; u++){
+ for (i = 0; i < list->n[u]; i++){
+ if (list->matches[u][i].edist > maxedist)
+ continue;
+
+ if (list->matches[u][i].subject == match->subject){
+ if (!found || abs((LLint)list->matches[best.a][best.b].p - match->p) >
+ abs((LLint)list->matches[u][i].p - match->p)){
+ found = 1;
+ best.a = u;
+ best.b = i;
+ bestedist = list->matches[u][i].edist;
+ }
+ }
+ else {
+ if (!found && (bestedist == -1 || bestedist > list->matches[u][i].edist)){
+ best.a = u;
+ best.b = i;
+ bestedist = list->matches[u][i].edist;
+ }
+ }
+ }
+ }
+ assert(bestedist != -1);
+
+ al = ALLOCMEMORY(space, NULL, Alignment, 1);
+ copyAlignment(al, list->matches[best.a][best.b].al);
+
+ res = bl_gmatchlistInit(space, maxedist, 0);
+ res = se_kdMatchListAdd(res, list->matches[best.a][best.b].subject,
+ list->matches[best.a][best.b].p,
+ list->matches[best.a][best.b].q,
+ list->matches[best.a][best.b].edist,
+ list->matches[best.a][best.b].scr,
+ list->matches[best.a][best.b].i,
+ list->matches[best.a][best.b].j,
+ list->matches[best.a][best.b].evalue,
+ al, best.a, -1, -1, 0, -1, -1, 0, 0);
+
+ return res;
+}
+
+/*-------------------------- se_kdFindBestMatePair ---------------------------
+ *
+ * @brief find the 'best' pair from two lists of hits for query and mate
+ * @author Steve Hoffmann
+ *
+ */
+
+gmatchlist_t*
+se_kdFindBestMatePair (void *space, gmatchlist_t *querylist,
+ gmatchlist_t *matelist, Uint maxedist, Uint matemaxedist) {
+
+ Uint u, v, i, j, ucnt=0, vcnt=0, uprime=0, vprime=0;
+ unsigned char found = 0, downstream = 0;
+ Alignment *al;
+ PairUint p,q;
+ gmatchlist_t *list;
+
+ list = NULL;
+ memset(&p, 0, sizeof(PairUint));
+ memset(&q, 0, sizeof(PairUint));
+
+ for(u=0; u < 2; u++) {
+ ucnt += querylist->n[u];
+ for(i=0; i < querylist->n[u]; i++) {
+ uprime = u;
+ for(v=0; v < 2; v++) {
+ vcnt += matelist->n[v];
+ for(j=0; j < matelist->n[v]; j++) {
+ vprime = v;
+ if(querylist->matches[u][i].subject ==
+ matelist->matches[v][j].subject) {
+ if(!found || abs((LLint)querylist->matches[p.a][p.b].p -
+ matelist->matches[q.a][q.b].p) >
+ abs((LLint)querylist->matches[u][i].p -
+ matelist->matches[v][j].p))
+ {
+ found = 1;
+ p.a = u;
+ p.b = i;
+ q.a = v;
+ q.b = j;
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+ if(!found && ucnt == 1 && vcnt == 1) {
+ p.a = uprime;
+ p.b = 0;
+ q.a = vprime;
+ q.b = 0;
+ found=1;
+ }
+
+
+ if(found) {
+
+ al = ALLOCMEMORY(space, NULL, Alignment, 1);
+ copyAlignment(al, querylist->matches[p.a][p.b].al);
+
+ list = bl_gmatchlistInit(space, maxedist, matemaxedist);
+ //fprintf(stdout, "adding match with at %d with edist %d\n", querylist->matches[p.a][p.b].p, querylist->matches[p.a][p.b].edist);
+
+ list = se_kdMatchListAdd(list,
+ querylist->matches[p.a][p.b].subject,
+ querylist->matches[p.a][p.b].p,
+ querylist->matches[p.a][p.b].q,
+ querylist->matches[p.a][p.b].edist,
+ querylist->matches[p.a][p.b].scr,
+ querylist->matches[p.a][p.b].i,
+ querylist->matches[p.a][p.b].j-1,
+ querylist->matches[p.a][p.b].evalue, al, p.a, -1, -1, 0, -1, -1, 0, 0);
+
+ if(querylist->matches[p.a][p.b].p >= matelist->matches[q.a][q.b].p)
+ downstream = 1;
+ else
+ downstream = 0;
+
+ al = ALLOCMEMORY(space, NULL, Alignment, 1);
+ copyAlignment(al, matelist->matches[q.a][q.b].al);
+
+
+ // fprintf(stdout, "adding mate with at %d with edist %d\n",matelist->matches[q.a][q.b].p, matelist->matches[q.a][q.b].edist);
+ se_kdSetMate(space, &list->matches[p.a][0],
+// list->matches[p.a][0].subject,
+ matelist->matches[q.a][q.b].subject,
+ matelist->matches[q.a][q.b].p,
+ matelist->matches[q.a][q.b].q,
+ matelist->matches[q.a][q.b].edist,
+ al, downstream, (p.a != q.a));
+
+ if(list->mateminedist > matelist->matches[q.a][q.b].edist) {
+ list->mateminedist = matelist->matches[q.a][q.b].edist;
+ }
+
+ // fprintf(stdout, "list mate min edist %d\n", list->mateminedist);
+
+ if(list->pairminedist > matelist->matches[q.a][q.b].edist +
+ querylist->matches[p.a][p.b].edist) {
+ list->pairminedist = matelist->matches[q.a][q.b].edist +
+ querylist->matches[p.a][p.b].edist;
+ }
+
+ // fprintf(stdout, "list pair min edist %d\n", list->mateminedist);
+
+ querylist->matches[p.a][p.b].skip = 1;
+ matelist->matches[q.a][q.b].skip = 1;
+ }
+
+ return list;
+}
+
+
+/*------------------------------ se_kdAlignMate ------------------------------
+ *
+ * @brief find the mate once a sequence was located
+ * @author Steve Hoffmann
+ *
+ */
+
+
+ Uint
+se_kdAlignMate(void *space, MultiCharSeq *seq, char **seqs, Uint len,
+ gmatchlist_t *list, Uint maxedist,Uint* enctab, bitvector *D, Uint maxlen)
+{
+
+ PairSint mb;
+ Alignment *al;
+ bitvector *peq[2];
+ char *refseq, *upstreamrefseq;
+ Uint u, i, k, p, q;
+ Uint idx, refstart, reflen, upstreamreflen,
+ upstreamrefstart, chrstart, chrend;
+ Uint edist;
+
+ peq[0] = getpeq(space, seqs[0], len, seq->map,
+ seq->mapsize, enctab);
+ peq[1] = getpeq(space, seqs[1], len, seq->map,
+ seq->mapsize, enctab);
+
+ list->mateminedist = maxedist;
+ list->pairminedist = list->minedist+maxedist;
+
+
+ for(u=0; u < 2; u++) {
+ for (i=0; i < list->n[u]; i++) {
+
+ idx = list->matches[u][i].subject;
+ getMultiCharSeqIdxBounds(seq, idx, &chrstart, &chrend);
+
+ p = list->matches[u][i].p;
+ q = list->matches[u][i].q;
+
+ refstart = p;
+ reflen = (chrend > (Lint)refstart + maxlen)? maxlen :(chrend-refstart);
+ refseq = &seq->sequences[refstart];
+
+ upstreamreflen =((Lint)q-maxlen > chrstart)? maxlen :(Lint)q -chrstart;
+ upstreamrefstart = q - upstreamreflen;
+ upstreamrefseq = &seq->sequences[upstreamrefstart];
+
+ for(k=0; k < 2; k++) {
+
+ myersbitmatrix(NULL, seqs[k], len, refseq, reflen,
+ seq->map, seq->mapsize, enctab,
+ len-maxedist, peq[k], &mb, D, reflen);
+
+
+ if (mb.a != -1 && mb.b <= maxedist && mb.a < reflen) {
+ al = ALLOCMEMORY(space, NULL, Alignment, 1);
+
+ initAlignment(al, seqs[k], len, 0, refseq, reflen, 0);
+ bitvectorbacktrack(al, D, reflen, len, mb.a);
+
+ edist = se_kdSetMate(space, &list->matches[u][i], idx,
+ refstart+al->voff, refstart+mb.a-1,
+ mb.b, al, 1, (u != k));
+
+ if(list->mateminedist > edist) {
+ list->mateminedist = edist;
+ }
+
+ //pairminedist ...
+ if(list->pairminedist > edist+list->matches[u][i].edist) {
+ list->pairminedist = edist+list->matches[u][i].edist;
+ }
+
+ list->matches[u][i].noofmatematches++;
+ }
+
+ myersbitmatrix(NULL, seqs[k], len, upstreamrefseq, upstreamreflen,
+ seq->map, seq->mapsize, enctab,
+ len-maxedist, peq[k], &mb, D, upstreamreflen);
+
+
+ if (mb.a != -1 && mb.b <= maxedist && mb.a < upstreamreflen) {
+ al = ALLOCMEMORY(space, NULL, Alignment, 1);
+
+ initAlignment(al, seqs[k], len, 0, upstreamrefseq,
+ upstreamreflen, 0);
+ bitvectorbacktrack(al, D, upstreamreflen, len, mb.a);
+
+ edist = se_kdSetMate(space, &list->matches[u][i], idx,
+ upstreamrefstart+al->voff, upstreamrefstart+mb.a-1,
+ mb.b, al, 0, (u != k));
+
+ if(list->mateminedist > edist) {
+ list->mateminedist = edist;
+// list->noofmatematches = 0;
+ }// else if(list->mateminedist == mb.b) {
+ // list->noofmatematches++;
+ //}
+ //pairminedist ...
+ if(list->pairminedist > edist+list->matches[u][i].edist) {
+ list->pairminedist = edist+list->matches[u][i].edist;
+ }
+
+
+ list->matches[u][i].noofmatematches++;
+ }
+ }
+ }
+ }
+
+ for(u=0; u < 2; u++) {
+ for(i=0; i < seq->mapsize; i++) {
+ FREEMEMORY(space, peq[u][i]);
+ }
+ FREEMEMORY(space, peq[u]);
+ }
+
+ return 0;
+}
+
+
+
+/*--------------------------- bl_kdUpdateBestSeed ----------------------------
+ *
+ * @brief update the best seed record
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_kdUpdateBestSeed (bestseed_t *best, MultiCharSeq *seq, Suffixarray *s, Uint qrylen,
+ Uint readstart, Uint mat, Uint mis, Uint ins, Uint del, Uint l, Uint refstrand)
+{
+
+ Uint pos, subidx, substart, subend;
+
+ pos = s->suftab[l];
+ subidx = getMultiCharSeqIndex(seq, &seq->sequences[pos]);
+ getMultiCharSeqIdxBounds(seq, subidx, &substart, &subend);
+ pos -= substart;
+
+ if(best->mat < mat) {
+ best->maxintervalconstraint = 0;
+ best->maxevalconstraint = 0;
+ best->readstart = (refstrand == 0) ? readstart : qrylen-readstart-mat-mis-ins;
+ assert(qrylen >= readstart + 1);
+ best->mat= mat;
+ best->mis = mis;
+ best->ins = ins;
+ best->del = del;
+ best->len = mat+mis+ins;
+ best->refidx = subidx;
+ best->refpos = pos;
+ best->refstrand = refstrand;
+ }
+
+ return ;
+}
+
+/*--------------------------- se_kdMatchStemAlign ----------------------------
+ *
+ * @brief align the seeds in the matchstem using bv algorithmics
+ * @author Steve Hoffmann
+ *
+ */
+
+
+gmatchlist_t*
+se_kdMatchStemAlign(void *space, Suffixarray *s, MultiCharSeq *seq,
+ matchstem_t **stems, char **seqs, Uint len, karlin_t *stats,
+ segemehl_t *nfo, Uint *enctab, bitvector* D, bestseed_t *best) {
+
+ Uint u, k, j, l, r, q, i, pos, mat, mis, ins, del;
+ int maxedist, bestedist, scr, skipmargin=0;
+ Uint *check=NULL;
+ Uint checklen=0;
+ double E;
+ bitvector *peq[2];
+ PairSint mb;
+ MultiCharSeqAlignment mcsa;
+ gmatchlist_t *list;
+
+ maxedist = bestedist = len - floor(((double)nfo->accuracy * len)/100.);
+ skipmargin = 40*((double)maxedist/100.);
+ list = bl_gmatchlistInit(space, maxedist, 0);
+
+ peq[0] = getpeq(NULL, seqs[0], len, seq->map,
+ seq->mapsize, enctab);
+ peq[1] = getpeq(NULL, seqs[1], len, seq->map,
+ seq->mapsize, enctab);
+
+ for(u = 0; u < 2; u++) {
+ for(i = 0; i < len; i++) {
+ for(q = 0; q < stems[u][i].noofbranches; q++) {
+
+ l = stems[u][i].branches[q].l;
+ r = stems[u][i].branches[q].r;
+ mat = stems[u][i].branches[q].mat;
+ mis = stems[u][i].branches[q].mis;
+ ins = stems[u][i].branches[q].ins;
+ del = stems[u][i].branches[q].del;
+
+ E = kd_getBranchEvalue(stems[u], i, q, len, s->numofsuffixes, stats);
+
+ if(l <= r && E > nfo->maxevalue && best->mat == 0) best->maxevalconstraint=1;
+ if(l <= r && (r-l) <= nfo->M && best->mat == 0) best->maxintervalconstraint=1;
+
+ if (l > r || E > nfo->maxevalue || (r-l) > nfo->M)
+ continue;
+
+ bl_kdUpdateBestSeed(best, seq, s, len, i, mat, mis, ins, del, l, u);
+
+ for(j = l; j <= r; j++) {
+ pos = s->suftab[j];
+
+ if(mat != len || mis+ins+del != 0) {
+ initMultiCharSeqAlignment(space, &mcsa, seq, pos,
+ i+maxedist, len+2*(maxedist+1), u, NULL, seqs[u], len);
+ } else {
+ initMultiCharSeqAlignment(space, &mcsa, seq, pos,
+ 0, len, u, NULL, seqs[u], len);
+ }
+
+ /*skip or update identical matches*/
+ for(k = 0; k < checklen; k++)
+ if (check[k] >= mcsa.refstart-skipmargin &&
+ check[k] <= mcsa.refstart+skipmargin)
+ break;
+
+ if (k < checklen) {
+ wrapMultiCharSeqAlignment(space, &mcsa);
+ continue;
+ }
+
+ check = ALLOCMEMORY(space, check, Uint, checklen+1);
+ check[checklen++]= mcsa.refstart;
+
+ if (mat == len && mis+ins+del == 0) {
+ scr = kd_getBranchScore(stems[u], i, q);
+
+ for(k=0; k < len; k++) {
+ insertEop(mcsa.al, Replacement);
+ }
+
+ mb.b = getEdist(mcsa.al);
+
+ if (mb.b <= maxedist && mb.b <= bestedist){
+
+ list = se_kdMatchListAdd(list, mcsa.subidx,
+ pos, pos+len-1, mb.b, scr, 0, len-1, E, mcsa.al,
+ u, -1, -1, 0, -1, -1, 0, 0);
+
+ if(nfo->bestonly) bestedist = list->minedist;
+
+ } else {
+ wrapMultiCharSeqAlignment(space, &mcsa);
+ }
+
+ } else {
+
+ myersbitmatrix(NULL, seqs[u], len, mcsa.refseq, mcsa.reflen,
+ seq->map, seq->mapsize, enctab, len-bestedist, peq[u],
+ &mb, D, mcsa.reflen);
+
+ if (mb.a != -1 && mb.b <= maxedist &&
+ mb.b <= bestedist && mb.a < mcsa.reflen) {
+ bitvectorbacktrack(mcsa.al, D, mcsa.reflen, len, mb.a);
+
+ mb.b = getEdist(mcsa.al);
+
+ if (mb.b <= maxedist && mb.b <= bestedist){
+
+ /*skip or update identical matches*/
+ for(k = 0; k < list->n[u]; k++)
+ if (list->matches[u][k].p == mcsa.refstart+mcsa.al->voff)
+ break;
+
+ if (k < list->n[u]) {
+ if (list->matches[u][k].edist <= mb.b){
+ wrapMultiCharSeqAlignment(space, &mcsa);
+ } else {
+ scr = kd_getBranchScore(stems[u], i, q);
+
+ list = se_kdMatchListSet(space, list, mcsa.subidx,
+ mcsa.refstart+mcsa.al->voff,
+ mcsa.refstart+mb.a-1,
+ mb.b, scr, 0, len-1, E, mcsa.al, u, k);
+ }
+ continue;
+ }
+
+ scr = kd_getBranchScore(stems[u], i, q);
+
+ list=se_kdMatchListAdd(list, mcsa.subidx,
+ mcsa.refstart+mcsa.al->voff,
+ mcsa.refstart+mb.a-1, mb.b, scr, 0, len-1, E, mcsa.al,
+ u, -1, -1, 0, -1, -1, 0, 0);
+
+ if(nfo->bestonly) bestedist = list->minedist;
+
+ } else {
+ wrapMultiCharSeqAlignment(space, &mcsa);
+ }
+ } else {
+ wrapMultiCharSeqAlignment(space, &mcsa);
+ }
+ }
+ }
+ }
+ }
+
+ for(j=0; j < seq->mapsize; j++) {
+ FREEMEMORY(space, peq[u][j]);
+ }
+
+ FREEMEMORY(space, peq[u]);
+ if(check) {
+ FREEMEMORY(space, check);
+ check = NULL;
+ checklen = 0;
+ }
+ }
+
+ return list;
+}
+
+
+
+/*-------------------------------- se_kdFixIn --------------------------------
+ *
+ * @brief fix in
+ * @author Steve Hoffmann
+ *
+ */
+
+MultiCharSeqAlignment*
+se_kdFixIn (MultiCharSeq *seq, MultiCharSeqAlignment *a, char **seqs, Uint qrylen,
+ gmatchlist_t *list, int *scores, int indel,
+ Uint *enctab, bitvector *D, unsigned int *noofaligns, segemehl_t *nfo)
+{
+
+ bitvector *peq[2];
+ PairSint mb;
+ Alignment *al;
+
+ char *fseq[2], *refseq, *refseq2;
+ unsigned int flen, reflen, reflen2, maxedist, w, idx, pos, refpos, u, foff[2], score, chrstart, chrend;
+ PairSint scan[2];
+ char nextstrand='+', prevstrand='+', strand;
+ Uint i, j, k, prevpos, nextidx, nextpos, vlen = 0,
+ ustartj = 0,
+ ulen=0, ustart=0, nextustart=0,
+ prevustart=0, uend=0;
+ MultiCharSeqAlignment *b=NULL;
+ unsigned int nooffragments = *noofaligns;
+ void *space = NULL;
+
+
+
+ for(k=0, i=0; i < nooffragments; i++) {
+
+ b = ALLOCMEMORY(space, b, MultiCharSeqAlignment, k+1);
+ memmove(&b[k], &a[i], sizeof(MultiCharSeqAlignment));
+ k++;
+
+ ustart = a[i].al->uoff;
+ ulen = getUalignlen(a[i].al);
+ idx = a[i].subidx;
+ vlen = getValignlen(a[i].al);
+ score = getAlignScore(a[i].al, scores, indel);
+
+
+ if(a[i].strand == 1) {
+ strand = '-';
+ pos = a[i].refstart + a[i].al->voff + getValignlen(a[i].al) - 1;
+ } else {
+ strand = '+';
+ pos = a[i].refstart + a[i].al->voff;
+ }
+ if (a[i].strand == 1) {
+ uend = qrylen - ustart - 1;
+ ustart = uend - ulen + 1;
+ } else {
+ uend = ustart + ulen - 1;
+ }
+
+
+ if (ulen >= nfo->minfragmentalignlen &&
+ vlen >= nfo->minfragmentalignlen &&
+ score >= nfo->minfragmentalignscore) {
+
+
+ prevpos = -1;
+ nextidx = -1;
+ nextpos = -1;
+ prevstrand = -1;
+ nextstrand = -1;
+ prevustart =0;
+ nextustart = 0;
+
+
+ for(j=0; j < nooffragments; j++) {
+
+ if(a[j].strand == 1) {
+ ustartj = qrylen - a[j].al->uoff - getUalignlen(a[j].al);
+ } else {
+ ustartj = a[j].al->uoff;
+ }
+
+ if (ustartj < ustart && (!prevustart || ustartj >= prevustart) &&
+ getUalignlen(a[j].al) >= nfo->minfragmentalignlen &&
+ getValignlen(a[j].al) >= nfo->minfragmentalignlen &&
+ getAlignScore(a[j].al, scores, indel) >= nfo->minfragmentalignscore) {
+
+ if(a[j].strand == 0) {
+ prevpos = a[j].refstart + a[j].al->voff + getValignlen(a[j].al) - 1;
+ prevstrand = '+';
+ if(a[j].strand == 1) {
+ prevstrand = '-';
+ }
+ } else {
+ prevpos = a[j].refstart + a[j].al->voff;
+ prevstrand = '-';
+ if(a[j].strand == 0) {
+ prevstrand = '+';
+ }
+ }
+ prevustart = ustartj;
+ }
+
+ if (ustartj > ustart && (!nextustart || ustartj <= nextustart) &&
+ getUalignlen(a[j].al) >= nfo->minfragmentalignlen &&
+ getValignlen(a[j].al) >= nfo->minfragmentalignlen &&
+ getAlignScore(a[j].al, scores, indel) >= nfo->minfragmentalignscore) {
+
+ nextidx = a[j].subidx;
+
+ if(a[j].strand == 0) {
+ nextpos = a[j].refstart + a[j].al->voff;
+ nextstrand = '+';
+ } else {
+ nextpos = a[j].refstart + a[j].al->voff + getValignlen(a[j].al) - 1;
+ nextstrand = '-';
+ }
+ nextustart = ustartj;
+ }
+ }
+
+ if(nextustart && nextustart > uend+5 && nextidx == idx && strand == nextstrand) {
+
+
+ flen = nextustart - uend - 1;
+ foff[0] = uend+1;
+ foff[1] = qrylen-nextustart;
+ fseq[0] = &seqs[0][foff[0]];
+ fseq[1] = &seqs[1][foff[1]];
+
+
+ refseq = &seq->sequences[MIN(pos,nextpos)];
+ reflen = MAX(pos,nextpos) - MIN(pos,nextpos);
+
+ maxedist = flen - floor(((double)nfo->accuracy * flen)/100.);
+#ifdef DEBUGFIXIN
+ fprintf(stdout, "attempt fix in %u]-[%u into genomic interval %u]-[%u; reflen:%u flen:%u\n",
+ uend, nextustart, pos, nextpos, reflen, flen);
+#endif
+ peq[0] = getpeq(NULL, fseq[0], flen, seq->map, seq->mapsize, enctab);
+ peq[1] = getpeq(NULL, fseq[1], flen, seq->map, seq->mapsize, enctab);
+
+ w = a[i].strand;
+ scan[w] = myersbitvector(NULL, fseq[w], flen, refseq, reflen, seq->map,
+ seq->mapsize, enctab, maxedist, peq[w]);
+
+ if(scan[w].a != -1) {
+ getMultiCharSeqIdxBounds(seq, idx, &chrstart, &chrend);
+#ifdef DEBUGFIXIN
+ fprintf(stdout, "found matsch at %u wiff edist %d (strand:%d)\n",
+ scan[w].a, scan[w].b, w);
+#endif
+ refpos = MIN(pos,nextpos);
+ refpos += (scan[w].a > flen) ? scan[w].a-flen : 0;
+ refpos += (scan[w].a > flen && scan[w].a-flen > 100) ? -100 : 0;
+ refpos = (refpos >= chrstart) ? refpos : chrstart;
+ refseq2 = &seq->sequences[refpos];
+ reflen2 = (chrend-(refpos+flen) > 200) ? flen + 200 : chrend-(refpos+flen);
+
+#ifdef DEBUGFIXIN
+ fprintf(stdout, "narrow down to [%u,%u] with length %d\n",
+ refpos, refpos+reflen2, reflen2);
+#endif
+ myersbitmatrix(NULL, fseq[w], flen, refseq2, reflen2,
+ seq->map, seq->mapsize, enctab, flen-maxedist, peq[w], &mb, D, reflen2);
+#ifdef DEGUGFIXIN
+ fprintf(stdout, "aligned matsch at %u wiff edist %d (maxedist:%d, reflen2:%d)\n",
+ mb.a, mb.b, maxedist, reflen2);
+#endif
+ if (mb.a != -1 && mb.b <= maxedist && mb.a < reflen2) {
+ al = ALLOCMEMORY(space, NULL, Alignment, 1);
+
+ initAlignment(al, seqs[w], qrylen, foff[w], refseq2, reflen2, 0);
+ bitvectorbacktrack(al, D, reflen2, flen, mb.a);
+
+ if(getUalignlen(al) > nfo->minfragmentalignlen &&
+ getValignlen(al) > nfo->minfragmentalignlen &&
+ getAlignScore(al, scores, indel) > nfo->minfragmentalignscore) {
+
+ b = ALLOCMEMORY(space, b, MultiCharSeqAlignment, k+1);
+
+ initMultiCharSeqAlignment(space, &b[k], seq, refpos,
+ 0, reflen2, w, NULL, seqs[w], qrylen);
+
+ wrapAlignment(b[k].al);
+ FREEMEMORY(space, b[k].al);
+ b[k].al = al;
+ k++;
+#ifdef DEGUBFIXIN
+ showAlign(al, stdout);
+#endif
+ } else {
+ wrapAlignment(al);
+ FREEMEMORY(space, al);
+ }
+ }
+ }
+
+ for(w=0; w < 2; w++) {
+ for(u=0; u < seq->mapsize; u++) {
+ FREEMEMORY(space, peq[w][u]);
+ }
+ FREEMEMORY(space, peq[w]);
+ }
+ }
+
+ if(prevpos && prevstrand == nextstrand && prevstrand != strand) {
+ //fprintf(stdout, "double check frag\n");
+ }
+ }
+ }
+
+ FREEMEMORY(space, a);
+
+ *noofaligns = k;
+ return b;
+
+}
+
+
+/*------------------------- se_kdAlignEvalSplitAlign -------------------------
+ *
+ * @brief post processing of the multisplitalignment
+ * @author Steve Hoffmann
+ *
+ */
+
+char
+se_kdAlignEvalSplitAlign (MultiCharSeq *seq, MultiCharSeqAlignment *a, char **seqs,
+ Uint qrylen, gmatchlist_t *list, Uint *totalcover, int *totalscore,
+ unsigned char *trans, int *scores, int indel, Uint *enctab, bitvector *D,
+ unsigned int noofaligns, segemehl_t *nfo)
+{
+
+ Alignment *alcopy;
+
+ unsigned char laststrand=0, purge = 0;
+ char nextstrand='+', prevstrand='+';
+ Uint i, j, k, previdx, prevpos, nextidx, nextpos,
+ lastsubidx =0 , totaledist = 0, ustartj = 0,
+ ulen=0, vlen=0, vstart=0, ustart=0, nextustart=0,
+ prevustart=0, uend=0, edist=0, fragno = 0;
+ int score;
+#ifdef DEBUGTRANSALIGN
+ Uint sub_start, sub_end;
+#endif
+ for(k=0, i=0; i < noofaligns; i++) {
+
+ //remove trailing indels
+ //if(i == cur->nooffragments && clean5prime) {
+ // clean5prime(a[i].al);
+ //}
+
+ ustart = a[i].al->uoff;
+ vstart = a[i].al->voff;
+ ulen = getUalignlen(a[i].al);
+ vlen = getValignlen(a[i].al);
+ score = getAlignScore(a[i].al, scores, indel);
+ edist = getEdist(a[i].al);
+
+
+#ifdef DEBUGTRANSALIGN
+ getMultiCharSeqIdxBounds(seq, a[i].subidx, &sub_start, &sub_end);
+
+ fprintf(stdout, "frag:%d, [%u,%u], off:%d, [%u,%u], voff:%d, strand:%d, score:%d, edist:%d, ulen:%d, vlen:%d\n",
+ i, a[i].qrystart, a[i].qrystart+a[i].qrylen-1, ustart,
+ a[i].refstart-sub_start, a[i].refstart-sub_start+a[i].reflen-1,
+ vstart, a[i].strand, score, edist, ulen, vlen);
+
+ showAlign(a[i].al, stdout);
+#endif
+
+
+
+ if(edist > (ulen - floor(((double)nfo->accuracy * ulen)/100.))) {
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "purging!\n");
+#endif
+ purge = 1;
+ }
+
+ if ((ulen >= nfo->minfragmentalignlen &&
+ vlen >= nfo->minfragmentalignlen &&
+ score >= nfo->minfragmentalignscore) || (a[i].pass && ulen >= 8)) {
+
+ *totalcover += ulen;
+ *totalscore += score;
+ totaledist += edist;
+ k++;
+
+ alcopy = ALLOCMEMORY(space, NULL, Alignment, 1);
+ copyAlignment(alcopy, a[i].al);
+ if (a[i].strand == 1) {
+ uend = qrylen - ustart - 1;
+ ustart = uend - ulen + 1;
+ } else {
+ uend = ustart + ulen - 1;
+ }
+
+ previdx = -1;
+ prevpos = -1;
+ nextidx = -1;
+ nextpos = -1;
+ prevstrand = -1;
+ nextstrand = -1;
+ prevustart =0;
+ nextustart = 0;
+
+
+ for(j=0; j < noofaligns; j++) {
+
+ if(a[j].strand == 1) {
+ ustartj = qrylen - a[j].al->uoff - getUalignlen(a[j].al);
+ } else {
+ ustartj = a[j].al->uoff;
+ }
+
+ if (ustartj < ustart && (!prevustart || ustartj >= prevustart) &&
+ (getUalignlen(a[j].al) >= nfo->minfragmentalignlen || a[j].pass) &&
+ (getValignlen(a[j].al) >= nfo->minfragmentalignlen || a[j].pass) &&
+ (getAlignScore(a[j].al, scores, indel) >= nfo->minfragmentalignscore || a[j].pass)) {
+
+ previdx = a[j].subidx;
+
+ if(a[j].strand == 0) {
+ prevpos = a[j].refstart + a[j].al->voff + getValignlen(a[j].al) - 1;
+ prevstrand = '+';
+ if(a[j].strand == 1) {
+ prevstrand = '-';
+ }
+ } else {
+ prevpos = a[j].refstart + a[j].al->voff;
+ prevstrand = '-';
+ if(a[j].strand == 0) {
+ prevstrand = '+';
+ }
+ }
+ prevustart = ustartj;
+ }
+
+ if (ustartj > ustart && (!nextustart || ustartj <= nextustart) &&
+ (getUalignlen(a[j].al) >= nfo->minfragmentalignlen || a[j].pass) &&
+ (getValignlen(a[j].al) >= nfo->minfragmentalignlen || a[j].pass) &&
+ (getAlignScore(a[j].al, scores, indel) >= nfo->minfragmentalignscore || a[j].pass)) {
+
+ nextidx = a[j].subidx;
+
+ if(a[j].strand == 0) {
+ nextpos = a[j].refstart + a[j].al->voff;
+ nextstrand = '+';
+ } else {
+ nextpos = a[j].refstart + a[j].al->voff + getValignlen(a[j].al) - 1;
+ nextstrand = '-';
+ }
+ nextustart = ustartj;
+ }
+ }
+
+ list = se_kdMatchListAdd(list, a[i].subidx,
+ a[i].refstart + vstart,
+ a[i].refstart + vstart + vlen - 1,
+ edist, score, ustart, //ustart + ulen - 1,
+ uend, .0, alcopy, a[i].strand,
+ previdx, prevpos, prevstrand,
+ nextidx, nextpos, nextstrand, fragno);
+
+ fragno++;
+
+ if (k > 1 && (laststrand != a[i].strand || lastsubidx != a[i].subidx)) {
+ *trans = 1;
+ }
+
+ laststrand = a[i].strand;
+ lastsubidx = a[i].subidx;
+ } else {
+ //purge =1;
+ }
+ }
+
+ if(fragno < 2) purge = 1;
+
+ return purge;
+}
+
+/*--------------------------- se_kdAlignSplitChain ---------------------------
+ *
+ * @brief align a chain of fragments using local multi spliced alignment
+ * @author Steve Hoffmann
+ *
+ */
+
+gmatchlist_t**
+se_kdAlignSplitChain (void *space, branchChain_t *chains, Uint noofchains,
+ Suffixarray *arr, MultiCharSeq *seq, char *querydesc, matchstem_t **stems,
+ char **seqs, Uint qrylen, int *scores, int indel, int transition,
+ spliceevents_t *events, Uint *enctab, bitvector *D, segemehl_t *nfo) {
+
+ Uint k, i, j, q, start, floff = 0, flen =0,
+ maxedist,
+ *strands, *starts, *ends, *tstarts, *tends, *lengths, *reflens, totalcover = 0,
+ sub_start, sub_end;
+
+ unsigned int margin=50, maxmargin=100, noofaligns; //50;
+ branchChain_t *newchains;
+ unsigned char trans=0, purge=0;
+ char **refseqs;
+ char ***K = NULL;
+ PairUint *bestscr;
+ int ***M, **lmr, **lmv, **lmc, totalscore=0;
+ branchChain_t *cur;
+ MultiCharSeqAlignment *a;
+ Alignment **aligns;
+ gmatchlist_t **list=NULL;
+ PairUint *diag;
+#ifdef DEBUGKBAND
+ char ***B;
+#endif
+
+
+
+ maxedist = qrylen - floor(((double)nfo->accuracy * qrylen)/100.);
+ list = ALLOCMEMORY(space, NULL, gmatchlist_t*, noofchains+1);
+ list[0] = bl_gmatchlistInit(space, maxedist, 0);
+
+// for(k=1; k < noofchains; k++) {
+// list[k] = bl_gmatchlistInit(space, maxedist, 0);
+// }
+
+ //DBG("kdalignsplitchain with %d chains", noofchains);
+
+ if(noofchains == 0) return list;
+
+ qsort(chains, noofchains, sizeof(branchChain_t), cmp_chainscores);
+ double maxchainscore = (double) chains[0].score;
+
+ for(k=0; k < noofchains; k++) {
+ if(chains[k].score < maxchainscore*0.9) break;
+ }
+
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "before condensing.\n");
+ showChains(chains, k, arr, stdout, seqs[1], qrylen);
+#endif
+
+ newchains = condenseChain(chains, k, seq, arr);
+
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "after condensing.\n");
+ showChains(newchains, k, arr, stdout, seqs[1], qrylen);
+#endif
+
+ qsort(newchains, k, sizeof(branchChain_t), cmp_chainlocality);
+
+
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "after sorting.\n");
+ showChains(newchains, k, arr, stdout, seqs[1], qrylen);
+#endif
+
+
+ // for(k=0; k < noofchains; k++) { // now only the best
+ q = 0;
+ floff = 0;
+ flen = 0;
+ trans = 0;
+ purge = 0;
+ totalscore = 0;
+ totalcover = 0;
+
+ cur = &newchains[q];
+
+ if(cur->nooffragments <= 1) {
+ wrapChains(space, newchains, k);
+ FREEMEMORY(space, newchains);
+ return list;
+ }
+
+
+#ifdef DEBUBTRANSALIGN
+ fprintf(stdout, "nooffrags: %d; scr1:%d scr:2:%d\n", cur->nooffragments,
+ chains[0].score, chains[1].score);
+#endif
+
+ a = ALLOCMEMORY(space, NULL, MultiCharSeqAlignment, cur->nooffragments);
+ reflens = ALLOCMEMORY(space, NULL, Uint, cur->nooffragments);
+ strands = ALLOCMEMORY(space, NULL, Uint, cur->nooffragments);
+ starts = ALLOCMEMORY(space, NULL, Uint, cur->nooffragments);
+ ends = ALLOCMEMORY(space, NULL, Uint, cur->nooffragments);
+ tstarts = ALLOCMEMORY(space, NULL, Uint, cur->nooffragments);
+ tends = ALLOCMEMORY(space, NULL, Uint, cur->nooffragments);
+ lengths = ALLOCMEMORY(space, NULL, Uint, cur->nooffragments);
+ refseqs = ALLOCMEMORY(space, NULL, char*, cur->nooffragments);
+ aligns = ALLOCMEMORY(space, NULL, Alignment*, cur->nooffragments);
+ diag = ALLOCMEMORY(space, NULL, PairUint, cur->nooffragments);
+
+
+
+ /*attention cur is a new chain and includes the beststarts already.
+ * no need for bd[beststart] beyond this call*/
+ //cur = condenseChain(cur, 1, seq, arr);
+
+ for(i=0; i < cur->nooffragments; i++) {
+
+ Uint uloff;
+ Uint uroff;
+
+ if(i > 0) {
+ uloff = (cur->f[i-1]->end < cur->f[i]->start) ? cur->f[i]->start - cur->f[i-1]->end : 0;
+ } else {
+ uloff = cur->f[i]->start;
+ }
+
+ if(i < cur->nooffragments-1) {
+ uroff = (cur->f[i]->end < cur->f[i+1]->start) ? cur->f[i+1]->start - cur->f[i]->end : 0;
+ } else {
+ uroff = qrylen-cur->f[i]->end;
+ }
+
+ if(cur->f[i]->strand) {
+ floff = uroff + MIN(maxedist + margin, maxmargin);
+ flen = floff + (cur->f[i]->end - cur->f[i]->start) + uloff + MIN(maxmargin, maxedist + margin);
+ } else {
+ floff = maxedist + uloff + margin;
+ flen = floff + (cur->f[i]->end - cur->f[i]->start) + uroff + MIN(maxmargin, maxedist + margin);
+ }
+
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "strand:%d floff:%d\tflen:%d\t[%d,%d]->%d\n",
+ cur->f[i]->strand, floff, flen, cur->f[i]->start,
+ cur->f[i]->end, (i < cur->nooffragments-1) ? cur->f[i+1]->start : qrylen);
+#endif
+
+ start = cur->f[i]->substart;
+ initMultiCharSeqAlignmentOpt(space, &a[i], seq, start,
+ querydesc, seqs[cur->f[i]->strand], cur->f[i]->start, cur->f[i]->end,
+ qrylen, floff, flen, uloff, uroff, MIN(maxmargin, maxedist+margin), cur->f[i]->strand);
+
+ a[i].pass = cur->f[i]->pass;
+ aligns[i] = a[i].al;
+ refseqs[i] = a[i].refseq;
+ reflens[i] = a[i].reflen;
+ strands[i] = cur->f[i]->strand;
+ lengths[i] = a[i].qrylen;
+ tstarts[i] = a[i].qrystart;
+ tends[i] = tstarts[i]+lengths[i]-1;
+
+ if(strands[i]==0) {
+ starts[i] = a[i].qrystart;
+ ends[i] = starts[i]+lengths[i]-1;
+ } else {
+ starts[i] = qrylen - (a[i].qrystart + lengths[i]);
+ assert(qrylen >= a[i].qrystart+lengths[i]);
+ ends[i] = starts[i]+lengths[i]-1;
+ assert(ends[i] <= qrylen);
+ }
+
+ if(strands[i]==0) {
+ diag[i].b = a[i].floff;
+ } else {
+ if(a[i].reflen >= MIN(maxmargin, maxedist+margin) + uroff + (cur->f[i]->end - cur->f[i]->start) - 1){
+ diag[i].b = a[i].reflen - MIN(maxmargin, maxedist+margin) - uroff - (cur->f[i]->end - cur->f[i]->start) + 1;
+ } else {
+ diag[i].b = 0;
+ }
+ }
+
+ diag[i].a = cur->f[i]->start - a[i].qrystart;
+
+
+// -DDEBUGMULTISPLICEOPT -DDEBUGTRANSALIGN
+#ifdef DEBUGTRANSALIGN
+ fprintf (stdout, "query sequence of fragment %d [%d->%d]\n", i, starts[i], ends[i]);
+
+ Uint h=0;
+ for(h=0; h < lengths[i]; h++) {
+ if(h && (h%60) == 0) fprintf(stdout, "\n");
+ fprintf(stdout, "%c", a[i].query[starts[i]+h]);
+ }
+ fprintf(stdout,"\n");
+
+ fprintf (stdout, "reference sequence of fragment %d\n", i);
+ for(h=0; h < reflens[i]; h++) {
+ if(h && (h%60) == 0) fprintf(stdout, "\n");
+ fprintf(stdout, "%c", refseqs[i][h]);
+ }
+ fprintf(stdout,"\n");
+
+ fprintf(stdout, "%s\n qrylen:%d, fragment:%d, start:%d, strand:%d, curstart:%d, curend:%d, maxedist:%d mapping to [%d,%d]\n",
+ querydesc, qrylen, i, start, strands[i], starts[i], ends[i], maxedist, a[i].refstart, a[i].refstart+a[i].reflen -1);
+ fprintf(stdout, "\n");
+#endif
+ }
+
+ M = localmultisplicedmatrixopt(space, seqs[0], seqs[1], qrylen, lengths,
+ refseqs, reflens, strands, starts, ends, tstarts, tends, cur->nooffragments, indel, transition,
+ constscr, scores, &lmv, &lmr, &lmc, &bestscr, &K, diag);
+
+
+ if(M == NULL) {
+ fprintf(stderr, "empty matrix returned for seqs: '%s'/'%s' (%d)\n",
+ seqs[0], seqs[1], qrylen);
+
+ for(i=0; i < cur->nooffragments; i++) {
+
+ getMultiCharSeqIdxBounds(seq, a[i].subidx, &sub_start, &sub_end);
+ fprintf(stderr, "fragment %d: %d in %d[%d,%d] '",
+ i, 1 /*arr->suftab[bd[beststart][i]]*/, a[i].subidx, sub_start, sub_end);
+ for(j=0; j< qrylen; j++) fprintf(stderr, "%c", refseqs[i][j]);
+ fprintf(stderr, "'(%d) strand:%d\n", reflens[i], strands[i]);
+ }
+ return list;
+ }
+
+#ifdef DEBUGKBAND
+ B =
+#endif
+ localmultisplicedtracebackopt(space, M, seqs[0], seqs[1], qrylen, lengths,
+ refseqs, reflens, strands, starts, ends, tstarts, tends,
+ cur->nooffragments, indel, transition, constscr, scores,
+ aligns, lmv, lmr, lmc, bestscr);
+
+ for(i=0; i < cur->nooffragments; i++) {
+ FREEMEMORY(space, lmv[i]);
+ FREEMEMORY(space, lmr[i]);
+ FREEMEMORY(space, lmc[i]);
+
+
+#ifdef DEBUGKBAND
+ Uint uloff;
+ Uint uroff;
+
+ if(i > 0) {
+ uloff = (cur->f[i-1]->end < cur->f[i]->start) ? cur->f[i]->start - cur->f[i-1]->end : 0;
+ } else {
+ uloff = cur->f[i]->start;
+ }
+
+ if(i < cur->nooffragments-1) {
+ uroff = (cur->f[i]->end < cur->f[i+1]->start) ? cur->f[i+1]->start - cur->f[i]->end : 0;
+ } else {
+ uroff = qrylen-cur->f[i]->end;
+ }
+
+ fprintf(stderr, "matrix %d of %d\n", i, cur->nooffragments);
+ fprintf (stderr, "query sequence of fragment %d (%d,%d)[%d->%d], starts:%u ends:%u fragstart:%d fragend:%d, uloff:%d, uroff:%d, floff:%d, reflen:%d\n", i, cur->f[i]->start, cur->f[i]->end, starts[i], ends[i], diag[i].a, diag[i].b, cur->f[i]->start, cur->f[i]->end, uloff, uroff, a[i].floff, a[i].reflen);
+
+ dumprecursionmatrix2D(stderr, M[i], B[i], K[i], lengths[i], reflens[i], &diag[i]);
+#endif
+
+ for(j=0; j < lengths[i]+1; j++) {
+#ifdef DEBUGKBAND
+ FREEMEMORY(space, B[i][j]);
+ FREEMEMORY(space, K[i][j]);
+#endif
+ FREEMEMORY(space, M[i][j]);
+ }
+#ifdef DEBUGKBAND
+ FREEMEMORY(space, B[i]);
+ FREEMEMORY(space, K[i]);
+#endif
+ FREEMEMORY(space, M[i]);
+ }
+#ifdef DEBUGKBAND
+ FREEMEMORY(space, B);
+ FREEMEMORY(space, K);
+#endif
+
+ FREEMEMORY(space, M);
+ FREEMEMORY(space, bestscr);
+ FREEMEMORY(space, diag);
+
+ noofaligns = cur->nooffragments;
+
+ a = se_kdFixIn(seq, a, seqs, qrylen, list[q], scores, indel, enctab,
+ D, &noofaligns, nfo);
+
+ purge = se_kdAlignEvalSplitAlign(seq, a, seqs, qrylen, list[q], &totalcover,
+ &totalscore, &trans, scores, indel, enctab, D, noofaligns, nfo);
+
+ totalcover *= 100;
+ totalcover /= qrylen;
+
+#ifdef DEBUGTRANSALIGN
+ fprintf(stdout, "qrylen:%d, totalcover %d, totalscore %d, noofchains %d, mincover:%d, mintotalscore:%d \n",
+ qrylen, totalcover, totalscore, noofchains, nfo->minsplicedaligncover, nfo->minsplicedalignscore);
+#endif
+
+
+ if(totalscore >= nfo->minsplicedalignscore &&
+ totalcover >= nfo->minsplicedaligncover /*&& q > 1*/
+ && !purge) {
+ /*restrictive policy for reporting trans splicing events*/
+// if(!trans || noofchains == 1) {
+// reportSplicedMatch(space, querydesc, b, q,
+// totalcover, totaledist, totalscore, nfo);
+// }
+// store splice sites internally for online remapping
+// bl_storeSpliceEvent (space, seq, list, events, 0, 100, seqs, querydesc);
+
+ } else {
+
+// fprintf(stdout, "destructing list\n");
+ bl_gmatchlistDestruct(space, list[q]);
+ list[q] = bl_gmatchlistInit(space, maxedist, 0);
+ }
+
+ for(i=0; i < noofaligns; i++) {
+ wrapMultiCharSeqAlignment(space, &a[i]);
+ }
+
+// wrapChains(space, cur, 1);
+// FREEMEMORY(space, cur);
+
+ wrapChains(space, newchains, k);
+ FREEMEMORY(space, newchains);
+
+ FREEMEMORY(space, reflens);
+ FREEMEMORY(space, refseqs);
+ FREEMEMORY(space, strands);
+ FREEMEMORY(space, starts);
+ FREEMEMORY(space, ends);
+ FREEMEMORY(space, tstarts);
+ FREEMEMORY(space, tends);
+ FREEMEMORY(space, lengths);
+
+
+ FREEMEMORY(space, lmv);
+ FREEMEMORY(space, lmr);
+ FREEMEMORY(space, lmc);
+ FREEMEMORY(space, aligns);
+ FREEMEMORY(space, a);
+ //}
+
+ return list;
+}
+
+
+/*------------------------------ se_kdSplitRead ------------------------------
+ *
+ * @brief find the splits of a chimeric reads from matchstem data
+ * @author Steve Hoffmann
+ *
+ */
+
+gmatchlist_t*
+se_kdSplitRead(void *space, Suffixarray *arr, MultiCharSeq *seq,
+ char *querydesc, matchstem_t **stems, char **seqs, Uint len,
+ karlin_t *stats, spliceevents_t *events, Uint *enctab, bitvector *D, segemehl_t *nfo)
+{
+ int indel = -2;
+ int transition = -10;
+ int scores[]={1, -2};
+ Uint noofchains, nooffixins, best=0;
+ gmatchlist_t** list = NULL, *h;
+ branchfragment_t* fragments, **g;
+ branchChain_t *chains;
+
+
+ chains = branchChain(space, arr, stems, seqs, len, stats,
+ &noofchains, &fragments, nfo->maxsplitevalue);
+
+#ifdef FIXINSMALL
+ g = fixinfragments (space, chains, noofchains, arr, seqs, len, &nooffixins);
+#endif
+
+#ifdef DEBUGTRANSALIGN
+ showChains(chains, noofchains, arr, stdout, seqs[1], len);
+#endif
+
+ list = se_kdAlignSplitChain (space, chains, noofchains,
+ arr, seq, querydesc, stems, seqs, len, scores, indel, transition, events, enctab, D, nfo);
+
+ //best = se_selectList(list, noofchains);
+
+ wrapChains(space, chains, noofchains);
+#ifdef FIXINSMALL
+ wrapFixinFragments(space, g, nooffixins);
+ FREEMEMORY(space, g);
+#endif
+ FREEMEMORY(space, fragments);
+ FREEMEMORY(space, chains);
+
+ h = list[best];
+ FREEMEMORY(space, list);
+
+ return h;
+}
+
+
+/*--------------------------------- se_clip ----------------------------------
+ *
+ * @brief clipping sequences
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+se_clip (void *space, fasta_t *reads, Uint elem, segemehl_t *nfo)
+{
+
+ if(nfo->hardclip3Prime || nfo->hardclip5Prime) {
+ bl_fastaHardClip(space, reads, elem, nfo->hardclip5Prime,
+ nfo->hardclip3Prime);
+ if(bl_fastaHasMate(reads)) {
+ bl_fastaMateHardClip(space, reads, elem, nfo->hardclip5Prime,
+ nfo->hardclip3Prime);
+ }
+ }
+
+ if(nfo->softclip3Prime || nfo->softclip5Prime) {
+
+ bl_fastaSoftClip(space, reads, elem,
+ nfo->softclip5Prime, nfo->softclip5PrimeLen, nfo->minclipscr5,
+ nfo->softclip3Prime, nfo->softclip3PrimeLen, nfo->clipacc, nfo->polyAlen);
+ if(bl_fastaHasMate(reads)) {
+ bl_fastaMateSoftClip(space, reads, elem,
+ nfo->softclip5Prime, nfo->softclip5PrimeLen, nfo->minclipscr5,
+ nfo->softclip3Prime, nfo->softclip3PrimeLen, nfo->clipacc, nfo->polyAlen);
+ }
+ }
+
+ return ;
+}
+
+/*----------------------------- se_kdGenomeMatch -----------------------------
+ *
+ * @brief map reads to the genome
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+se_kdGenomeMatch(void *space, Suffixarray *s, fasta_t *reads,
+ segemehl_t *nfo) {
+
+ unsigned char matchflag, matematchflag;
+ matchstatus_t pairStatus = QUERY;
+ char *seqs[2], *mateseqs[2], rep=0;
+ Uint k, i, u, *enctab, dim, wordno, len, matelen=0, jump, maxedist,
+ matemaxedist=0; //, setmatches;
+ karlin_t stats;
+ bitvector *D, *Mv;
+ Gmap map;
+ gread_t read;
+ matchstem_t *stems[2] = {NULL, NULL}, *matestems[2] = {NULL, NULL},
+ *b0[2], *mateb0[2];
+ gmatchlist_t *list=NULL, *matelist=NULL, *templist,
+ *bestpairlist=NULL, *slist=NULL, *slist2=NULL;
+ spliceevents_t *events;
+ bestseed_t best, bestmate;
+ PairSint frag;
+
+ events = ALLOCMEMORY(space, NULL, spliceevents_t, 1);
+ events->noofevents = 0;
+ events->event = NULL;
+
+ enctab = encodetab(nfo->seq->map, nfo->seq->mapsize);
+ dim = reads->maxlen+1000;
+
+ if(bl_fastaHasMate(reads)) {
+ dim += nfo->maxinsertsize;
+ }
+
+ dim += 2*((reads->maxlen-floor(((double)nfo->accuracy*reads->maxlen)/100.))+4);
+ wordno = (reads->maxlen/BITVECTOR_WORDSIZE)+1;
+
+ D = ALLOCMEMORY(space, NULL, bitvector, 2*(dim+1));
+ Mv = &D[dim+1];
+
+ for(i=0; i <= dim; i++) {
+ D[i] = initbitvector(space, wordno*BITVECTOR_WORDSIZE);
+ Mv[i] = initbitvector(space, wordno*BITVECTOR_WORDSIZE);
+ }
+
+ karlinunitcostpp(space, &stats.lambda, &stats.H, &stats.K);
+
+ for (k=0; k < reads->noofseqs; k++) {
+ pairStatus = QUERY;
+ matchflag = 0;
+ matematchflag = 0;
+
+ best.mat = 0;
+ best.mis = 0;
+ best.ins =0;
+ best.del = 0;
+ best.len = 0;
+ best.readstart = 0;
+ best.refidx = 0;
+ best.refpos = 0;
+ best.refstrand = 0;
+ best.maxevalconstraint = 0;
+ best.maxintervalconstraint = 0;
+
+ bestmate.mat = 0;
+ bestmate.mis = 0;
+ bestmate.ins = 0;
+ bestmate.del = 0;
+ bestmate.len = 0;
+ bestmate.readstart = 0;
+ bestmate.refidx = 0;
+ bestmate.refpos = 0;
+ bestmate.refstrand = 0;
+ bestmate.maxevalconstraint = 0;
+ bestmate.maxintervalconstraint = 0;
+
+
+
+ if(!nfo->mute) se_updateProgressBar(k, nfo);
+ se_clip(space, reads, k, nfo);
+
+
+ seqs[0] = bl_fastaGetSequence(reads, k);
+ len = bl_fastaGetSequenceLength(reads, k);
+
+#ifdef HASHING
+ if (bl_fastaGetQuantity(reads, k) == 1){
+ DBG("%u: %s\t%u\n", k, bl_fastaGetSequence(reads, k), bl_fastaGetQuantity(reads, k));
+ }
+
+ // fprintf(nfo->dev,"@%s\n%s\n+\n%s\n",
+ // bl_fastaGetDescription(reads,k), seqs[0], bl_fastaGetQuality(reads,k));
+ // continue;
+ continue;
+#endif
+ // pthread_mutex_lock(nfo->mtx2);
+ // fprintf(nfo->dev, "%s\n", bl_fastaGetDescription(reads,k));
+ // fprintf(nfo->dev, "%s\n", bl_fastaGetMateDescription(reads,k));
+ // pthread_mutex_unlock(nfo->mtx2);
+
+
+ if(len >= nfo->minsize) {
+ seqs[1] = charIUPACcomplement(space, seqs[0], len);
+
+ /* convert for seed search */
+ if (nfo->bisulfite){
+ seqs[0] = ALLOCMEMORY(space, NULL, char, len+1);
+ memmove(seqs[0], bl_fastaGetSequence(reads, k), len+1);
+ bl_convertBisulfite(seqs[0], len, nfo->bisulfite, 1);
+ bl_convertBisulfite(seqs[1], len, nfo->bisulfite, 1);
+ }
+
+ initGmap(&map, nfo->seq, 1);
+ initRead(&read, k);
+
+ if (nfo->jump == 0) {
+ jump = floor(len/75) * 2;
+ jump = (jump > 0) ? jump : 1;
+ jump = MIN(jump, 15); //limit jump to 15
+ } else {
+ jump = nfo->jump;
+ }
+
+ stems[0] = NULL; stems[1] = NULL;
+ b0[0] = NULL; b0[1] = NULL;
+
+ /* restrict search to one strand */
+ for (u = 0; u < 2; u++){
+ /* nfo->strand == 1 : search only on plus strand
+ * => init stems[1] as empty
+ * nfo->strand == 2 : search only on minus strand
+ * => init stems[0] as empty
+ * Note: empty but initialized stems are ignored
+ * in function kdbest
+ */
+ if (nfo->strand == 2 - u){
+ stems[u] = ALLOCMEMORY(space, NULL, matchstem_t, len);
+ for (i = 0; i < len; i++){
+ stems[u][i].branches = NULL;
+ stems[u][i].noofbranches = 0;
+ }
+ }
+ }
+
+ /*
+ * try to find full match, only possible
+ * if there are no more than k_p
+ * unmatchable characters are in read
+ */
+ if (nfo->bestonly && countNonMatchingChars(seqs[0], len) <= nfo->k_p){
+ kdbest(space, s, seqs, len, nfo->s_ext, nfo->p_mis,
+ nfo->Xoff, nfo->k_p, stems, b0);
+ }
+
+ if (stems[0] == NULL){
+ stems[0]=kdseeds(space, s, seqs[0], len, jump, nfo->s_ext, nfo->p_mis,
+ nfo->Xoff, nfo->k_p, b0[0]);
+ }
+ if (stems[1] == NULL){
+ stems[1]=kdseeds(space, s, seqs[1], len, jump, nfo->s_ext, nfo->p_mis,
+ nfo->Xoff, nfo->k_p, b0[1]);
+ }
+
+ /* convert for alignment */
+ if (nfo->bisulfite){
+ FREEMEMORY(space, seqs[1]);
+ memmove(seqs[0], bl_fastaGetSequence(reads, k), len+1);
+ seqs[1] = charIUPACcomplement(space, seqs[0], len);
+ bl_convertBisulfite(seqs[0], len, nfo->bisulfite, 0);
+ bl_convertBisulfite(seqs[1], len, nfo->bisulfite, 0);
+ }
+
+ list = se_kdMatchStemAlign(space, s, nfo->seq, stems, seqs,
+ len, &stats, nfo, enctab, D, &best);
+
+ if (bl_fastaHasMate(reads)) {
+ bestpairlist = NULL;
+
+ mateseqs[0] = bl_fastaGetMate(reads, k);
+ matelen = bl_fastaGetMateLength(reads, k);
+
+ mateseqs[1] = charIUPACcomplement(space, mateseqs[0], matelen);
+
+ /* convert for direct mate alignment */
+ if (nfo->bisulfite){
+ mateseqs[0] = ALLOCMEMORY(space, NULL, char, matelen+1);
+ memmove(mateseqs[0], bl_fastaGetMate(reads, k), matelen+1);
+ bl_convertBisulfite(mateseqs[0], matelen, nfo->bisulfite, 0);
+ bl_convertBisulfite(mateseqs[1], matelen, nfo->bisulfite, 0);
+ }
+
+ if(se_kdMatchListhasMatches(list)) {
+ matemaxedist = matelen - floor(((double)nfo->accuracy * matelen)/100.);
+ se_kdAlignMate(space, nfo->seq, mateseqs, matelen,
+ list, matemaxedist, enctab, D, nfo->maxinsertsize);
+ }
+
+ if (se_kdMatchListhasMatches(list) &&
+ se_kdMatchListhasMates(list)) {
+ /*pair is fully matched*/
+ pairStatus = PAIR;
+ } else {
+ /*try to find mate first*/
+ if (nfo->jump == 0) {
+ jump = floor(matelen/75) * 2;
+ jump = (jump > 0) ? jump : 1;
+ } else {
+ jump = nfo->jump;
+ }
+
+ /* convert for mate seed search */
+ if (nfo->bisulfite){
+ FREEMEMORY(space, mateseqs[1]);
+ memmove(mateseqs[0], bl_fastaGetMate(reads, k), matelen+1);
+ mateseqs[1] = charIUPACcomplement(space, mateseqs[0], matelen);
+ bl_convertBisulfite(mateseqs[0], matelen, nfo->bisulfite, 1);
+ bl_convertBisulfite(mateseqs[1], matelen, nfo->bisulfite, 1);
+ }
+
+ matestems[0] = NULL; matestems[1] = NULL;
+ mateb0[0] = NULL; mateb0[1] = NULL;
+
+ /* restrict search to one strand */
+ for (u = 0; u < 2; u++){
+ /* nfo->strand == 1 : search only on plus strand
+ * => search for mate only on minus strand
+ * => init stems[0] as empty
+ * nfo->strand == 2 : search only on minus strand
+ * => search for mate only on plus strand
+ * => init stems[1] as empty
+ * Note: empty but initialized stems are ignored
+ * in function kdbest
+ */
+ if (nfo->strand == u + 1){
+ matestems[u] = ALLOCMEMORY(space, NULL, matchstem_t, matelen);
+ for (i = 0; i < matelen; i++){
+ matestems[u][i].branches = NULL;
+ matestems[u][i].noofbranches = 0;
+ }
+ }
+ }
+
+ /*
+ * try to find full match, only possible
+ * if there are no more than k_p
+ * unmatchable characters are in read
+ */
+ if (nfo->bestonly && countNonMatchingChars(mateseqs[0], matelen) <= nfo->k_p){
+ kdbest(space, s, mateseqs, matelen, nfo->s_ext, nfo->p_mis,
+ nfo->Xoff, nfo->k_p, matestems, mateb0);
+ }
+
+ if (matestems[0] == NULL){
+ matestems[0]=kdseeds(space, s, mateseqs[0], matelen,
+ jump, nfo->s_ext, nfo->p_mis,
+ nfo->Xoff, nfo->k_p, mateb0[0]);
+ }
+ if (matestems[1] == NULL){
+ matestems[1]=kdseeds(space, s, mateseqs[1], matelen,
+ jump, nfo->s_ext, nfo->p_mis,
+ nfo->Xoff, nfo->k_p, mateb0[1]);
+ }
+
+ /* convert for mate alignment */
+ if (nfo->bisulfite){
+ FREEMEMORY(space, mateseqs[1]);
+ memmove(mateseqs[0], bl_fastaGetMate(reads, k), matelen+1);
+ mateseqs[1] = charIUPACcomplement(space, mateseqs[0], matelen);
+ bl_convertBisulfite(mateseqs[0], matelen, nfo->bisulfite, 0);
+ bl_convertBisulfite(mateseqs[1], matelen, nfo->bisulfite, 0);
+ }
+
+ matelist = se_kdMatchStemAlign(space, s, nfo->seq, matestems,
+ mateseqs, matelen, &stats, nfo, enctab, D, &bestmate);
+
+ maxedist = len - floor(((double)nfo->accuracy * len)/100.);
+
+ se_kdAlignMate(space, nfo->seq, seqs, len, matelist,
+ maxedist, enctab, D, nfo->maxinsertsize);
+
+ if (se_kdMatchListhasMatches(matelist) &&
+ !se_kdMatchListhasMates(matelist) &&
+ !se_kdMatchListhasMatches(list)) {
+ /*query remains unmatched*/
+ pairStatus = MATE;
+ }
+
+ if(!se_kdMatchListhasMatches(matelist) &&
+ se_kdMatchListhasMatches(list)) {
+ /*mate remains unmatched*/
+ pairStatus = QUERY;
+ }
+
+ if(se_kdMatchListhasMatches(list) &&
+ se_kdMatchListhasMatches(matelist) &&
+ !se_kdMatchListhasMates(matelist)) {
+ /*pair not aligned properly but we have hits (long indel!)*/
+ maxedist = len - floor(((double)nfo->accuracy * len)/100.);
+ matemaxedist = matelen - floor(((double)nfo->accuracy * matelen)/100.);
+ bestpairlist = se_kdFindBestMatePair(space, list, matelist, maxedist, matemaxedist);
+ pairStatus = PAIR_INS;
+ }
+
+ if (se_kdMatchListhasMatches(matelist) &&
+ se_kdMatchListhasMates(matelist)) {
+ /*pair is fully matched in reverse order*/
+ templist = list;
+ list = matelist;
+ matelist = templist;
+ pairStatus = PAIR_REV;
+ }
+ }
+ }
+
+ if (nfo->bestonly) {
+ maxedist = list->minedist;
+ if(matelist) matemaxedist = matelist->minedist;
+ } else {
+ maxedist = len - floor(((double)nfo->accuracy * len)/100.);
+ if(matelist) matemaxedist = matelen - floor(((double)nfo->accuracy * matelen)/100.);
+ }
+
+// if(rep) fprintf(nfo->dev, "pair status %d\n", pairStatus);
+
+ matchflag = 0;
+ matematchflag = 0;
+ setReads(&map, &read, 1);
+
+ /*report: single ends, fully matched pairs*/
+ if(!bl_fastaHasMate(reads) || pairStatus == PAIR_REV ||
+ pairStatus == PAIR) {
+
+// if(rep) fprintf(nfo->dev, "PAIR edist:%d mateedist:%d pairedist:%d (Pair:%d, Rev:%d)\n", maxedist, matemaxedist, list->pairminedist, pairStatus == PAIR, pairStatus == PAIR_REV);
+ //if (list->n[0] || list->n[1]) matchflag = 1;
+ se_setMatches(space, &read, list, maxedist, nfo, rep);
+ matchflag = reportMatch(space, &map, reads, nfo, pairStatus, pairStatus == PAIR_REV);
+ se_destructMatches(space, &read);
+ }
+
+ /*report: spliced single ends */
+ if(nfo->split && !bl_fastaHasMate(reads) &&
+ !se_kdMatchListhasMatches(list)) {
+
+// if(rep) fprintf(nfo->dev, "SINGLE\n");
+ slist = se_kdSplitRead(space, s, nfo->seq,
+ bl_fastaGetDescription(reads, k),
+ stems, seqs, len, &stats, events, enctab, D, nfo);
+
+ se_setMatches(space, &read, slist, maxedist, nfo, rep);
+ matchflag = reportMatch(space, &map, reads, nfo, pairStatus, 0);
+ se_destructMatches(space, &read);
+ bl_gmatchlistDestruct(space, slist);
+ }
+
+ /*report: bestpair from two separately calculated match lists*/
+ if(pairStatus == PAIR_INS && bestpairlist) {
+
+// if(rep) fprintf(nfo->dev, "PAIRINS\n");
+ se_setMatches(space, &read, bestpairlist, maxedist, nfo, rep);
+ //matchflag = 1;
+ //matematchflag = 1;
+ matchflag = reportMatch(space, &map, reads, nfo, pairStatus, 0);
+ se_destructMatches(space, &read);
+ bl_gmatchlistDestruct(space, bestpairlist);
+ }
+
+ /*report: spliced unmatched mate pairs*/
+ if(bl_fastaHasMate(reads) &&
+ (pairStatus == MATE || pairStatus == QUERY)) {
+
+// if(rep) fprintf(nfo->dev, "MATE OR QUERY\n");
+ if (nfo->split) {
+
+ slist = NULL;
+ slist2 = NULL;
+
+ if(!se_kdMatchListhasMatches(list)) {
+ slist = se_kdSplitRead(space, s, nfo->seq,
+ bl_fastaGetDescription(reads, k),
+ stems, seqs, len, &stats, events, enctab, D, nfo);
+
+// if(rep) fprintf(nfo->dev, "ATTEMPT SPLICING FOR QUERY %d\n", (slist->n[0] || slist->n[1]) );
+ }
+
+
+
+ if(!se_kdMatchListhasMatches(matelist)) {
+ slist2 = se_kdSplitRead(space, s, nfo->seq,
+ bl_fastaGetMateDescription(reads, k),
+ matestems, mateseqs, matelen, &stats, events, enctab, D, nfo);
+
+// if (rep) fprintf(nfo->dev, "ATTEMPT SPLICING FOR MATE: %d\n", (slist2->n[0] || slist2->n[1]) );
+ }
+
+
+
+ if(slist && se_kdMatchListhasMatches(slist)
+ && (!slist2 || !se_kdMatchListhasMatches(slist2))) {
+ /*spliced query full mate*/
+// if(rep) fprintf(nfo->dev, "SPLICED QUERY FULL MATE\n");
+
+ if (se_kdMatchListhasMatches(matelist)) {
+ /*report the full mate match*/
+// if(rep) fprintf(nfo->dev, "MATELIST HAS MATCHES\n");
+ pairStatus = QUERY_SPL_FULL_MATE;
+
+ /* select best mate to spliced query */
+ if (matelist->n[0] + matelist->n[1] > 1){
+
+ /* get last fragment */
+ frag.a = frag.b = -1;
+ for (u = 0; u < 2; u++){
+ for (i = 0; i < slist->n[u]; i++){
+ if (slist->matches[u][i].fragno ==
+ slist->n[0]+slist->n[1]-1){
+ frag.a = u;
+ frag.b = i;
+ }
+ }
+ }
+ assert(frag.a != -1 && frag.b != -1);
+ bestpairlist = se_kdFindBestMate(space, matelist, &slist->matches[frag.a][frag.b], matemaxedist);
+ bl_gmatchlistDestruct(space, matelist);
+ matelist = bestpairlist;
+ }
+ se_setMatches(space, &read, matelist, matemaxedist, nfo, rep);
+ matematchflag = reportMatch(space, &map, reads, nfo, pairStatus, 1);
+ se_destructMatches(space, &read);
+ /*spliced query no mate*/
+ } else {
+ pairStatus = QUERY_SPL_NO_MATE;
+ }
+
+ se_setMatches(space, &read, slist, maxedist, nfo, rep);
+ matchflag = reportMatch(space, &map, reads, nfo, pairStatus, 0);
+
+ if(matematchflag && matchflag) matchflag = 3;
+ else if(matchflag && !matematchflag) matchflag = 1;
+ else if(!matchflag && matematchflag) matchflag = 2;
+
+ se_destructMatches(space, &read);
+ }
+
+ if(slist2 && se_kdMatchListhasMatches(slist2) &&
+ (!slist || !se_kdMatchListhasMatches(slist))) {
+ /*spliced mate full query*/
+// if(rep) fprintf(nfo->dev, "SPLICED MATE FULL QUERY\n");
+
+ if (se_kdMatchListhasMatches(list)) {
+// if(rep) fprintf(nfo->dev, "QUERY LIST HAS MATCHES\n");
+ pairStatus = MATE_SPL_FULL_QUERY;
+
+ /* select best query to spliced mate */
+ if (list->n[0] + list->n[1] > 1){
+
+ /* get first fragment */
+ frag.a = frag.b = -1;
+ for (u = 0; u < 2; u++){
+ for (i = 0; i < slist2->n[u]; i++){
+ if (slist2->matches[u][i].fragno == 0){
+ frag.a = u;
+ frag.b = i;
+ }
+ }
+ }
+ assert(frag.a != -1 && frag.b != -1);
+ bestpairlist = se_kdFindBestMate(space, list, &slist2->matches[frag.a][frag.b], maxedist);
+ bl_gmatchlistDestruct(space, list);
+ list = bestpairlist;
+ }
+ se_setMatches(space, &read, list, maxedist, nfo, rep);
+ matchflag = reportMatch(space, &map, reads, nfo, pairStatus, 0);
+ se_destructMatches(space, &read);
+ /*spliced query no mate*/
+ } else {
+ pairStatus = MATE_SPL_NO_QUERY;
+ }
+
+ se_setMatches(space, &read, slist2, matemaxedist, nfo, rep);
+ matematchflag = reportMatch(space, &map, reads, nfo, pairStatus, 1);
+
+ if(matematchflag && matchflag) matchflag = 3;
+ else if(matchflag && !matematchflag) matchflag = 1;
+ else if(!matchflag && matematchflag) matchflag = 2;
+
+ se_destructMatches(space, &read);
+ }
+
+ if(slist && se_kdMatchListhasMatches(slist) && slist2 &&
+ se_kdMatchListhasMatches(slist2)) {
+ /*both spliced*/
+// if(rep) fprintf(nfo->dev, "BOTH SPLICED\n");
+ pairStatus = PAIR_SPL;
+
+ se_setMatches(space, &read, slist, maxedist, nfo, rep);
+ matchflag = reportMatch(space, &map, reads, nfo, pairStatus, 0);
+ se_destructMatches(space, &read);
+
+ se_setMatches(space, &read, slist2, matemaxedist, nfo, rep);
+ matematchflag = reportMatch(space, &map, reads, nfo, pairStatus, 1);
+ se_destructMatches(space, &read);
+
+ if(matematchflag && matchflag) matchflag = 3;
+ else if(matchflag && !matematchflag) matchflag = 1;
+ else if(!matchflag && matematchflag) matchflag = 2;
+
+ }
+
+ if((!slist || !se_kdMatchListhasMatches(slist)) && se_kdMatchListhasMatches(matelist)) {
+ pairStatus = MATE;
+
+// if(rep ) fprintf(nfo->dev, "ONLY MATE matemaxedist %d\n", matemaxedist);
+ se_setMatches(space, &read, matelist, matemaxedist, nfo, rep);
+ matematchflag = reportMatch(space, &map, reads, nfo, pairStatus, 1);
+ se_destructMatches(space, &read);
+ if(matematchflag) matchflag = 2; else matchflag = 0;
+ }
+
+ if((!slist2 || !se_kdMatchListhasMatches(slist2)) && se_kdMatchListhasMatches(list)) {
+ pairStatus = QUERY;
+
+// if(rep ) fprintf(nfo->dev, "ONLY QUERY\n");
+ se_setMatches(space, &read, list, maxedist, nfo, rep);
+ matchflag = reportMatch(space, &map, reads, nfo, pairStatus, 0);
+ se_destructMatches(space, &read);
+ if(matchflag) matchflag = 1; else matchflag = 0;
+ }
+
+
+ if(slist) bl_gmatchlistDestruct(space, slist);
+ if(slist2) bl_gmatchlistDestruct(space, slist2);
+
+ } else {
+
+// if(rep ) fprintf(nfo->dev, "DISJOINT\n");
+ matchflag = 0;
+ matematchflag = 0;
+
+ if(list && se_kdMatchListhasMatches(list)) {
+ // matchflag = 1;
+ se_setMatches(space, &read, list, maxedist, nfo, rep);
+ matchflag = reportMatch(space, &map, reads, nfo, pairStatus, 0);
+ se_destructMatches(space, &read);
+ }
+
+ if(matelist && se_kdMatchListhasMatches(matelist)) {
+ // matematchflag = 1;
+ se_setMatches(space, &read, matelist, matemaxedist, nfo, rep);
+ matematchflag = reportMatch(space, &map, reads, nfo, pairStatus, 1);
+ se_destructMatches(space, &read);
+
+ }
+
+ if(matematchflag && matchflag) matchflag = 3;
+ else if(matchflag && !matematchflag) matchflag = 1;
+ else if(!matchflag && matematchflag) matchflag = 2;
+
+
+ }
+ }
+
+ bl_kdMatchstemDestruct(space, stems[0], len);
+ bl_kdMatchstemDestruct(space, stems[1], len);
+ if(matestems[0]) {
+ bl_kdMatchstemDestruct(space, matestems[0], matelen);
+ bl_kdMatchstemDestruct(space, matestems[1], matelen);
+ matestems[0] = NULL;
+ matestems[1] = NULL;
+ }
+
+ bl_gmatchlistDestruct(space, list);
+ if (nfo->bisulfite){
+ FREEMEMORY(space, seqs[0]);
+ }
+ FREEMEMORY(space, seqs[1]);
+
+ if (bl_fastaHasMate(reads)) {
+ if (nfo->bisulfite){
+ FREEMEMORY(space, mateseqs[0]);
+ }
+ FREEMEMORY(space, mateseqs[1]);
+ }
+ if (matelist) {
+ bl_gmatchlistDestruct(space, matelist);
+ matelist = NULL;
+ }
+ }
+
+ bl_kdReportUnmatched(space, reads, k, matchflag, matematchflag, &best, &bestmate, nfo);
+ }
+
+ wrapBitmatrix(space, D, 2*(dim+1));
+ FREEMEMORY(space, D);
+ FREEMEMORY(space, enctab);
+ FREEMEMORY(space, events);
+ return;
+}
+
+
+
+/*--------------------------- bl_kdReportUnmatched ---------------------------
+ *
+ * @brief dump the unmatched sequences to a device
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+bl_kdReportUnmatched (void *space, fasta_t *reads, Uint k,
+ unsigned char matchflag, unsigned char matematchflag,
+ bestseed_t *best, bestseed_t *bestmate,
+ segemehl_t *nfo)
+{
+
+ if(nfo->nomatchdev) {
+
+ if (matchflag == 0 && !bl_fastaHasMate(reads)) {
+ if (nfo->threadno > 1) pthread_mutex_lock(nfo->mtx2);
+
+ if (!bl_fastaHasQuality(reads)){
+ fprintf(nfo->nomatchdev, ">%s ef:%d;if:%d %d:%d %d:%d:%d\n%s\n",
+ bl_fastaGetDescription(reads, k),
+ best->maxevalconstraint, best->maxintervalconstraint,
+ best->readstart, best->len,
+ best->refidx, best->refpos, best->refstrand,
+
+ bl_fastaGetSequence(reads, k));
+ } else {
+ fprintf(nfo->nomatchdev, "@%s ef:%d;if:%d %d:%d %d:%d:%d\n%s\n+%s\n%s\n",
+ bl_fastaGetDescription(reads, k),
+ best->maxevalconstraint, best->maxintervalconstraint,
+
+ best->readstart, best->len,
+ best->refidx, best->refpos, best->refstrand,
+
+ bl_fastaGetSequence(reads, k),
+ bl_fastaGetDescription(reads, k), bl_fastaGetQuality(reads, k));
+ }
+
+ fflush(nfo->nomatchdev);
+ if (nfo->threadno > 1) pthread_mutex_unlock(nfo->mtx2);
+ }
+
+
+ if ((matchflag < 3) && bl_fastaHasMate(reads)) {
+ if (nfo->threadno > 1) pthread_mutex_lock(nfo->mtx2);
+
+
+ if(matchflag == 0 || matchflag ==2) {
+ if (!bl_fastaHasQuality(reads)){
+ fprintf(nfo->nomatchdev, ">%s ef:%d;if:%d %d:%d %d:%d:%d\n%s\n",
+ bl_fastaGetDescription(reads, k),
+ best->maxevalconstraint, best->maxintervalconstraint,
+
+ best->readstart, best->len,
+ best->refidx, best->refpos, best->refstrand,
+
+ bl_fastaGetSequence(reads, k));
+ } else {
+ fprintf(nfo->nomatchdev, "@%s ef:%d;if:%d %d:%d %d:%d:%d\n%s\n+%s\n%s\n",
+ bl_fastaGetDescription(reads, k),
+ best->maxevalconstraint, best->maxintervalconstraint,
+
+ best->readstart, best->len,
+ best->refidx, best->refpos, best->refstrand,
+
+ bl_fastaGetSequence(reads, k),
+ bl_fastaGetDescription(reads, k), bl_fastaGetQuality(reads, k));
+ }
+ }
+
+ if(matchflag == 0 || matchflag == 1) {
+ if (!bl_fastaHasQuality(reads)){
+ fprintf(nfo->nomatchdev, ">%s ef:%d;if:%d %d:%d %d:%d:%d\n%s\n",
+ bl_fastaGetMateDescription(reads, k),
+ bestmate->maxevalconstraint, bestmate->maxintervalconstraint,
+
+ bestmate->readstart, bestmate->len,
+ bestmate->refidx, bestmate->refpos, bestmate->refstrand,
+
+ bl_fastaGetMate(reads, k));
+ } else {
+ fprintf(nfo->nomatchdev, "@%s ef:%d;if:%d %d:%d %d:%d:%d\n%s\n+%s\n%s\n",
+ bl_fastaGetMateDescription(reads, k),
+ bestmate->maxevalconstraint, bestmate->maxintervalconstraint,
+
+ bestmate->readstart, bestmate->len,
+ bestmate->refidx, bestmate->refpos, bestmate->refstrand,
+
+ bl_fastaGetMate(reads, k),
+ bl_fastaGetMateDescription(reads, k), bl_fastaGetMateQuality(reads, k));
+ }
+ }
+
+ fflush(nfo->nomatchdev);
+ if (nfo->threadno > 1) pthread_mutex_unlock(nfo->mtx2);
+ }
+
+ }
+
+ return ;
+}
+
diff --git a/segemehl/src/kdmatch.h b/segemehl/src/kdmatch.h
new file mode 100644
index 0000000..8a96121
--- /dev/null
+++ b/segemehl/src/kdmatch.h
@@ -0,0 +1,107 @@
+#ifndef KDMATCH_H
+#define KDMATCH_H
+
+/*
+ *
+ * relaxed.h
+ * Declarations for kdmatch
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 12/26/07 02:41:35 CST
+ *
+ * SVN
+ * Revision of last commit: $Rev: 54 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-09-10 22:13:30 +0200 (Wed, 10 Sep 2008) $
+ *
+ * Id: $Id: kdmatch.h 54 2008-09-10 20:13:30Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/trunk/src/kdmatch.h $
+ *
+ */
+
+#include "charsequence.h"
+#include "kdseed.h"
+#include "segemehl.h"
+#include "karlin.h"
+#include "bitVector.h"
+
+#define SCORE(M, X, I, D) (M) - (X) - (I) - (D)
+#define MATCHES(S, X, I, D) (S) + (X) + (I) + (D)
+#define LEN_Q(S, X, I, D) (S) + (X) + (I) + (D) + (X) + (I)
+#define LEN_S(S, X, I, D) (S) + (X) + (I) + (D) + (X) + (D)
+
+
+typedef struct bestseed_s {
+ Uint readstart;
+ Uint mat;
+ Uint mis;
+ Uint ins;
+ Uint del;
+ Uint len;
+ Uint refidx;
+ Uint refpos;
+ char refstrand;
+ char maxintervalconstraint;
+ char maxevalconstraint;
+
+} bestseed_t;
+
+typedef struct split_s {
+ Uint subidx;
+ char strand;
+ Uint start;
+ Uint end;
+ uint16_t i;
+ uint16_t j;
+} split_t;
+
+typedef struct spliceevent_s {
+ uint8_t noofsites;
+ Uint firstreadid;
+ char *strand;
+ Uint *subidx;
+ Uint *start;
+ Uint *end;
+ uint16_t *i;
+ uint16_t *j;
+} spliceevent_t;
+
+typedef struct spliceevents_s {
+ Uint noofevents;
+ spliceevent_t *event;
+} spliceevents_t;
+
+
+typedef struct spliceventmapelem_s{
+ unsigned char type;
+ uint8_t site;
+ spliceevent_t *ptr;
+} spliceeventmapelem_t;
+
+typedef struct spliceeventmap_s{
+ Uint size;
+ spliceeventmapelem_t *map;
+}spliceeventmap_t;
+
+void
+se_kdGenomeMatch(void *space, Suffixarray *s, fasta_t *reads,
+ segemehl_t *nfo);
+
+gmatchlist_t*
+se_kdMatchStemAlign(void *space,
+ Suffixarray *s,
+ MultiCharSeq *seq,
+ matchstem_t **stems,
+ char **sequences,
+ Uint len,
+ karlin_t *stats,
+ segemehl_t *nfo,
+ Uint *enctab,
+ bitvector* D, bestseed_t *best);
+
+ void
+bl_kdReportUnmatched (void *space, fasta_t *reads, Uint k, unsigned char matchflag,
+ unsigned char matematchflag, bestseed_t* , bestseed_t *, segemehl_t *nfo);
+
+#endif
diff --git a/segemehl/src/rsorter.c b/segemehl/src/rsorter.c
new file mode 100644
index 0000000..b6c4871
--- /dev/null
+++ b/segemehl/src/rsorter.c
@@ -0,0 +1,89 @@
+/*
+ * rsorter.c
+ * segemehl
+ *
+ * Created by Steve Hoffmann on 08.02.10.
+ * Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+#include "radixsort.h"
+#include "basic-types.h"
+
+
+Uint
+uintkey(void *key) {
+ Uint* number = (Uint*) key;
+ return *number;
+}
+
+int main(int argc, char** argv) {
+ int i, len = 200000000, range = 20000000;
+ unsigned int iseed = (unsigned int)time(NULL);
+ double difmatch;
+
+ time_t startmatch, endmatch;
+ unsigned int a[]={
+ 123,432,654,3123,654,2123,543,131,653,123,
+ 533,1141,532,213,2241,824,1124,42,134,411,
+ 491,341,1234,527,388,245,1992,654,243,987};
+
+ Uint *lr, *lr2, *lr3;
+
+ printf("Before radix sort:\n");
+ for(i=0; i<sizeof a/sizeof(unsigned int); ++i)
+ printf(" %d", a[i]);
+ putchar('\n');
+
+ bl_radixSort(NULL, a, sizeof(Uint), sizeof a/sizeof(Uint), uintkey, 1);
+
+ printf("After radix sort:\n");
+ for(i=0; i<sizeof a/sizeof(unsigned int); ++i)
+ printf(" %d", a[i]);
+ putchar('\n');
+
+ srand (iseed);
+ lr = malloc(sizeof(Uint)*len);
+ lr2 = malloc(sizeof(Uint)*len);
+ lr3 = malloc(sizeof(Uint)*len);
+
+ for(i=0; i < len; i++) {
+ lr[i] = rand()%range;
+ lr2[i] = lr[i];
+ lr3[i] = lr[i];
+ }
+
+ time (&startmatch);
+ bl_radixSort(NULL, lr, sizeof(Uint), len, uintkey, 16);
+ time (&endmatch);
+ difmatch = difftime (endmatch, startmatch);
+ printf("sorting took %f seconds\n", difmatch);
+
+ time (&startmatch);
+ bl_radixSortKeyFirst(NULL, lr2, sizeof(Uint), len, 16);
+ time (&endmatch);
+ difmatch = difftime (endmatch, startmatch);
+ printf("sorting took %f seconds\n", difmatch);
+
+ time (&startmatch);
+ bl_radixSortUint(NULL, lr3, len, 16);
+ time (&endmatch);
+ difmatch = difftime (endmatch, startmatch);
+ printf("sorting took %f seconds\n", difmatch);
+
+ for(i=1; i < len; i++) {
+ if(lr[i-1]>lr[i]) {
+ printf("lr[%d]=%d > lr[%d]=%d", i-1, lr[i-1], i, lr[i]);
+ return 0;
+ }
+ assert(lr[i] == lr2[i]);
+ assert(lr[i] == lr3[i]);
+ }
+
+ return 0;
+}
+
diff --git a/segemehl/src/rsorter.h b/segemehl/src/rsorter.h
new file mode 100644
index 0000000..f8ceb44
--- /dev/null
+++ b/segemehl/src/rsorter.h
@@ -0,0 +1,9 @@
+/*
+ * rsorter.h
+ * segemehl
+ *
+ * Created by Steve Hoffmann on 08.02.10.
+ * Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+
diff --git a/segemehl/src/segemehl.c b/segemehl/src/segemehl.c
new file mode 100644
index 0000000..b0f9698
--- /dev/null
+++ b/segemehl/src/segemehl.c
@@ -0,0 +1,1023 @@
+
+/*
+ * segemehl.c
+ * segemehl
+ *
+ * @author Steve Hoffmann
+ * @email steve at bioinf.uni-leipzig.de
+ * @date 07/10/2007 02:50:57 PM CEST
+ *
+ * Revision of last commit:
+ * $Rev: 103 $
+ * $Author: steve $
+ * $Date: 2008-12-10 15:18:18 +0100 (Wed, 10 Dec 2008) $
+ *
+ *
+ * $Id: segemehl.c 103 2008-12-10 14:18:18Z steve $
+ * $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/src/segemehl.c $
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+#include "memory.h"
+#include "biofiles.h"
+#include "fileio.h"
+#include "stringutils.h"
+#include "charsequence.h"
+#include "multicharseq.h"
+#include "sufarray.h"
+#include "mmchar.h"
+#include "mathematics.h"
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/times.h>
+#include "vtprogressbar.h"
+#include "manout.h"
+#include <time.h>
+#include "sort.h"
+#include "list.h"
+#include "biofiles.h"
+#include "kdmatch.h"
+#include "debug.h"
+#include "info.h"
+#include <pthread.h>
+#include "citation.h"
+#include "kdseed.h"
+#include "manopt.h"
+#include "segemehl.h"
+#include "manout.h"
+#include "fileBins.h"
+#include "seqclip.h"
+#include "iupac.h"
+#include "merge.h"
+#ifdef HASHING
+#include "hash.h"
+#endif
+
+pthread_mutex_t updatemtx;
+unsigned char mute=0;
+
+void*
+checkclock(void *args) {
+ checkthreadinfo_t *t;
+
+ sleep(2);
+ cursorVisible();
+ t = (checkthreadinfo_t*) args;
+ initProgressBarVT();
+
+ while (pthread_mutex_trylock(&updatemtx) != 0) {
+ progressBarVT("reads matched.", t->noofseqs, (*t->counter), 25);
+ }
+
+ cursorVisible();
+ fprintf(stderr, "\n");
+ return NULL;
+}
+
+
+/*--------------------------- se_updateProgressBar ---------------------------
+ *
+ * @brief keeping the user informed ... somehow
+ * @author Steve Hoffmann
+ *
+ */
+
+void
+se_updateProgressBar(Uint k, segemehl_t *nfo) {
+
+ if (!nfo->mute) {
+ if (nfo->counter == NULL) {
+ progressBarVT("reads matched.", nfo->reads->noofseqs, k, 25);
+ } else {
+ (*nfo->counter)++;
+ }
+ }
+ return;
+}
+
+void*
+kdseedworker(void *args) {
+ segemehl_t *t;
+
+ t = (segemehl_t*) args;
+ se_kdGenomeMatch(t->space, t->sarray, t->reads, t);
+ return NULL;
+}
+
+
+int main(int argc, char** argv) {
+
+ segemehl_t info, *th_info;
+ manopt_arg *unflagged;
+ manopt_optionset optset;
+ manopt_intconstraint threadconstraint;
+ manopt_intconstraint accuracyconstraint;
+ manopt_intconstraint jumpconstraint;
+ manopt_intconstraint bisulfiteconstraint;
+
+ int *space = NULL;
+ int i = 0, j, k, qfbaselen = 0, ch=0;
+ Uint filebinbasenamelen=0, //splitfilebasenamelen=0,
+ clipseqlen3=0, *dmlen=NULL, dmno=0, desclen, headerlen;
+ char oldch, newch, *qfbase, *splitfilebasename = NULL, *filebinbasename=NULL,
+ *version, *clipseq3=NULL, **dms, **header, *desc, *adapter=NULL;
+ unsigned int *suflinktable,
+ counter=0;
+ unsigned char index = 0,
+ brief = 0;
+
+ bl_fileBinDomains_t* bins;
+
+ double difsuf,
+ difmatch;
+ time_t startsuf, endsuf;
+ time_t startmatch, endmatch;
+
+ fasta_t **chopsuey,
+ **reads;
+ pthread_t *threads;
+ pthread_t clockthread;
+ checkthreadinfo_t ch_info;
+ manopt_arg *dbfilenames;
+ threadconstraint.max = 3000;
+ threadconstraint.min = 1;
+ accuracyconstraint.max = 100;
+ accuracyconstraint.min = 0;
+ jumpconstraint.max = INT_MAX;
+ jumpconstraint.min = 0;
+ bisulfiteconstraint.min = 1;
+ bisulfiteconstraint.max = 6;
+
+ se_setdefault(&info);
+ info.cmdline = malloc(strlen(argv[0])+2);
+ strcpy(info.cmdline, argv[0]);
+
+ for(i = 1; i < argc; i++) {
+ info.cmdline = realloc(info.cmdline, strlen(info.cmdline) +
+ strlen(argv[i]) + 10);
+ strcat(info.cmdline," ");
+ strcat(info.cmdline,argv[i]);
+ }
+
+ version = getNiceSVNVersion(VERSION);
+ manopt_initoptionset(&optset, argv[0], NULL,
+ "Heuristic mapping of short sequences\n",
+ "SEGEMEHL is free software for non-commercial use \n (C) 2008 Bioinformatik Leipzig\n",
+ version,
+ "Please report bugs to steve at bioinf.uni-leipzig.de");
+
+ manopt_blockseparator(&optset, "INPUT");
+ manopt(&optset, LISTOPT, 1, 'd', "database",
+ "list of path/filename(s) of database sequence(s)", "<file> [<file>]",
+ NULL, NULL);
+ manopt(&optset, STRINGOPT, 0, 'q', "query",
+ "path/filename of query sequences", "<file>", NULL, &info.queryfilename);
+ manopt(&optset, STRINGOPT, 0, 'p', "mate",
+ "path/filename of mate pair sequences", "<file>", NULL, &info.matefilename);
+ manopt(&optset, REQSTRINGOPT, 0, 'i', "index",
+ "path/filename of db index", "<file>", NULL, &info.idxfilename);
+ manopt(&optset, REQSTRINGOPT, 0, 'j', "index2",
+ "path/filename of second db index", "<file>", NULL, &info.idx2filename);
+ manopt(&optset, REQSTRINGOPT, 0, 'x', "generate",
+ "generate db index and store to disk", "<file>", NULL, &info.idxfilename);
+ manopt(&optset, REQSTRINGOPT, 0, 'y', "generate2",
+ "generate second db index and store to disk", "<file>", NULL, &info.idx2filename);
+ manopt(&optset, STRINGOPT, 0, 'B', "filebins",
+ "file bins with basename <string> for easier data handling",
+ "<string>", NULL, &info.filebinbasename);
+ /*manopt(&optset, REQUINTOPT, 0, 'F', "bisulfite",
+ "bisulfite mapping with methylC-seq/Lister et al. (=1) or bs-seq/Cokus et al. protocol (=2), PAR-CLIP with 4SU (=3) or with 6SG (=4)", "<n>",
+ &bisulfiteconstraint, &info.bisulfiteprotocol);
+ */
+ manopt(&optset, REQUINTOPT, 0, 'F', "bisulfite",
+ "bisulfite mapping with methylC-seq/Lister et al. (=1) or bs-seq/Cokus et al. protocol (=2)", "<n>",
+ &bisulfiteconstraint, &info.bisulfiteprotocol);
+
+ manopt_blockseparator(&optset, "GENERAL");
+ manopt(&optset, REQUINTOPT, 0, 'm', "minsize",
+ "minimum size of queries", "<n>", NULL, &info.minsize);
+ manopt(&optset, FLAG, 0, 's', "silent",
+ "shut up!", NULL, NULL, &info.mute);
+ manopt(&optset, FLAG, 0, 'b', "brief",
+ "brief output", NULL, NULL, &brief);
+ manopt(&optset, FLAG, 0, 'c', "checkidx",
+ "check index", NULL, NULL, &info.check);
+ manopt(&optset, REQUINTOPT, 0, 't', "threads",
+ "start <n> threads", "<n>", &threadconstraint, &info.threadno);
+ manopt(&optset, REQSTRINGOPT, 0, 'o', "outfile",
+ "outputfile", "<string>", NULL, &info.outfile);
+ manopt(&optset, REQSTRINGOPT, 0, 'u', "nomatchfilename",
+ "filename for unmatched reads", "<file>", NULL, &info.nomatchname);
+ manopt_blockseparator(&optset, "SEEDPARAMS");
+ manopt(&optset, REQUINTOPT, 0, 'D', "differences",
+ "search seeds initially with <n> differences", "<n>", NULL, &info.k_p);
+ manopt(&optset, REQUINTOPT, 0, 'J', "jump",
+ "search seeds with jump size <n> (0=automatic)", "<n>", &jumpconstraint, &info.jump);
+ manopt(&optset, REQDBLOPT, 0, 'E', "evalue",
+ "max evalue", "<double>", NULL, &info.maxevalue);
+ manopt(&optset, REQDBLOPT, 0, 'w', "maxsplitevalue",
+ "max evalue for splits", "<double>", NULL, &info.maxsplitevalue);
+ manopt(&optset, REQUINTOPT, 0, 'M', "maxinterval",
+ "maximum width of a suffix array interval, i.e. a query seed will be omitted if it matches more than <n> times", "<n>", NULL, &info.M);
+ manopt(&optset, REQUINTOPT, 0, 'r', "maxout",
+ "maximum number of alignments that will be reported. If set to zero, all alignments will be reported", "<n>", NULL, &info.maxout);
+ manopt(&optset, STRINGOPT, 0, 'S', "splits",
+ "detect split/spliced reads", NULL, NULL, &info.splitfilebasename);
+ manopt(&optset, FLAG, 0, 'K', "SEGEMEHL",
+ "output SEGEMEHL format (needs to be selected for brief)", NULL, NULL, &info.sam);
+ manopt(&optset, FLAG, 0, 'V', "MEOP",
+ "output MEOP field for easier variance calling in SAM (XE:Z:)", NULL, NULL, &info.SAMmeop);
+
+ manopt(&optset, FLAG, 0, 0, "nohead",
+ "do not output header", NULL, NULL, &info.nohead);
+ /*manopt(&optset, FLAG, 0, 'Z', "PAIR",
+ "output pairstatus flag XA:Z: field in SAM", NULL, NULL, &info.SAMpairstat);
+ */
+ manopt_blockseparator(&optset, "SEEDEXTENSIONPARAMS");
+ manopt(&optset, REQUINTOPT, 0, 'e', "extensionscore",
+ "score of a match during extension", "<n>", NULL, &info.s_ext);
+ manopt(&optset, REQUINTOPT, 0, 'n', "extensionpenalty",
+ "penalty for a mismatch during extension", "<n>", NULL, &info.p_mis);
+ manopt(&optset, REQUINTOPT, 0, 'X', "dropoff",
+ "dropoff parameter for extension", "<n>", NULL, &info.Xoff);
+ manopt_blockseparator(&optset, "ALIGNPARAMS");
+ manopt(&optset, REQUINTOPT, 0, 'A', "accuracy",
+ "min percentage of matches per read in semi-global alignment", "<n>", &accuracyconstraint, &info.accuracy);
+ manopt(&optset, REQUINTOPT, 0, 'W', "minsplicecover",
+ "min coverage for spliced transcripts", "<n>", &accuracyconstraint, &info.minsplicedaligncover);
+ manopt(&optset, REQUINTOPT, 0, 'U', "minfragscore",
+ "min score of a spliced fragment", "<n>", NULL, &info.minfragmentalignscore);
+ manopt(&optset, REQUINTOPT, 0, 'Z', "minfraglen",
+ "min length of a spliced fragment", "<n>", NULL, &info.minfragmentalignlen);
+ manopt(&optset, REQDBLOPT, 0, 'l', "splicescorescale",
+ "report spliced alignment with score s only if <f>*s is larger than next best spliced alignment", "<f>", NULL, &info.chainscorescale);
+ manopt(&optset, REQUINTOPT, 0, 'H', "hitstrategy",
+ "report only best scoring hits (=1) or all (=0)", NULL, NULL, &info.bestonly);
+ manopt(&optset, FLAG, 0, 0, "showalign",
+ "show alignments", NULL, NULL, &info.align);
+ manopt(&optset, REQSTRINGOPT, 0, 'P', "prime5",
+ "add 5' adapter", "<string>", NULL, &info.softclip5Prime);
+ manopt(&optset, REQSTRINGOPT, 0, 'Q', "prime3",
+ "add 3' adapter", "<string>", NULL, &info.softclip3Prime);
+ manopt(&optset, REQINTOPT, 0, 'R', "clipacc",
+ "clipping accuracy", "<n>", NULL, &info.clipacc);
+ manopt(&optset, FLAG, 0, 'T', "polyA",
+ "clip polyA tail", NULL, NULL, &info.polyA);
+ manopt(&optset, FLAG, 0, 'Y', "autoclip",
+ "autoclip unknown 3prime adapter", NULL, NULL, &info.autoclip);
+ manopt(&optset, FLAG, 0, 'C', "hardclip",
+ "enable hard clipping", NULL, NULL, &info.hardclip);
+
+ manopt(&optset, FLAG, 0, 'O', "order",
+ "sorts the output by chromsome and position (might take a while!)",
+ "<n>", NULL, &info.order);
+ manopt(&optset, REQINTOPT, 0, 'I', "maxinsertsize",
+ "maximum size of the inserts (paired end)", "<n>", NULL, &info.maxinsertsize);
+ unflagged = manopt_getopts(&optset, argc, argv);
+
+ if (!manopt_isset(&optset, 'F', "bisulfite")){
+ index = manopt_isset(&optset, 'x', NULL);
+
+ if(!(!manopt_isset(&optset, 'i', NULL) ^ !manopt_isset(&optset, 'x', NULL))) {
+ manopt_help(&optset, "please give index filename using -i XOR -x option\n");
+ } else if(unflagged->noofvalues > 1) {
+ manopt_help(&optset, "unknown argument(s)\n");
+ }
+ }
+ else {
+ if (manopt_isset(&optset, 'i', NULL) && manopt_isset(&optset, 'x', NULL)){
+ manopt_help(&optset, "please give C/T index filename using -i XOR -x option\n");
+ }
+ if (manopt_isset(&optset, 'j', NULL) && manopt_isset(&optset, 'y', NULL)){
+ manopt_help(&optset, "please give G/A index filename using -j XOR -y option\n");
+
+ }
+ if (!manopt_isset(&optset, 'i', NULL) && !manopt_isset(&optset, 'x', NULL) &&
+ !manopt_isset(&optset, 'j', NULL) && !manopt_isset(&optset, 'y', NULL)){
+ manopt_help(&optset, "please give C/T and/or G/A index filename using (-i XOR -x) AND/OR (-j XOR -y) option\n");
+ } else if(unflagged->noofvalues > 1) {
+ manopt_help(&optset, "unknown argument(s)\n");
+ }
+
+ if (manopt_isset(&optset, 'q', "query")){
+ if ((manopt_isset(&optset, 'i', NULL) ^ manopt_isset(&optset, 'x', NULL)) &&
+ (manopt_isset(&optset, 'j', NULL) ^ manopt_isset(&optset, 'y', NULL))){
+ info.bisulfitemerging = 1;
+
+
+ if (!manopt_isset(&optset, 'o', "outfile") && !manopt_isset(&optset, 'B', "filebins")){
+ manopt_help(&optset, "bisulfite mapping with two runs may not be used \
+ when output is dumped to stdout.\n");
+ }
+ }
+ if (info.nomatchname != NULL){
+ NFO("Warning: file with unmapped reads may contain reads that are mapped \
+ in one but not in both matching runs.\n", NULL);
+ }
+ /* only required with bisulfite merging but necessary for subsequent analyses in any case */
+ if (manopt_isset(&optset, 'K', "SEGEMEHL")){
+ manopt_help(&optset, "please use SAM format with bisulfite mapping\n");
+ }
+
+ if (manopt_isset(&optset, 'S', "splits")){
+ manopt_help(&optset, "split alignments not yet supported with bisulfite mapping\n");
+ }
+ }
+ }
+
+ if(manopt_isset(&optset, 'U', "minfragscore")){
+ info.minsplicedalignscore = 2*info.minfragmentalignscore;
+ }
+
+ if(manopt_isset(&optset, 'b', "brief")) {
+ info.rep_type = 5;
+ }
+
+ if(manopt_isset(&optset, 'K', "SAM")) {
+ //default
+ } else {
+ info.rep_type = 15;
+ }
+
+ if(manopt_isset(&optset, 'O', "order") && (
+ !manopt_isset(&optset, 'o', "outfile") &&
+ !manopt_isset(&optset, 'B', "filebins"))) {
+ manopt_help(&optset, "option -O, --order may not be used when output is dumped to stdout!\n");
+ }
+
+ if(info.nomatchname != NULL) {
+ info.nomatchdev = fopen(info.nomatchname, "w");
+ if(info.nomatchdev == NULL) {
+ manopt_help(&optset, "could not open file for unmapped reads. Writing privileges set?\n");
+ }
+ }
+
+
+ if(info.queryfilename) {
+ NFO("reading queries in '%s'.\n", info.queryfilename);
+
+#ifdef HASHING
+ info.index = 1;
+#endif
+
+ if(info.index) {
+ info.reads = bl_fastxGetSet(space,
+ &info.queryfilename, 1, 1, 0, 1, info.threadno);
+ } else {
+
+ info.reads = bl_fastxRead(space,
+ info.reads, info.queryfilename, 1, 0, 0, 0, 0, bl_fastxAdd);
+ }
+ NFO("%d query sequences found.\n", info.reads->noofseqs);
+
+#ifdef HASHING
+ if (!info.index){
+ MSG("Hashing without fasta index\n");
+ bl_fastxGetTags(space, info.reads);
+ }
+ else {
+ MSG("Hashing with fasta index\n");
+ bl_fastxGetTags3(space, info.reads);
+ }
+ exit(-1);
+#endif
+
+ if (info.threadno > info.reads->noofseqs) {
+ NFO("more threads than queries. Exit forced\n", NULL);
+ exit(EXIT_FAILURE);
+ }
+
+
+ if(info.reads->noofseqs < 50 && info.autoclip) {
+ NFO("A minimum of 50 queries is reccommended for autoclip.\n", info.reads->noofseqs);
+ MSG("Do you want to proceed with autoclip? (y/n): ");
+ while((ch=getchar()) != 'n' && ch != 'y');
+ if(ch == 'n') {
+ MSG("Do you want to proceed without clipping? (y/n): ");
+ while((ch=getchar()) != 'n' && ch != 'y');
+ if(ch == 'n') exit(EXIT_SUCCESS);
+ else info.autoclip = 0;
+ }
+ }
+
+ if(info.autoclip) {
+ adapter = bl_seqclipFind3Prime(space, info.reads, 100000, 40, 10);
+ NFO("found adapter sequence: '%s'\n", adapter);
+ MSG("Do you want to clip? (y/n): ");
+ while((ch=getchar()) != 'n' && ch != 'y');
+ if(ch == 'n') {
+ MSG("Do you want to proceed without clipping? (y/n): ");
+ while((ch=getchar()) != 'n' && ch != 'y');
+ if(ch == 'n') exit(EXIT_SUCCESS);
+ } else {
+ info.softclip3Prime = adapter;
+ }
+ }
+ }
+
+ if(info.softclip3Prime) {
+ info.softclip3PrimeLen = strlen(info.softclip3Prime);
+ }
+
+ if(info.softclip5Prime) {
+ info.softclip5PrimeLen = strlen(info.softclip5Prime);
+ }
+
+ if(info.queryfilename && info.matefilename) {
+ NFO("reading mates in '%s'.\n", info.matefilename);
+
+ if (info.index) {
+ info.reads = bl_fastxGetMateSet(space, info.reads,
+ &info.matefilename, 1, 1, 0, 1, info.threadno);
+ } else {
+ info.reads =
+ bl_fastxRead(space, info.reads,
+ info.matefilename, 1, 0, 0, 0, 0, bl_fastxAddMate);
+ }
+ NFO("%d mate sequences found.\n", info.reads->noofseqs);
+ }
+
+
+ oldch = newch = ' ';
+ for (k = 0; k < 2; k++){
+ /* reset counter variables */
+ info.counter = 0;
+ counter = 0;
+ info.totallength = 0;
+
+ /* normal matching run */
+ if(!info.bisulfiteprotocol){
+ if (k == 1){
+ break;
+ }
+ initIUPAC(1, 1);
+ info.bisulfite = 0;
+ } else {
+ /* initialize bisulfite matching run */
+ if (k == 0){
+ initIUPAC(2, 1);
+
+ if (manopt_isset(&optset, 'i', NULL) ^ manopt_isset(&optset, 'x', NULL)){
+ info.bisulfiterun = 1;
+ }
+ else {
+ info.bisulfiterun = 2;
+ }
+
+ /* bisulfite binning in case of two matching runs */
+ if (info.bisulfitemerging){
+ /* create domain for each matching run with bins for each thread */
+ if (!info.filebinbasename) {
+ qfbase = bl_basename(info.queryfilename);
+ qfbaselen = strlen(qfbase);
+ filebinbasename = ALLOCMEMORY(space, NULL, char, qfbaselen);
+ memmove(filebinbasename, qfbase, bl_fileprefixlen(qfbase));
+ filebinbasename[bl_fileprefixlen(qfbase)] = 0;
+ info.filebinbasename = filebinbasename;
+ }
+ filebinbasenamelen = strlen(info.filebinbasename);
+
+ NFO("creating bisulfite bins.\n", NULL);
+ info.bins = se_createBisulfiteBins(space, 2, info.threadno, info.filebinbasename, filebinbasenamelen);
+
+ if(info.bins == NULL) {
+ NFO("Could not create bisulfite bins %s*! Exit forced.\n",
+ info.filebinbasename);
+ exit(-1);
+ }
+ }
+ } else {
+ if (manopt_isset(&optset, 'i', NULL) ^ manopt_isset(&optset, 'x', NULL) &&
+ manopt_isset(&optset, 'j', NULL) ^ manopt_isset(&optset, 'y', NULL)){
+ info.bisulfiterun = 2;
+ /* cleanup */
+ destructMultiCharSeq(space, info.seq);
+ bl_fastaDestruct(space, info.fasta);
+ FREEMEMORY(space, info.fasta);
+ }
+ else {
+ break;
+ }
+ }
+
+ if (info.bisulfiterun == 1){
+ oldch = 'C';
+ newch = 'T';
+ index = manopt_isset(&optset, 'x', NULL);
+ } else if (info.bisulfiterun == 2){
+ oldch = 'G';
+ newch = 'A';
+ info.idxfilename = info.idx2filename;
+ index = manopt_isset(&optset, 'y', NULL);
+ }
+ /*
+ * set conversion accordingly to run
+ * in bisulfite and PARCLIP with 4SG
+ * info.bisulfite = 1 in run 1
+ * info.bisulfite = 2 in run 2
+ */
+ info.bisulfite = info.bisulfiterun;
+ /*
+ * adjustment of conversion in PAR-CLIP with 4SU:
+ * info.bisulfite = 3 in run 1
+ * info.bisulfite = 4 in run 2
+ */
+ if (info.bisulfiteprotocol == 3){
+ info.bisulfite = info.bisulfiterun + 2;
+ }
+
+ /*
+ * set strand accordingly in bisulfite with
+ * Lister et al.'s protocol and PARCLIP with 4SU
+ * info.strand == 1 (plus) in run 1
+ * info.strand == 2 (minus) in run 2
+ */
+ if (info.bisulfiteprotocol == 1 ||
+ info.bisulfiteprotocol == 3){
+ info.strand = info.bisulfiterun;
+ }
+ /*
+ * adjustment to PAR-CLIP with 4SG:
+ * info.strand == 2 (minus) in run 1
+ * info.strand == 1 (plus) in run 2
+ */
+ if (info.bisulfiteprotocol == 4){
+ info.strand = 3 - info.bisulfiterun;
+ }
+
+ //NFO("info.bisulfiteprotocol=%d\tinfo.bisulfite=%d\tinfo.strand=%d\tseedconv:%c->%c\n",
+ // info.bisulfiteprotocol, info.bisulfite, info.strand, oldch, newch);
+ //NFO("bisulfite/parclip mapping run %d\n", info.bisulfiterun, oldch, newch);
+ }
+
+ MSG("reading database sequences.\n");
+
+ dbfilenames = manopt_getarg(&optset, 'd', "database");
+ info.fasta = bl_fastxGetSet(space, dbfilenames->values,
+ dbfilenames->noofvalues, 1, 0, 0, 1);
+
+ NFO("%d database sequences found.\n", info.fasta->noofseqs);
+ for(i=0; i < info.fasta->noofseqs; i++) {
+ info.totallength += bl_fastaGetSequenceLength(info.fasta, i);
+ }
+
+ for(i=0; i < info.fasta->noofseqs; i++) {
+ desclen = bl_fastaGetDescriptionLength(info.fasta, i);
+ desc = strclip(space, bl_fastaGetDescription(info.fasta, i), &desclen);
+ FREEMEMORY(space, info.fasta->seqs[i]->description);
+ info.fasta->seqs[i]->description = desc;
+ info.fasta->seqs[i]->descrlen = desclen;
+ }
+
+ NFO("total length of db sequences: %u\n", info.totallength);
+
+ if (info.bisulfiteprotocol){
+ info.seq = concatCharSequences(space, info.fasta->seqs, info.fasta->noofseqs, (char)126, (char)127);
+
+ /* character conversion */
+ for (i=0; i < info.fasta->noofseqs; i++){
+ strconvert(bl_fastaGetSequence(info.fasta, i),
+ bl_fastaGetSequenceLength(info.fasta, i), oldch, newch);
+ }
+ }
+
+ if (!info.bisulfitemerging && !info.bins &&
+ manopt_isset(&optset, 'B', "filebins") &&
+ manopt_isset(&optset, 'q', "query")) {
+
+ if (!info.filebinbasename) {
+ qfbase = bl_basename(info.queryfilename);
+ qfbaselen = strlen(qfbase);
+ filebinbasename = ALLOCMEMORY(space, NULL, char, qfbaselen);
+ memmove(filebinbasename, qfbase, bl_fileprefixlen(qfbase));
+ filebinbasename[bl_fileprefixlen(qfbase)] = 0;
+ info.filebinbasename = filebinbasename;
+ }
+
+ filebinbasenamelen = strlen(info.filebinbasename);
+ info.bins = se_createChromDomains(space, info.fasta, 500, 500,
+ info.filebinbasename, filebinbasenamelen);
+
+ if(info.bins == NULL) {
+ NFO("Could not create bins %s*! Try w/o binning! Exit forced.\n",
+ info.filebinbasename);
+ exit(-1);
+ }
+ }
+
+
+ if(manopt_isset(&optset, 'S', "splits") &&
+ manopt_isset(&optset, 'q', "query")) {
+ /*
+ if (!info.splitfilebasename) {
+ qfbase = bl_basename(info.queryfilename);
+ qfbaselen = strlen(qfbase);
+ splitfilebasename = ALLOCMEMORY(space, NULL, char, qfbaselen+5);
+ memset(splitfilebasename, 0, qfbaselen+5);
+ memmove(splitfilebasename, qfbase, bl_fileprefixlen(qfbase));
+ memmove(&splitfilebasename[bl_fileprefixlen(qfbase)], ".spl", 4);
+ info.splitfilebasename = splitfilebasename;
+ }
+
+ if(!manopt_isset(&optset, 'B', "filebins")) {
+ info.splitdev = fopen(info.splitfilebasename, "w");
+ } else {
+ splitfilebasenamelen = strlen(info.splitfilebasename);
+ info.splitbins = se_createChromDomains(space, info.fasta, 150, 150,
+ info.splitfilebasename, splitfilebasenamelen);
+
+ if(info.splitbins == NULL) {
+ NFO("Could not create splitbins %s*! Try w/o binning! Exit forced.\n",
+ info.splitfilebasename);
+ exit(-1);
+ }
+ }
+ */
+ info.split = 1;
+ }
+
+ if(index) {
+ time (&startsuf);
+ info.sarray = constructSufArr(space, info.fasta->seqs,
+ info.fasta->noofseqs, NULL, mute);
+
+ for(i=0; i < info.fasta->noofseqs; i++) {
+ FREEMEMORY(space, info.fasta->seqs[i]->sequence);
+ info.fasta->seqs[i]->sequence = NULL;
+ }
+
+ if (info.check) {
+ NFO("checking suffixarray %s\n", info.idxfilename);
+ for(i=1; i < info.sarray->numofsuffixes-1; i++) {
+ if(!mute) {
+ progressBarVT("suffixes checked.", info.sarray->numofsuffixes, i, 25);
+ }
+ if (strcmp(&info.sarray->seq->sequences[info.sarray->suftab[i-1]],
+ &info.sarray->seq->sequences[info.sarray->suftab[i]]) > 0) {
+ NFO("suffix array '%s' corrupted! Exit forced.\n", info.idxfilename);
+ exit(-1);
+ }
+ }
+ }
+
+ MSG("constructing lcp.\n");
+ constructLcp(space, info.sarray);
+ MSG("deleting inv_suftab\n");
+ FREEMEMORY(space, info.sarray->inv_suftab);
+ info.sarray->inv_suftab = NULL;
+
+ MSG("constructing child tab.\n");
+ constructchildtab(space, info.sarray);
+ MSG("constructing suffix links.\n");
+ MSG("constructing id.\n");
+ computeId(space, info.sarray);
+ MSG("constructing suflinks - bottom up.\n");
+ suflinktable = getsufsucc(space, info.sarray);
+ MSG("constructing suflinks - top down.\n");
+ constructsuflinks(space, info.sarray, suflinktable);
+ FREEMEMORY(space, suflinktable);
+ time (&endsuf);
+ difsuf = difftime (endsuf, startsuf);
+ NFO("building the suffix array has taken %f seconds.\n", difsuf);
+ NFO("total length of suffix array was %u.\n", info.totallength);
+
+ } else {
+
+ time (&startsuf);
+ NFO("reading suffix array '%s' from disk.\n", info.idxfilename);
+ info.sarray=readSuffixarray(space, info.idxfilename, info.fasta->seqs,
+ info.fasta->noofseqs, mute);
+
+ for(i=0; i < info.fasta->noofseqs; i++) {
+ FREEMEMORY(space, info.fasta->seqs[i]->sequence);
+ info.fasta->seqs[i]->sequence = NULL;
+ }
+
+ time (&endsuf);
+ difsuf = difftime (endsuf, startsuf);
+ NFO("reading the suffix array has taken %f seconds.\n", difsuf);
+ }
+
+ if (info.check) {
+ NFO("checking suffixarray %s\n", info.idxfilename);
+ for(i=1; i < info.sarray->numofsuffixes-1; i++) {
+ if(!mute) {
+ progressBarVT("suffixes checked.", info.sarray->numofsuffixes, i, 25);
+ }
+ if (strcmp(&info.sarray->seq->sequences[info.sarray->suftab[i-1]],
+ &info.sarray->seq->sequences[info.sarray->suftab[i]]) > 0) {
+ NFO("suffix array '%s' corrupted! Exit forced.\n", info.idxfilename);
+ exit(-1);
+ }
+ }
+ checksuflinks(info.sarray, 0, info.sarray->numofsuffixes-1);
+ }
+
+ if(index && info.idxfilename) {
+ NFO("writing suffix array '%s' to disk.\n", info.idxfilename);
+ writeSuffixarray(info.sarray, info.idxfilename);
+ }
+
+ if(info.queryfilename) {
+
+ if (!info.bisulfiteprotocol)
+ info.seq = info.sarray->seq;
+
+ if(!info.bins && k == 0)
+ se_registerOutputDevice(space, &info);
+
+
+ if (info.polyA) {
+ info.polyAlen = MIN(80, info.reads->maxlen);
+ clipseq3 = ALLOCMEMORY(space, NULL, char, info.polyAlen+1);
+ memset(&clipseq3[0], 'A', info.polyAlen);
+ clipseqlen3 = info.polyAlen;
+ clipseq3[info.polyAlen] = 0;
+ info.minclipscr3 = 5;
+ }
+
+ if(info.softclip3Prime) {
+ clipseqlen3 += info.softclip3PrimeLen;
+ clipseq3 = ALLOCMEMORY(space, clipseq3, char, clipseqlen3 +1);
+ memmove(&clipseq3[info.polyAlen], info.softclip3Prime, info.softclip3PrimeLen);
+ clipseq3[clipseqlen3] = 0;
+ //info.minclipscr3 = floor((((float)info.softclip3PrimeLen)*info.clipacc)/100.);
+ info.minclipscr3 = 5;
+ }
+
+
+ // fprintf(stderr, "clipseq:%s\n", clipseq3);
+
+ info.softclip3Prime = clipseq3;
+ info.softclip3PrimeLen = clipseqlen3;
+
+
+ if(info.softclip5Prime) {
+ info.minclipscr5 = floor((((float)info.softclip5PrimeLen)*info.clipacc)/100.);
+ }
+
+ if (info.threadno > 1) {
+
+ info.counter=&counter;
+ NFO("starting %d threads.\n", info.threadno);
+
+ if(info.index) {
+ chopsuey = bl_fastxChopIndex(space, info.reads, info.threadno);
+ } else {
+ chopsuey = bl_fastaChop(space, info.reads, info.threadno);
+ }
+
+ th_info = ALLOCMEMORY(space, NULL, segemehl_t, info.threadno);
+ threads = ALLOCMEMORY(space, NULL, pthread_t, info.threadno);
+ ch_info.noofseqs = info.reads->noofseqs;
+ ch_info.counter = &counter;
+
+ if (!mute && !info.mute) {
+ pthread_mutex_init(&updatemtx, NULL);
+ pthread_mutex_lock(&updatemtx);
+ pthread_create(&clockthread, NULL, checkclock, &ch_info);
+ }
+
+ for(i=0; i < info.threadno; i++) {
+ NFO("%d reads in thread %d.\n", chopsuey[i]->noofseqs, i);
+ }
+
+ time (&startmatch);
+
+ for(i=0; i < info.threadno; i++) {
+ memmove(&th_info[i], &info, sizeof(segemehl_t));
+ th_info[i].reads = chopsuey[i];
+ th_info[i].threadid = i;
+ pthread_create(&threads[i], NULL, kdseedworker, &th_info[i]);
+ }
+
+ for(i=0; i < info.threadno; i++) {
+ pthread_join(threads[i], NULL);
+ }
+
+ if(!mute && !info.mute) {
+ /*notification via mutex - why use signals?*/
+ pthread_mutex_unlock(&updatemtx);
+ pthread_join(clockthread, NULL);
+ }
+
+ fflush(info.dev);
+ time (&endmatch);
+ difmatch = difftime (endmatch, startmatch);
+ NFO("threaded matching w/ suffixarray has taken %f seconds.\n", difmatch);
+
+ for (i=0; i < info.threadno; i++) {
+ bl_fastxDestructSequence(space, chopsuey[i]);
+ bl_fastxDestructChunkIndex(space, chopsuey[i]);
+ FREEMEMORY(space, chopsuey[i]);
+ }
+
+ FREEMEMORY(space, chopsuey);
+ FREEMEMORY(space, th_info);
+ FREEMEMORY(space, threads);
+
+ } else {
+
+ time (&startmatch);
+ initProgressBarVT();
+ se_kdGenomeMatch(info.space, info.sarray, info.reads, &info);
+
+ time (&endmatch);
+ difmatch = difftime (endmatch, startmatch);
+ NFO("matching w/ suffixarray has taken %f seconds.\n", difmatch);
+ }
+ }
+
+ destructSufArr(space, info.sarray);
+ } /* END OF for (k = 0; k < 2; k++) */
+
+ /* merge thread-bins */
+ if (info.bisulfitemerging){
+
+ bl_fileBinDomainsCloseAll(info.bins);
+ bins = NULL;
+
+ /* if no chromosomal binning --> write to output file */
+ if (!manopt_isset(&optset, 'B', "filebins")){
+ se_registerOutputDevice(space, &info);
+ }
+ /* otherwise initialize and write to chromosome bins */
+ else {
+ bins = se_createChromDomains(space, info.fasta, 500, 500,
+ info.filebinbasename, filebinbasenamelen);
+ if(bins == NULL) {
+ NFO("Could not create bins %s*! Try w/o binning! Exit forced.\n",
+ info.filebinbasename);
+ exit(-1);
+ }
+ }
+ if(info.index) {
+ reads = bl_fastxChopIndex(space, info.reads, info.threadno);
+ } else {
+ reads = bl_fastaChop(space, info.reads, info.threadno);
+ }
+ se_mergeBisulfiteBins(space, info.bins, reads, info.dev, bins, 1, info.bestonly);
+
+ for (i=0; i < info.threadno; i++) {
+ bl_fastxDestructSequence(space, reads[i]);
+ bl_fastxDestructChunkIndex(space, reads[i]);
+ FREEMEMORY(space, reads[i]);
+ }
+ FREEMEMORY(space, reads);
+
+ /* destruct bins */
+ bl_fileBinDomainsDestruct(space, info.bins);
+ FREEMEMORY(space, info.bins);
+
+ info.bins = bins;
+ }
+
+ if (info.outfile && !info.bins) {
+ fclose(info.dev);
+
+ if(info.order) {
+ NFO("Sorting file '%s'. Consider option '-B' for faster sorting!\n",
+ info.outfile);
+ /* read and store header information (until first '\n') */
+ header = ALLOCMEMORY(space, NULL, char*, 1);
+ se_storeHeader(space, info.outfile, header, &headerlen);
+ /*
+ * replace header by repeated string to appear again
+ * in first line after sort
+ * NOTE: assumes ascending order in sort (not -r) and
+ * sort order to consider not more than 20 columns
+ */
+ if (headerlen < 40){
+ MSG("Warning: header may not be sorted at begin of file.\n");
+ }
+ bl_freplacestr(info.outfile, "\000\t", 2, 29);
+
+ /* sort file */
+ bl_UnixSort(space, info.outfile, SORT[info.rep_type], SORTDELIM);
+
+ /* write header back to file */
+ bl_freplacestr(info.outfile, *header, headerlen, '\n');
+ FREEMEMORY(space, *header);
+ FREEMEMORY(space, header);
+ }
+ if (info.align && (info.order || info.bisulfitemerging)){
+ NFO("Expanding alignments in '%s'.\n", info.outfile);
+ bl_freplacearr(info.outfile, "\007","\n", 1, EOF);
+ }
+ bl_freplacearr(info.outfile, "\007\010","\n\t", 2, 29);
+ }
+
+ if(info.nomatchname != NULL)
+ fclose(info.nomatchdev);
+
+ // if(info.splitdev != NULL) fclose(info.splitdev);
+
+ if(info.bins) {
+ bl_fileBinDomainsCloseAll(info.bins);
+
+ if(info.order) {
+ bl_fileBinDomainsUnixSort(space, info.bins, SORTBIN[info.rep_type], SORTDELIM);
+ }
+
+ if(info.align && (info.order || info.bisulfitemerging)) {
+ MSG("Expanding alignments in all bins.\n");
+ for(i=0; i < info.bins->noofdomains; i++) {
+ for(j=0; j < info.bins->domain[i].bins.noofbins; j++) {
+ bl_freplacearr(info.bins->domain[i].bins.b[j].fname, "\007",
+ "\n", 1, EOF);
+ }
+ }
+ }
+
+ dmno = bl_fileBinsDomainsGetList(space, info.bins, &dms, &dmlen);
+ header = ALLOCMEMORY(space, NULL, char**, dmno);
+ if(info.rep_type == 15) {
+ for(i=0; i < dmno; i++) {
+ header[i]= se_SAMHeader(space, &dms[i], &dmlen[i], 1, NULL,
+ '\t', '\n', info.order);
+ }
+ } else {
+ for(i=0; i < dmno; i++) {
+ header[i]= se_defaultHeader(space, &info, '\t', '\n');
+ }
+ }
+
+ bl_fileBinDomainsMerge(space, info.bins, info.filebinbasename,
+ filebinbasenamelen, "mat", 3, header, 1);
+
+ for(i=0; i < dmno; i++) {
+ FREEMEMORY(space, header[i]);
+ }
+
+ FREEMEMORY(space, header);
+ FREEMEMORY(space, dms);
+ FREEMEMORY(space, dmlen);
+
+ bl_fileBinDomainsDestruct(space, info.bins);
+ FREEMEMORY(space, info.bins);
+
+ }
+
+ if(info.splitbins) {
+ /*
+ bl_fileBinDomainsCloseAll(info.splitbins);
+
+ if(info.order) {
+ MSG("sorting split reads\n");
+ bl_fileBinDomainsUnixSort(space, info.splitbins, "-k9,9n", SORTDELIM);
+ }
+
+ MSG("merging split reads\n");
+ bl_fileBinDomainsMerge(space, info.splitbins, info.splitfilebasename,
+ splitfilebasenamelen, "spl", 3, NULL, 1);
+
+ bl_fileBinDomainsDestruct(space, info.splitbins);
+ */
+ FREEMEMORY(space, info.splitbins);
+ }
+
+ bl_fastaDestruct(space, info.fasta);
+ FREEMEMORY(space, info.fasta);
+
+ if (info.bisulfiteprotocol)
+ destructMultiCharSeq(space, info.seq);
+
+ manopt_destructoptionset(&optset);
+ manopt_destructarg(unflagged);
+ free(unflagged);
+
+ if(adapter) {
+ FREEMEMORY(space, adapter);
+ }
+
+ if(info.softclip3Prime) {
+ FREEMEMORY(space, info.softclip3Prime);
+ }
+
+ if(info.queryfilename) {
+ bl_fastaDestruct(space, info.reads);
+ FREEMEMORY(space, info.reads);
+ }
+
+ if(filebinbasename)
+ FREEMEMORY(space, filebinbasename);
+
+ if(splitfilebasename)
+ FREEMEMORY(space, splitfilebasename);
+
+ FREEMEMORY(space, info.mtx);
+ FREEMEMORY(space, info.mtx2);
+ FREEMEMORY(space, info.mtx3);
+ FREEMEMORY(space, info.cmdline);
+ FREEMEMORY(space, version);
+
+ NFO("\nGoodbye.\n %s\n", citerand());
+ return EXIT_SUCCESS;
+
+}
diff --git a/segemehl/src/segemehl.h b/segemehl/src/segemehl.h
new file mode 100644
index 0000000..ea21c67
--- /dev/null
+++ b/segemehl/src/segemehl.h
@@ -0,0 +1,225 @@
+#ifndef SEGEMEHL_H
+#define SEGEMEHL_H
+
+/*
+ *
+ * segemehl.h
+ * declarations for threaded segemehl
+ *
+ * @author Steve Hoffmann, steve at bioinf.uni-leipzig.de
+ * @company Bioinformatics, University of Leipzig
+ * @date 01/02/2008 10:12:46 AM CET
+ *
+ * SVN
+ * Revision of last commit: $Rev: 101 $
+ * Author: $Author: steve $
+ * Date: $Date: 2008-12-08 02:29:27 +0100 (Mon, 08 Dec 2008) $
+ *
+ * Id: $Id: segemehl.h 101 2008-12-08 01:29:27Z steve $
+ * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/src/segemehl.h $
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "biofiles.h"
+#include "fileio.h"
+#include "stringutils.h"
+#include "charsequence.h"
+#include "multicharseq.h"
+#include "sufarray.h"
+#include "fileBins.h"
+#include <pthread.h>
+#include "version.h"
+#include "vtprogressbar.h"
+
+#define MAXFILEBINS 50
+
+typedef struct segemehl_s {
+ void *space;
+ char *outfile;
+ char *queryfilename;
+ char *matefilename;
+ char *idxfilename;
+ char *idx2filename;
+ char *dbfilename;
+ char *nomatchname;
+ char *matelessfilename;
+ char *softclip3Prime;
+ char *softclip5Prime;
+ char *filebinbasename;
+ char *splitfilebasename;
+ FILE *dev;
+ FILE *nomatchdev;
+ FILE *splitdev;
+ bl_fileBinDomains_t* bins;
+ bl_fileBinDomains_t *splitbins;
+ int (*slct)(void *, void *);
+ Suffixarray *sarray;
+ MultiCharSeq *seq;
+ fasta_t *reads;
+ fasta_t *fasta;
+ char * cmdline;
+ Uint hardclip;
+ Uint hardclip3Prime;
+ Uint hardclip5Prime;
+ Uint softclip3PrimeLen;
+ Uint softclip5PrimeLen;
+ Uint totallength;
+ Uint minsize;
+ Uint *mapmatches;
+ Uint *counter;
+ Uint M;
+ Uint maxout;
+ Uint jump;
+ Uint s_ext;
+ Uint p_mis;
+ Uint Xoff;
+ Uint k;
+ Uint k_p;
+ Uint rep_type;
+ Uint check;
+ Uint kMis;
+ Uint threadno;
+ Uint threadid;
+ Uint bestonly;
+ Uint maxinsertsize;
+ Uint polyAlen;
+ Uint minclipscr3;
+ Uint minclipscr5;
+ Uint bisulfiterun;
+ Uint bisulfiteprotocol;
+ Uint bisulfitemerging;
+ Uint bisulfite;
+ Uint strand;
+ Uint minfragmentalignlen;
+ int minfragmentalignscore;
+ Uint minsplicedaligncover;
+ int minsplicedalignscore;
+ unsigned char split;
+ unsigned char index;
+ unsigned char bining;
+ unsigned char matchingstat;
+ unsigned char align;
+ unsigned char mute;
+ unsigned char oldversion;
+ unsigned char showMateless;
+ unsigned char polyA;
+ unsigned char order;
+ unsigned char sam;
+ unsigned char autoclip;
+ unsigned char SAMmeop;
+ unsigned char SAMpairstat;
+ unsigned char nohead;
+ double maxevalue;
+ double maxsplitevalue;
+ int accuracy;
+ int clipacc;
+ Uint bedist;
+ Uint fusion;
+ double chainscorescale;
+ pthread_mutex_t *mtx;
+ pthread_mutex_t *mtx2;
+ pthread_mutex_t *mtx3;
+} segemehl_t;
+
+typedef struct checkthread_s {
+ Uint noofseqs;
+ Uint *counter;
+} checkthreadinfo_t;
+
+inline static void
+se_setdefault(segemehl_t *info) {
+ info->bins = NULL;
+ info->split = 0;
+ info->splitbins = NULL;
+ info->bining = 0;
+ info->slct = bl_fileBinsCClassSelect;
+ info->dev = stdout;
+ info->idxfilename = NULL;
+ info->idx2filename = NULL;
+ info->dbfilename = NULL;
+ info->queryfilename = NULL;
+ info->matefilename=NULL;
+ info->splitfilebasename = NULL;
+ info->sarray = NULL;
+ info->maxout = 0;
+ info->nohead = 0;
+ info->seq = NULL;
+ info->fasta = NULL;
+ info->polyAlen = 0;
+ info->reads = NULL;
+ info->outfile=NULL;
+ info->totallength=0;
+ info->counter=0;
+ info->minsize = 12;
+ info->k_p = 1;
+ info->index = 1;
+ info->threadno = 1;
+ info->threadid = 0;
+ info->accuracy = 90;
+ info->clipacc = 70;
+ info->maxsplitevalue = 50;
+ info->M = 100;
+ info->jump = 0;
+ info->s_ext = 2;
+ info->p_mis = 4;
+ info->Xoff = 8;
+ info->rep_type = 12;
+ info->kMis = 0;
+ info->mute = 0;
+ info->maxinsertsize = 5000;
+ info->matchingstat = 0;
+ info->bestonly = 1;
+ info->check = 0;
+ info->maxevalue = 5;
+ info->space = NULL;
+ info->nomatchname = NULL;
+ info->nomatchdev = NULL;
+ info->splitdev = NULL;
+ info->showMateless = 1;
+ info->bedist=0;
+ info->autoclip=0;
+ info->align=0;
+ info->order=0;
+ info->sam = 1;
+ info->bisulfiterun = 0;
+ info->bisulfiteprotocol = 0;
+ info->bisulfitemerging = 0;
+ info->bisulfite = 0;
+ info->strand = 0;
+ info->oldversion=0;
+ info->fusion=0;
+ info->hardclip = 0;
+ info->hardclip3Prime = 0;
+ info->hardclip5Prime = 0;
+ info->softclip3Prime = NULL;
+ info->softclip5Prime = NULL;
+ info->filebinbasename = NULL;
+ info->polyA = 0;
+ info->softclip3PrimeLen = 0;
+ info->softclip5PrimeLen = 0;
+ info->minclipscr3 = 5;
+ info->minclipscr5 = 5;
+ info->SAMmeop = 0;
+ info->SAMpairstat = 1;
+ info->chainscorescale = 1.0;
+ info->minfragmentalignlen = 20;
+ info->minfragmentalignscore = 18;
+ info->minsplicedaligncover = 80;
+ info->cmdline = NULL;
+ info->minsplicedalignscore = 2*18;
+ info->mtx = ALLOCMEMORY(space, NULL, pthread_mutex_t, 1);
+ info->mtx2 = ALLOCMEMORY(space, NULL, pthread_mutex_t, 1);
+ info->mtx3 = ALLOCMEMORY(space, NULL, pthread_mutex_t, 1);
+ pthread_mutex_init(info->mtx, NULL);
+ pthread_mutex_init(info->mtx2, NULL);
+ pthread_mutex_init(info->mtx3, NULL);
+}
+
+extern const char *SORT[];
+extern const char *SORTBIN[];
+extern const char SORTDELIM;
+void se_updateProgressBar(Uint k, segemehl_t *nfo);
+
+#endif
diff --git a/segemehl/src/version.h b/segemehl/src/version.h
new file mode 100644
index 0000000..8909ee4
--- /dev/null
+++ b/segemehl/src/version.h
@@ -0,0 +1,395 @@
+#define VERSION "0.2.0-$Rev: 418 $ ($Date: 2015-01-05 05:17:35 -0500 (Mon, 05 Jan 2015) $)"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/segemehl.git
More information about the debian-med-commit
mailing list