[med-svn] [dascrubber] 01/01: Imported Upstream version 0~20160601
Afif Elghraoui
afif at moszumanska.debian.org
Thu Oct 20 08:19:11 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository dascrubber.
commit b2bacc891f58950d4e09fcd8aecde21cdebec78a
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Thu Oct 20 00:18:15 2016 -0700
Imported Upstream version 0~20160601
DASqv.c | 619 ++++++++
DAStrim.c | 1866 ++++++++++++++++++++++
DB.c | 1733 +++++++++++++++++++++
DB.h | 417 +++++
LICENSE | 34 +
Makefile | 25 +
QV.c | 1387 +++++++++++++++++
QV.h | 96 ++
README | 47 +
align.c | 5132 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
align.h | 335 ++++
11 files changed, 11691 insertions(+)
diff --git a/DASqv.c b/DASqv.c
new file mode 100644
index 0000000..131e3f1
--- /dev/null
+++ b/DASqv.c
@@ -0,0 +1,619 @@
+ *
+ * Using overlap pile for each read compute estimated intrinisic quality values
+ *
+ * Author: Gene Myers
+ * Date : September 2015
+ *
+ *******************************************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "DB.h"
+#include "align.h"
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#define PATHSEP "/"
+#undef QV_DEBUG
+static char *Usage = "[-v] -c<int> <source:db> <overlaps:las> ...";
+#define MAXQV 50 // Max QV score is 50
+#define MAXQV1 51
+#define MINCOV 2 // To have a score must be covered >= MINCOV in each direction (must be >0)
+#define PARTIAL .20 // Partial terminal segments covering this percentage are scored
+static int QV_DEEP; // # of best diffs to average for QV score
+static int VERBOSE;
+static int TRACE_SPACING; // Trace spacing (from .las file)
+static int TBYTES; // Bytes per trace segment (from .las file)
+static HITS_DB _DB, *DB = &_DB; // Data base
+static int DB_FIRST; // First read of DB to process
+static int DB_LAST; // Last read of DB to process (+1)
+static int DB_PART; // 0 if all, otherwise block #
+static FILE *QV_AFILE; // .qual.anno
+static FILE *QV_DFILE; // .qual.data
+static int64 QV_INDEX; // Current index into .qual.data file
+// Statistics
+static int64 nreads, totlen;
+static int64 qgram[MAXQV1], sgram[MAXQV1];
+// For each pile, calculate QV scores of the aread at tick spacing TRACE_SPACING
+static void CALCULATE_QVS(int aread, Overlap *ovls, int novl)
+{ static int nmax = 0;
+ static int *hist = NULL;
+ static int *cist = NULL;
+ static uint8 *qvec = NULL;
+ static int partial;
+ int alen, atick;
+ int *tick, *cick;
+ int i;
+ alen = DB->reads[aread].rlen;
+ atick = (alen + (TRACE_SPACING-1))/TRACE_SPACING;
+#if defined(QV_DEBUG)
+ printf("AREAD %d",aread);
+ if (novl == 0)
+ printf(" EMPTY");
+ printf("\n");
+ // Allocate or expand data structures for qv calculation as needed
+ if (atick > nmax)
+ { nmax = atick*1.2 + 100;
+ hist = (int *) Realloc(hist,nmax*MAXQV1*sizeof(int),"Allocating histograms");
+ cist = (int *) Realloc(cist,nmax*MAXQV1*sizeof(int),"Allocating histograms");
+ qvec = (uint8 *) Realloc(qvec,nmax*sizeof(uint8),"Allocating QV vector");
+ if (hist == NULL || cist == NULL || qvec == NULL)
+ exit (1);
+ for (i = MAXQV1*nmax-1; i >= 0; i--)
+ hist[i] = cist[i] = 0;
+ }
+ // For every segment, fill histogram of match diffs for every one of the
+ // atick intervals, building separate histograms, hist & cist, for forward
+ // and reverse B-hits
+ for (i = 0; i < novl; i++)
+ { Path *path;
+ uint16 *trace;
+ int *ht;
+ int tlen, abit;
+ int a, b, x;
+ path = &(ovls[i].path);
+ trace = (uint16 *) path->trace;
+ tlen = path->tlen;
+ if (COMP(ovls[i].flags))
+ ht = cist;
+ else
+ ht = hist;
+ b = 0;
+ a = (path->abpos/TRACE_SPACING)*MAXQV1;
+ abit = (path->abpos % TRACE_SPACING);
+ if (abit != 0)
+ { a += MAXQV1;
+ b += 2;
+ }
+ abit = (path->aepos % TRACE_SPACING);
+ if (abit != 0)
+ tlen -= 2;
+ while (b < tlen)
+ { x = (int) ((200.*trace[b]) / (TRACE_SPACING + trace[b+1]));
+ if (x > MAXQV)
+ x = MAXQV;
+ ht[a + x] += 1;
+ a += MAXQV1;
+ b += 2;
+ }
+ if (path->aepos == alen && abit >= partial)
+ { x = (int) ((200.*trace[tlen]) / (abit + trace[tlen+1]));
+ if (x > MAXQV)
+ x = MAXQV;
+ ht[a + x] += 1;
+ }
+ }
+ // For every segment, qv score is the maximum of the averages of the QV_DEEP lowest
+ // in the forward and reverse directions (if each is QV_DEEP), or the average
+ // of overlap scores (if between MINCOV and QV_DEEP-1), or MAXQV if no overlaps at all.
+ // Reset histogram for segment to zeros.
+ tick = hist;
+ cick = cist;
+ for (i = 0; i < atick; i++)
+ { int v, y;
+ int qvn, qvc;
+ int cntn, cntc;
+ int sumn, sumc;
+#ifdef QV_DEBUG
+ { int min, max;
+ printf(" [%5d,%5d]:",i*TRACE_SPACING,(i+1)*TRACE_SPACING);
+ for (v = 0; v <= MAXQV; v++)
+ if (tick[v] > 0)
+ break;
+ min = v;
+ for (v = MAXQV; v >= 0; v--)
+ if (tick[v] > 0)
+ break;
+ max = v;
+ for (v = min; v <= max; v++)
+ if (tick[v] == 1)
+ printf(" %2d",v);
+ else if (tick[v] > 1)
+ printf(" %2d(%d)",v,tick[v]);
+ printf("\n :");
+ for (v = 0; v <= MAXQV; v++)
+ if (cick[v] > 0)
+ break;
+ min = v;
+ for (v = MAXQV; v >= 0; v--)
+ if (cick[v] > 0)
+ break;
+ max = v;
+ for (v = min; v <= max; v++)
+ if (cick[v] == 1)
+ printf(" %2d",v);
+ else if (cick[v] > 1)
+ printf(" %2d(%d)",v,cick[v]);
+ }
+ if (VERBOSE)
+ for (v = 0; v <= MAXQV; v++)
+ sgram[v] += tick[v] + cick[v];
+ cntn = sumn = 0;
+ for (v = 0; v <= MAXQV; v++)
+ { y = tick[v];
+ tick[v] = 0;
+ cntn += y;
+ sumn += y*v;
+ if (cntn >= QV_DEEP)
+ { sumn -= (cntn-QV_DEEP)*v;
+ cntn = QV_DEEP;
+ break;
+ }
+ }
+ for (v++; v <= MAXQV; v++)
+ tick[v] = 0;
+ cntc = sumc = 0;
+ for (v = 0; v <= MAXQV; v++)
+ { y = cick[v];
+ cick[v] = 0;
+ cntc += y;
+ sumc += y*v;
+ if (cntc >= QV_DEEP)
+ { sumc -= (cntc-QV_DEEP)*v;
+ cntc = QV_DEEP;
+ break;
+ }
+ }
+ for (v++; v <= MAXQV; v++)
+ cick[v] = 0;
+ if (cntn >= MINCOV)
+ qvn = sumn/cntn;
+ else
+ qvn = MAXQV;
+ if (cntc >= MINCOV)
+ qvc = sumc/cntc;
+ else
+ qvc = MAXQV;
+ if (qvn > qvc)
+ qvec[i] = (uint8) qvn;
+ else
+ qvec[i] = (uint8) qvc;
+ tick += MAXQV1;
+ cick += MAXQV1;
+#ifdef QV_DEBUG
+ printf(" >> %2d %2d = %2d <<\n",qvn,qvc,qvec[i]);
+ }
+ // Accumulate qv histogram (if VERBOSE) and append qv's to .qual file
+ if (VERBOSE)
+ { for (i = 0; i < atick; i++)
+ qgram[qvec[i]] += 1;
+ nreads += 1;
+ totlen += alen;
+ }
+ fwrite(qvec,sizeof(uint8),atick,QV_DFILE);
+ QV_INDEX += atick;
+ fwrite(&QV_INDEX,sizeof(int64),1,QV_AFILE);
+ // Read in each successive pile and call ACTION on it. Read in the traces only if
+ // "trace" is nonzero
+static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int trace)
+{ static Overlap *ovls = NULL;
+ static int omax = 500;
+ static uint16 *paths = NULL;
+ static int pmax = 100000;
+ int64 i, j, novl;
+ int n, a;
+ int pcur;
+ int max;
+ if (ovls == NULL)
+ { ovls = (Overlap *) Malloc(sizeof(Overlap)*omax,"Allocating overlap buffer");
+ if (ovls == NULL)
+ exit (1);
+ }
+ if (trace && paths == NULL)
+ { paths = (uint16 *) Malloc(sizeof(uint16)*pmax,"Allocating path buffer");
+ if (paths == NULL)
+ exit (1);
+ }
+ rewind(input);
+ fread(&novl,sizeof(int64),1,input);
+ fread(&TRACE_SPACING,sizeof(int),1,input);
+ TBYTES = sizeof(uint8);
+ else
+ TBYTES = sizeof(uint16);
+ Read_Overlap(input,ovls);
+ if (trace)
+ { if (ovls[0].path.tlen > pmax)
+ { pmax = 1.2*(ovls[0].path.tlen)+10000;
+ paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
+ if (paths == NULL) exit (1);
+ }
+ fread(paths,TBYTES,ovls[0].path.tlen,input);
+ if (TBYTES == 1)
+ { ovls[0].path.trace = paths;
+ Decompress_TraceTo16(ovls);
+ }
+ }
+ else
+ fseek(input,TBYTES*ovls[0].path.tlen,SEEK_CUR);
+ if (ovls[0].aread < DB_FIRST)
+ { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n",
+ Prog_Name,DB_PART);
+ exit (1);
+ }
+ pcur = 0;
+ n = max = 0;
+ for (j = DB_FIRST; j < DB_LAST; j++)
+ { ovls[0] = ovls[n];
+ a = ovls[0].aread;
+ if (a != j)
+ n = 0;
+ else
+ { if (trace)
+ memcpy(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
+ n = 1;
+ pcur = ovls[0].path.tlen;
+ while (1)
+ { if (Read_Overlap(input,ovls+n) != 0)
+ { ovls[n].aread = INT32_MAX;
+ break;
+ }
+ if (trace)
+ { if (pcur + ovls[n].path.tlen > pmax)
+ { pmax = 1.2*(pcur+ovls[n].path.tlen)+10000;
+ paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
+ if (paths == NULL) exit (1);
+ }
+ fread(paths+pcur,TBYTES,ovls[n].path.tlen,input);
+ if (TBYTES == 1)
+ { ovls[n].path.trace = paths+pcur;
+ Decompress_TraceTo16(ovls+n);
+ }
+ }
+ else
+ fseek(input,TBYTES*ovls[n].path.tlen,SEEK_CUR);
+ if (ovls[n].aread != a)
+ break;
+ pcur += ovls[n].path.tlen;
+ n += 1;
+ if (n >= omax)
+ { omax = 1.2*n + 100;
+ ovls = (Overlap *) Realloc(ovls,sizeof(Overlap)*omax,"Expanding overlap buffer");
+ if (ovls == NULL) exit (1);
+ }
+ }
+ if (n >= max)
+ max = n;
+ pcur = 0;
+ for (i = 0; i < n; i++)
+ { ovls[i].path.trace = paths+pcur;
+ pcur += ovls[i].path.tlen;
+ }
+ }
+ ACTION(j,ovls,n);
+ }
+ return (max);
+int main(int argc, char *argv[])
+{ FILE *input;
+ char *root, *dpwd;
+ char *las, *lpwd;
+ int64 novl;
+ int c, COVERAGE;
+ // Process arguments
+ { int i, j, k;
+ int flags[128];
+ char *eptr;
+ COVERAGE = -1;
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("v")
+ break;
+ case 'c':
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+ VERBOSE = flags['v'];
+ if (argc < 3)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ if (COVERAGE < 0)
+ { fprintf(stderr,"%s: Must supply -c parameter\n",Prog_Name);
+ exit (1);
+ }
+ else
+ { if (COVERAGE >= 40)
+ else if (COVERAGE >= 20)
+ QV_DEEP = 5;
+ else if (COVERAGE >= 4)
+ else
+ { fprintf(stderr,"%s: Average coverage is too low (< 4X), cannot infer qv's\n",Prog_Name);
+ exit (1);
+ }
+ }
+ }
+ // Open trimmed DB
+ { int status;
+ status = Open_DB(argv[1],DB);
+ if (status < 0)
+ exit (1);
+ if (status == 1)
+ { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ if (DB->part)
+ { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ Trim_DB(DB);
+ }
+ // Initialize statistics gathering
+ if (VERBOSE)
+ { int i;
+ nreads = 0;
+ totlen = 0;
+ for (i = 0; i <= MAXQV; i++)
+ qgram[i] = sgram[i] = 0;
+ printf("\nDASqv -c%d %s",COVERAGE,argv[1]);
+ for (i = 2; i < argc; i++)
+ printf(" %s",argv[i]);
+ printf("\n");
+ }
+ // Determine if overlap block is being processed and if so get first and last read
+ // from .db file
+ dpwd = PathTo(argv[1]);
+ root = Root(argv[1],".db");
+ for (c = 2; c < argc; c++)
+ { las = Root(argv[c],".las");
+ { FILE *dbfile;
+ char buffer[2*MAX_NAME+100];
+ char *p, *eptr;
+ int i, part, nfiles, nblocks, cutoff, all, oindx;
+ int64 size;
+ DB_PART = 0;
+ DB_FIRST = 0;
+ DB_LAST = DB->nreads;
+ p = rindex(las,'.');
+ if (p != NULL)
+ { part = strtol(p+1,&eptr,10);
+ if (*eptr == '\0' && eptr != p+1)
+ { dbfile = Fopen(Catenate(dpwd,"/",root,".db"),"r");
+ if (dbfile == NULL)
+ exit (1);
+ if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
+ for (i = 0; i < nfiles; i++)
+ if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL)
+ if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1)
+ if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3)
+ for (i = 1; i <= part; i++)
+ if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2)
+ if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2)
+ fclose(dbfile);
+ DB_PART = part;
+ *p = '\0';
+ }
+ }
+ }
+ // Set up preliminary trimming track
+ if (DB_PART > 0)
+ { QV_AFILE = Fopen(Catenate(dpwd,PATHSEP,root,
+ Numbered_Suffix(".",DB_PART,".qual.anno")),"w");
+ QV_DFILE = Fopen(Catenate(dpwd,PATHSEP,root,
+ Numbered_Suffix(".",DB_PART,".qual.data")),"w");
+ }
+ else
+ { QV_AFILE = Fopen(Catenate(dpwd,PATHSEP,root,".qual.anno"),"w");
+ QV_DFILE = Fopen(Catenate(dpwd,PATHSEP,root,".qual.data"),"w");
+ }
+ exit (1);
+ { int size, nreads;
+ nreads = DB_LAST - DB_FIRST;
+ size = sizeof(int64);
+ fwrite(&nreads,sizeof(int),1,QV_AFILE);
+ fwrite(&size,sizeof(int),1,QV_AFILE);
+ QV_INDEX = 0;
+ fwrite(&QV_INDEX,sizeof(int64),1,QV_AFILE);
+ }
+ // Open overlap file
+ lpwd = PathTo(argv[c]);
+ if (DB_PART > 0)
+ input = Fopen(Catenate(lpwd,"/",las,Numbered_Suffix(".",DB_PART,".las")),"r");
+ else
+ input = Fopen(Catenate(lpwd,"/",las,".las"),"r");
+ if (input == NULL)
+ exit (1);
+ free(lpwd);
+ free(las);
+ // Get trace point spacing information
+ fread(&novl,sizeof(int64),1,input);
+ fread(&TRACE_SPACING,sizeof(int),1,input);
+ // Process each read pile
+ make_a_pass(input,CALCULATE_QVS,1);
+ fclose(QV_AFILE);
+ fclose(QV_DFILE);
+ }
+ // If verbose output statistics summary to stdout
+ if (VERBOSE)
+ { int i;
+ int64 ssum, qsum;
+ int64 stotal, qtotal;
+ printf("\nInput: ");
+ Print_Number(nreads,7,stdout);
+ printf("reads, ");
+ Print_Number(totlen,12,stdout);
+ printf(" bases\n");
+ stotal = qtotal = 0;
+ for (i = 0; i <= MAXQV; i++)
+ { stotal += sgram[i];
+ qtotal += qgram[i];
+ }
+ printf("\nHistogram of q-values (average %d best)\n",2*QV_DEEP);
+ printf("\n Input QV\n");
+ qsum = qgram[MAXQV];
+ ssum = sgram[MAXQV];
+ printf("\n %2d: %9lld %5.1f%% %9lld %5.1f%%\n\n",
+ MAXQV,sgram[MAXQV],(100.*ssum)/stotal,qgram[MAXQV],(100.*qsum)/qtotal);
+ qtotal -= qsum;
+ stotal -= ssum;
+ ssum = qsum = 0;
+ for (i = MAXQV-1; i >= 0; i--)
+ if (qgram[i] > 0)
+ { ssum += sgram[i];
+ qsum += qgram[i];
+ printf(" %2d: %9lld %5.1f%% %9lld %5.1f%%\n",
+ i,sgram[i],(100.*ssum)/stotal,
+ qgram[i],(100.*qsum)/qtotal);
+ }
+ }
+ // Clean up
+ free(dpwd);
+ free(root);
+ Close_DB(DB);
+ free(Prog_Name);
+ exit (0);
diff --git a/DAStrim.c b/DAStrim.c
new file mode 100644
index 0000000..7932981
--- /dev/null
+++ b/DAStrim.c
@@ -0,0 +1,1866 @@
+ *
+ * Using overlap pile for each read and intrinisic quality values, determine the
+ * high quality segments with interspersed gaps. Any unremoved
+ * adaptemer sequences are dectected and the shorter side trimmed.
+ *
+ * Author: Gene Myers
+ * Date : March 2015
+ *
+ *******************************************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+#include "DB.h"
+#include "align.h"
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#define PATHSEP "/"
+#undef DEBUG_HQ_BLOCKS // Various DEBUG flags (normally all off)
+#undef SHOW_PAIRS
+#define ANNOTATE
+// Command format and global parameter variables
+static char *Usage = " [-v] [-l<int(1000)>] -g<int> -b<int> <source:db> <overlaps:las> ...";
+static int BAD_QV; // qv >= and you are "bad"
+static int GOOD_QV; // qv <= and you are "good"
+static int MIN_LEN; // Minimum segment length to keep
+static int VERBOSE;
+// Good patch constant
+#define MIN_BLOCK 500 // Minimum length of a good patch
+// Gap constants
+#define MIN_COVER 3 // A coverage gap occurs at or below this level
+#define COVER_LEN 400 // An overlap covers a point if it extends COVER_LEN to either side.
+// Wall Constants
+#define MIN_PNT 5 // Minimum # of events in a wall
+#define MAX_SEP 25 // Maximum separation between two events in a wall
+#define AVE_SEP 5. // Maximum average separation between two events in a wall
+// Global Variables (must exist across the processing of each pile)
+ // Read-only
+static int TRACE_SPACING; // Trace spacing (from .las file)
+static HITS_DB _DB, *DB = &_DB; // Data base
+static int DB_FIRST; // First read of DB to process
+static int DB_LAST; // Last read of DB to process (+1)
+static int DB_PART; // 0 if all, otherwise block #
+static int64 *QV_IDX; // qual track index
+static uint8 *QV; // qual track values
+ // Read & Write
+#ifdef ANNOTATE
+static FILE *HQ_AFILE; // .hq.anno
+static FILE *HQ_DFILE; // .hq.data
+static int64 HQ_INDEX; // Current index into .hq.data file as it is being written
+static FILE *HL_AFILE; // .hole.anno
+static FILE *HL_DFILE; // .hole.data
+static int64 HL_INDEX; // Current index into .hole.data file as it is being written
+static FILE *SN_AFILE; // .span.anno
+static FILE *SN_DFILE; // .span.data
+static int64 SN_INDEX; // Current index into .span.data file as it is being written
+static FILE *SP_AFILE; // .split.anno
+static FILE *SP_DFILE; // .split.data
+static int64 SP_INDEX; // Current index into .split.data file as it is being written
+static FILE *AD_AFILE; // .adapt.anno
+static FILE *AD_DFILE; // .adapt.data
+static int64 AD_INDEX; // Current index into .adapt.data file as it is being written
+static FILE *KP_AFILE; // .keep.anno
+static FILE *KP_DFILE; // .keep.data
+static int64 KP_INDEX; // Current index into .keep.data file as it is being written
+static int64 nreads, totlen;
+static int64 nelim, nelimbp;
+static int64 n5trm, n5trmbp;
+static int64 n3trm, n3trmbp;
+static int64 natrm, natrmbp;
+static int64 ngaps, ngapsbp;
+ static int64 nlowq, nlowqbp;
+ static int64 nspan, nspanbp;
+ static int64 nchim, nchimbp;
+// Data Structures
+typedef struct // General read interval [beg..end]
+ { int beg;
+ int end;
+ } Interval;
+ // Coverage events, type (one of 7 below) and position
+#define ADD 0 // leftmost A-position of LA
+#define LFT 1 // ADD position + COVER_LEN of LA (>= 2*COVER_LEN long)
+#define LGP 2 // left end of an HQ-block
+#define CTR 3 // A-center of LA < 2*COVER_LEN long
+#define RGP 4 // right end of an HQ-block
+#define RGT 5 // DEL position - COVER_LEN of LA
+#define DEL 6 // rightmost A-position of LA
+static char Symbol[7] = { 'A', 'L', '[', 'C', ']', 'R', 'D' };
+typedef struct
+ { int type;
+ int pos;
+ } Event;
+ // Wall: there are cnt LFT/RGT events ending in the interval [beg,end] going
+ // from coverage depth cov up to cov+cnt
+typedef struct
+ { int beg;
+ int end;
+ int cnt;
+ int cov;
+ } Wall;
+ *
+ *
+ ********************************************************************************************/
+ // Find "good" blocks of trace point intervals:
+ // 0. A good block must begin and end with an interval <= GOOD_QV
+ // 1. Any stretch all < BAD_QV at least MIN_BLOCK long
+ // 2. Any stretch all <= GOOD_QV at least MIN_BLOCK-TRACE_SPACING long
+ // 3. Any stretch all <= GOOD_QV only 1 interval away from another good patch
+ // Global Inputs: QV, QV_IDX, GOOD_QV, BAD_QV
+ // HQ_BLOCKS[0..*nblk-1] contain the good patches in increase sequencing across aread.
+ // Parameter aread is input-only, and p_nblk is output-only.
+static Interval *HQ_BLOCKS(int aread, int *p_nblk)
+{ int nblk;
+ static int *alive = NULL;
+ static Interval *block = NULL;
+ int alen, atick;
+ uint8 *qvec;
+ alen = DB->reads[aread].rlen;
+ atick = (alen + (TRACE_SPACING-1))/TRACE_SPACING;
+ if (alive == NULL)
+ { int max = DB->maxlen/TRACE_SPACING+2;
+ alive = (int *) Malloc(max*sizeof(int),"Allocating alive vector");
+ block = (Interval *) Malloc(max*sizeof(Interval),"Allocating block vector");
+ if (alive == NULL || block == NULL)
+ exit (1);
+ }
+ qvec = QV + QV_IDX[aread];
+ nblk = 0;
+ // Find all blocks < BAD_QV with either len >= MIN_BLOCK or all <= GOOD_QV in block[0..nblk)
+ // Mark those satisfying 1. or 2. as "alive" (.alv)
+ { int lmost = 0, rmost, thr;
+ int i, in;
+ in = 0;
+ for (i = 0; i <= atick; i++)
+ { int q, alv;
+ if (i < atick)
+ q = qvec[i];
+ else
+ q = BAD_QV;
+ if (in)
+ { if (q >= BAD_QV)
+ { alv = (lmost-rmost >= thr);
+ if (alv)
+ { block[nblk].beg = rmost;
+ block[nblk].end = lmost + 1;
+ alive[nblk] = alv;
+ nblk += 1;
+ }
+ else
+ { int j, k;
+ for (j = rmost; j <= lmost; j = k)
+ { for (k = j+1; k <= lmost; k++)
+ if (qvec[k] > GOOD_QV)
+ break;
+ block[nblk].beg = j;
+ block[nblk].end = k;
+ alive[nblk] = (k-j >= thr);
+ nblk += 1;
+ for ( ; k <= lmost; k++)
+ if (qvec[k] <= GOOD_QV)
+ break;
+ }
+ }
+ in = 0;
+ }
+ else if (q <= GOOD_QV)
+ lmost = i;
+ }
+ else
+ { if (q <= GOOD_QV)
+ { rmost = lmost = i;
+ in = 1;
+ }
+ }
+ }
+ }
+ // Mark as alive all short, all-good blocks that satisfy 3.
+ { int i, j;
+ for (i = 0; i < nblk; i++)
+ if (alive[i])
+ { for (j = i-1; j >= 0 && ! alive[j]; j--)
+ if (block[j+1].beg - block[j].end == 1)
+ alive[j] = 1;
+ else
+ break;
+ for (j = i+1; j < nblk && ! alive[j]; j++)
+ if (block[j].beg - block[j-1].end == 1)
+ alive[j] = 1;
+ else
+ break;
+ }
+ }
+ // Remove all blocks that are not alive
+ { int i, j;
+ j = 0;
+ for (i = 0; i < nblk; i++)
+ if (alive[i])
+ { block[j].beg = block[i].beg * TRACE_SPACING;
+ block[j].end = block[i].end * TRACE_SPACING;
+ j += 1;
+ }
+ nblk = j;
+ if (nblk > 0 && block[nblk-1].end > alen)
+ block[nblk-1].end = alen;
+ }
+ { int i;
+ printf(" %3d:",nblk);
+ for (i = 0; i < nblk; i++)
+ printf(" [%5d,%5d]",block[i].beg,block[i].end);
+ printf("\n");
+ }
+ *p_nblk = nblk;
+ return (block);
+ *
+ *
+ ********************************************************************************************/
+ // Find intervals of LFT/RGT events where no two events are separated by more than
+ // MAX_SEP, the average arrival rate is AVE_SEP, and there are at least MIN_PNT
+ // events in the interval.
+static Wall *wall_detector(int *ev, int b, int e, Wall *next)
+{ int idx;
+ { int i, n, max;
+ double ave;
+ n = e-b;
+ if (n < MIN_PNT) return (next); // Too small: done
+ idx = b;
+ max = -1; // Find the position of the largest separation between
+ for (i = b+1; i < e; i++) // two tips in ev[b..e)
+ if (ev[i] - ev[i-1] > max)
+ { max = ev[i] - ev[i-1];
+ idx = i;
+ }
+ ave = (ev[e-1] - ev[b]) / (n-1.); // Check if the current interval is a wall
+ if (ave <= AVE_SEP && max <= MAX_SEP)
+ { if (max <= 4.*(ave+1.)) // Max separation < 4*average separation ?
+ { next->beg = b;
+ next->end = e;
+ next->cnt = n;
+ return (next+1);
+ }
+ }
+ }
+ next = wall_detector(ev,b,idx,next); // If not then split on the largest separation
+ next = wall_detector(ev,idx,e,next); // and recurse on the two parts
+ return (next);
+ // Find LFT/RGT event walls
+static Wall *find_walls(int novl, Event *queue, int *anum, int *dnum)
+{ static int nmax = 0;
+ Wall *aptr, *dptr;
+ static Wall *wall = NULL;
+ int ntip;
+ static int *adds = NULL;
+ static int *dels;
+ if (novl == 0)
+ return (NULL);
+ if (novl > nmax)
+ { nmax = novl*1.2 + 1000;
+ wall = (Wall *) Realloc(wall,sizeof(Wall)*(nmax/MIN_PNT),"Reallocating wall vector");
+ adds = (int *) Realloc(adds,sizeof(int)*2*nmax,"Reallocating add+del vectors");
+ if (wall == NULL || adds == NULL)
+ exit (1);
+ dels = adds + nmax;
+ }
+ // Make separate arrays of add and del tips (LFT and RGT events) in sorted order in
+ // which to seek "walls".
+ { int i, j, x;
+ i = x = 0; // A bit tricky: less than novl tips due to CTR events
+ for (j = 0; x < novl; j++) // that don't generate tips, so analyze events until
+ if (queue[j].type == CTR) // have counted all LA's. Furthermore adds and dels
+ x += 1; // are sorted because queue is sorted.
+ else if (queue[j].type == LFT)
+ { x += 1;
+ adds[i++] = queue[j].pos;
+ }
+ ntip = i;
+ i = 0;
+ for (j = 0; i < ntip; j++)
+ if (queue[j].type == RGT)
+ dels[i++] = queue[j].pos;
+ }
+ // Find LFT walls and RGT walls in [walls,aptr) and [aptr,dptr)
+ aptr = wall_detector(adds,0,ntip,wall);
+ dptr = wall_detector(dels,0,ntip,aptr);
+ // For each wall, determine the coverage of its base with a merged traversal
+ // of the adds and dels arrays
+ { Wall *a, *d;
+ int i, j, x;
+ x = 0;
+ a = wall;
+ d = aptr;;
+ i = j = 0;
+ while (j < ntip)
+ if (i < ntip && adds[i] < dels[j])
+ { if (a->beg == i)
+ a->cov = x;
+ else if (a->end == i+1)
+ { a += 1;
+ if (a >= aptr)
+ a -= 1;
+ }
+ x += 1;
+ i += 1;
+ }
+ else
+ { if (d->beg == j)
+ d->cov = x - d->cnt;
+ else if (d->end == j+1)
+ { d += 1;
+ if (d >= dptr)
+ d -= 1;
+ }
+ x -= 1;
+ j += 1;
+ }
+ }
+ // Sneaky, switch beg/end from an index into the adds or dels array, to the actually
+ // coordinate of the event.
+ { Wall *a;
+ for (a = wall; a < aptr; a++)
+ { a->beg = adds[a->beg];
+ a->end = adds[a->end-1];
+ }
+ for (a = aptr; a < dptr; a++)
+ { a->beg = dels[a->beg];
+ a->end = dels[a->end-1];
+ }
+ }
+ *anum = aptr-wall;
+ *dnum = dptr-aptr;
+ return (wall);
+ *
+ * COVERAGE ANALYSIS TO FIND ALL HOLES (regions of very low coverage/support)
+ *
+ ********************************************************************************************/
+ // Find intervals for which there are MIN_COVER or fewer LAs that project at least COVER_LEN
+ // bases to the left and right of the interval. These are called holes.
+ // Holes are usually found between HQ-blocks. However occasionally they intersect one or
+ // more blocks and this requires the HQ-blocks be refined as follows:
+ // a. Hole spans an HQ-block:
+ // The block needs to be removed as HQ *if* it is not based 5 or more LA's
+ // (this usually never happens, 10^-5 or less)
+ // b. Hole is contained in an HQ-block:
+ // The block needs to be split around the hole because one needs to verify that
+ // the left and right regions on each side of a hole actually belong together
+ // (this happens occasionaly, ~ 10^-3)
+ // c. Hole overlaps an HQ-block:
+ // If this happens, then the overlap is very small and the block is left unperturbed.
+ // (this worries me a bit, but in all testing it remains so)
+ // Given the above affects, the list of HQ-blocks can be modified by FIND_HOLES.
+static int ESORT(const void *l, const void *r)
+{ Event *x = (Event *) l;
+ Event *y = (Event *) r;
+ if (x->pos == y->pos)
+ return (x->type - y->type);
+ return (x->pos - y->pos);
+static Interval *FIND_HOLES(int aread, Overlap *ovls, int novl,
+ int *p_nhole, Interval **p_block, int *p_nblk)
+{ static int nmax = 0;
+ int nev;
+ static Event *queue = NULL; // Event queue[0..nev)
+ int nhole;
+ static Interval *holes = NULL; // Detected holes[0..nhole)
+ static int pmax;
+ static Interval *cover = NULL; // Coverage at block ends [0..nblk)
+ static Interval *nwblk; // Modified block list [0..nblk')
+ int nblk = *p_nblk; // Initial HQ block list [0..nblk)
+ Interval *block = *p_block;
+ int anum = 0, dnum = 0; // LFT and RGT walls, awall[0..anum) & dwall[0..dnum)
+ Wall *awall, *dwall;
+ if (cover == NULL)
+ { pmax = DB->maxlen/TRACE_SPACING + 2;
+ cover = (Interval *) Malloc(2*pmax*sizeof(Interval),"Allocating patch vector");
+ nwblk = cover + pmax;
+ }
+ if (4*novl + pmax > nmax)
+ { nmax = 4.8*novl + pmax + 100;
+ queue = (Event *) Realloc(queue,(nmax+1)*sizeof(Event),"Allocating event queue");
+ holes = (Interval *) Realloc(holes,(nmax/4)*sizeof(Interval),"Allocating hole vector");
+ if (queue == NULL || holes == NULL)
+ exit (1);
+ }
+ { int i;
+ // For each trimmed overlap: add its events to the queue
+ nev = 0;
+ for (i = 0; i < novl; i++)
+ { queue[nev].type = ADD;
+ queue[nev].pos = ovls[i].path.abpos;
+ nev += 1;
+ queue[nev].type = DEL;
+ queue[nev].pos = ovls[i].path.aepos;
+ nev += 1;
+ if (ovls[i].path.abpos + 2*COVER_LEN + 10 > ovls[i].path.aepos)
+ { queue[nev].type = CTR;
+ queue[nev].pos = (ovls[i].path.abpos + ovls[i].path.aepos) / 2;
+ nev += 1;
+ }
+ else
+ { queue[nev].type = LFT;
+ queue[nev].pos = ovls[i].path.abpos + COVER_LEN;
+ nev += 1;
+ queue[nev].type = RGT;
+ queue[nev].pos = ovls[i].path.aepos - COVER_LEN;
+ nev += 1;
+ }
+ }
+ // For each HQ-block: add its events to the queue
+ for (i = 0; i < nblk; i++)
+ { queue[nev].type = LGP;
+ queue[nev].pos = block[i].beg;
+ nev += 1;
+ queue[nev].type = RGP;
+ queue[nev].pos = block[i].end;
+ nev += 1;
+ }
+ queue[nev].pos = DB->reads[aread].rlen;
+ }
+ // Sort the events
+ qsort(queue,nev,sizeof(Event),ESORT);
+ // Find all LFT and RGT walls
+ awall = find_walls(novl,queue,&anum,&dnum);
+ dwall = awall + anum;
+ { int i;
+ printf("\n");
+ for (i = 0; i < anum; i++)
+ printf(" Add [%5d,%5d] %d %d\n",awall[i].beg,awall[i].end,awall[i].cnt,awall[i].cov);
+ for (i = 0; i < dnum; i++)
+ printf(" Del [%5d,%5d] %d %d\n",dwall[i].beg,dwall[i].end,dwall[i].cnt,dwall[i].cov);
+ printf("\n");
+ }
+ // Move through events in order keeping track of inc, dec, & cnf so that the
+ // invariant stated below holds
+ { int cnf, inc, dec;
+ int cblk;
+ int in;
+ int nbeg, nend = 0;
+ int first, last;
+ int i;
+ in = 1;
+ first = -1;
+ cblk = 0;
+ nhole = 0;
+ inc = dec = cnf = 0;
+ for (i = 0; i < nev; i++)
+ { switch (queue[i].type)
+ { case ADD:
+ inc += 1;
+ break;
+ case LFT:
+ inc -= 1;
+ cnf += 1;
+ break;
+ case LGP:
+ cover[cblk].beg = cnf + inc + dec; // = coverage depth at block[cblk].beg
+ continue;
+ case CTR:
+ inc -= 1;
+ dec += 1;
+ continue;
+ case RGP:
+ cover[cblk].end = cnf + inc + dec; // = coverage depth at block[cblk].end
+ cblk += 1;
+ continue;
+ case RGT:
+ cnf -= 1;
+ dec += 1;
+ break;
+ case DEL:
+ dec -= 1;
+ break;
+ }
+ // For position x = queue[i].pos:
+ // inc = # of LA's between (ADD,LFT] positions
+ // dec = # of LA's between (RGT,DEL] positions
+ // cnf = # of LA's between (LFT,RGT] positions (= # of LAs tat project at least
+ // COVER_LEN bases to the right and left of x!
+ printf(" %5d %c: %3d< %3d >%3d %3d\n",
+ queue[i].pos,Symbol[queue[i].type],inc,cnf,dec,dec-inc);
+ // When truncated coverage, cnf, transitions below MIN_COVER(3), note the fact (in = 1)
+ // and record the index first of the event (must be a RGT) and the number of LA's
+ // currently in their (RGT,DEL] interval
+ if (cnf <= MIN_COVER)
+ { if ( ! in)
+ { in = 1;
+ nend = dec;
+ first = i;
+ }
+ }
+ // When truncated coverage transitions above MIN_COVER, we declare it a hole
+ // if interval below MIN_COVER is at least COVER_LEN long, there are at least
+ // 4 LA's that are "ending" at the left (i.e. in (RGT,DEL] interval, and
+ // at least 4 LA's ending at the right.
+ else
+ { if (in && first >= 0 && queue[i].pos - queue[first].pos >= COVER_LEN &&
+ nend >= 4 && inc >= 4)
+ { int lflank, rflank;
+ int dpos, apos;
+ nbeg = inc;
+ last = i;
+ // Need to find the boundaries of the hole. In principle, this is
+ // [dpos + COVER_LEN, apos - COVER_LEN] where apos = queue[first].pos
+ // and dpos = queue[last].pos, i.e. the entry and exit into the low
+ // truncated cover interval. However, walls induced by repeat boundaries
+ // and/or uneveness in the end-points of LA's can cause the above to be
+ // quite far off. So ...
+ // First try the average of the 2nd and 3rd quartile of the nend RGT events
+ // before dpos. The requisite number of events must exist by the definition
+ // of nend. While one is at it determine the index of the first of the
+ // nend RGT events in lflank.
+ { int64 sum;
+ int q1, q3, n;
+ int a, d, k;
+ int acov, dcov;
+ q1 = nend/4;
+ q3 = (3*nend)/4;
+ sum = 0;
+ n = 0;
+ for (lflank = first; n < nend; lflank--)
+ if (queue[lflank].type == RGT || queue[lflank].type == CTR)
+ { if (n >= q1 && n < q3)
+ sum += queue[lflank].pos;
+ n += 1;
+ }
+ dpos = sum/(q3-q1);
+ lflank += 1;
+ printf(" Dev %5d-%3d-%5d -> %5d",queue[lflank].pos,nend,queue[first].pos,dpos);
+ // Second, look for the rightmost RGT-(LFT-)wall that overlaps the left (right)
+ // flank, i.e. queue[lflank,first].pos (queue[last,rflank].pos), and if found
+ // take the average position of the wall.
+ for (d = dnum-1; d >= 0; d--)
+ if (dwall[d].beg <= queue[first].pos)
+ break;
+ if (d >= 0 && dwall[d].end >= queue[lflank].pos)
+ { sum = 0;
+ n = 0;
+ for (k = first; k >= lflank; k--)
+ if (queue[k].type == RGT || queue[k].type == CTR)
+ { if (queue[k].pos < dwall[d].beg)
+ break;
+ if (queue[k].pos <= dwall[d].end)
+ { sum += queue[k].pos;
+ n += 1;
+ }
+ }
+ dpos = sum/n;
+ printf(" [%5d,%5d] -> %4d\n",dwall[d].beg,dwall[d].end,dpos);
+ dcov = dwall[d].cov + dwall[d].cnt;
+ d -= 1;
+ }
+ else
+ { dcov = nend + MIN_COVER;
+ printf(" No wall mapping\n");
+ }
+ // First try on LFT events (replace nend with nbeg, RGT with LFT, before
+ // with after, and dpos with apos, first with last, and lflank with rflank.
+ q1 = nbeg/4;
+ q3 = (3*nbeg)/4;
+ sum = 0;
+ n = 0;
+ for (rflank = last; n < nbeg; rflank++)
+ if (queue[rflank].type == LFT || queue[rflank].type == CTR)
+ { if (n >= q1 && n < q3)
+ sum += queue[rflank].pos;
+ n += 1;
+ }
+ apos = sum/(q3-q1);
+ rflank -= 1;
+ printf(" Aev %5d-%3d-%5d -> %5d",queue[i].pos,nbeg,queue[rflank].pos,apos);
+ // Second look at LFT events.
+ for (a = 0; a < anum; a++)
+ if (awall[a].end >= queue[i].pos)
+ break;
+ if (a < anum && awall[a].beg <= queue[rflank].pos)
+ { sum = 0;
+ n = 0;
+ for (k = i; k <= rflank; k++)
+ if (queue[k].type == LFT || queue[k].type == CTR)
+ { if (queue[k].pos > awall[a].end)
+ break;
+ if (queue[k].pos >= awall[a].beg)
+ { sum += queue[k].pos;
+ n += 1;
+ }
+ }
+ apos = sum/n;
+ printf(" [%5d,%5d] -> %4d\n",awall[a].beg,awall[a].end,apos);
+ acov = awall[a].cov + awall[a].cnt;
+ a += 1;
+ }
+ else
+ { acov = nbeg + MIN_COVER;
+ printf(" No wall mapping\n");
+ }
+ // If apos and dpos are still so close that the implied hole boundaries
+ // are out of order by 50 or more bases, then walk back through ascending
+ // walls (if present) until this is no longer true or there are no more
+ // more walls left. If both left and right options exist, always take
+ // the wall starting at the lower current height.
+ while (apos - dpos < 2*COVER_LEN - 50)
+ { if (d >= 0 && dwall[d].cov >= dcov)
+ if (a < anum && awall[a].cov >= acov)
+ { if (dcov < acov)
+ { dcov = dwall[d].cov + dwall[d].cnt;
+ dpos = dwall[d--].beg;
+ printf(" <- %d\n",dpos);
+ }
+ else
+ { acov = awall[a].cov + awall[a].cnt;
+ apos = awall[a++].end;
+ printf(" -> %d\n",apos);
+ }
+ }
+ else
+ { dcov = dwall[d].cov + dwall[d].cnt;
+ dpos = dwall[d--].beg;
+ printf(" <- %d\n",dpos);
+ }
+ else
+ if (a < anum && awall[a].cov >= acov)
+ { acov = awall[a].cov + awall[a].cnt;
+ apos = awall[a++].end;
+ printf(" -> %d\n",apos);
+ }
+ else
+ {
+ printf(" FAULT\n");
+ break;
+ }
+ }
+ }
+ // Finalize and record the hole boundaries.
+ holes[nhole].beg = dpos + COVER_LEN;
+ holes[nhole].end = apos - COVER_LEN;
+ nhole += 1;
+ }
+ in = 0;
+ }
+ }
+ }
+ // See if the holes remove or split any HQ-blocks and build the revised list
+ // in newblk[0..q).
+ { int i, p, q, x;
+ int lhang, rhang;
+ int reverse;
+ // For each hole in left-to-right order
+ p = q = 0;
+ for (i = 0; i < nhole; i++)
+ { if (holes[i].beg > holes[i].end)
+ { x = holes[i].beg;
+ holes[i].beg = holes[i].end;
+ holes[i].end = x;
+ reverse = 1;
+ }
+ else
+ reverse = 0;
+ // Advance to the next block p that intersects with or is to the right of hole
+ // moving blocks being skipped over to the new block list
+ while (p < nblk && block[p].end <= holes[i].beg)
+ nwblk[q++] = block[p++];
+ printf(" HOLE: %5d [%5d,%5d]\n",
+ aread+1,holes[i].beg,holes[i].end);
+ // While the current block intersects the current hole
+ while (p < nblk && block[p].beg < holes[i].end)
+ { lhang = (holes[i].beg < block[p].beg);
+ rhang = (holes[i].end > block[p].end);
+ if (lhang)
+ { if (rhang)
+ // Hole i contains block p: remove it if coverage <= 4 at both ends
+ { if (block[p].end - block[p].beg >= MIN_BLOCK &&
+ (cover[p].beg > 4 || cover[p].end > 4))
+ nwblk[q++] = block[p];
+ p += 1;
+ printf(" INTERSECT %5d S [%5d,%5d] %3d %3d",
+ aread+1,block[p-1].beg,block[p-1].end,cover[p-1].beg,cover[p-1].end);
+ if (reverse)
+ printf(" REV");
+ printf("\n");
+ }
+ // Hole i intersect the left tip of block p: nothing to do
+ else
+ {
+ printf(" INTERSECT %5d Z %5d [..,%5d] %3d",
+ aread+1,holes[i].end-block[p].beg,holes[i].end,cover[p].beg);
+ if (reverse)
+ printf(" REV");
+ printf("\n");
+ break;
+ }
+ }
+ else
+ if (rhang)
+ // Hole i intersect the right tip of block p: move p to new block list
+ { nwblk[q++] = block[p++];
+ printf(" INTERSECT %5d Z %5d [%5d,..] %3d",
+ aread+1,block[p-1].end-holes[i].beg,holes[i].beg,cover[p-1].end);
+ if (reverse)
+ printf(" REV");
+ printf("\n");
+ }
+ else
+ // Hole i is contained within block p: Break block into two parts at
+ // TRACE_SPACING ticks left and right of hole, and keep each piece
+ // if they are greater than MIN_BLOCK long.
+ { int beg, end;
+ printf(" INTERSECT %5d C %5d [%5d,%5d]",
+ aread+1,holes[i].end-holes[i].beg,block[p].beg,block[p].end);
+ if (reverse)
+ printf(" REV");
+ printf("\n");
+ beg = (holes[i].beg/TRACE_SPACING);
+ end = (holes[i].end-1)/TRACE_SPACING+1;
+ if (beg == end)
+ { beg -= 1; end += 1; }
+ if (beg - block[p].beg >= MIN_BLOCK)
+ { nwblk[q].beg = block[p].beg;
+ nwblk[q++].end = beg;
+ }
+ if (block[p].end - end >= MIN_BLOCK)
+ block[p].beg = end;
+ else
+ p += 1;
+ break;
+ }
+ }
+ }
+ // Remove any remaining blocks to the new list
+ while (p < nblk)
+ nwblk[q++] = block[p++];
+ nblk = q;
+ }
+ // Return the list of holes holes[0..nhole) and the new list of blocks, nwblk[0..nblk)
+ *p_nblk = nblk;
+ *p_block = nwblk;
+ *p_nhole = nhole;
+ return (holes);
+ *
+ *
+ ********************************************************************************************/
+static int GSORT(const void *l, const void *r)
+{ int x = *((int *) l);
+ int y = *((int *) r);
+ return (x - y);
+#define LOWQ 0
+#define SPAN 1
+#define SPLIT 2
+#define ADAPT 3
+static int gap_status(Overlap *ovls, int novl, Interval *rblock)
+{ static int nmax = 0;
+ static int *gsort = NULL; // A-B delta for all B-reads spanning a gap
+ static int *asort = NULL; // A-B delta for all B-reads spanning a gap
+ Interval *lblock = rblock-1;
+ int j;
+ int lft, rgt;
+ int lcv, rcv;
+ int cnt;
+ if (novl > nmax)
+ { nmax = 1.2*novl + 500;
+ gsort = (int *) Realloc(gsort,nmax*sizeof(int),"Allocating gap vector");
+ asort = (int *) Realloc(asort,nmax*sizeof(int),"Allocating adaptemer position vector");
+ if (gsort == NULL || asort == NULL)
+ exit (1);
+ }
+ lft = lblock->end;
+ rgt = rblock->beg;
+ lcv = lft - COVER_LEN;
+ rcv = rgt + COVER_LEN;
+ if (lcv < lblock->beg)
+ lcv = lblock->beg;
+ if (rcv > rblock->end)
+ rcv = rblock->end;
+ printf(" GAP [%5d,%5d]\n",lcv,rcv);
+ cnt = 0;
+ for (j = 0; j < novl; j++)
+ if (ovls[j].path.abpos <= lcv && ovls[j].path.aepos >= rcv)
+ { cnt += 1;
+ if (cnt >= 10)
+ break;
+ }
+ if (cnt >= 10)
+ {
+ printf(" LOWQ\n");
+ return (LOWQ);
+ }
+ { int bread, bcomp, blen;
+ int ab, ae;
+ int lcnt, rcnt, scnt, gcnt, acnt;
+ int lidx, ridx, sidx, cidx;
+ int k;
+ lcnt = rcnt = scnt = gcnt = acnt = 0;
+ for (j = 0; j < novl; j = k)
+ { bread = ovls[j].bread;
+ blen = DB->reads[bread].rlen;
+ bcomp = COMP(ovls[j].flags);
+ if (bcomp)
+ cidx = j;
+ lidx = ridx = sidx = -1;
+ for (k = j; k < novl; k++)
+ { if (ovls[k].bread != bread)
+ break;
+ if (COMP(ovls[k].flags) != (uint32) bcomp)
+ { cidx = k;
+ bcomp = COMP(ovls[k].flags);
+ }
+ ab = ovls[k].path.abpos;
+ ae = ovls[k].path.aepos;
+#ifdef SHOW_PAIRS
+ printf("\n %5d [%5d,%5d] %c",bread,ab,ae,COMP(ovls[k].flags)?'c':'n');
+ if (ab <= lcv && ae >= rcv)
+ printf("s");
+ else
+ printf(" ");
+ if (ab <= lcv && ae >= rcv)
+ { sidx = k;
+ continue;
+ }
+ // Duplicate left or right hits were due mainly to low complexity sequence
+ // Just ignore, you want to lose the left or right have that is bad
+ // Duplicate pairs are due entirely to unremoved adapters (palindromes)
+ // Let the complement pair go through, both are equivalent
+#ifdef SHOW_PAIRS
+ if (ae >= rcv && ab <= rcv && ab - ovls[k].path.bbpos <= lft - MIN_LEN)
+ printf("r");
+ else
+ printf(" ");
+ if (ab <= lcv && ae >= lcv && ae + (blen-ovls[j].path.bepos) >= rgt + MIN_LEN)
+ printf("l");
+ else
+ printf(" ");
+ if (ae >= rcv && ab <= rcv && ab - ovls[k].path.bbpos <= lft - MIN_LEN)
+ ridx = k;
+ if (ab <= lcv && ae >= lcv && ae + (blen-ovls[j].path.bepos) >= rgt + MIN_LEN)
+ lidx = k;
+ }
+ if (! bcomp)
+ cidx = k;
+#ifdef SHOW_PAIRS
+ printf(" =");
+ if (sidx >= 0)
+ printf(" S");
+ if (lidx >= 0)
+ printf(" L");
+ if (ridx >= 0)
+ printf(" R");
+ if (0 <= lidx && lidx < ridx && (ridx < cidx || lidx >= cidx))
+ printf(" G");
+ if ((0<=ridx && ridx<cidx && cidx<=lidx) || (0<=lidx && lidx<cidx && cidx<=ridx))
+ printf(" A");
+ if (sidx >= 0)
+ scnt += 1;
+ if (lidx >= 0)
+ lcnt += 1;
+ if (ridx >= 0)
+ rcnt += 1;
+ if (0 <= lidx && lidx < ridx && (ridx < cidx || cidx <= lidx))
+ gsort[gcnt++] = (ovls[ridx].path.abpos - ovls[lidx].path.aepos)
+ - (ovls[ridx].path.bbpos - ovls[lidx].path.bepos);
+ if ((0<=ridx && ridx<cidx && cidx<=lidx) || (0<=lidx && lidx<cidx && cidx<=ridx))
+ asort[acnt++] = (((blen-ovls[ridx].path.bbpos) - ovls[lidx].path.bepos)
+ + (ovls[lidx].path.aepos + ovls[ridx].path.abpos))/2;
+ }
+#ifdef SHOW_PAIRS
+ printf("\n");
+ printf(" lcnt = %d scnt = %d(%d) rcnt = %d acnt = %d\n",lcnt,gcnt,scnt,rcnt,acnt);
+ { int64 med, dev = 0;
+ int std, low, hgh;
+ if (lcnt < rcnt)
+ rcnt = lcnt;
+ if (acnt >= .4*rcnt && scnt+gcnt < .3*acnt)
+ { qsort(asort,acnt,sizeof(int),GSORT);
+ med = asort[acnt/2];
+ low = acnt*.25;
+ hgh = acnt*.75;
+ for (j = low; j <= hgh; j++)
+ dev = (asort[j]-med)*(asort[j]-med);
+ std = sqrt((1.*dev)/acnt);
+ if (std > 200)
+ printf(" Warning: Read %d adaptemer test may be wrong\n",ovls[0].aread);
+ printf(" ADAPT %3d\n",std);
+ return (ADAPT);
+ }
+ qsort(gsort,gcnt,sizeof(int),GSORT);
+ med = gsort[gcnt/2];
+ low = gcnt*.25;
+ hgh = gcnt*.75;
+ for (j = low; j <= hgh; j++)
+ dev = (gsort[j]-med)*(gsort[j]-med);
+ std = sqrt((1.*dev)/gcnt);
+ if (std < 150 && scnt+gcnt >= 10 && (scnt+gcnt >= .4*rcnt || scnt+gcnt >= 20))
+ {
+ printf(" SPAN %3d %5d\n",std,rgt-lft);
+ return (SPAN);
+ }
+ else
+ {
+ if (rcnt >= 20)
+ printf(" STRONG SPLIT\n");
+ else
+ printf(" WEAK SPLIT\n");
+ if (scnt + gcnt >= 10)
+ printf(" UNCERTAIN %5.1f %5d %3d\n",(scnt+gcnt)/(.01*rcnt),rgt-lft,scnt+gcnt);
+ return (SPLIT);
+ }
+ }
+ }
+static int *GAP_ANALYSIS(Overlap *ovls, int novl, Interval *block, int nblk)
+{ static int bmax = 0;
+ static int *status = NULL; // Status of gaps between HQ_blocks
+ int i;
+ if (nblk > bmax)
+ { bmax = 1.2*nblk + 100;
+ status = (int *) Realloc(status,bmax*sizeof(int),"Allocating status vector");
+ if (status == NULL)
+ exit (1);
+ }
+ printf(" GAPS\n");
+ for (i = 1; i < nblk; i++)
+ status[i] = gap_status(ovls,novl,block+i);
+ return (status);
+ *
+ * Trim low-quality tips of reads and patch low quality intervals within a sequence
+ * Trim adapter (and associated redundant prefix or suffix)
+ * Break chimers or all unscaffoldable no-coverage gaps of reads
+ *
+ ********************************************************************************************/
+// Analyze all the gaps between the good patches found in the first pass.
+// Consider a hole between two good intervals [lb,le] and [rb,re]. An overlap
+ // is anchored to the left of the whole if abpos <= le-COVER_LEN and aepos >= rb+COVER_LEN
+static void GAPS(int aread, Overlap *ovls, int novl)
+{ int alen;
+ int nblk;
+ Interval *block;
+ int *status;
+ int nhole;
+ Interval *holes;
+#if defined(DEBUG_HQ_BLOCKS) || defined(SHOW_EVENTS) || defined(DEBUG_HOLE_FINDER) || defined(DEBUG_ADAPTER)
+ printf("\n");
+ printf("AREAD %d\n",aread);
+ alen = DB->reads[aread].rlen;
+ if (VERBOSE)
+ { nreads += 1;
+ totlen += alen;
+ }
+ // Partition into HQ-blocks
+ block = HQ_BLOCKS(aread,&nblk);
+ // No blocks? ==> nothing to do
+ if (nblk <= 0)
+ { if (VERBOSE)
+ { nelim += 1;
+ nelimbp += alen;
+ }
+#ifdef ANNOTATE
+ fwrite(&HQ_INDEX,sizeof(int64),1,HQ_AFILE);
+ fwrite(&SN_INDEX,sizeof(int64),1,SN_AFILE);
+ fwrite(&SP_INDEX,sizeof(int64),1,SP_AFILE);
+ fwrite(&AD_INDEX,sizeof(int64),1,AD_AFILE);
+ fwrite(&HL_INDEX,sizeof(int64),1,HL_AFILE);
+ fwrite(&KP_INDEX,sizeof(int64),1,KP_AFILE);
+ return;
+ }
+ // Find holes and modify HQ-blocks if necessary
+ holes = FIND_HOLES(aread,ovls,novl,&nhole,&block,&nblk);
+ if (VERBOSE)
+ { if (block[0].beg > 0)
+ { n5trm += 1;
+ n5trmbp += block[0].beg;
+ }
+ if (block[nblk-1].end < alen)
+ { n3trm += 1;
+ n3trmbp += alen - block[nblk-1].end;
+ }
+ }
+ // Determine the status of each gap between a pair of blocks
+ status = GAP_ANALYSIS(ovls,novl,block,nblk);
+#ifdef ANNOTATE
+ { int i;
+ for (i = 0; i < nblk; i++)
+ { fwrite(&(block[i].beg),sizeof(int),1,HQ_DFILE);
+ fwrite(&(block[i].end),sizeof(int),1,HQ_DFILE);
+ if (i > 0)
+ { if (status[i] == SPAN || status[i] == LOWQ)
+ { fwrite(&(block[i-1].end),sizeof(int),1,SN_DFILE);
+ fwrite(&(block[i].beg),sizeof(int),1,SN_DFILE);
+ SN_INDEX += 2*sizeof(int);
+ }
+ else if (status[i] == SPLIT)
+ { fwrite(&(block[i-1].end),sizeof(int),1,SP_DFILE);
+ fwrite(&(block[i].beg),sizeof(int),1,SP_DFILE);
+ SP_INDEX += 2*sizeof(int);
+ }
+ else // status[i] == ADAPT
+ { fwrite(&(block[i-1].end),sizeof(int),1,AD_DFILE);
+ fwrite(&(block[i].beg),sizeof(int),1,AD_DFILE);
+ AD_INDEX += 2*sizeof(int);
+ }
+ }
+ }
+ HQ_INDEX += 2*sizeof(int)*nblk;
+ fwrite(&HQ_INDEX,sizeof(int64),1,HQ_AFILE);
+ fwrite(&SN_INDEX,sizeof(int64),1,SN_AFILE);
+ fwrite(&SP_INDEX,sizeof(int64),1,SP_AFILE);
+ fwrite(&AD_INDEX,sizeof(int64),1,AD_AFILE);
+ for (i = 0; i < nhole; i++)
+ if (holes[i].end - holes[i].beg < 75)
+ { holes[i].end += 50;
+ holes[i].beg -= 50;
+ fwrite(&(holes[i].beg),sizeof(int),1,HL_DFILE);
+ fwrite(&(holes[i].end),sizeof(int),1,HL_DFILE);
+ holes[i].end -= 50;
+ holes[i].beg += 50;
+ }
+ else
+ { fwrite(&(holes[i].beg),sizeof(int),1,HL_DFILE);
+ fwrite(&(holes[i].end),sizeof(int),1,HL_DFILE);
+ }
+ HL_INDEX += 2*nhole*sizeof(int);
+ fwrite(&HL_INDEX,sizeof(int64),1,HL_AFILE);
+ }
+ { int cmax, amax, abeg = 0, aend = 0;
+ int p, i;
+ amax = 0;
+ p = 0;
+ cmax = block[0].end-block[0].beg;
+ for (i = 1; i < nblk; i++)
+ if (status[i] == ADAPT)
+ { if (cmax > amax)
+ { amax = cmax;
+ abeg = p;
+ aend = i;
+ }
+ p = i;
+ cmax = block[i].end - block[i].beg;
+ }
+ else if (status[i] != SPLIT)
+ cmax += block[i].end - block[i-1].end;
+ else
+ cmax += block[i].end - block[i].beg;
+ if (cmax > amax)
+ { amax = cmax;
+ abeg = p;
+ aend = nblk;
+ }
+ printf(" Keeping %d-%d [%d,%d]\n",abeg,aend-1,block[abeg].beg,block[aend-1].end);
+ if (VERBOSE)
+ { if (abeg > 0)
+ { natrm += 1;
+ natrmbp += block[abeg].beg - block[0].beg;
+ }
+ if (aend < nblk)
+ { natrm += 1;
+ natrmbp += (block[nblk-1].end - block[aend-1].end);
+ }
+ for (i = abeg+1; i < aend; i++)
+ { ngaps += 1;
+ ngapsbp += block[i].beg - block[i-1].end;
+ if (status[i] == LOWQ)
+ { nlowq += 1;
+ nlowqbp += block[i].beg - block[i-1].end;
+ }
+ else if (status[i] == SPAN)
+ { nspan += 1;
+ nspanbp += block[i].beg - block[i-1].end;
+ }
+ else // status[i] == SPLIT
+ { nchim += 1;
+ nchimbp += block[i].beg - block[i-1].end;
+ }
+ }
+ }
+ nblk = aend-abeg;
+ block += abeg;
+ status += abeg;
+ holes += abeg;
+ }
+#ifdef ANNOTATE
+ { int i;
+ fwrite(&(block[0].beg),sizeof(int),1,KP_DFILE);
+ for (i = 1; i < nblk; i++)
+ if (status[i] == SPLIT)
+ { fwrite(&(block[i-1].end),sizeof(int),1,KP_DFILE);
+ fwrite(&(block[i].beg),sizeof(int),1,KP_DFILE);
+ KP_INDEX += 2*sizeof(int);
+ }
+ fwrite(&(block[nblk-1].end),sizeof(int),1,KP_DFILE);
+ KP_INDEX += 2*sizeof(int);
+ fwrite(&KP_INDEX,sizeof(int64),1,KP_AFILE);
+ }
+ // Read in each successive pile and call ACTION on it. Read in the traces only if
+ // "trace" is nonzero
+static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int trace)
+{ static Overlap *ovls = NULL;
+ static int omax = 500;
+ static uint16 *paths = NULL;
+ static int pmax = 100000;
+ int64 i, j, novl;
+ int n, a;
+ int pcur;
+ int max;
+ int tbytes;
+ if (ovls == NULL)
+ { ovls = (Overlap *) Malloc(sizeof(Overlap)*omax,"Allocating overlap buffer");
+ if (ovls == NULL)
+ exit (1);
+ }
+ if (trace && paths == NULL)
+ { paths = (uint16 *) Malloc(sizeof(uint16)*pmax,"Allocating path buffer");
+ if (paths == NULL)
+ exit (1);
+ }
+ rewind(input);
+ fread(&novl,sizeof(int64),1,input);
+ fread(&TRACE_SPACING,sizeof(int),1,input);
+ tbytes = sizeof(uint8);
+ else
+ tbytes = sizeof(uint16);
+ Read_Overlap(input,ovls);
+ if (trace)
+ { if (ovls[0].path.tlen > pmax)
+ { pmax = 1.2*(ovls[0].path.tlen)+10000;
+ paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
+ if (paths == NULL) exit (1);
+ }
+ fread(paths,tbytes,ovls[0].path.tlen,input);
+ if (tbytes == 1)
+ { ovls[0].path.trace = paths;
+ Decompress_TraceTo16(ovls);
+ }
+ }
+ else
+ fseek(input,tbytes*ovls[0].path.tlen,SEEK_CUR);
+ if (ovls[0].aread < DB_FIRST)
+ { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n",
+ Prog_Name,DB_PART);
+ exit (1);
+ }
+ pcur = 0;
+ n = max = 0;
+ for (j = DB_FIRST; j < DB_LAST; j++)
+ { ovls[0] = ovls[n];
+ a = ovls[0].aread;
+ if (a != j)
+ n = 0;
+ else
+ { if (trace)
+ memcpy(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
+ n = 1;
+ pcur = ovls[0].path.tlen;
+ while (1)
+ { if (Read_Overlap(input,ovls+n) != 0)
+ { ovls[n].aread = INT32_MAX;
+ break;
+ }
+ if (trace)
+ { if (pcur + ovls[n].path.tlen > pmax)
+ { pmax = 1.2*(pcur+ovls[n].path.tlen)+10000;
+ paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
+ if (paths == NULL) exit (1);
+ }
+ fread(paths+pcur,tbytes,ovls[n].path.tlen,input);
+ if (tbytes == 1)
+ { ovls[n].path.trace = paths+pcur;
+ Decompress_TraceTo16(ovls+n);
+ }
+ }
+ else
+ fseek(input,tbytes*ovls[n].path.tlen,SEEK_CUR);
+ if (ovls[n].aread != a)
+ break;
+ pcur += ovls[n].path.tlen;
+ n += 1;
+ if (n >= omax)
+ { omax = 1.2*n + 100;
+ ovls = (Overlap *) Realloc(ovls,sizeof(Overlap)*omax,"Expanding overlap buffer");
+ if (ovls == NULL) exit (1);
+ }
+ }
+ if (n >= max)
+ max = n;
+ pcur = 0;
+ for (i = 0; i < n; i++)
+ { ovls[i].path.trace = paths+pcur;
+ pcur += ovls[i].path.tlen;
+ }
+ }
+ ACTION(j,ovls,n);
+ }
+ return (max);
+int main(int argc, char *argv[])
+{ FILE *input;
+ char *root, *dpwd;
+ char *las, *lpwd;
+ int64 novl;
+ HITS_TRACK *track;
+ int c;
+ // Process arguments
+ { int i, j, k;
+ int flags[128];
+ char *eptr;
+ ARG_INIT("DAStrim")
+ BAD_QV = -1;
+ GOOD_QV = -1;
+ MIN_LEN = 1000;
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("v")
+ break;
+ case 'b':
+ ARG_NON_NEGATIVE(BAD_QV,"Minimum QV score for being considered bad")
+ break;
+ case 'g':
+ ARG_NON_NEGATIVE(GOOD_QV,"Maximum QV score for being considered good")
+ break;
+ case 'l':
+ ARG_POSITIVE(MIN_LEN,"Minimum retained segment length")
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+ VERBOSE = flags['v'];
+ if (argc < 3)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ if (GOOD_QV < 0)
+ { fprintf(stderr,"%s: Must supply -g parameter\n",Prog_Name);
+ exit (1);
+ }
+ if (BAD_QV < 0)
+ { fprintf(stderr,"%s: Must supply -b parameter\n",Prog_Name);
+ exit (1);
+ }
+ if (GOOD_QV > BAD_QV)
+ { fprintf(stderr,"%s: Good QV threshold (%d) > Bad QV threshold (%d) ?\n",
+ Prog_Name,GOOD_QV,BAD_QV);
+ exit (1);
+ }
+ }
+ // Open trimmed DB and the prim-track
+ { int status;
+ status = Open_DB(argv[1],DB);
+ if (status < 0)
+ exit (1);
+ if (status == 1)
+ { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ if (DB->part)
+ { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ Trim_DB(DB);
+ }
+ // Initialize statistics gathering
+ if (VERBOSE)
+ { nreads = 0;
+ totlen = 0;
+ nelim = 0;
+ n5trm = 0;
+ n3trm = 0;
+ natrm = 0;
+ nelimbp = 0;
+ n5trmbp = 0;
+ n3trmbp = 0;
+ natrmbp = 0;
+ ngaps = 0;
+ nlowq = 0;
+ nspan = 0;
+ nchim = 0;
+ ngapsbp = 0;
+ nlowqbp = 0;
+ nspanbp = 0;
+ nchimbp = 0;
+ printf("\nDAStrim -g%d -b%d -l%d %s", GOOD_QV,BAD_QV,MIN_LEN,argv[1]);
+ for (c = 2; c < argc; c++)
+ printf(" %s",argv[c]);
+ printf("\n");
+ }
+ // Determine if overlap block is being processed and if so get first and last read
+ // from .db file
+ dpwd = PathTo(argv[1]);
+ root = Root(argv[1],".db");
+ for (c = 2; c < argc; c++)
+ { las = Root(argv[c],".las");
+ { FILE *dbfile;
+ char buffer[2*MAX_NAME+100];
+ char *p, *eptr;
+ int i, part, nfiles, nblocks, cutoff, all, oindx;
+ int64 size;
+ DB_PART = 0;
+ DB_FIRST = 0;
+ DB_LAST = DB->nreads;
+ p = rindex(las,'.');
+ if (p != NULL)
+ { part = strtol(p+1,&eptr,10);
+ if (*eptr == '\0' && eptr != p+1)
+ { dbfile = Fopen(Catenate(dpwd,"/",root,".db"),"r");
+ if (dbfile == NULL)
+ exit (1);
+ if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
+ for (i = 0; i < nfiles; i++)
+ if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL)
+ if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1)
+ if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3)
+ for (i = 1; i <= part; i++)
+ if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2)
+ if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2)
+ fclose(dbfile);
+ DB_PART = part;
+ *p = '\0';
+ }
+ }
+ }
+ track = Load_Track(DB,"qual");
+ if (track != NULL)
+ { QV_IDX = (int64 *) track->anno;
+ QV = (uint8 *) track->data;
+ }
+ else
+ { fprintf(stderr,"%s: Must have a 'qual' track, run DASqv\n",Prog_Name);
+ exit (1);
+ }
+#ifdef ANNOTATE
+ // Set up QV trimming track
+#define SETUP(AFILE,DFILE,INDEX,anno,data) \
+{ int len, size; \
+ \
+ if (DB_PART > 0) \
+ { AFILE = Fopen(Catenate(dpwd,PATHSEP,root, \
+ Numbered_Suffix(".",DB_PART,anno)),"w"); \
+ DFILE = Fopen(Catenate(dpwd,PATHSEP,root, \
+ Numbered_Suffix(".",DB_PART,data)),"w"); \
+ } \
+ else \
+ { AFILE = Fopen(Catenate(dpwd,PATHSEP,root,anno),"w"); \
+ DFILE = Fopen(Catenate(dpwd,PATHSEP,root,data),"w"); \
+ } \
+ if (AFILE == NULL || DFILE == NULL) \
+ exit (1); \
+ \
+ len = DB_LAST - DB_FIRST; \
+ size = 0; \
+ fwrite(&len,sizeof(int),1,AFILE); \
+ fwrite(&size,sizeof(int),1,AFILE); \
+ INDEX = 0; \
+ fwrite(&INDEX,sizeof(int64),1,AFILE); \
+ SETUP(HQ_AFILE,HQ_DFILE,HQ_INDEX,".hq.anno",".hq.data")
+ SETUP(SN_AFILE,SN_DFILE,SN_INDEX,".span.anno",".span.data")
+ SETUP(SP_AFILE,SP_DFILE,SP_INDEX,".split.anno",".split.data")
+ SETUP(AD_AFILE,AD_DFILE,AD_INDEX,".adapt.anno",".adapt.data")
+ SETUP(HL_AFILE,HL_DFILE,HL_INDEX,".hole.anno",".hole.data")
+ SETUP(KP_AFILE,KP_DFILE,KP_INDEX,".keep.anno",".keep.data")
+ // Open overlap file
+ lpwd = PathTo(argv[2]);
+ if (DB_PART)
+ input = Fopen(Catenate(lpwd,"/",las,Numbered_Suffix(".",DB_PART,".las")),"r");
+ else
+ input = Fopen(Catenate(lpwd,"/",las,".las"),"r");
+ if (input == NULL)
+ exit (1);
+ free(lpwd);
+ free(las);
+ // Get trace point spacing information
+ fread(&novl,sizeof(int64),1,input);
+ fread(&TRACE_SPACING,sizeof(int),1,input);
+ make_a_pass(input,GAPS,1);
+ // Clean up
+#ifdef ANNOTATE
+ fclose(HQ_AFILE);
+ fclose(HQ_DFILE);
+ fclose(SN_AFILE);
+ fclose(SN_DFILE);
+ fclose(SP_AFILE);
+ fclose(SP_DFILE);
+ fclose(AD_AFILE);
+ fclose(AD_DFILE);
+ fclose(HL_AFILE);
+ fclose(HL_DFILE);
+ fclose(KP_AFILE);
+ fclose(KP_DFILE);
+ }
+ // If verbose output statistics summary to stdout
+ if (VERBOSE)
+ { printf("\nInput: ");
+ Print_Number((int64) nreads,7,stdout);
+ printf(" (100.0%%) reads ");
+ Print_Number(totlen,12,stdout);
+ printf(" (100.0%%) bases\n");
+ printf("Trimmed: ");
+ Print_Number(nelim,7,stdout);
+ printf(" (%5.1f%%) reads ",(100.*nelim)/nreads);
+ Print_Number(nelimbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*nelimbp)/totlen);
+ printf("5' trim: ");
+ Print_Number(n5trm,7,stdout);
+ printf(" (%5.1f%%) reads ",(100.*n5trm)/nreads);
+ Print_Number(n5trmbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*n5trmbp)/totlen);
+ printf("3' trim: ");
+ Print_Number(n3trm,7,stdout);
+ printf(" (%5.1f%%) reads ",(100.*n3trm)/nreads);
+ Print_Number(n3trmbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*n3trmbp)/totlen);
+ printf("Adapter: ");
+ Print_Number(natrm,7,stdout);
+ printf(" (%5.1f%%) reads ",(100.*natrm)/nreads);
+ Print_Number(natrmbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*natrmbp)/totlen);
+ printf("\n");
+ printf("Gaps: ");
+ Print_Number(ngaps,7,stdout);
+ printf(" (%5.1f%%) gaps ",(100.*(ngaps))/nreads);
+ Print_Number(ngapsbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*(ngapsbp))/totlen);
+ printf(" Low QV: ");
+ Print_Number(nlowq,7,stdout);
+ printf(" (%5.1f%%) gaps ",(100.*(nlowq))/nreads);
+ Print_Number(nlowqbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*(nlowqbp))/totlen);
+ printf(" Span'd: ");
+ Print_Number(nspan,7,stdout);
+ printf(" (%5.1f%%) gaps ",(100.*(nspan))/nreads);
+ Print_Number(nspanbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*(nspanbp))/totlen);
+ printf(" Break: ");
+ Print_Number(nchim,7,stdout);
+ printf(" (%5.1f%%) gaps ",(100.*(nchim))/nreads);
+ Print_Number(nchimbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*(nchimbp))/totlen);
+ printf("\n");
+ printf("Clipped: ");
+ Print_Number(n5trm+n3trm+nelim+natrm+nchim,7,stdout);
+ printf(" clips ");
+ Print_Number(n5trmbp+n3trmbp+nelimbp+natrmbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*(n5trmbp+n3trmbp+nelimbp+natrmbp+nchimbp))/totlen);
+ printf("Patched: ");
+ Print_Number(nlowq+nspan,7,stdout);
+ printf(" patches ");
+ Print_Number(nlowqbp+nspanbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*(nlowqbp+nspanbp))/totlen);
+ }
+ free(dpwd);
+ free(root);
+ Close_DB(DB);
+ free(Prog_Name);
+ exit (0);
diff --git a/DB.c b/DB.c
new file mode 100644
index 0000000..b536536
--- /dev/null
+++ b/DB.c
@@ -0,0 +1,1733 @@
+ *
+ * Compressed data base module. Auxiliary routines to open and manipulate a data base for
+ * which the sequence and read information are separated into two separate files, and the
+ * sequence is compressed into 2-bits for each base. Support for tracks of additional
+ * information, and trimming according to the current partition.
+ *
+ * Author : Gene Myers
+ * Date : July 2013
+ * Revised: April 2014
+ *
+ ********************************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <dirent.h>
+#include "DB.h"
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#define PATHSEP "/"
+ *
+ *
+ ********************************************************************************************/
+char *Prog_Name;
+char Ebuffer[1000];
+void *Malloc(int64 size, char *mesg)
+{ void *p;
+ if ((p = malloc(size)) == NULL)
+ { if (mesg == NULL)
+ EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name);
+ else
+ EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg);
+ }
+ return (p);
+void *Realloc(void *p, int64 size, char *mesg)
+{ if (size <= 0)
+ size = 1;
+ if ((p = realloc(p,size)) == NULL)
+ { if (mesg == NULL)
+ EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name);
+ else
+ EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg);
+ }
+ return (p);
+char *Strdup(char *name, char *mesg)
+{ char *s;
+ if (name == NULL)
+ return (NULL);
+ if ((s = strdup(name)) == NULL)
+ { if (mesg == NULL)
+ EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name);
+ else
+ EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg);
+ }
+ return (s);
+FILE *Fopen(char *name, char *mode)
+{ FILE *f;
+ if (name == NULL || mode == NULL)
+ return (NULL);
+ if ((f = fopen(name,mode)) == NULL)
+ EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode);
+ return (f);
+char *PathTo(char *name)
+{ char *path, *find;
+ if (name == NULL)
+ return (NULL);
+ if ((find = rindex(name,'/')) != NULL)
+ { *find = '\0';
+ path = Strdup(name,"Extracting path from");
+ *find = '/';
+ }
+ else
+ path = Strdup(".","Allocating default path");
+ return (path);
+char *Root(char *name, char *suffix)
+{ char *path, *find, *dot;
+ int epos;
+ if (name == NULL)
+ return (NULL);
+ find = rindex(name,'/');
+ if (find == NULL)
+ find = name;
+ else
+ find += 1;
+ if (suffix == NULL)
+ { dot = strchr(find,'.');
+ if (dot != NULL)
+ *dot = '\0';
+ path = Strdup(find,"Extracting root from");
+ if (dot != NULL)
+ *dot = '.';
+ }
+ else
+ { epos = strlen(find);
+ epos -= strlen(suffix);
+ if (epos > 0 && strcasecmp(find+epos,suffix) == 0)
+ { find[epos] = '\0';
+ path = Strdup(find,"Extracting root from");
+ find[epos] = suffix[0];
+ }
+ else
+ path = Strdup(find,"Allocating root");
+ }
+ return (path);
+char *Catenate(char *path, char *sep, char *root, char *suffix)
+{ static char *cat = NULL;
+ static int max = -1;
+ int len;
+ if (path == NULL || root == NULL || sep == NULL || suffix == NULL)
+ return (NULL);
+ len = strlen(path);
+ len += strlen(sep);
+ len += strlen(root);
+ len += strlen(suffix);
+ if (len > max)
+ { max = ((int) (1.2*len)) + 100;
+ if ((cat = (char *) realloc(cat,max+1)) == NULL)
+ { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root);
+ return (NULL);
+ }
+ }
+ sprintf(cat,"%s%s%s%s",path,sep,root,suffix);
+ return (cat);
+char *Numbered_Suffix(char *left, int num, char *right)
+{ static char *suffix = NULL;
+ static int max = -1;
+ int len;
+ if (left == NULL || right == NULL)
+ return (NULL);
+ len = strlen(left);
+ len += strlen(right) + 40;
+ if (len > max)
+ { max = ((int) (1.2*len)) + 100;
+ if ((suffix = (char *) realloc(suffix,max+1)) == NULL)
+ { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num);
+ return (NULL);
+ }
+ }
+ sprintf(suffix,"%s%d%s",left,num,right);
+ return (suffix);
+#define COMMA ','
+// Print big integers with commas/periods for better readability
+void Print_Number(int64 num, int width, FILE *out)
+{ if (width == 0)
+ { if (num < 1000ll)
+ fprintf(out,"%lld",num);
+ else if (num < 1000000ll)
+ fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll);
+ else if (num < 1000000000ll)
+ fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,
+ COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll);
+ else
+ fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,
+ COMMA,(num%1000000000ll)/1000000ll,
+ COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll);
+ }
+ else
+ { if (num < 1000ll)
+ fprintf(out,"%*lld",width,num);
+ else if (num < 1000000ll)
+ { if (width <= 4)
+ fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll);
+ else
+ fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll);
+ }
+ else if (num < 1000000000ll)
+ { if (width <= 8)
+ fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll,
+ COMMA,num%1000ll);
+ else
+ fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll,
+ COMMA,num%1000ll);
+ }
+ else
+ { if (width <= 12)
+ fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA,
+ (num%1000000000ll)/1000000ll,COMMA,
+ (num%1000000ll)/1000ll,COMMA,num%1000ll);
+ else
+ fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA,
+ (num%1000000000ll)/1000000ll,COMMA,
+ (num%1000000ll)/1000ll,COMMA,num%1000ll);
+ }
+ }
+// Return the number of digits, base 10, of num
+int Number_Digits(int64 num)
+{ int digit;
+ digit = 0;
+ while (num >= 1)
+ { num /= 10;
+ digit += 1;
+ }
+ return (digit);
+ *
+ *
+ ********************************************************************************************/
+// Compress read into 2-bits per base (from [0-3] per byte representation
+void Compress_Read(int len, char *s)
+{ int i;
+ char c, d;
+ char *s0, *s1, *s2, *s3;
+ s0 = s;
+ s1 = s0+1;
+ s2 = s1+1;
+ s3 = s2+1;
+ c = s1[len];
+ d = s2[len];
+ s0[len] = s1[len] = s2[len] = 0;
+ for (i = 0; i < len; i += 4)
+ *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]);
+ s1[len] = c;
+ s2[len] = d;
+// Uncompress read form 2-bits per base into [0-3] per byte representation
+void Uncompress_Read(int len, char *s)
+{ int i, tlen, byte;
+ char *s0, *s1, *s2, *s3;
+ char *t;
+ s0 = s;
+ s1 = s0+1;
+ s2 = s1+1;
+ s3 = s2+1;
+ tlen = (len-1)/4;
+ t = s+tlen;
+ for (i = tlen*4; i >= 0; i -= 4)
+ { byte = *t--;
+ s0[i] = (char) ((byte >> 6) & 0x3);
+ s1[i] = (char) ((byte >> 4) & 0x3);
+ s2[i] = (char) ((byte >> 2) & 0x3);
+ s3[i] = (char) (byte & 0x3);
+ }
+ s[len] = 4;
+// Convert read in [0-3] representation to ascii representation (end with '\n')
+void Lower_Read(char *s)
+{ static char letter[4] = { 'a', 'c', 'g', 't' };
+ for ( ; *s != 4; s++)
+ *s = letter[(int) *s];
+ *s = '\0';
+void Upper_Read(char *s)
+{ static char letter[4] = { 'A', 'C', 'G', 'T' };
+ for ( ; *s != 4; s++)
+ *s = letter[(int) *s];
+ *s = '\0';
+// Convert read in ascii representation to [0-3] representation (end with 4)
+void Number_Read(char *s)
+{ static char number[128] =
+ { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+ for ( ; *s != '\0'; s++)
+ *s = number[(int) *s];
+ *s = 4;
+ *
+ *
+ ********************************************************************************************/
+// Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has
+// a part # in it then just the part is opened. The index array is allocated (for all or
+// just the part) and read in.
+// Return status of routine:
+// -1: The DB could not be opened for a reason reported by the routine to EPLACE
+// 0: Open of DB proceeded without mishap
+// 1: Open of DAM proceeded without mishap
+int Open_DB(char* path, HITS_DB *db)
+{ HITS_DB dbcopy;
+ char *root, *pwd, *bptr, *fptr, *cat;
+ int nreads;
+ FILE *index, *dbvis;
+ int status, plen, isdam;
+ int part, cutoff, all;
+ int ufirst, tfirst, ulast, tlast;
+ status = -1;
+ dbcopy = *db;
+ plen = strlen(path);
+ if (strcmp(path+(plen-4),".dam") == 0)
+ root = Root(path,".dam");
+ else
+ root = Root(path,".db");
+ pwd = PathTo(path);
+ bptr = rindex(root,'.');
+ if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-')
+ { part = strtol(bptr+1,&fptr,10);
+ if (*fptr != '\0' || part == 0)
+ part = 0;
+ else
+ *bptr = '\0';
+ }
+ else
+ part = 0;
+ isdam = 0;
+ cat = Catenate(pwd,"/",root,".db");
+ if (cat == NULL)
+ return (-1);
+ if ((dbvis = fopen(cat,"r")) == NULL)
+ { cat = Catenate(pwd,"/",root,".dam");
+ if (cat == NULL)
+ return (-1);
+ if ((dbvis = fopen(cat,"r")) == NULL)
+ { EPRINTF(EPLACE,"%s: Could not open database %s\n",Prog_Name,path);
+ goto error;
+ }
+ isdam = 1;
+ }
+ if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL)
+ goto error1;
+ if (fread(db,sizeof(HITS_DB),1,index) != 1)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ goto error2;
+ }
+ { int p, nblocks, nfiles;
+ int64 size;
+ char fname[MAX_NAME], prolog[MAX_NAME];
+ nblocks = 0;
+ if (fscanf(dbvis,DB_NFILE,&nfiles) != 1)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error2;
+ }
+ for (p = 0; p < nfiles; p++)
+ if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error2;
+ }
+ if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1)
+ if (part == 0)
+ { cutoff = 0;
+ all = 1;
+ }
+ else
+ { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n",
+ Prog_Name,root);
+ goto error2;
+ }
+ else
+ { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error2;
+ }
+ if (part > nblocks)
+ { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks);
+ goto error2;
+ }
+ }
+ if (part > 0)
+ { for (p = 1; p <= part; p++)
+ if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error2;
+ }
+ if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error2;
+ }
+ }
+ else
+ { ufirst = tfirst = 0;
+ ulast = db->ureads;
+ tlast = db->treads;
+ }
+ }
+ db->trimmed = 0;
+ db->tracks = NULL;
+ db->part = part;
+ db->cutoff = cutoff;
+ db->all = all;
+ db->ufirst = ufirst;
+ db->tfirst = tfirst;
+ nreads = ulast-ufirst;
+ if (part <= 0)
+ { db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index");
+ if (db->reads == NULL)
+ goto error2;
+ db->reads += 1;
+ if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ free(db->reads);
+ goto error2;
+ }
+ }
+ else
+ { HITS_READ *reads;
+ int i, r, maxlen;
+ int64 totlen;
+ reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index");
+ if (reads == NULL)
+ goto error2;
+ reads += 1;
+ fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR);
+ if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ free(reads);
+ goto error2;
+ }
+ totlen = 0;
+ maxlen = 0;
+ for (i = 0; i < nreads; i++)
+ { r = reads[i].rlen;
+ totlen += r;
+ if (r > maxlen)
+ maxlen = r;
+ }
+ db->maxlen = maxlen;
+ db->totlen = totlen;
+ db->reads = reads;
+ }
+ ((int *) (db->reads))[-1] = ulast - ufirst; // Kludge, need these for DB part
+ ((int *) (db->reads))[-2] = tlast - tfirst;
+ db->nreads = nreads;
+ db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path");
+ if (db->path == NULL)
+ goto error2;
+ db->bases = NULL;
+ db->loaded = 0;
+ status = isdam;
+ fclose(index);
+ fclose(dbvis);
+ if (bptr != NULL)
+ *bptr = '.';
+ free(pwd);
+ free(root);
+ if (status < 0)
+ *db = dbcopy;
+ return (status);
+// Trim the DB or part thereof and all loaded tracks according to the cuttof and all settings
+// of the current DB partition. Reallocate smaller memory blocks for the information kept
+// for the retained reads.
+void Trim_DB(HITS_DB *db)
+{ int i, j, r;
+ int allflag, cutoff;
+ int64 totlen;
+ int maxlen, nreads;
+ HITS_TRACK *record;
+ HITS_READ *reads;
+ if (db->trimmed) return;
+ if (db->cutoff <= 0 && db->all) return;
+ cutoff = db->cutoff;
+ if (db->all)
+ allflag = 0;
+ else
+ allflag = DB_BEST;
+ reads = db->reads;
+ nreads = db->nreads;
+ for (record = db->tracks; record != NULL; record = record->next)
+ if (strcmp(record->name,". at qvs") == 0)
+ { uint16 *table = ((HITS_QV *) record)->table;
+ j = 0;
+ for (i = 0; i < db->nreads; i++)
+ if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff)
+ table[j++] = table[i];
+ }
+ else
+ { int *anno4, size;
+ int64 *anno8;
+ char *anno, *data;
+ size = record->size;
+ data = (char *) record->data;
+ if (data == NULL)
+ { anno = (char *) record->anno;
+ j = 0;
+ for (i = r = 0; i < db->nreads; i++, r += size)
+ if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff)
+ { memmove(anno+j,anno+r,size);
+ j += size;
+ }
+ memmove(anno+j,anno+r,size);
+ }
+ else if (size == 4)
+ { int ai;
+ anno4 = (int *) (record->anno);
+ j = anno4[0] = 0;
+ for (i = 0; i < db->nreads; i++)
+ if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff)
+ { ai = anno4[i];
+ anno4[j+1] = anno4[j] + (anno4[i+1]-ai);
+ memmove(data+anno4[j],data+ai,anno4[i+1]-ai);
+ j += 1;
+ }
+ record->data = Realloc(record->data,anno4[j],NULL);
+ }
+ else // size == 8
+ { int64 ai;
+ anno8 = (int64 *) (record->anno);
+ j = anno8[0] = 0;
+ for (i = 0; i < db->nreads; i++)
+ if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff)
+ { ai = anno8[i];
+ anno8[j+1] = anno8[j] + (anno8[i+1]-ai);
+ memmove(data+anno8[j],data+ai,anno8[i+1]-ai);
+ j += 1;
+ }
+ record->data = Realloc(record->data,anno8[j],NULL);
+ }
+ record->anno = Realloc(record->anno,record->size*(j+1),NULL);
+ }
+ totlen = maxlen = 0;
+ for (j = i = 0; i < nreads; i++)
+ { r = reads[i].rlen;
+ if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff)
+ { totlen += r;
+ if (r > maxlen)
+ maxlen = r;
+ reads[j++] = reads[i];
+ }
+ }
+ db->totlen = totlen;
+ db->maxlen = maxlen;
+ db->nreads = j;
+ db->trimmed = 1;
+ if (j < nreads)
+ { db->reads = Realloc(reads-1,sizeof(HITS_READ)*(j+2),NULL);
+ db->reads += 1;
+ }
+// The DB has already been trimmed, but a track over the untrimmed DB needs to be loaded.
+// Trim the track by rereading the untrimmed DB index from the file system.
+static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart)
+{ int i, j, r;
+ int allflag, cutoff;
+ int ureads;
+ char *root;
+ HITS_READ read;
+ FILE *indx;
+ if (!db->trimmed) return (0);
+ if (db->cutoff <= 0 && db->all) return (0);
+ cutoff = db->cutoff;
+ if (db->all)
+ allflag = 0;
+ else
+ allflag = DB_BEST;
+ root = rindex(db->path,'/') + 2;
+ indx = Fopen(Catenate(db->path,"","",".idx"),"r");
+ fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*db->ufirst,SEEK_SET);
+ if (ispart)
+ ureads = ((int *) (db->reads))[-1];
+ else
+ ureads = db->ureads;
+ if (strcmp(track->name,". at qvs") == 0)
+ { EPRINTF(EPLACE,"%s: Cannot load QV track after trimming\n",Prog_Name);
+ fclose(indx);
+ EXIT(1);
+ }
+ { int *anno4, size;
+ int64 *anno8;
+ char *anno, *data;
+ size = track->size;
+ data = (char *) track->data;
+ if (data == NULL)
+ { anno = (char *) track->anno;
+ j = r = 0;
+ for (i = r = 0; i < ureads; i++, r += size)
+ { if (fread(&read,sizeof(HITS_READ),1,indx) != 1)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ fclose(indx);
+ EXIT(1);
+ }
+ if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff)
+ { memmove(anno+j,anno+r,size);
+ j += size;
+ }
+ r += size;
+ }
+ memmove(anno+j,anno+r,size);
+ }
+ else if (size == 4)
+ { int ai;
+ anno4 = (int *) (track->anno);
+ j = anno4[0] = 0;
+ for (i = 0; i < ureads; i++)
+ { if (fread(&read,sizeof(HITS_READ),1,indx) != 1)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ fclose(indx);
+ EXIT(1);
+ }
+ if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff)
+ { ai = anno4[i];
+ anno4[j+1] = anno4[j] + (anno4[i+1]-ai);
+ memmove(data+anno4[j],data+ai,anno4[i+1]-ai);
+ j += 1;
+ }
+ }
+ track->data = Realloc(track->data,anno4[j],NULL);
+ }
+ else // size == 8
+ { int64 ai;
+ anno8 = (int64 *) (track->anno);
+ j = anno8[0] = 0;
+ for (i = 0; i < ureads; i++)
+ { if (fread(&read,sizeof(HITS_READ),1,indx) != 1)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ fclose(indx);
+ EXIT(1);
+ }
+ if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff)
+ { ai = anno8[i];
+ anno8[j+1] = anno8[j] + (anno8[i+1]-ai);
+ memmove(data+anno8[j],data+ai,anno8[i+1]-ai);
+ j += 1;
+ }
+ }
+ track->data = Realloc(track->data,anno8[j],NULL);
+ }
+ track->anno = Realloc(track->anno,track->size*(j+1),NULL);
+ }
+ fclose(indx);
+ return (0);
+// Shut down an open 'db' by freeing all associated space, including tracks and QV structures,
+// and any open file pointers. The record pointed at by db however remains (the user
+// supplied it and so should free it).
+void Close_DB(HITS_DB *db)
+{ HITS_TRACK *t, *p;
+ if (db->loaded)
+ free(((char *) (db->bases)) - 1);
+ else if (db->bases != NULL)
+ fclose((FILE *) db->bases);
+ if (db->reads != NULL)
+ free(db->reads-1);
+ free(db->path);
+ Close_QVs(db);
+ for (t = db->tracks; t != NULL; t = p)
+ { p = t->next;
+ free(t->anno);
+ free(t->data);
+ free(t);
+ }
+// Return the size in bytes of the memory occupied by a given DB
+int64 sizeof_DB(HITS_DB *db)
+{ int64 s;
+ s = sizeof(HITS_DB)
+ + sizeof(HITS_READ)*(db->nreads+2)
+ + strlen(db->path)+1
+ + (db->totlen+db->nreads+4);
+ t = db->tracks;
+ if (t != NULL && strcmp(t->name,". at qvs") == 0)
+ { HITS_QV *q = (HITS_QV *) t;
+ s += sizeof(HITS_QV)
+ + sizeof(uint16) * db->nreads
+ + q->ncodes * sizeof(QVcoding)
+ + 6;
+ t = t->next;
+ }
+ for (; t != NULL; t = t->next)
+ { s += sizeof(HITS_TRACK)
+ + strlen(t->name)+1
+ + t->size * (db->nreads+1);
+ if (t->data != NULL)
+ { if (t->size == 8)
+ s += sizeof(int)*((int64 *) t->anno)[db->nreads];
+ else // t->size == 4
+ s += sizeof(int)*((int *) t->anno)[db->nreads];
+ }
+ }
+ return (s);
+ *
+ *
+ ********************************************************************************************/
+HITS_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry"
+HITS_QV *Active_QV; // Becomes invalid after closing
+int Load_QVs(HITS_DB *db)
+{ FILE *quiva, *istub, *indx;
+ char *root;
+ uint16 *table;
+ HITS_QV *qvtrk;
+ QVcoding *coding, *nx;
+ int ncodes = 0;
+ if (db->tracks != NULL && strcmp(db->tracks->name,". at qvs") == 0)
+ return (0);
+ if (db->trimmed)
+ { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name);
+ EXIT(1);
+ }
+ if (db->reads[db->nreads-1].coff < 0)
+ { if (db->part > 0)
+ { EPRINTF(EPLACE,"%s: All QVs for this block have not been added to the DB!\n",Prog_Name);
+ EXIT(1);
+ }
+ else
+ { EPRINTF(EPLACE,"%s: All QVs for this DB have not been added!\n",Prog_Name);
+ EXIT(1);
+ }
+ }
+ // Open .qvs, .idx, and .db files
+ quiva = Fopen(Catenate(db->path,"","",".qvs"),"r");
+ if (quiva == NULL)
+ return (-1);
+ istub = NULL;
+ indx = NULL;
+ table = NULL;
+ coding = NULL;
+ qvtrk = NULL;
+ root = rindex(db->path,'/');
+ if (root[1] == '.')
+ { *root = '\0';
+ istub = Fopen(Catenate(db->path,"/",root+2,".db"),"r");
+ *root = '/';
+ }
+ else
+ istub = Fopen(Catenate(db->path,"","",".db"),"r");
+ if (istub == NULL)
+ goto error;
+ { int first, last, nfiles;
+ char prolog[MAX_NAME], fname[MAX_NAME];
+ int i, j;
+ if (fscanf(istub,DB_NFILE,&nfiles) != 1)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error;
+ }
+ if (db->part > 0)
+ { int pfirst, plast;
+ int fbeg, fend;
+ int n, k;
+ FILE *indx;
+ // Determine first how many and which files span the block (fbeg to fend)
+ pfirst = db->ufirst;
+ plast = pfirst + db->nreads;
+ first = 0;
+ for (fbeg = 0; fbeg < nfiles; fbeg++)
+ { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error;
+ }
+ if (last > pfirst)
+ break;
+ first = last;
+ }
+ for (fend = fbeg+1; fend <= nfiles; fend++)
+ { if (last >= plast)
+ break;
+ if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error;
+ }
+ first = last;
+ }
+ indx = Fopen(Catenate(db->path,"","",".idx"),"r");
+ ncodes = fend-fbeg;
+ coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes");
+ table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices");
+ if (indx == NULL || coding == NULL || table == NULL)
+ { ncodes = 0;
+ goto error;
+ }
+ // Carefully get the first coding scheme (its offset is most likely in a HITS_RECORD
+ // in .idx that is *not* in memory). Get all the other coding schemes normally and
+ // assign the tables # for each read in the block in "tables".
+ rewind(istub);
+ (void) fscanf(istub,DB_NFILE,&nfiles);
+ first = 0;
+ for (n = 0; n < fbeg; n++)
+ { (void) fscanf(istub,DB_FDATA,&last,fname,prolog);
+ first = last;
+ }
+ for (n = fbeg; n < fend; n++)
+ { (void) fscanf(istub,DB_FDATA,&last,fname,prolog);
+ i = n-fbeg;
+ if (first < pfirst)
+ { HITS_READ read;
+ fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*first,SEEK_SET);
+ if (fread(&read,sizeof(HITS_READ),1,indx) != 1)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ ncodes = i;
+ goto error;
+ }
+ fseeko(quiva,read.coff,SEEK_SET);
+ nx = Read_QVcoding(quiva);
+ if (nx == NULL)
+ { ncodes = i;
+ goto error;
+ }
+ coding[i] = *nx;
+ }
+ else
+ { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET);
+ nx = Read_QVcoding(quiva);
+ if (nx == NULL)
+ { ncodes = i;
+ goto error;
+ }
+ coding[i] = *nx;
+ db->reads[first-pfirst].coff = ftello(quiva);
+ }
+ j = first-pfirst;
+ if (j < 0)
+ j = 0;
+ k = last-pfirst;
+ if (k > db->nreads)
+ k = db->nreads;
+ while (j < k)
+ table[j++] = (uint16) i;
+ first = last;
+ }
+ fclose(indx);
+ indx = NULL;
+ }
+ else
+ { // Load in coding scheme for each file, adjust .coff of first read in the file, and
+ // record which table each read uses
+ ncodes = nfiles;
+ coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes");
+ table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices");
+ if (coding == NULL || table == NULL)
+ goto error;
+ first = 0;
+ for (i = 0; i < nfiles; i++)
+ { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error;
+ }
+ fseeko(quiva,db->reads[first].coff,SEEK_SET);
+ nx = Read_QVcoding(quiva);
+ if (nx == NULL)
+ { ncodes = i;
+ goto error;
+ }
+ coding[i] = *nx;
+ db->reads[first].coff = ftello(quiva);
+ for (j = first; j < last; j++)
+ table[j] = (uint16) i;
+ first = last;
+ }
+ }
+ // Allocate and fill in the HITS_QV record and add it to the front of the
+ // track list
+ qvtrk = (HITS_QV *) Malloc(sizeof(HITS_QV),"Allocating QV pseudo-track");
+ if (qvtrk == NULL)
+ goto error;
+ qvtrk->name = Strdup(". at qvs","Allocating QV pseudo-track name");
+ if (qvtrk->name == NULL)
+ goto error;
+ qvtrk->next = db->tracks;
+ db->tracks = (HITS_TRACK *) qvtrk;
+ qvtrk->ncodes = ncodes;
+ qvtrk->table = table;
+ qvtrk->coding = coding;
+ qvtrk->quiva = quiva;
+ }
+ fclose(istub);
+ return (0);
+ if (qvtrk != NULL)
+ free(qvtrk);
+ if (table != NULL)
+ free(table);
+ if (coding != NULL)
+ { int i;
+ for (i = 0; i < ncodes; i++)
+ Free_QVcoding(coding+i);
+ free(coding);
+ }
+ if (indx != NULL)
+ fclose(indx);
+ if (istub != NULL)
+ fclose(istub);
+ fclose(quiva);
+ EXIT(1);
+// Close the QV stream, free the QV pseudo track and all associated memory
+void Close_QVs(HITS_DB *db)
+{ HITS_TRACK *track;
+ HITS_QV *qvtrk;
+ int i;
+ Active_DB = NULL;
+ track = db->tracks;
+ if (track != NULL && strcmp(track->name,". at qvs") == 0)
+ { qvtrk = (HITS_QV *) track;
+ for (i = 0; i < qvtrk->ncodes; i++)
+ Free_QVcoding(qvtrk->coding+i);
+ free(qvtrk->coding);
+ free(qvtrk->table);
+ fclose(qvtrk->quiva);
+ db->tracks = track->next;
+ free(track);
+ }
+ return;
+ *
+ *
+ ********************************************************************************************/
+// Return status of track:
+// 1: Track is for trimmed DB
+// 0: Track is for untrimmed DB
+// -1: Track is not the right size of DB either trimmed or untrimmed
+// -2: Could not find the track
+int Check_Track(HITS_DB *db, char *track, int *kind)
+{ FILE *afile;
+ int tracklen, size, ispart;
+ int ureads, treads;
+ afile = NULL;
+ if (db->part > 0)
+ { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r");
+ ispart = 1;
+ }
+ if (afile == NULL)
+ { afile = fopen(Catenate(db->path,".",track,".anno"),"r");
+ ispart = 0;
+ }
+ if (afile == NULL)
+ return (-2);
+ if (fread(&tracklen,sizeof(int),1,afile) != 1)
+ { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track);
+ exit (1);
+ }
+ if (fread(&size,sizeof(int),1,afile) != 1)
+ { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track);
+ exit (1);
+ }
+ if (size == 0)
+ *kind = MASK_TRACK;
+ else if (size > 0)
+ *kind = CUSTOM_TRACK;
+ else
+ { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track);
+ exit (1);
+ }
+ fclose(afile);
+ if (ispart)
+ { ureads = ((int *) (db->reads))[-1];
+ treads = ((int *) (db->reads))[-2];
+ }
+ else
+ { ureads = db->ureads;
+ treads = db->treads;
+ }
+ if (tracklen == ureads)
+ return (0);
+ else if (tracklen == treads)
+ return (1);
+ else
+ return (-1);
+// If track is not already in the db's track list, then allocate all the storage for it,
+// read it in from the appropriate file, add it to the track list, and return a pointer
+// to the newly created HITS_TRACK record. If the track does not exist or cannot be
+// opened for some reason, then NULL is returned.
+HITS_TRACK *Load_Track(HITS_DB *db, char *track)
+{ FILE *afile, *dfile;
+ int tracklen, size;
+ int nreads, ispart;
+ int treads, ureads;
+ void *anno;
+ void *data;
+ char *name;
+ HITS_TRACK *record;
+ if (track[0] == '.')
+ { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track);
+ }
+ for (record = db->tracks; record != NULL; record = record->next)
+ if (strcmp(record->name,track) == 0)
+ return (record);
+ afile = NULL;
+ if (db->part)
+ { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r");
+ ispart = 1;
+ }
+ if (afile == NULL)
+ { afile = fopen(Catenate(db->path,".",track,".anno"),"r");
+ ispart = 0;
+ }
+ if (afile == NULL)
+ { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track);
+ return (NULL);
+ }
+ dfile = NULL;
+ anno = NULL;
+ data = NULL;
+ record = NULL;
+ if (ispart)
+ name = Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data");
+ else
+ name = Catenate(db->path,".",track,".data");
+ if (name == NULL)
+ goto error;
+ dfile = fopen(name,"r");
+ if (fread(&tracklen,sizeof(int),1,afile) != 1)
+ { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track);
+ goto error;
+ }
+ if (fread(&size,sizeof(int),1,afile) != 1)
+ { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track);
+ goto error;
+ }
+ if (size < 0)
+ { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track);
+ goto error;
+ }
+ if (size == 0)
+ size = 8;
+ if (ispart)
+ { ureads = ((int *) (db->reads))[-1];
+ treads = ((int *) (db->reads))[-2];
+ }
+ else
+ { ureads = db->ureads;
+ treads = db->treads;
+ }
+ if (db->trimmed)
+ { if (tracklen != treads && tracklen != ureads)
+ { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track);
+ goto error;
+ }
+ if ( ! ispart && db->part > 0)
+ { if (tracklen == treads)
+ fseeko(afile,size*db->tfirst,SEEK_CUR);
+ else
+ fseeko(afile,size*db->ufirst,SEEK_CUR);
+ }
+ }
+ else
+ { if (tracklen != ureads)
+ { if (tracklen == treads)
+ EPRINTF(EPLACE,"%s: Track '%s' is for a trimmed DB !\n",Prog_Name,track);
+ else
+ EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track);
+ goto error;
+ }
+ if ( ! ispart && db->part > 0)
+ fseeko(afile,size*db->ufirst,SEEK_CUR);
+ }
+ nreads = tracklen;
+ anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector");
+ if (anno == NULL)
+ goto error;
+ if (dfile != NULL)
+ { int64 *anno8, off8, dlen;
+ int *anno4, off4;
+ int i;
+ if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1))
+ { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track);
+ goto error;
+ }
+ if (size == 4)
+ { anno4 = (int *) anno;
+ off4 = anno4[0];
+ if (off4 != 0)
+ { for (i = 0; i <= nreads; i++)
+ anno4[i] -= off4;
+ fseeko(dfile,off4,SEEK_SET);
+ }
+ dlen = anno4[nreads];
+ data = (void *) Malloc(dlen,"Allocating Track Data Vector");
+ }
+ else
+ { anno8 = (int64 *) anno;
+ off8 = anno8[0];
+ if (off8 != 0)
+ { for (i = 0; i <= nreads; i++)
+ anno8[i] -= off8;
+ fseeko(dfile,off8,SEEK_SET);
+ }
+ dlen = anno8[nreads];
+ data = (void *) Malloc(dlen,"Allocating Track Data Vector");
+ }
+ if (data == NULL)
+ goto error;
+ if (dlen > 0)
+ { if (fread(data,dlen,1,dfile) != 1)
+ { EPRINTF(EPLACE,"%s: Track '%s' data file is junk\n",Prog_Name,track);
+ goto error;
+ }
+ }
+ fclose(dfile);
+ dfile = NULL;
+ }
+ else
+ { if (fread(anno,size,nreads,afile) != (size_t) nreads)
+ { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track);
+ goto error;
+ }
+ data = NULL;
+ }
+ fclose(afile);
+ record = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating Track Record");
+ if (record == NULL)
+ goto error;
+ record->name = Strdup(track,"Allocating Track Name");
+ if (record->name == NULL)
+ goto error;
+ record->data = data;
+ record->anno = anno;
+ record->size = size;
+ if (db->trimmed && tracklen != treads)
+ { if (Late_Track_Trim(db,record,ispart))
+ goto error;
+ }
+ if (db->tracks != NULL && strcmp(db->tracks->name,". at qvs") == 0)
+ { record->next = db->tracks->next;
+ db->tracks->next = record;
+ }
+ else
+ { record->next = db->tracks;
+ db->tracks = record;
+ }
+ return (record);
+ if (record != NULL)
+ free(record);
+ if (data != NULL)
+ free(data);
+ if (anno != NULL)
+ free(anno);
+ if (dfile != NULL)
+ fclose(dfile);
+ fclose(afile);
+void Close_Track(HITS_DB *db, char *track)
+{ HITS_TRACK *record, *prev;
+ prev = NULL;
+ for (record = db->tracks; record != NULL; record = record->next)
+ { if (strcmp(record->name,track) == 0)
+ { free(record->anno);
+ free(record->data);
+ free(record->name);
+ if (prev == NULL)
+ db->tracks = record->next;
+ else
+ prev->next = record->next;
+ free(record);
+ return;
+ }
+ prev = record;
+ }
+ return;
+ *
+ *
+ ********************************************************************************************/
+// Allocate and return a buffer big enough for the largest read in 'db', leaving room
+// for an initial delimiter character
+char *New_Read_Buffer(HITS_DB *db)
+{ char *read;
+ read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer");
+ if (read == NULL)
+ return (read+1);
+// Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a
+// lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and
+// 3(T) otherwise.
+// **NB**, the byte before read will be set to a delimiter character!
+int Load_Read(HITS_DB *db, int i, char *read, int ascii)
+{ FILE *bases = (FILE *) db->bases;
+ int64 off;
+ int len, clen;
+ HITS_READ *r = db->reads;
+ if (i >= db->nreads)
+ { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name);
+ EXIT(1);
+ }
+ if (bases == NULL)
+ { bases = Fopen(Catenate(db->path,"","",".bps"),"r");
+ if (bases == NULL)
+ EXIT(1);
+ db->bases = (void *) bases;
+ }
+ off = r[i].boff;
+ len = r[i].rlen;
+ if (ftello(bases) != off)
+ fseeko(bases,off,SEEK_SET);
+ clen = COMPRESSED_LEN(len);
+ if (clen > 0)
+ { if (fread(read,clen,1,bases) != 1)
+ { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name);
+ EXIT(1);
+ }
+ }
+ Uncompress_Read(len,read);
+ if (ascii == 1)
+ { Lower_Read(read);
+ read[-1] = '\0';
+ }
+ else if (ascii == 2)
+ { Upper_Read(read);
+ read[-1] = '\0';
+ }
+ else
+ read[-1] = 4;
+ return (0);
+char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii)
+{ FILE *bases = (FILE *) db->bases;
+ int64 off;
+ int len, clen;
+ int bbeg, bend;
+ HITS_READ *r = db->reads;
+ if (i >= db->nreads)
+ { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name);
+ }
+ if (bases == NULL)
+ { bases = Fopen(Catenate(db->path,"","",".bps"),"r");
+ if (bases == NULL)
+ db->bases = (void *) bases;
+ }
+ bbeg = beg/4;
+ bend = (end-1)/4+1;
+ off = r[i].boff + bbeg;
+ len = end - beg;
+ if (ftello(bases) != off)
+ fseeko(bases,off,SEEK_SET);
+ clen = bend-bbeg;
+ if (clen > 0)
+ { if (fread(read,clen,1,bases) != 1)
+ { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name);
+ }
+ }
+ Uncompress_Read(4*clen,read);
+ read += beg%4;
+ read[len] = 4;
+ if (ascii == 1)
+ { Lower_Read(read);
+ read[-1] = '\0';
+ }
+ else if (ascii == 2)
+ { Upper_Read(read);
+ read[-1] = '\0';
+ }
+ else
+ read[-1] = 4;
+ return (read);
+ *
+ *
+ ********************************************************************************************/
+// Allocate and return a buffer of 5 vectors big enough for the largest read in 'db'
+char **New_QV_Buffer(HITS_DB *db)
+{ char **entry;
+ char *qvs;
+ int i;
+ qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer");
+ entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer");
+ if (qvs == NULL || entry == NULL)
+ for (i = 0; i < 5; i++)
+ entry[i] = qvs + i*db->maxlen;
+ return (entry);
+// Load into entry the QV streams for the i'th read from db. The parameter ascii applies to
+// the DELTAG stream as described for Load_Read.
+int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii)
+{ HITS_READ *reads;
+ FILE *quiva;
+ int rlen;
+ if (db != Active_DB)
+ { if (db->tracks == NULL || strcmp(db->tracks->name,". at qvs") != 0)
+ { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name);
+ EXIT(1);
+ }
+ Active_QV = (HITS_QV *) db->tracks;
+ Active_DB = db;
+ }
+ if (i >= db->nreads)
+ { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name);
+ EXIT(1);
+ }
+ reads = db->reads;
+ quiva = Active_QV->quiva;
+ rlen = reads[i].rlen;
+ fseeko(quiva,reads[i].coff,SEEK_SET);
+ if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen))
+ EXIT(1);
+ if (ascii != 1)
+ { char *deltag = entry[1];
+ if (ascii != 2)
+ { char x = deltag[rlen];
+ deltag[rlen] = '\0';
+ Number_Read(deltag);
+ deltag[rlen] = x;
+ }
+ else
+ { int j;
+ int u = 'A'-'a';
+ for (j = 0; j < rlen; j++)
+ deltag[j] = (char) (deltag[j]+u);
+ }
+ }
+ return (0);
+ *
+ *
+ ********************************************************************************************/
+// Allocate a block big enough for all the uncompressed sequences, read them into it,
+// reset the 'off' in each read record to be its in-memory offset, and set the
+// bases pointer to point at the block after closing the bases file. If ascii is
+// non-zero then the reads are converted to ACGT ascii, otherwise the reads are left
+// as numeric strings over 0(A), 1(C), 2(G), and 3(T).
+int Read_All_Sequences(HITS_DB *db, int ascii)
+{ FILE *bases;
+ int nreads = db->nreads;
+ HITS_READ *reads = db->reads;
+ void (*translate)(char *s);
+ char *seq;
+ int64 o, off;
+ int i, len, clen;
+ bases = Fopen(Catenate(db->path,"","",".bps"),"r");
+ if (bases == NULL)
+ EXIT(1);
+ seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads");
+ if (seq == NULL)
+ { fclose(bases);
+ EXIT(1);
+ }
+ *seq++ = 4;
+ if (ascii == 1)
+ translate = Lower_Read;
+ else
+ translate = Upper_Read;
+ o = 0;
+ for (i = 0; i < nreads; i++)
+ { len = reads[i].rlen;
+ off = reads[i].boff;
+ if (ftello(bases) != off)
+ fseeko(bases,off,SEEK_SET);
+ clen = COMPRESSED_LEN(len);
+ if (clen > 0)
+ { if (fread(seq+o,clen,1,bases) != 1)
+ { EPRINTF(EPLACE,"%s: Read of .bps file failed (Read_All_Sequences)\n",Prog_Name);
+ free(seq);
+ fclose(bases);
+ EXIT(1);
+ }
+ }
+ Uncompress_Read(len,seq+o);
+ if (ascii)
+ translate(seq+o);
+ reads[i].boff = o;
+ o += (len+1);
+ }
+ reads[nreads].boff = o;
+ fclose(bases);
+ db->bases = (void *) seq;
+ db->loaded = 1;
+ return (0);
+int List_DB_Files(char *path, void actor(char *path, char *extension))
+{ int status, plen, rlen, dlen;
+ char *root, *pwd, *name;
+ int isdam;
+ DIR *dirp;
+ struct dirent *dp;
+ status = 0;
+ pwd = PathTo(path);
+ plen = strlen(path);
+ if (strcmp(path+(plen-4),".dam") == 0)
+ root = Root(path,".dam");
+ else
+ root = Root(path,".db");
+ rlen = strlen(root);
+ if (root == NULL || pwd == NULL)
+ { free(pwd);
+ free(root);
+ EXIT(1);
+ }
+ if ((dirp = opendir(pwd)) == NULL)
+ { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd);
+ status = -1;
+ goto error;
+ }
+ isdam = 0;
+ while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary)
+ { name = dp->d_name;
+ if (strcmp(name,Catenate("","",root,".db")) == 0)
+ break;
+ if (strcmp(name,Catenate("","",root,".dam")) == 0)
+ { isdam = 1;
+ break;
+ }
+ if (strcasecmp(name,Catenate("","",root,".db")) == 0)
+ { strncpy(root,name,rlen);
+ break;
+ }
+ if (strcasecmp(name,Catenate("","",root,".dam")) == 0)
+ { strncpy(root,name,rlen);
+ isdam = 1;
+ break;
+ }
+ }
+ if (dp == NULL)
+ { EPRINTF(EPLACE,"%s: Cannot find %s (List_DB_Files)\n",Prog_Name,pwd);
+ status = -1;
+ closedir(dirp);
+ goto error;
+ }
+ if (isdam)
+ actor(Catenate(pwd,"/",root,".dam"),"dam");
+ else
+ actor(Catenate(pwd,"/",root,".db"),"db");
+ rewinddir(dirp); // Report each auxiliary file
+ while ((dp = readdir(dirp)) != NULL)
+ { name = dp->d_name;
+ dlen = strlen(name);
+#ifdef HIDE_FILES
+ if (name[0] != '.')
+ continue;
+ dlen -= 1;
+ name += 1;
+ if (dlen < rlen+1)
+ continue;
+ if (name[rlen] != '.')
+ continue;
+ if (strncmp(name,root,rlen) != 0)
+ continue;
+ actor(Catenate(pwd,PATHSEP,name,""),name+(rlen+1));
+ }
+ closedir(dirp);
+ free(pwd);
+ free(root);
+ return (status);
+void Print_Read(char *s, int width)
+{ int i;
+ if (s[0] < 4)
+ { for (i = 0; s[i] != 4; i++)
+ { if (i%width == 0 && i != 0)
+ printf("\n");
+ printf("%d",s[i]);
+ }
+ printf("\n");
+ }
+ else
+ { for (i = 0; s[i] != '\0'; i++)
+ { if (i%width == 0 && i != 0)
+ printf("\n");
+ printf("%c",s[i]);
+ }
+ printf("\n");
+ }
diff --git a/DB.h b/DB.h
new file mode 100644
index 0000000..a7b8636
--- /dev/null
+++ b/DB.h
@@ -0,0 +1,417 @@
+ *
+ * Compressed data base module. Auxiliary routines to open and manipulate a data base for
+ * which the sequence and read information are separated into two separate files, and the
+ * sequence is compressed into 2-bits for each base. Support for tracks of additional
+ * information, and trimming according to the current partition. Eventually will also
+ * support compressed quality information.
+ *
+ * Author : Gene Myers
+ * Date : July 2013
+ * Revised: April 2014
+ *
+ ********************************************************************************************/
+#ifndef _HITS_DB
+#define _HITS_DB
+#include <stdio.h>
+#include "QV.h"
+#define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden"
+ // Undefine if you don't want this
+// For interactive applications where it is inappropriate to simply exit with an error
+// message to standard error, define the constant INTERACTIVE. If set, then error
+// messages are put in the global variable Ebuffer and the caller of a DB routine
+// can decide how to deal with the error.
+// DB, QV, or alignment routines that can encounter errors function as before in
+// non-INTERACTIVE mode by exiting after printing an error message to stderr. In
+// INTERACTIVE mode the routines place a message at EPLACE and return an error
+// value. For such routines that were previously void, they are now int, and
+// return 1 if an error occured, 0 otherwise.
+#define EPRINTF sprintf
+#define EPLACE Ebuffer
+#define EXIT(x) return (x)
+#else // BATCH
+#define EPRINTF fprintf
+#define EPLACE stderr
+#define EXIT(x) exit (1)
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint32;
+typedef unsigned long long uint64;
+typedef signed char int8;
+typedef signed short int16;
+typedef signed int int32;
+typedef signed long long int64;
+typedef float float32;
+typedef double float64;
+ *
+ *
+ ********************************************************************************************/
+extern char *Prog_Name; // Name of program
+extern char Ebuffer[];
+#define SYSTEM_ERROR \
+ { EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \
+ exit (2); \
+ }
+#define ARG_INIT(name) \
+ Prog_Name = Strdup(name,""); \
+ for (i = 0; i < 128; i++) \
+ flags[i] = 0;
+#define ARG_FLAGS(set) \
+ for (k = 1; argv[i][k] != '\0'; k++) \
+ { if (index(set,argv[i][k]) == NULL) \
+ { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \
+ exit (1); \
+ } \
+ flags[(int) argv[i][k]] = 1; \
+ }
+#define ARG_POSITIVE(var,name) \
+ var = strtol(argv[i]+2,&eptr,10); \
+ if (*eptr != '\0' || argv[i][2] == '\0') \
+ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \
+ Prog_Name,argv[i][1],argv[i]+2); \
+ exit (1); \
+ } \
+ if (var <= 0) \
+ { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \
+ exit (1); \
+ }
+#define ARG_NON_NEGATIVE(var,name) \
+ var = strtol(argv[i]+2,&eptr,10); \
+ if (*eptr != '\0' || argv[i][2] == '\0') \
+ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \
+ Prog_Name,argv[i][1],argv[i]+2); \
+ exit (1); \
+ } \
+ if (var < 0) \
+ { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \
+ exit (1); \
+ }
+#define ARG_REAL(var) \
+ var = strtod(argv[i]+2,&eptr); \
+ if (*eptr != '\0' || argv[i][2] == '\0') \
+ { fprintf(stderr,"%s: -%c '%s' argument is not a real number\n", \
+ Prog_Name,argv[i][1],argv[i]+2); \
+ exit (1); \
+ }
+ *
+ *
+ ********************************************************************************************/
+// The following general utilities return NULL if any of their input pointers are NULL, or if they
+// could not perform their function (in which case they also print an error to stderr).
+void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc
+void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to
+char *Strdup(char *string, char *mesg); // stderr if out of memory
+FILE *Fopen(char *path, char *mode); // Open file path for "mode"
+char *PathTo(char *path); // Return path portion of file name "path"
+char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path"
+// Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer
+// Numbered_Suffix returns concatenation of left.<num>.right in a *temporary* buffer
+char *Catenate(char *path, char *sep, char *root, char *suffix);
+char *Numbered_Suffix(char *left, int num, char *right);
+// DB-related utilities
+void Print_Number(int64 num, int width, FILE *out); // Print readable big integer
+int Number_Digits(int64 num); // Return # of digits in printed number
+#define COMPRESSED_LEN(len) (((len)+3) >> 2)
+void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form
+void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form
+void Print_Read(char *s, int width);
+void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt)
+void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT)
+void Number_Read(char *s); // Convert read from letters to numbers
+ *
+ *
+ ********************************************************************************************/
+#define DB_QV 0x03ff // Mask for 3-digit quality value
+#define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert
+#define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1)
+typedef struct
+ { int origin; // Well #
+ int rlen; // Length of the sequence (Last pulse = fpulse + rlen)
+ int fpulse; // First pulse
+ int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of
+ // uncompressed bases in memory block
+ int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file
+ int flags; // QV of read + flags above
+// A track can be of 3 types:
+// data == NULL: there are nreads 'anno' records of size 'size'.
+// data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1])
+// contains the variable length data
+// data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1])
+// contains the variable length data
+typedef struct _track
+ { struct _track *next; // Link to next track
+ char *name; // Symbolic name of track
+ int size; // Size in bytes of anno records
+ void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records
+ void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL
+// The information for accessing QV streams is in a HITS_QV record that is a "pseudo-track"
+// named ". at qvs" and is always the first track record in the list (if present). Since normal
+// track names cannot begin with a . (this is enforced), this pseudo-track is never confused
+// with a normal track.
+typedef struct
+ { struct _track *next;
+ char *name;
+ int ncodes; // # of coding tables
+ QVcoding *coding; // array [0..ncodes-1] of coding schemes (see QV.h)
+ uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with
+ // scheme coding[table[i]]
+ FILE *quiva; // the open file pointer to the .qvs file
+ } HITS_QV;
+// The DB record holds all information about the current state of an active DB including an
+// array of HITS_READS, one per read, and a linked list of HITS_TRACKs the first of which
+// is always a HITS_QV pseudo-track (if the QVs have been loaded).
+typedef struct
+ { int ureads; // Total number of reads in untrimmed DB
+ int treads; // Total number of reads in trimmed DB
+ int cutoff; // Minimum read length in block (-1 if not yet set)
+ int all; // Consider multiple reads from a given well
+ float freq[4]; // frequency of A, C, G, T, respectively
+ // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed)
+ int maxlen; // length of maximum read (initially over all DB)
+ int64 totlen; // total # of bases (initially over all DB)
+ int nreads; // # of reads in actively loaded portion of DB
+ int trimmed; // DB has been trimmed by cutoff/all
+ int part; // DB block (if > 0), total DB (if == 0)
+ int ufirst; // Index of first read in block (without trimming)
+ int tfirst; // Index of first read in block (with trimming)
+ // In order to avoid forcing users to have to rebuild all thier DBs to accommodate
+ // the addition of fields for the size of the actively loaded trimmed and untrimmed
+ // blocks, an additional read record is allocated in "reads" when a DB is loaded into
+ // memory (reads[-1]) and the two desired fields are crammed into the first two
+ // integer spaces of the record.
+ char *path; // Root name of DB for .bps, .qvs, and tracks
+ int loaded; // Are reads loaded in memory?
+ void *bases; // file pointer for bases file (to fetch reads from),
+ // or memory pointer to uncompressed block of all sequences.
+ HITS_READ *reads; // Array [-1..nreads] of HITS_READ
+ HITS_TRACK *tracks; // Linked list of loaded tracks
+ } HITS_DB;
+ *
+ *
+ ********************************************************************************************/
+#define MAX_NAME 10000 // Longest file name or fasta header line
+#define DB_NFILE "files = %9d\n" // number of files
+#define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name
+#define DB_NBLOCK "blocks = %9d\n" // number of blocks
+#define DB_PARAMS "size = %10lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well
+#define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed)
+ *
+ *
+ ********************************************************************************************/
+ // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps,
+ // .DB.qvs, and files .DB.<track>.anno and DB.<track>.data where <track> is a track name
+ // (not containing a . !).
+ // A DAM is basically a DB except that:
+ // 1. there are no QV's, instead .coff points the '\0' terminated fasta header of the read
+ // in the file .<dam>.hdr file
+ // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences
+ // contain N-separated contigs), and .fpulse the first base of the contig in the
+ // fasta entry
+ // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has
+ // a part # in it then just the part is opened. The index array is allocated (for all or
+ // just the part) and read in.
+ // Return status of routine:
+ // -1: The DB could not be opened for a reason reported by the routine to EPLACE
+ // 0: Open of DB proceeded without mishap
+ // 1: Open of DAM proceeded without mishap
+int Open_DB(char *path, HITS_DB *db);
+ // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings
+ // of the current DB partition. Reallocate smaller memory blocks for the information kept
+ // for the retained reads.
+void Trim_DB(HITS_DB *db);
+ // Shut down an open 'db' by freeing all associated space, including tracks and QV structures,
+ // and any open file pointers. The record pointed at by db however remains (the user
+ // supplied it and so should free it).
+void Close_DB(HITS_DB *db);
+ // Return the size in bytes of the given DB
+int64 sizeof_DB(HITS_DB *db);
+ // If QV pseudo track is not already in db's track list, then load it and set it up.
+ // The database must not have been trimmed yet. -1 is returned if a .qvs file is not
+ // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE
+ // is defined. Otherwise a 0 is returned.
+int Load_QVs(HITS_DB *db);
+ // Remove the QV pseudo track, all space associated with it, and close the .qvs file.
+void Close_QVs(HITS_DB *db);
+ // Look up the file and header in the file of the indicated track. Return:
+ // 1: Track is for trimmed DB
+ // 0: Track is for untrimmed DB
+ // -1: Track is not the right size of DB either trimmed or untrimmed
+ // -2: Could not find the track
+ // In addition, if opened (0 or 1 returned), then kind points at an integer indicating
+ // the type of track as follows:
+ // CUSTOM 0 => a custom track
+ // MASK 1 => a mask track
+#define CUSTOM_TRACK 0
+#define MASK_TRACK 1
+int Check_Track(HITS_DB *db, char *track, int *kind);
+ // If track is not already in the db's track list, then allocate all the storage for it,
+ // read it in from the appropriate file, add it to the track list, and return a pointer
+ // to the newly created HITS_TRACK record. If the track does not exist or cannot be
+ // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise
+ // the routine prints an error message to stderr and exits if an error occurs, and returns
+ // with NULL only if the track does not exist.
+HITS_TRACK *Load_Track(HITS_DB *db, char *track);
+ // If track is on the db's track list, then it is removed and all storage associated with it
+ // is freed.
+void Close_Track(HITS_DB *db, char *track);
+ // Allocate and return a buffer big enough for the largest read in 'db'.
+ // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte
+ // are needed by the alignment algorithms. If cannot allocate memory then return NULL
+ // if INTERACTIVE is defined, or print error to stderr and exit otherwise.
+char *New_Read_Buffer(HITS_DB *db);
+ // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an
+ // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T)
+ // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter
+ // for traversals in either direction. A non-zero value is returned if an error occured
+ // and INTERACTIVE is defined.
+int Load_Read(HITS_DB *db, int i, char *read, int ascii);
+ // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the
+ // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii
+ // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string
+ // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to
+ // the string holding the substring so it has a delimeter for traversals in either direction.
+ // A NULL pointer is returned if an error occured and INTERACTIVE is defined.
+char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii);
+ // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur
+ // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined,
+ // or print error to stderr and exit otherwise.
+#define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer
+#define DEL_TAG 1 // The deleted characters
+#define INS_QV 2 // The insertion QVs
+#define SUB_QV 3 // The substitution QVs
+#define MRG_QV 4 // The merge QVs
+char **New_QV_Buffer(HITS_DB *db);
+ // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters
+ // are converted to a numeric or upper/lower case ascii string as per ascii. Return with
+ // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1.
+int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii);
+ // Allocate a block big enough for all the uncompressed sequences, read them into it,
+ // reset the 'off' in each read record to be its in-memory offset, and set the
+ // bases pointer to point at the block after closing the bases file. If ascii is
+ // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and
+ // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T).
+ // Return with a zero, except when an error occurs and INTERACTIVE is defined in which
+ // case return wtih 1.
+int Read_All_Sequences(HITS_DB *db, int ascii);
+ // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all
+ // those of the form "prefix/[.]root.part" and call actor with the complete path to each file
+ // pointed at by path, and the suffix of the path by extension. The . proceeds the root
+ // name if the defined constant HIDE_FILES is set. Always the first call is with the
+ // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for
+ // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and
+ // so this routine gives one a way to know all the tracks associated with a given DB.
+ // -1 is returned if the path could not be found, and 1 is returned if an error (reported
+ // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned.
+int List_DB_Files(char *path, void actor(char *path, char *extension));
+#endif // _HITS_DB
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..9aa819c
--- /dev/null
@@ -0,0 +1,34 @@
+ Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved.
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+ · Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+ · Redistributions in binary form must reproduce the above copyright notice, this
+ list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+ · The name of EWM may not be used to endorse or promote products derived from
+ this software without specific prior written permission.
+ For any issues regarding this software and its use, contact EWM at:
+ Eugene W. Myers Jr.
+ Bautzner Str. 122e
+ 01099 Dresden
+ Email: gene.myers at gmail.com
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..851972d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,25 @@
+DEST_DIR = ~/bin
+CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing
+ALL = DASqv DAStrim
+all: $(ALL)
+DASqv: DASqv.c align.c align.h DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DASqv DASqv.c align.c DB.c QV.c -lm
+DAStrim: DAStrim.c align.c align.h DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DAStrim DAStrim.c align.c DB.c QV.c -lm
+ rm -f $(ALL)
+ rm -fr *.dSYM
+ rm -f scrubber.tar.gz
+ cp $(ALL) $(DEST_DIR)
+ make clean
+ tar -zcf scrubber.tar.gz README Makefile *.h *.c
diff --git a/QV.c b/QV.c
new file mode 100644
index 0000000..cdb6a63
--- /dev/null
+++ b/QV.c
@@ -0,0 +1,1387 @@
+ *
+ * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on
+ * the histogram of values occuring in a given file. The two low complexity streams
+ * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant
+ * character.
+ *
+ * Author: Gene Myers
+ * Date: Jan 18, 2014
+ * Modified: July 25, 2014
+ *
+ ********************************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <unistd.h>
+#include "DB.h"
+#undef DEBUG
+#define MIN_BUFFER 1000
+#define HUFF_CUTOFF 16 // This cannot be larger than 16 !
+ *
+ * Endian flipping routines
+ *
+ ********************************************************************************************/
+static int LittleEndian; // Little-endian machine ?
+ // Referred by: Decode & Decode_Run
+static int Flip; // Flip endian of all coded shorts and ints
+ // Referred by: Decode & Decode_Run & Read_Scheme
+static void Set_Endian(int flip)
+{ uint32 x = 3;
+ uint8 *b = (uint8 *) (&x);
+ Flip = flip;
+ LittleEndian = (b[0] == 3);
+static void Flip_Long(void *w)
+{ uint8 *v = (uint8 *) w;
+ uint8 x;
+ x = v[0];
+ v[0] = v[3];
+ v[3] = x;
+ x = v[1];
+ v[1] = v[2];
+ v[2] = x;
+static void Flip_Short(void *w)
+{ uint8 *v = (uint8 *) w;
+ uint8 x;
+ x = v[0];
+ v[0] = v[1];
+ v[1] = x;
+ *
+ * Routines for computing a Huffman Encoding Scheme
+ *
+ ********************************************************************************************/
+typedef struct
+ { int type; // 0 => normal, 1 => normal but has long codes, 2 => truncated
+ uint32 codebits[256]; // If type = 2, then code 255 is the special code for
+ int codelens[256]; // non-Huffman exceptions
+ int lookup[0x10000]; // Lookup table (just for decoding)
+ } HScheme;
+typedef struct _HTree
+ { struct _HTree *lft, *rgt;
+ uint64 count;
+ } HTree;
+ // Establish heap property from node s down (1 is root, siblings of n are 2n and 2n+1)
+ // assuming s is the only perturbation in the tree.
+static void Reheap(int s, HTree **heap, int hsize)
+{ int c, l, r;
+ HTree *hs, *hr, *hl;
+ c = s;
+ hs = heap[s];
+ while ((l = 2*c) <= hsize)
+ { r = l+1;
+ hl = heap[l];
+ hr = heap[r];
+ if (r > hsize || hr->count > hl->count)
+ { if (hs->count > hl->count)
+ { heap[c] = hl;
+ c = l;
+ }
+ else
+ break;
+ }
+ else
+ { if (hs->count > hr->count)
+ { heap[c] = hr;
+ c = r;
+ }
+ else
+ break;
+ }
+ }
+ if (c != s)
+ heap[c] = hs;
+ // Given Huffman tree build a table of codes from it, the low-order codelens[s] bits
+ // of codebits[s] contain the code for symbol s.
+static void Build_Table(HTree *node, int code, int len, uint32 *codebits, int *codelens)
+{ if (node->rgt == NULL)
+ { uint64 symbol = (uint64) (node->lft);
+ codebits[symbol] = code;
+ codelens[symbol] = len;
+ }
+ else
+ { code <<= 1;
+ len += 1;
+ Build_Table(node->lft,code,len,codebits,codelens);
+ Build_Table(node->rgt,code+1,len,codebits,codelens);
+ }
+ // For the non-zero symbols in hist, compute a huffman tree over them, and then
+ // build a table of the codes. If inscheme is not NULL, then place all symbols
+ // with code 255 or with more than HUFF_CUTOFF bits in the encoding by inscheme
+ // as a single united entity, whose code signals that the value of these symbols
+ // occur explicitly in 8 (values) or 16 (run lengths) bits following the code.
+ // All the symbols in this class will have the same entry in the code table and
+ // 255 is always in this class.
+static HScheme *Huffman(uint64 *hist, HScheme *inscheme)
+{ HScheme *scheme;
+ HTree *heap[259];
+ HTree node[512];
+ int hsize;
+ HTree *lft, *rgt;
+ int value, range;
+ int i;
+ scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record");
+ if (scheme == NULL)
+ return (NULL);
+ hsize = 0; // Load heap
+ value = 0;
+ if (inscheme != NULL)
+ { node[0].count = 0;
+ node[0].lft = (HTree *) (uint64) 255;
+ node[0].rgt = NULL;
+ heap[++hsize] = node+(value++);
+ }
+ for (i = 0; i < 256; i++)
+ if (hist[i] > 0)
+ { if (inscheme != NULL && (inscheme->codelens[i] > HUFF_CUTOFF || i == 255))
+ node[0].count += hist[i];
+ else
+ { node[value].count = hist[i];
+ node[value].lft = (HTree *) (uint64) i;
+ node[value].rgt = NULL;
+ heap[++hsize] = node+(value++);
+ }
+ }
+ for (i = hsize/2; i >= 1; i--) // Establish heap property
+ Reheap(i,heap,hsize);
+ range = value; // Merge pairs with smallest count until have a tree
+ for (i = 1; i < value; i++)
+ { lft = heap[1];
+ heap[1] = heap[hsize--];
+ Reheap(1,heap,hsize);
+ rgt = heap[1];
+ node[range].lft = lft;
+ node[range].rgt = rgt;
+ node[range].count = lft->count + rgt->count;
+ heap[1] = node+(range++);
+ Reheap(1,heap,hsize);
+ }
+ for (i = 0; i < 256; i++) // Build the code table
+ { scheme->codebits[i] = 0;
+ scheme->codelens[i] = 0;
+ }
+ Build_Table(node+(range-1),0,0,scheme->codebits,scheme->codelens);
+ if (inscheme != NULL) // Set scheme type and if truncated (2), map truncated codes
+ { scheme->type = 2; // to code and length for 255
+ for (i = 0; i < 255; i++)
+ if (inscheme->codelens[i] > HUFF_CUTOFF || scheme->codelens[i] > HUFF_CUTOFF)
+ { scheme->codelens[i] = scheme->codelens[255];
+ scheme->codebits[i] = scheme->codebits[255];
+ }
+ }
+ else
+ { scheme->type = 0;
+ for (i = 0; i < 256; i++)
+ { if (scheme->codelens[i] > HUFF_CUTOFF)
+ scheme->type = 1;
+ }
+ }
+ return (scheme);
+#ifdef DEBUG
+ // For debug, show the coding table
+static void Print_Table(HScheme *scheme, uint64 *hist, int infosize)
+{ uint64 total_bits;
+ uint32 specval, mask, code, *bits;
+ int speclen, clen, *lens;
+ int i, k;
+ total_bits = 0;
+ bits = scheme->codebits;
+ lens = scheme->codelens;
+ if (scheme->type == 2)
+ { specval = bits[255];
+ speclen = lens[255];
+ }
+ else
+ specval = speclen = 0x7fffffff;
+ printf("\nCode Table:\n");
+ for (i = 0; i < 256; i++)
+ if (lens[i] > 0)
+ { clen = lens[i];
+ mask = (1 << clen);
+ code = bits[i];
+ printf(" %3d: %2d ",i,clen);
+ for (k = 0; k < clen; k++)
+ { mask >>= 1;
+ if (code & mask)
+ printf("1");
+ else
+ printf("0");
+ }
+ if (code == specval && clen == speclen)
+ { printf(" ***");
+ if (hist != NULL)
+ total_bits += (clen+infosize)*hist[i];
+ }
+ else if (hist != NULL)
+ total_bits += clen*hist[i];
+ printf("\n");
+ }
+ if (hist != NULL)
+ printf("\nTotal Bytes = %lld\n",(total_bits-1)/8+1);
+ // For debug, show the histogram
+static void Print_Histogram(uint64 *hist)
+{ int i, low, hgh;
+ uint64 count;
+ for (hgh = 255; hgh >= 0; hgh--)
+ if (hist[hgh] != 0)
+ break;
+ for (low = 0; low < 256; low++)
+ if (hist[low] != 0)
+ break;
+ count = 0;
+ for (i = low; i <= hgh; i++)
+ count += hist[i];
+ for (i = hgh; i >= low; i--)
+ printf(" %3d: %8llu %5.1f%%\n",i,hist[i],(hist[i]*100.)/count);
+ *
+ * Read and Write Huffman Schemes
+ *
+ ********************************************************************************************/
+ // Write the code table to out.
+static void Write_Scheme(HScheme *scheme, FILE *out)
+{ int i;
+ uint8 x;
+ uint32 *bits;
+ int *lens;
+ lens = scheme->codelens;
+ bits = scheme->codebits;
+ x = (uint8) (scheme->type);
+ fwrite(&x,1,1,out);
+ for (i = 0; i < 256; i++)
+ { x = (uint8) (lens[i]);
+ fwrite(&x,1,1,out);
+ if (x > 0)
+ fwrite(bits+i,sizeof(uint32),1,out);
+ }
+ // Allocate and read a code table from in, and return a pointer to it.
+static HScheme *Read_Scheme(FILE *in)
+{ HScheme *scheme;
+ int *look, *lens;
+ uint32 *bits, base;
+ int i, j, powr;
+ uint8 x;
+ scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record");
+ if (scheme == NULL)
+ return (NULL);
+ lens = scheme->codelens;
+ bits = scheme->codebits;
+ look = scheme->lookup;
+ if (fread(&x,1,1,in) != 1)
+ { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n");
+ free(scheme);
+ return (NULL);
+ }
+ scheme->type = x;
+ for (i = 0; i < 256; i++)
+ { if (fread(&x,1,1,in) != 1)
+ { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i);
+ return (NULL);
+ }
+ lens[i] = x;
+ if (x > 0)
+ { if (fread(bits+i,sizeof(uint32),1,in) != 1)
+ { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i);
+ free(scheme);
+ return (NULL);
+ }
+ }
+ else
+ bits[i] = 0;
+ }
+ if (Flip)
+ { for (i = 0; i < 256; i++)
+ Flip_Long(bits+i);
+ }
+ for (i = 0; i < 256; i++)
+ { if (lens[i] > 0)
+ { base = (bits[i] << (16-lens[i]));
+ powr = (1 << (16-lens[i]));
+ for (j = 0; j < powr; j++)
+ look[base+j] = i;
+ }
+ }
+ return (scheme);
+ *
+ * Encoders and Decoders
+ *
+ ********************************************************************************************/
+ // Encode read[0..rlen-1] according to scheme and write to out
+static void Encode(HScheme *scheme, FILE *out, uint8 *read, int rlen)
+{ uint32 x, c, ocode;
+ int n, k, olen, llen;
+ int *nlens;
+ uint32 *nbits;
+ uint32 nspec;
+ int nslen;
+ nlens = scheme->codelens;
+ nbits = scheme->codebits;
+ if (scheme->type == 2)
+ { nspec = nbits[255];
+ nslen = nlens[255];
+ }
+ else
+ nspec = nslen = 0x7fffffff;
+#define OCODE(L,C) \
+{ int len = olen + (L); \
+ uint32 code = (C); \
+ \
+ llen = olen; \
+ if (len >= 32) \
+ { olen = len-32; \
+ ocode |= (code >> olen); \
+ fwrite(&ocode,sizeof(uint32),1,out); \
+ if (olen > 0) \
+ ocode = (code << (32-olen)); \
+ else \
+ ocode = 0; \
+ } \
+ else \
+ { olen = len; \
+ ocode |= (code << (32-olen));; \
+ } \
+ llen = 0;
+ olen = 0;
+ ocode = 0;
+ for (k = 0; k < rlen; k++)
+ { x = read[k];
+ n = nlens[x];
+ c = nbits[x];
+ OCODE(n,c);
+ if (c == nspec && n == nslen)
+ OCODE(8,x);
+ }
+ if (olen > 0) // Tricky: must pad so decoder does not read past
+ { fwrite(&ocode,sizeof(uint32),1,out); // last integer int the coded output.
+ if (llen > 16 && olen > llen)
+ fwrite(&ocode,sizeof(uint32),1,out);
+ }
+ else if (llen > 16)
+ fwrite(&ocode,sizeof(uint32),1,out);
+ // Encode read[0..rlen-1] according to non-rchar table neme, and run-length table reme for
+ // runs of rchar characters. Write to out.
+static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int rlen, int rchar)
+{ uint32 x, c, ocode;
+ int n, h, k, olen, llen;
+ int *nlens, *rlens;
+ uint32 *nbits, *rbits;
+ uint32 nspec, rspec;
+ int nslen, rslen;
+ nlens = neme->codelens;
+ nbits = neme->codebits;
+ rlens = reme->codelens;
+ rbits = reme->codebits;
+ if (neme->type == 2)
+ { nspec = nbits[255];
+ nslen = nlens[255];
+ }
+ else
+ nspec = nslen = 0x7fffffff;
+ rspec = rbits[255];
+ rslen = rlens[255];
+ llen = 0;
+ olen = 0;
+ ocode = 0;
+ k = 0;
+ while (k < rlen)
+ { h = k;
+ while (k < rlen && read[k] == rchar)
+ k += 1;
+ if (k-h >= 255)
+ x = 255;
+ else
+ x = k-h;
+ n = rlens[x];
+ c = rbits[x];
+ OCODE(n,c);
+ if (c == rspec && n == rslen)
+ OCODE(16,k-h);
+ if (k < rlen)
+ { x = read[k];
+ n = nlens[x];
+ c = nbits[x];
+ OCODE(n,c);
+ if (c == nspec && n == nslen)
+ OCODE(8,x);
+ k += 1;
+ }
+ }
+ if (olen > 0)
+ { fwrite(&ocode,sizeof(uint32),1,out);
+ if (llen > 16 && olen > llen)
+ fwrite(&ocode,sizeof(uint32),1,out);
+ }
+ else if (llen > 16)
+ fwrite(&ocode,sizeof(uint32),1,out);
+ // Read and decode from in, the next rlen symbols into read according to scheme
+static int Decode(HScheme *scheme, FILE *in, char *read, int rlen)
+{ int *look, *lens;
+ int signal, ilen;
+ uint64 icode;
+ uint32 *ipart;
+ uint16 *xpart;
+ uint8 *cpart;
+ int j, n, c;
+ if (LittleEndian)
+ { ipart = ((uint32 *) (&icode));
+ xpart = ((uint16 *) (&icode)) + 2;
+ cpart = ((uint8 *) (&icode)) + 5;
+ }
+ else
+ { ipart = ((uint32 *) (&icode)) + 1;
+ xpart = ((uint16 *) (&icode)) + 1;
+ cpart = ((uint8 *) (&icode)) + 2;
+ }
+ if (scheme->type == 2)
+ signal = 255;
+ else
+ signal = 256;
+ lens = scheme->codelens;
+ look = scheme->lookup;
+#define GET \
+ if (n > ilen) \
+ { icode <<= ilen; \
+ if (fread(ipart,sizeof(uint32),1,in) != 1) \
+ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \
+ return (1); \
+ } \
+ ilen = n-ilen; \
+ icode <<= ilen; \
+ ilen = 32-ilen; \
+ } \
+ else \
+ { icode <<= n; \
+ ilen -= n; \
+ }
+#define GETFLIP \
+ if (n > ilen) \
+ { icode <<= ilen; \
+ if (fread(ipart,sizeof(uint32),1,in) != 1) \
+ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \
+ return (1); \
+ } \
+ Flip_Long(ipart); \
+ ilen = n-ilen; \
+ icode <<= ilen; \
+ ilen = 32-ilen; \
+ } \
+ else \
+ { icode <<= n; \
+ ilen -= n; \
+ }
+ n = 16;
+ ilen = 0;
+ icode = 0;
+ if (Flip)
+ for (j = 0; j < rlen; j++)
+ c = look[*xpart];
+ n = lens[c];
+ if (c == signal)
+ c = *cpart;
+ n = 8;
+ }
+ read[j] = (char) c;
+ }
+ else
+ for (j = 0; j < rlen; j++)
+ { GET
+ c = look[*xpart];
+ n = lens[c];
+ if (c == signal)
+ { GET
+ c = *cpart;
+ n = 8;
+ }
+ read[j] = (char) c;
+ }
+ return (0);
+ // Read and decode from in, the next rlen symbols into read according to non-rchar scheme
+ // neme, and the rchar runlength shceme reme
+static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read,
+ int rlen, int rchar)
+{ int *nlook, *nlens;
+ int *rlook, *rlens;
+ int nsignal, ilen;
+ uint64 icode;
+ uint32 *ipart;
+ uint16 *xpart;
+ uint8 *cpart;
+ int j, n, c, k;
+ if (LittleEndian)
+ { ipart = ((uint32 *) (&icode));
+ xpart = ((uint16 *) (&icode)) + 2;
+ cpart = ((uint8 *) (&icode)) + 5;
+ }
+ else
+ { ipart = ((uint32 *) (&icode)) + 1;
+ xpart = ((uint16 *) (&icode)) + 1;
+ cpart = ((uint8 *) (&icode)) + 2;
+ }
+ if (neme->type == 2)
+ nsignal = 255;
+ else
+ nsignal = 256;
+ nlens = neme->codelens;
+ nlook = neme->lookup;
+ rlens = reme->codelens;
+ rlook = reme->lookup;
+ n = 16;
+ ilen = 0;
+ icode = 0;
+ if (Flip)
+ for (j = 0; j < rlen; j++)
+ c = rlook[*xpart];
+ n = rlens[c];
+ if (c == 255)
+ c = *xpart;
+ n = 16;
+ }
+ for (k = 0; k < c; k++)
+ read[j++] = (char) rchar;
+ if (j < rlen)
+ c = nlook[*xpart];
+ n = nlens[c];
+ if (c == nsignal)
+ c = *cpart;
+ n = 8;
+ }
+ read[j] = (char) c;
+ }
+ }
+ else
+ for (j = 0; j < rlen; j++)
+ { GET
+ c = rlook[*xpart];
+ n = rlens[c];
+ if (c == 255)
+ { GET
+ c = *xpart;
+ n = 16;
+ }
+ for (k = 0; k < c; k++)
+ read[j++] = (char) rchar;
+ if (j < rlen)
+ { GET
+ c = nlook[*xpart];
+ n = nlens[c];
+ if (c == nsignal)
+ { GET
+ c = *cpart;
+ n = 8;
+ }
+ read[j] = (char) c;
+ }
+ }
+ return (0);
+ *
+ * Histogrammers
+ *
+ ********************************************************************************************/
+// Histogram runlengths of symbol runChar in stream[0..rlen-1] into run.
+static void Histogram_Seqs(uint64 *hist, uint8 *stream, int rlen)
+{ int k;
+ for (k = 0; k < rlen; k++)
+ hist[stream[k]] += 1;
+static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar)
+{ int k, h;
+ k = 0;
+ while (k < rlen)
+ { h = k;
+ while (k < rlen && stream[k] == runChar)
+ k += 1;
+ if (k-h >= 256)
+ run[255] += 1;
+ else
+ run[k-h] += 1;
+ if (k < rlen)
+ k += 1;
+ }
+ *
+ * Reader
+ *
+ ********************************************************************************************/
+static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan,
+static int Rmax = -1; // Compress_Next_QVentry
+static int Nline; // Referred by: QVcoding_Scan
+char *QVentry()
+{ return (Read); }
+void Set_QV_Line(int line)
+{ Nline = line; }
+int Get_QV_Line()
+{ return (Nline); }
+// If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines
+// for a sequence. Place line j at Read+j*Rmax and the length of every line is returned
+// unless eof occurs in which case return -1. If any error occurs return -2.
+int Read_Lines(FILE *input, int nlines)
+{ int i, rlen;
+ int tmax;
+ char *tread;
+ char *other;
+ if (Read == NULL)
+ { tmax = MIN_BUFFER;
+ tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer");
+ if (tread == NULL)
+ EXIT(-2);
+ Rmax = tmax;
+ Read = tread;
+ }
+ Nline += 1;
+ if (fgets(Read,Rmax,input) == NULL)
+ return (-1);
+ rlen = strlen(Read);
+ while (Read[rlen-1] != '\n')
+ { tmax = ((int) 1.4*Rmax) + MIN_BUFFER;
+ tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer");
+ if (tread == NULL)
+ EXIT(-2);
+ Rmax = tmax;
+ Read = tread;
+ if (fgets(Read+rlen,Rmax-rlen,input) == NULL)
+ { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline);
+ EXIT(-2);
+ }
+ rlen += strlen(Read+rlen);
+ }
+ other = Read;
+ for (i = 1; i < nlines; i++)
+ { other += Rmax;
+ Nline += 1;
+ if (fgets(other,Rmax,input) == NULL)
+ { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline);
+ EXIT(-2);
+ }
+ if (rlen != (int) strlen(other))
+ { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline);
+ EXIT(-2);
+ }
+ }
+ return (rlen-1);
+ *
+ * Tag compression and decompression routines
+ *
+ ********************************************************************************************/
+// Keep only the symbols in tags[0..rlen-1] for which qvs[k] != rchar and
+// return the # of symbols kept.
+static int Pack_Tag(char *tags, char *qvs, int rlen, int rchar)
+{ int j, k;
+ j = 0;
+ for (k = 0; k < rlen; k++)
+ if (qvs[k] != rchar)
+ tags[j++] = tags[k];
+ tags[j] = '\0';
+ return (j);
+ // Count the # of non-rchar symbols in qvs[0..rlen-1]
+static int Packed_Length(char *qvs, int rlen, int rchar)
+{ int k, clen;
+ clen = 0;
+ for (k = 0; k < rlen; k++)
+ if (qvs[k] != rchar)
+ clen += 1;
+ return (clen);
+ // Unpack tags by moving its i'th char to position k where qvs[k] is the i'th non-rchar
+ // symbol in qvs. All other chars are set to rchar. rlen is the length of qvs and
+ // the unpacked result, clen is the initial length of tags.
+static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar)
+{ int j, k;
+ j = clen-1;
+ for (k = rlen-1; k >= 0; k--)
+ { if (qvs[k] == rchar)
+ tags[k] = 'n';
+ else
+ tags[k] = tags[j--];
+ }
+ *
+ * Statistics Scan and Scheme creation and write
+ *
+ ********************************************************************************************/
+ // Read up to the next num entries or until eof from the .quiva file on input and record
+ // frequency statistics. Copy these entries to the temporary file temp if != NULL.
+ // If there is an error then -1 is returned, otherwise the number of entries read.
+static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256];
+static uint64 totChar;
+static int delChar, subChar;
+ // Referred by: QVcoding_Scan, Create_QVcoding
+int QVcoding_Scan(FILE *input, int num, FILE *temp)
+{ char *slash;
+ int rlen;
+ int i, r;
+ // Zero histograms
+ bzero(delHist,sizeof(uint64)*256);
+ bzero(mrgHist,sizeof(uint64)*256);
+ bzero(insHist,sizeof(uint64)*256);
+ bzero(subHist,sizeof(uint64)*256);
+ for (i = 0; i < 256; i++)
+ delRun[i] = subRun[i] = 1;
+ totChar = 0;
+ delChar = -1;
+ subChar = -1;
+ // Make a sweep through the .quiva entries, histogramming the relevant things
+ // and figuring out the run chars for the deletion and substition streams
+ r = 0;
+ for (i = 0; i < num; i++)
+ { int well, beg, end, qv;
+ rlen = Read_Lines(input,1);
+ if (rlen == -2)
+ EXIT(-1);
+ if (rlen < 0)
+ break;
+ if (rlen == 0 || Read[0] != '@')
+ { EPRINTF(EPLACE,"Line %d: Header in quiva file is missing\n",Nline);
+ EXIT(-1);
+ }
+ slash = index(Read+1,'/');
+ if (slash == NULL)
+ { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n",
+ Prog_Name,Nline);
+ EXIT(-1);
+ }
+ if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4)
+ { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n",
+ Prog_Name,Nline);
+ EXIT(-1);
+ }
+ if (temp != NULL)
+ fputs(Read,temp);
+ rlen = Read_Lines(input,5);
+ if (rlen < 0)
+ { if (rlen == -1)
+ EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline);
+ EXIT(-1);
+ }
+ if (temp != NULL)
+ { fputs(Read,temp);
+ fputs(Read+Rmax,temp);
+ fputs(Read+2*Rmax,temp);
+ fputs(Read+3*Rmax,temp);
+ fputs(Read+4*Rmax,temp);
+ }
+ Histogram_Seqs(delHist,(uint8 *) (Read),rlen);
+ Histogram_Seqs(insHist,(uint8 *) (Read+2*Rmax),rlen);
+ Histogram_Seqs(mrgHist,(uint8 *) (Read+3*Rmax),rlen);
+ Histogram_Seqs(subHist,(uint8 *) (Read+4*Rmax),rlen);
+ if (delChar < 0)
+ { int k;
+ char *del = Read+Rmax;
+ for (k = 0; k < rlen; k++)
+ if (del[k] == 'n' || del[k] == 'N')
+ { delChar = Read[k];
+ break;
+ }
+ }
+ if (delChar >= 0)
+ Histogram_Runs( delRun,(uint8 *) (Read),rlen,delChar);
+ totChar += rlen;
+ if (subChar < 0)
+ { if (totChar >= 100000)
+ { int k;
+ subChar = 0;
+ for (k = 1; k < 256; k++)
+ if (subHist[k] > subHist[subChar])
+ subChar = k;
+ }
+ }
+ if (subChar >= 0)
+ Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar);
+ r += 1;
+ }
+ return (r);
+ // Using the statistics in the global stat tables, create the Huffman schemes and write
+ // them to output. If lossy is set, then create a lossy table for the insertion and merge
+ // QVs.
+QVcoding *Create_QVcoding(int lossy)
+{ static QVcoding coding;
+ HScheme *delScheme, *insScheme, *mrgScheme, *subScheme;
+ HScheme *dRunScheme, *sRunScheme;
+ delScheme = NULL;
+ dRunScheme = NULL;
+ insScheme = NULL;
+ mrgScheme = NULL;
+ subScheme = NULL;
+ sRunScheme = NULL;
+ // Check whether using a subtitution run char is a win
+ if (totChar < 200000 || subHist[subChar] < .5*totChar)
+ subChar = -1;
+ // If lossy encryption is enabled then scale insertions and merge QVs.
+ if (lossy)
+ { int k;
+ for (k = 0; k < 256; k += 2)
+ { insHist[k] += insHist[k+1];
+ insHist[k+1] = 0;
+ }
+ for (k = 0; k < 256; k += 4)
+ { mrgHist[k] += mrgHist[k+1];
+ mrgHist[k] += mrgHist[k+2];
+ mrgHist[k] += mrgHist[k+3];
+ mrgHist[k+1] = 0;
+ mrgHist[k+2] = 0;
+ mrgHist[k+3] = 0;
+ }
+ }
+ // Build a Huffman scheme for each stream entity from the histograms
+#define SCHEME_MACRO(meme,hist,label,bits) \
+ scheme = Huffman( (hist), NULL); \
+ if (scheme == NULL) \
+ goto error; \
+ if (scheme->type) \
+ { (meme) = Huffman( (hist), scheme); \
+ free(scheme); \
+ } \
+ else \
+ (meme) = scheme;
+#ifdef DEBUG
+#define MAKE_SCHEME(meme,hist,label,bits) \
+ SCHEME_MACRO(meme,hist,label,bits) \
+ printf("\n%s\n", (label) ); \
+ Print_Histogram( (hist)); \
+ Print_Table( (meme), (hist), (bits));
+#define MAKE_SCHEME(meme,hist,label,bits) \
+ SCHEME_MACRO(meme,hist,label,bits)
+ { HScheme *scheme;
+ if (delChar < 0)
+ { MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs", 8);
+ dRunScheme = NULL;
+ }
+ else
+ { delHist[delChar] = 0;
+ MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs less run char", 8);
+ MAKE_SCHEME(dRunScheme,delRun, "Histogram of Deletion Runs QVs", 16);
+#ifdef DEBUG
+ printf("\nRun char is '%c'\n",delChar);
+ }
+#ifdef DEBUG
+ { int k;
+ uint64 count;
+ count = 0;
+ for (k = 0; k < 256; k++)
+ count += delHist[k];
+ printf("\nDelTag will require %lld bytes\n",count/4);
+ }
+ MAKE_SCHEME(insScheme,insHist, "Hisotgram of Insertion QVs", 8);
+ MAKE_SCHEME(mrgScheme,mrgHist, "Hisotgram of Merge QVs", 8);
+ if (subChar < 0)
+ { MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs", 8);
+ sRunScheme = NULL;
+ }
+ else
+ { subHist[subChar] = 0;
+ MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs less run char", 8);
+ MAKE_SCHEME(sRunScheme,subRun, "Histogram of Substitution Run QVs", 16);
+#ifdef DEBUG
+ printf("\nRun char is '%c'\n",subChar);
+ }
+ }
+ // Setup endian handling
+ Set_Endian(0);
+ coding.delScheme = delScheme;
+ coding.insScheme = insScheme;
+ coding.mrgScheme = mrgScheme;
+ coding.subScheme = subScheme;
+ coding.dRunScheme = dRunScheme;
+ coding.sRunScheme = sRunScheme;
+ coding.delChar = delChar;
+ coding.subChar = subChar;
+ coding.prefix = NULL;
+ coding.flip = 0;
+ return (&coding);
+ if (delScheme != NULL)
+ free(delScheme);
+ if (dRunScheme != NULL)
+ free(dRunScheme);
+ if (insScheme != NULL)
+ free(insScheme);
+ if (mrgScheme != NULL)
+ free(mrgScheme);
+ if (subScheme != NULL)
+ free(subScheme);
+ if (sRunScheme != NULL)
+ free(sRunScheme);
+ // Write the encoding scheme 'coding' to 'output'
+void Write_QVcoding(FILE *output, QVcoding *coding)
+ // Write out the endian key, run chars, and prefix (if not NULL)
+ { uint16 half;
+ int len;
+ half = 0x33cc;
+ fwrite(&half,sizeof(uint16),1,output);
+ if (coding->delChar < 0)
+ half = 256;
+ else
+ half = (uint16) (coding->delChar);
+ fwrite(&half,sizeof(uint16),1,output);
+ if (coding->subChar < 0)
+ half = 256;
+ else
+ half = (uint16) (coding->subChar);
+ fwrite(&half,sizeof(uint16),1,output);
+ len = strlen(coding->prefix);
+ fwrite(&len,sizeof(int),1,output);
+ fwrite(coding->prefix,1,len,output);
+ }
+ // Write out the scheme tables
+ Write_Scheme(coding->delScheme,output);
+ if (coding->delChar >= 0)
+ Write_Scheme(coding->dRunScheme,output);
+ Write_Scheme(coding->insScheme,output);
+ Write_Scheme(coding->mrgScheme,output);
+ Write_Scheme(coding->subScheme,output);
+ if (coding->subChar >= 0)
+ Write_Scheme(coding->sRunScheme,output);
+ // Read the encoding scheme 'coding' to 'output'
+QVcoding *Read_QVcoding(FILE *input)
+{ static QVcoding coding;
+ // Read endian key, run chars, and short name common to all headers
+ { uint16 half;
+ int len;
+ if (fread(&half,sizeof(uint16),1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n");
+ }
+ coding.flip = (half != 0x33cc);
+ if (fread(&half,sizeof(uint16),1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n");
+ }
+ if (coding.flip)
+ Flip_Short(&half);
+ coding.delChar = half;
+ if (coding.delChar >= 256)
+ coding.delChar = -1;
+ if (fread(&half,sizeof(uint16),1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n");
+ }
+ if (coding.flip)
+ Flip_Short(&half);
+ coding.subChar = half;
+ if (coding.subChar >= 256)
+ coding.subChar = -1;
+ // Read the short name common to all headers
+ if (fread(&len,sizeof(int),1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n");
+ }
+ if (coding.flip)
+ Flip_Long(&len);
+ coding.prefix = (char *) Malloc(len+1,"Allocating header prefix");
+ if (coding.prefix == NULL)
+ if (len > 0)
+ { if (fread(coding.prefix,len,1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n");
+ }
+ }
+ coding.prefix[len] = '\0';
+ }
+ // Setup endian handling
+ Set_Endian(coding.flip);
+ // Read the Huffman schemes used to compress the data
+ coding.delScheme = NULL;
+ coding.dRunScheme = NULL;
+ coding.insScheme = NULL;
+ coding.mrgScheme = NULL;
+ coding.subScheme = NULL;
+ coding.sRunScheme = NULL;
+ coding.delScheme = Read_Scheme(input);
+ if (coding.delScheme == NULL)
+ goto error;
+ if (coding.delChar >= 0)
+ { coding.dRunScheme = Read_Scheme(input);
+ if (coding.dRunScheme == NULL)
+ goto error;
+ }
+ coding.insScheme = Read_Scheme(input);
+ if (coding.insScheme == NULL)
+ goto error;
+ coding.mrgScheme = Read_Scheme(input);
+ if (coding.mrgScheme == NULL)
+ goto error;
+ coding.subScheme = Read_Scheme(input);
+ if (coding.subScheme == NULL)
+ goto error;
+ if (coding.subChar >= 0)
+ { coding.sRunScheme = Read_Scheme(input);
+ if (coding.sRunScheme == NULL)
+ goto error;
+ }
+ return (&coding);
+ if (coding.delScheme != NULL)
+ free(coding.delScheme);
+ if (coding.dRunScheme != NULL)
+ free(coding.dRunScheme);
+ if (coding.insScheme != NULL)
+ free(coding.insScheme);
+ if (coding.mrgScheme != NULL)
+ free(coding.mrgScheme);
+ if (coding.subScheme != NULL)
+ free(coding.subScheme);
+ if (coding.sRunScheme != NULL)
+ free(coding.sRunScheme);
+ // Free all the auxilliary storage associated with the encoding argument
+void Free_QVcoding(QVcoding *coding)
+{ if (coding->subChar >= 0)
+ free(coding->sRunScheme);
+ free(coding->subScheme);
+ free(coding->mrgScheme);
+ free(coding->insScheme);
+ if (coding->delChar >= 0)
+ free(coding->dRunScheme);
+ free(coding->delScheme);
+ free(coding->prefix);
+ *
+ * Encode/Decode (w.r.t. coding) next entry from input and write to output
+ *
+ ********************************************************************************************/
+int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy)
+{ int rlen, clen;
+ // Get all 5 streams, compress each with its scheme, and output
+ rlen = Read_Lines(input,5);
+ if (rlen < 0)
+ { if (rlen == -1)
+ EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline);
+ EXIT (-1);
+ }
+ if (coding->delChar < 0)
+ { Encode(coding->delScheme, output, (uint8 *) Read, rlen);
+ clen = rlen;
+ }
+ else
+ { Encode_Run(coding->delScheme, coding->dRunScheme, output,
+ (uint8 *) Read, rlen, coding->delChar);
+ clen = Pack_Tag(Read+Rmax,Read,rlen,coding->delChar);
+ }
+ Number_Read(Read+Rmax);
+ Compress_Read(clen,Read+Rmax);
+ fwrite(Read+Rmax,1,COMPRESSED_LEN(clen),output);
+ if (lossy)
+ { uint8 *insert = (uint8 *) (Read+2*Rmax);
+ uint8 *merge = (uint8 *) (Read+3*Rmax);
+ int k;
+ for (k = 0; k < rlen; k++)
+ { insert[k] = (uint8) ((insert[k] >> 1) << 1);
+ merge[k] = (uint8) (( merge[k] >> 2) << 2);
+ }
+ }
+ Encode(coding->insScheme, output, (uint8 *) (Read+2*Rmax), rlen);
+ Encode(coding->mrgScheme, output, (uint8 *) (Read+3*Rmax), rlen);
+ if (coding->subChar < 0)
+ Encode(coding->subScheme, output, (uint8 *) (Read+4*Rmax), rlen);
+ else
+ Encode_Run(coding->subScheme, coding->sRunScheme, output,
+ (uint8 *) (Read+4*Rmax), rlen, coding->subChar);
+ return (rlen);
+int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen)
+{ int clen, tlen;
+ // Decode each stream and write to output
+ if (coding->delChar < 0)
+ { if (Decode(coding->delScheme, input, entry[0], rlen))
+ EXIT(1);
+ clen = rlen;
+ tlen = COMPRESSED_LEN(clen);
+ if (tlen > 0)
+ { if (fread(entry[1],tlen,1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n");
+ EXIT(1);
+ }
+ }
+ Uncompress_Read(clen,entry[1]);
+ Lower_Read(entry[1]);
+ }
+ else
+ { if (Decode_Run(coding->delScheme, coding->dRunScheme, input,
+ entry[0], rlen, coding->delChar))
+ EXIT(1);
+ clen = Packed_Length(entry[0],rlen,coding->delChar);
+ tlen = COMPRESSED_LEN(clen);
+ if (tlen > 0)
+ { if (fread(entry[1],tlen,1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n");
+ EXIT(1);
+ }
+ }
+ Uncompress_Read(clen,entry[1]);
+ Lower_Read(entry[1]);
+ Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar);
+ }
+ if (Decode(coding->insScheme, input, entry[2], rlen))
+ EXIT(1);
+ if (Decode(coding->mrgScheme, input, entry[3], rlen))
+ EXIT(1);
+ if (coding->subChar < 0)
+ { if (Decode(coding->subScheme, input, entry[4], rlen))
+ EXIT(1);
+ }
+ else
+ { if (Decode_Run(coding->subScheme, coding->sRunScheme, input,
+ entry[4], rlen, coding->subChar))
+ EXIT(1);
+ }
+ return (0);
diff --git a/QV.h b/QV.h
new file mode 100644
index 0000000..532b2f4
--- /dev/null
+++ b/QV.h
@@ -0,0 +1,96 @@
+ *
+ * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on
+ * the histogram of values occuring in a given file. The two low complexity streams
+ * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant
+ * character.
+ *
+ * Author: Gene Myers
+ * Date: Jan 18, 2014
+ * Modified: July 25, 2014
+ *
+ ********************************************************************************************/
+#include <stdio.h>
+ // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or
+ // batch version of the routines in this library are compiled. In batch mode, routines
+ // print an error message and exit. In interactive mode, the routines place the error
+ // message in EPLACE (also defined in DB.h) and return an error value, typically NULL
+ // if the routine returns a pointer, and an unusual integer value if the routine returns
+ // an integer.
+ // Below when an error return is described, one should understand that this value is returned
+ // only if the routine was compiled in INTERACTIVE mode.
+ // A PacBio compression scheme
+typedef struct
+ { void *delScheme; // Huffman scheme for deletion QVs
+ void *insScheme; // Huffman scheme for insertion QVs
+ void *mrgScheme; // Huffman scheme for merge QVs
+ void *subScheme; // Huffman scheme for substitution QVs
+ void *dRunScheme; // Huffman scheme for deletion run lengths (if delChar > 0)
+ void *sRunScheme; // Huffman scheme for substitution run lengths (if subChar > 0)
+ int delChar; // If > 0, run-encoded deletion value
+ int subChar; // If > 0, run-encoded substitution value
+ int flip; // Need to flip multi-byte integers
+ char *prefix; // Header line prefix
+ } QVcoding;
+ // Read the next nlines of input, and QVentry returns a pointer to the first line if needed.
+ // If end-of-input is encountered before any further input, -1 is returned. If there is
+ // an error than -2 is returned. Otherwise the length of the line(s) read is returned.
+int Read_Lines(FILE *input, int nlines);
+char *QVentry();
+ // Get and set the line counter for error reporting
+void Set_QV_Line(int line);
+int Get_QV_Line();
+ // Read up to the next num entries or until eof from the .quiva file on input and record
+ // frequency statistics. Copy these entries to the temporary file temp if != NULL.
+ // If there is an error then -1 is returned, otherwise the number of entries read.
+int QVcoding_Scan(FILE *input, int num, FILE *temp);
+ // Given QVcoding_Scan has been called at least once, create an encoding scheme based on
+ // the accumulated statistics and return a pointer to it. The returned encoding object
+ // is *statically allocated within the routine. If lossy is set then use a lossy scaling
+ // for the insertion and merge streams. If there is an error, then NULL is returned.
+QVcoding *Create_QVcoding(int lossy);
+ // Read/write a coding scheme to input/output. The encoding object returned by the reader
+ // is *statically* allocated within the routine. If an error occurs while reading then
+ // NULL is returned.
+QVcoding *Read_QVcoding(FILE *input);
+void Write_QVcoding(FILE *output, QVcoding *coding);
+ // Free all the auxiliary storage associated with coding (but not the object itself!)
+void Free_QVcoding(QVcoding *coding);
+ // Assuming the file pointer is positioned just beyond an entry header line, read the
+ // next set of 5 QV lines, compress them according to 'coding', and output. If lossy
+ // is set then the scheme is a lossy one. A negative value is returned if an error
+ // occurred, and the sequence length otherwise.
+int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy);
+ // Assuming the input is position just beyond the compressed encoding of an entry header,
+ // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them,
+ // and place their decompressed values into entry which is a 5 element array of character
+ // pointers. The parameter rlen computed from the preceeding header line, critically
+ // provides the length of each of the 5 vectors. A non-zero value is return only if an
+ // error occured.
+int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen);
+#endif // _QV_COMPRESSOR
diff --git a/README b/README
new file mode 100644
index 0000000..b68a35f
--- /dev/null
+++ b/README
@@ -0,0 +1,47 @@
+*** PLEASE GO TO THE DAZZLER BLOG (https://dazzlerblog.wordpress.com) FOR TYPESET ***
+ The Dazzler Scrubbing Suite: DAS SCRUBBER
+ Author: Gene Myers
+ First: October 12, 2014
+ Recent: March 27, 2016
+ The goal of scrubbing is to produce a set of edited reads that are guaranteed to
+(a) be continuous stretches of the underlying genome (i.e. no unremoved adapters
+and not chimers), and (b) have no very low quality stretches (i.e. the error rate
+never exceeds some reasonable maximum, 20% or so in the case of Pacbio data). The
+secondary goal of scrubbing is to do so with the minimum removal of data and splitting
+of reads.
+ The "DAS" suite will consist of a pipeline of several programs that will accomplish
+the task of scrubbing. At this time, we are releasing the first program which assigns
+intrinsic quality values to every trace point interval of a read.
+1. DASqv [-v] -c<int> <source:db> <overlaps:las>
+ DASqv takes as input a database <source> and the local alignments, <overlaps>, for
+said database or a block thereof. Note carefully that <source> must always refer to
+the entire DB, only <overlaps> can involve a block number.
+ Using the local alignment-pile for each A-read, DASqv produces a QV value for each
+complete segment of TRACE_SPACING bases (e.g. 100bp, the -s parameter to daligner).
+The quality value of the average percentile of the best 25-5-% alignment matches
+covering it depending on the coverage estimate -c. One must supply the -c parameter
+to the expected coverage of the genome in question. All quality values over 50 are
+clipped to 50.
+ The quality values are written to a .qual track, that can be viewed by calling
+DBdump with the -i option set ("i" for "intrinsic QV").
+ The -v option prints out a histogram of the segment align matches, and the quality
+values produced. This histgram is usefull in assessing, for a given data set, what
+constitutes the threshold -g and -b, to be used by down stream commands, for what is
+definitely a good segment and what is definitely a bad segment.
+2. DAStrim -- soon !
diff --git a/align.c b/align.c
new file mode 100644
index 0000000..82de72a
--- /dev/null
+++ b/align.c
@@ -0,0 +1,5132 @@
+ *
+ * Fast alignment discovery and trace generation along with utilites for displaying alignments
+ * Based on previously unpublished ideas from 2005, subsequently refined in 2013-14. Basic
+ * idea is to keep a dynamically selected interval of the f.r. waves from my 1986 O(nd) paper.
+ * A recent cool idea is to not record all the details of an alignment while discovering it
+ * but simply record trace points through which the optimal alignment passes every 100bp,
+ * allowing rapid recomputation of the alignment details between trace points.
+ *
+ * Author : Gene Myers
+ * First : June 2013
+ * Current: June 1, 2014
+ *
+ ********************************************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include <limits.h>
+#include "DB.h"
+#include "align.h"
+#undef DEBUG_PASSES // Show forward / backward extension termini for Local_Alignment
+#undef DEBUG_POINTS // Show trace points
+#undef DEBUG_WAVE // Show waves of Local_Alignment
+#undef SHOW_MATCH_WAVE // For waves of Local_Alignment also show # of matches
+#undef SHOW_TRAIL // Show trace at the end of forward and reverse passes
+#undef SHOW_TPS // Show trace points as they are encountered in a wave
+#undef DEBUG_EXTEND // Show waves of Extend_Until_Overlap
+#undef DEBUG_ALIGN // Show division points of Compute_Trace
+#undef DEBUG_SCRIPT // Show trace additions for Compute_Trace
+#undef DEBUG_AWAVE // Show F/R waves of Compute_Trace
+#undef SHOW_TRACE // Show full trace for Print_Alignment
+#undef WAVE_STATS
+* *
+* Working Storage Abstraction *
+* *
+typedef struct // Hidden from the user, working space for each thread
+ { int vecmax;
+ void *vector;
+ int celmax;
+ void *cells;
+ int pntmax;
+ void *points;
+ int tramax;
+ void *trace;
+ } _Work_Data;
+Work_Data *New_Work_Data()
+{ _Work_Data *work;
+ work = (_Work_Data *) Malloc(sizeof(_Work_Data),"Allocating work data block");
+ if (work == NULL)
+ work->vecmax = 0;
+ work->vector = NULL;
+ work->pntmax = 0;
+ work->points = NULL;
+ work->tramax = 0;
+ work->trace = NULL;
+ work->celmax = 0;
+ work->cells = NULL;
+ return ((Work_Data *) work);
+static int enlarge_vector(_Work_Data *work, int newmax)
+{ void *vec;
+ int max;
+ max = ((int) (newmax*1.2)) + 10000;
+ vec = Realloc(work->vector,max,"Enlarging DP vector");
+ if (vec == NULL)
+ EXIT(1);
+ work->vecmax = max;
+ work->vector = vec;
+ return (0);
+static int enlarge_points(_Work_Data *work, int newmax)
+{ void *vec;
+ int max;
+ max = ((int) (newmax*1.2)) + 10000;
+ vec = Realloc(work->points,max,"Enlarging point vector");
+ if (vec == NULL)
+ EXIT(1);
+ work->pntmax = max;
+ work->points = vec;
+ return (0);
+static int enlarge_trace(_Work_Data *work, int newmax)
+{ void *vec;
+ int max;
+ max = ((int) (newmax*1.2)) + 10000;
+ vec = Realloc(work->trace,max,"Enlarging trace vector");
+ if (vec == NULL)
+ EXIT(1);
+ work->tramax = max;
+ work->trace = vec;
+ return (0);
+void Free_Work_Data(Work_Data *ework)
+{ _Work_Data *work = (_Work_Data *) ework;
+ if (work->vector != NULL)
+ free(work->vector);
+ if (work->cells != NULL)
+ free(work->cells);
+ if (work->trace != NULL)
+ free(work->trace);
+ if (work->points != NULL)
+ free(work->points);
+ free(work);
+* *
+* *
+ // Absolute/Fixed Parameters
+#define BVEC uint64 // Can be uint32 if PATH_LEN <= 32
+#define TRIM_LEN 15 // Report as the tip, the last wave maximum for which the last
+ // 2*TRIM_LEN edits are prefix-positive at rate ave_corr*f(bias)
+ // (max value is 20)
+#define PATH_LEN 60 // Follow the last PATH_LEN columns/edges (max value is 63)
+ // Derivative fixed parameters
+#define PATH_TOP 0x1000000000000000ll // Must be 1 << PATH_LEN
+#define PATH_INT 0x0fffffffffffffffll // Must be PATH_TOP-1
+#define TRIM_MASK 0x7fff // Must be (1 << TRIM_LEN) - 1
+#define TRIM_MLAG 200 // How far can last trim point be behind best point
+#define WAVE_LAG 30 // How far can worst point be behind the best point
+static double Bias_Factor[10] = { .690, .690, .690, .690, .780,
+ .850, .900, .933, .966, 1.000 };
+ // Adjustable paramters
+typedef struct
+ { double ave_corr;
+ int trace_space;
+ float freq[4];
+ int ave_path;
+ int16 *score;
+ int16 *table;
+ } _Align_Spec;
+/* Fill in bit table: TABLE[x] = 1 iff the alignment modeled by x (1 = match, 0 = mismatch)
+ has a non-negative score for every suffix of the alignment under the scoring scheme
+ where match = MATCH and mismatch = -1. MATCH is set so that an alignment with TRIM_PCT
+ matches has zero score ( (1-TRIM_PCT) / TRIM_PCT ). */
+#define FRACTION 1000 // Implicit fractional part of scores, i.e. score = x/FRACTION
+typedef struct
+ { int mscore;
+ int dscore;
+ int16 *table;
+ int16 *score;
+ } Table_Bits;
+static void set_table(int bit, int prefix, int score, int max, Table_Bits *parms)
+{ if (bit >= TRIM_LEN)
+ { parms->table[prefix] = (int16) (score-max);
+ parms->score[prefix] = (int16) score;
+ }
+ else
+ { if (score > max)
+ max = score;
+ set_table(bit+1,(prefix<<1),score - parms->dscore,max,parms);
+ set_table(bit+1,(prefix<<1) | 1,score + parms->mscore,max,parms);
+ }
+/* Create an alignment specification record including path tip tables & values */
+Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq)
+{ _Align_Spec *spec;
+ Table_Bits parms;
+ double match;
+ int bias;
+ spec = (_Align_Spec *) Malloc(sizeof(_Align_Spec),"Allocating alignment specification");
+ if (spec == NULL)
+ spec->ave_corr = ave_corr;
+ spec->trace_space = trace_space;
+ spec->freq[0] = freq[0];
+ spec->freq[1] = freq[1];
+ spec->freq[2] = freq[2];
+ spec->freq[3] = freq[3];
+ match = freq[0] + freq[3];
+ if (match > .5)
+ match = 1.-match;
+ bias = (int) ((match+.025)*20.-1.);
+ if (match < .2)
+ { fprintf(stderr,"Warning: Base bias worse than 80/20%% ! (New_Align_Spec)\n");
+ fprintf(stderr," Capping bias at this ratio.\n");
+ bias = 3;
+ }
+ spec->ave_path = (int) (PATH_LEN * (1. - Bias_Factor[bias] * (1. - ave_corr)));
+ parms.mscore = (int) (FRACTION * Bias_Factor[bias] * (1. - ave_corr));
+ parms.dscore = FRACTION - parms.mscore;
+ parms.score = (int16 *) Malloc(sizeof(int16)*(TRIM_MASK+1)*2,"Allocating trim table");
+ if (parms.score == NULL)
+ { free(spec);
+ }
+ parms.table = parms.score + (TRIM_MASK+1);
+ set_table(0,0,0,0,&parms);
+ spec->table = parms.table;
+ spec->score = parms.score;
+ return ((Align_Spec *) spec);
+void Free_Align_Spec(Align_Spec *espec)
+{ _Align_Spec *spec = (_Align_Spec *) espec;
+ free(spec->score);
+ free(spec);
+double Average_Correlation(Align_Spec *espec)
+{ return (((_Align_Spec *) espec)->ave_corr); }
+int Trace_Spacing(Align_Spec *espec)
+{ return (((_Align_Spec *) espec)->trace_space); }
+float *Base_Frequencies(Align_Spec *espec)
+{ return (((_Align_Spec *) espec)->freq); }
+* *
+* LOCAL ALIGNMENT FINDER: forward_/reverse_wave and Local_Alignment *
+* *
+#ifdef WAVE_STATS
+static int64 MAX, TOT, NWV;
+static int64 RESTARTS;
+void Init_Stats()
+{ MAX = TOT = NWV = 0;
+void Print_Stats()
+{ printf("\nMax = %lld Ave = %.1f # = %lld\n",MAX,(1.*TOT)/NWV,NWV);
+ printf("\nRestarts = %lld\n",RESTARTS);
+#ifdef DEBUG_WAVE
+static void print_wave(int *V, int *M, int low, int hgh, int besta)
+{ int k, bestk;
+ (void) M;
+ printf(" [%6d,%6d]: ",low,hgh);
+ for (k = low; k <= hgh; k++)
+ { if (besta == V[k])
+ bestk = k;
+ // printf(" %3d",(V[k]+k)/2);
+ printf(" %3d",besta-V[k]);
+ }
+ printf(" : %d (%d,%d)\n",besta,(besta+bestk)/2,(besta-bestk)/2);
+ printf(" ");
+ for (k = low; k <= hgh; k++)
+ printf(" %3d",M[k]);
+ printf("\n");
+ fflush(stdout);
+/* At each furthest reaching point, keep a-coordinate of point (V), bitvector
+ recording the last TRIM_LEN columns of the implied alignment (T), and the
+ # of matches (1-bits) in the bitvector (M). */
+typedef struct
+ { int ptr;
+ int diag;
+ int diff;
+ int mark;
+ } Pebble;
+static int VectorEl = 6*sizeof(int) + sizeof(BVEC);
+static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath,
+ int *mind, int maxd, int mida, int minp, int maxp)
+{ char *aseq = align->aseq;
+ char *bseq = align->bseq;
+ Path *apath = align->path;
+ int hgh, low, dif;
+ int vlen, vmin, vmax;
+ int *V, *M;
+ int *_V, *_M;
+ BVEC *T;
+ BVEC *_T;
+ int *HA, *HB;
+ int *_HA, *_HB;
+ int *NA, *NB;
+ int *_NA, *_NB;
+ Pebble *cells;
+ int avail, cmax, boff;
+ int TRACE_SPACE = spec->trace_space;
+ int PATH_AVE = spec->ave_path;
+ int16 *SCORE = spec->score;
+ int16 *TABLE = spec->table;
+ int besta, besty;
+ int trima, trimy, trimd;
+ int trimha, trimhb;
+ int morea, morey, mored;
+ int moreha, morehb;
+ int more, morem, lasta;
+ int aclip, bclip;
+ hgh = maxd;
+ low = *mind;
+ dif = 0;
+ { int span, wing;
+ span = (hgh-low)+1;
+ vlen = work->vecmax/VectorEl;
+ wing = (vlen - span)/2;
+ vmin = low - wing;
+ vmax = hgh + wing;
+ _V = ((int *) work->vector);
+ _M = _V + vlen;
+ _HA = _M + vlen;
+ _HB = _HA + vlen;
+ _NA = _HB + vlen;
+ _NB = _NA + vlen;
+ _T = ((BVEC *) (_NB + vlen));
+ V = _V-vmin;
+ M = _M-vmin;
+ HA = _HA-vmin;
+ HB = _HB-vmin;
+ NA = _NA-vmin;
+ NB = _NB-vmin;
+ T = _T-vmin;
+ cells = (Pebble *) (work->cells);
+ cmax = work->celmax;
+ avail = 0;
+ if (COMP(align->flags))
+ boff = align->blen % TRACE_SPACE;
+ else
+ boff = 0;
+ }
+ /* Compute 0-wave starting from mid-line */
+ more = 1;
+ aclip = INT32_MAX;
+ bclip = -INT32_MAX;
+ besta = trima = morea = lasta = mida;
+ besty = trimy = morey = (mida-hgh) >> 1;
+ trimd = mored = 0;
+ trimha = moreha = 0;
+ trimhb = morehb = 1;
+ morem = -1;
+ { int k;
+ char *a;
+ a = aseq + hgh;
+ for (k = hgh; k >= low; k--)
+ { int y, c, d;
+ int ha, hb;
+ int na, nb;
+ Pebble *pb;
+ y = (mida-k) >> 1;
+ if (avail >= cmax-1)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = -1;
+ pb->diag = k;
+ pb->diff = 0;
+ pb->mark = na;
+ ha = avail++;
+ na += TRACE_SPACE;
+ nb = ((y+(TRACE_SPACE-boff))/TRACE_SPACE-1)*TRACE_SPACE+boff;
+#ifdef SHOW_TPS
+ printf(" B %d: %d,%d,0,%d\n",avail,-1,k,nb); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = -1;
+ pb->diag = k;
+ pb->diff = 0;
+ pb->mark = nb;
+ hb = avail++;
+ nb += TRACE_SPACE;
+ while (1)
+ { c = bseq[y];
+ if (c == 4)
+ { more = 0;
+ if (bclip < k)
+ bclip = k;
+ break;
+ }
+ d = a[y];
+ if (c != d)
+ { if (d == 4)
+ { more = 0;
+ aclip = k;
+ }
+ break;
+ }
+ y += 1;
+ }
+ c = (y << 1) + k;
+ while (y+k >= na)
+ { if (avail >= cmax)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = ha;
+ pb->diag = k;
+ pb->diff = 0;
+ pb->mark = na;
+ ha = avail++;
+ na += TRACE_SPACE;
+ }
+ while (y >= nb)
+ { if (avail >= cmax)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = hb;
+ pb->diag = k;
+ pb->diff = 0;
+ pb->mark = nb;
+ hb = avail++;
+ nb += TRACE_SPACE;
+ }
+ if (c > besta)
+ { besta = trima = lasta = c;
+ besty = trimy = y;
+ trimha = ha;
+ trimhb = hb;
+ }
+ V[k] = c;
+ T[k] = PATH_INT;
+ M[k] = PATH_LEN;
+ HA[k] = ha;
+ HB[k] = hb;
+ NA[k] = na;
+ NB[k] = nb;
+ a -= 1;
+ }
+ }
+ if (more == 0)
+ { if (bseq[besty] != 4 && aseq[besta - besty] != 4)
+ more = 1;
+ if (hgh >= aclip)
+ { hgh = aclip-1;
+ if (morem <= M[aclip])
+ { morem = M[aclip];
+ morea = V[aclip];
+ morey = (morea - aclip)/2;
+ moreha = HA[aclip];
+ morehb = HB[aclip];
+ }
+ }
+ if (low <= bclip)
+ { low = bclip+1;
+ if (morem <= M[bclip])
+ { morem = M[bclip];
+ morea = V[bclip];
+ morey = (morea - bclip)/2;
+ moreha = HA[bclip];
+ morehb = HB[bclip];
+ }
+ }
+ aclip = INT32_MAX;
+ bclip = -INT32_MAX;
+ }
+#ifdef DEBUG_WAVE
+ printf("\nFORWARD WAVE:\n");
+ print_wave(V,M,low,hgh,besta);
+ /* Compute successive waves until no furthest reaching points remain */
+ while (more && lasta >= besta - TRIM_MLAG)
+ { int k, n;
+ int ua, ub;
+ BVEC t;
+ int am, ac, ap;
+ char *a;
+ low -= 1;
+ hgh += 1;
+ if (low <= vmin || hgh >= vmax)
+ { int span, wing;
+ int64 move;
+ int64 vd, md, had, hbd, nad, nbd, td;
+ span = (hgh-low)+1;
+ if (.8*vlen < span)
+ { if (enlarge_vector(work,vlen*VectorEl))
+ EXIT(1);
+ move = ((void *) _V) - work->vector;
+ vlen = work->vecmax/VectorEl;
+ _V = (int *) work->vector;
+ _M = _V + vlen;
+ _HA = _M + vlen;
+ _HB = _HA + vlen;
+ _NA = _HB + vlen;
+ _NB = _NA + vlen;
+ _T = ((BVEC *) (_NB + vlen));
+ }
+ else
+ move = 0;
+ wing = (vlen - span)/2;
+ vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move);
+ md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move);
+ had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move);
+ hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move);
+ nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move);
+ nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move);
+ td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move);
+ if (vd < 0)
+ memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int));
+ if (md < 0)
+ memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int));
+ if (had < 0)
+ memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int));
+ if (hbd < 0)
+ memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int));
+ if (nad < 0)
+ memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int));
+ if (nbd < 0)
+ memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int));
+ if (td < 0)
+ memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC));
+ if (td > 0)
+ memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC));
+ if (nbd > 0)
+ memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int));
+ if (nad > 0)
+ memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int));
+ if (hbd > 0)
+ memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int));
+ if (had > 0)
+ memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int));
+ if (md > 0)
+ memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int));
+ if (vd > 0)
+ memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int));
+ vmin = low-wing;
+ vmax = hgh+wing;
+ V = _V-vmin;
+ M = _M-vmin;
+ HA = _HA-vmin;
+ HB = _HB-vmin;
+ NA = _NA-vmin;
+ NB = _NB-vmin;
+ T = _T-vmin;
+ }
+ if (low >= minp)
+ { NA[low] = NA[low+1];
+ NB[low] = NB[low+1];
+ V[low] = -1;
+ }
+ else
+ low += 1;
+ if (hgh <= maxp)
+ { NA[hgh] = NA[hgh-1];
+ NB[hgh] = NB[hgh-1];
+ V[hgh] = am = -1;
+ }
+ else
+ am = V[--hgh];
+ dif += 1;
+ ac = V[hgh+1] = V[low-1] = -1;
+ a = aseq + hgh;
+ t = PATH_INT;
+ n = PATH_LEN;
+ ua = ub = -1;
+ for (k = hgh; k >= low; k--)
+ { int y, m;
+ int ha, hb;
+ int c, d;
+ BVEC b;
+ Pebble *pb;
+ ap = ac;
+ ac = am;
+ am = V[d = k-1];
+ if (ac < am)
+ if (am < ap)
+ { c = ap+1;
+ m = n;
+ b = t;
+ ha = ua;
+ hb = ub;
+ }
+ else
+ { c = am+1;
+ m = M[d];
+ b = T[d];
+ ha = HA[d];
+ hb = HB[d];
+ }
+ else
+ if (ac < ap)
+ { c = ap+1;
+ m = n;
+ b = t;
+ ha = ua;
+ hb = ub;
+ }
+ else
+ { c = ac+2;
+ m = M[k];
+ b = T[k];
+ ha = HA[k];
+ hb = HB[k];
+ }
+ if ((b & PATH_TOP) != 0)
+ m -= 1;
+ b <<= 1;
+ y = (c-k) >> 1;
+ while (1)
+ { c = bseq[y];
+ if (c == 4)
+ { more = 0;
+ if (bclip < k)
+ bclip = k;
+ break;
+ }
+ d = a[y];
+ if (c != d)
+ { if (d == 4)
+ { more = 0;
+ aclip = k;
+ }
+ break;
+ }
+ y += 1;
+ if ((b & PATH_TOP) == 0)
+ m += 1;
+ b = (b << 1) | 1;
+ }
+ c = (y << 1) + k;
+ while (y+k >= NA[k])
+ { if (cells[ha].mark < NA[k])
+ { if (avail >= cmax)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),
+ "Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = ha;
+ pb->diag = k;
+ pb->diff = dif;
+ pb->mark = NA[k];
+ ha = avail++;
+ }
+ }
+ while (y >= NB[k])
+ { if (cells[hb].mark < NB[k])
+ { if (avail >= cmax)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),
+ "Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = hb;
+ pb->diag = k;
+ pb->diff = dif;
+ pb->mark = NB[k];
+ hb = avail++;
+ }
+ }
+ if (c > besta)
+ { besta = c;
+ besty = y;
+ if (m >= PATH_AVE)
+ { lasta = c;
+ if (TABLE[b & TRIM_MASK] >= 0)
+ if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0)
+ { trima = c;
+ trimy = y;
+ trimd = dif;
+ trimha = ha;
+ trimhb = hb;
+ }
+ }
+ }
+ t = T[k];
+ n = M[k];
+ ua = HA[k];
+ ub = HB[k];
+ V[k] = c;
+ T[k] = b;
+ M[k] = m;
+ HA[k] = ha;
+ HB[k] = hb;
+ a -= 1;
+ }
+ if (more == 0)
+ { if (bseq[besty] != 4 && aseq[besta-besty] != 4)
+ more = 1;
+ if (hgh >= aclip)
+ { hgh = aclip-1;
+ if (morem <= M[aclip])
+ { morem = M[aclip];
+ morea = V[aclip];
+ morey = (morea - aclip)/2;
+ mored = dif;
+ moreha = HA[aclip];
+ morehb = HB[aclip];
+ }
+ }
+ if (low <= bclip)
+ { low = bclip+1;
+ if (morem <= M[bclip])
+ { morem = M[bclip];
+ morea = V[bclip];
+ morey = (morea - bclip)/2;
+ mored = dif;
+ moreha = HA[bclip];
+ morehb = HB[bclip];
+ }
+ }
+ aclip = INT32_MAX;
+ bclip = -INT32_MAX;
+ }
+ n = besta - WAVE_LAG;
+ while (hgh >= low)
+ if (V[hgh] < n)
+ hgh -= 1;
+ else
+ { while (V[low] < n)
+ low += 1;
+ break;
+ }
+#ifdef WAVE_STATS
+ k = (hgh-low)+1;
+ if (k > MAX)
+ MAX = k;
+ TOT += k;
+ NWV += 1;
+#ifdef DEBUG_WAVE
+ print_wave(V,M,low,hgh,besta);
+ }
+ { uint16 *atrace = (uint16 *) apath->trace;
+ uint16 *btrace = (uint16 *) bpath->trace;
+ int atlen, btlen;
+ int trimx;
+ int a, b, k, h;
+ int d, e;
+ if (morem >= 0)
+ { trimx = morea-morey;
+ trimy = morey;
+ trimd = mored;
+ trimha = moreha;
+ trimhb = morehb;
+ }
+ else
+ trimx = trima-trimy;
+ atlen = btlen = 0;
+ a = -1;
+ for (h = trimha; h >= 0; h = b)
+ { b = cells[h].ptr;
+ cells[h].ptr = a;
+ a = h;
+ }
+ h = a;
+ k = cells[h].diag;
+ b = (mida-k)/2;
+ e = 0;
+#ifdef SHOW_TRAIL
+ printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout);
+ for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
+ { k = cells[h].diag;
+ a = cells[h].mark - k;
+ d = cells[h].diff;
+ atrace[atlen++] = (uint16) (d-e);
+ atrace[atlen++] = (uint16) (a-b);
+#ifdef SHOW_TRAIL
+ printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout);
+ b = a;
+ e = d;
+ }
+ if (b+k != trimx)
+ { atrace[atlen++] = (uint16) (trimd-e);
+ atrace[atlen++] = (uint16) (trimy-b);
+#ifdef SHOW_TRAIL
+ printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout);
+ }
+ else if (b != trimy)
+ { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b));
+ atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e));
+#ifdef SHOW_TRAIL
+ printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout);
+ }
+ a = -1;
+ for (h = trimhb; h >= 0; h = b)
+ { b = cells[h].ptr;
+ cells[h].ptr = a;
+ a = h;
+ }
+ h = a;
+ k = cells[h].diag;
+ b = (mida+k)/2;
+ e = 0;
+ low = k;
+#ifdef SHOW_TRAIL
+ printf(" B path = (%5d,%5d)\n",b,(mida-k)/2); fflush(stdout);
+ for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
+ { k = cells[h].diag;
+ a = cells[h].mark + k;
+ d = cells[h].diff;
+ btrace[btlen++] = (uint16) (d-e);
+ btrace[btlen++] = (uint16) (a-b);
+#ifdef SHOW_TRAIL
+ printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,a-b); fflush(stdout);
+ b = a;
+ e = d;
+ }
+ if (b-k != trimy)
+ { btrace[btlen++] = (uint16) (trimd-e);
+ btrace[btlen++] = (uint16) (trimx-b);
+#ifdef SHOW_TRAIL
+ printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout);
+ }
+ else if (b != trimx)
+ { btrace[btlen-1] = (uint16) (btrace[btlen-1] + (trimx-b));
+ btrace[btlen-2] = (uint16) (btrace[btlen-2] + (trimd-e));
+#ifdef SHOW_TRAIL
+ printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout);
+ }
+ apath->aepos = trimx;
+ apath->bepos = trimy;
+ apath->diffs = trimd;
+ apath->tlen = atlen;
+ if (COMP(align->flags))
+ { bpath->abpos = align->blen - apath->bepos;
+ bpath->bbpos = align->alen - apath->aepos;
+ }
+ else
+ { bpath->aepos = apath->bepos;
+ bpath->bepos = apath->aepos;
+ }
+ bpath->diffs = trimd;
+ bpath->tlen = btlen;
+ }
+ *mind = low;
+ return (0);
+/*** Reverse Wave ***/
+static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath,
+ int mind, int maxd, int mida, int minp, int maxp)
+{ char *aseq = align->aseq - 1;
+ char *bseq = align->bseq - 1;
+ Path *apath = align->path;
+ int hgh, low, dif;
+ int vlen, vmin, vmax;
+ int *V, *M;
+ int *_V, *_M;
+ BVEC *T;
+ BVEC *_T;
+ int *HA, *HB;
+ int *_HA, *_HB;
+ int *NA, *NB;
+ int *_NA, *_NB;
+ Pebble *cells;
+ int avail, cmax, boff;
+ int TRACE_SPACE = spec->trace_space;
+ int PATH_AVE = spec->ave_path;
+ int16 *SCORE = spec->score;
+ int16 *TABLE = spec->table;
+ int besta, besty;
+ int trima, trimy, trimd;
+ int trimha, trimhb;
+ int morea, morey, mored;
+ int moreha, morehb;
+ int more, morem, lasta;
+ int aclip, bclip;
+ hgh = maxd;
+ low = mind;
+ dif = 0;
+ { int span, wing;
+ span = (hgh-low)+1;
+ vlen = work->vecmax/VectorEl;
+ wing = (vlen - span)/2;
+ vmin = low - wing;
+ vmax = hgh + wing;
+ _V = ((int *) work->vector);
+ _M = _V + vlen;
+ _HA = _M + vlen;
+ _HB = _HA + vlen;
+ _NA = _HB + vlen;
+ _NB = _NA + vlen;
+ _T = ((BVEC *) (_NB + vlen));
+ V = _V-vmin;
+ M = _M-vmin;
+ HA = _HA-vmin;
+ HB = _HB-vmin;
+ NA = _NA-vmin;
+ NB = _NB-vmin;
+ T = _T-vmin;
+ cells = (Pebble *) (work->cells);
+ cmax = work->celmax;
+ avail = 0;
+ if (COMP(align->flags))
+ boff = align->blen % TRACE_SPACE;
+ else
+ boff = 0;
+ }
+ more = 1;
+ aclip = -INT32_MAX;
+ bclip = INT32_MAX;
+ besta = trima = morea = lasta = mida;
+ besty = trimy = morey = (mida-hgh) >> 1;
+ trimd = mored = 0;
+ trimha = moreha = 0;
+ trimhb = morehb = 1;
+ morem = -1;
+ { int k;
+ char *a;
+ a = aseq + low;
+ for (k = low; k <= hgh; k++)
+ { int y, c, d;
+ int ha, hb;
+ int na, nb;
+ Pebble *pb;
+ y = (mida-k) >> 1;
+ if (avail >= cmax-1)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = -1;
+ pb->diag = k;
+ pb->diff = 0;
+ pb->mark = y+k;
+ ha = avail++;
+ nb = ((y+(TRACE_SPACE-boff)-1)/TRACE_SPACE-1)*TRACE_SPACE+boff;
+#ifdef SHOW_TPS
+ printf(" B %d: -1,%d,0,%d\n",avail,k,nb+TRACE_SPACE); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = -1;
+ pb->diag = k;
+ pb->diff = 0;
+ pb->mark = y;
+ hb = avail++;
+ while (1)
+ { c = bseq[y];
+ if (c == 4)
+ { more = 0;
+ if (bclip > k)
+ bclip = k;
+ break;
+ }
+ d = a[y];
+ if (c != d)
+ { if (d == 4)
+ { more = 0;
+ aclip = k;
+ }
+ break;
+ }
+ y -= 1;
+ }
+ c = (y << 1) + k;
+ while (y+k <= na)
+ { if (avail >= cmax)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = ha;
+ pb->diag = k;
+ pb->diff = 0;
+ pb->mark = na;
+ ha = avail++;
+ na -= TRACE_SPACE;
+ }
+ while (y <= nb)
+ { if (avail >= cmax)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = hb;
+ pb->diag = k;
+ pb->diff = 0;
+ pb->mark = nb;
+ hb = avail++;
+ nb -= TRACE_SPACE;
+ }
+ if (c < besta)
+ { besta = trima = lasta = c;
+ besty = trimy = y;
+ trimha = ha;
+ trimhb = hb;
+ }
+ V[k] = c;
+ T[k] = PATH_INT;
+ M[k] = PATH_LEN;
+ HA[k] = ha;
+ HB[k] = hb;
+ NA[k] = na;
+ NB[k] = nb;
+ a += 1;
+ }
+ }
+ if (more == 0)
+ { if (bseq[besty] != 4 && aseq[besta - besty] != 4)
+ more = 1;
+ if (low <= aclip)
+ { low = aclip+1;
+ if (morem <= M[aclip])
+ { morem = M[aclip];
+ morea = V[aclip];
+ morey = (morea - aclip)/2;
+ moreha = HA[aclip];
+ morehb = HB[aclip];
+ }
+ }
+ if (hgh >= bclip)
+ { hgh = bclip-1;
+ if (morem <= M[bclip])
+ { morem = M[bclip];
+ morea = V[bclip];
+ morey = (morea - bclip)/2;
+ moreha = HA[bclip];
+ morehb = HB[bclip];
+ }
+ }
+ aclip = -INT32_MAX;
+ bclip = INT32_MAX;
+ }
+#ifdef DEBUG_WAVE
+ printf("\nREVERSE WAVE:\n");
+ print_wave(V,M,low,hgh,besta);
+ while (more && lasta <= besta + TRIM_MLAG)
+ { int k, n;
+ int ua, ub;
+ BVEC t;
+ int am, ac, ap;
+ char *a;
+ low -= 1;
+ hgh += 1;
+ if (low <= vmin || hgh >= vmax)
+ { int span, wing;
+ int64 move, vd, md, had, hbd, nad, nbd, td;
+ span = (hgh-low)+1;
+ if (.8*vlen < span)
+ { if (enlarge_vector(work,vlen*VectorEl))
+ EXIT(1);
+ move = ((void *) _V) - work->vector;
+ vlen = work->vecmax/VectorEl;
+ _V = (int *) work->vector;
+ _M = _V + vlen;
+ _HA = _M + vlen;
+ _HB = _HA + vlen;
+ _NA = _HB + vlen;
+ _NB = _NA + vlen;
+ _T = ((BVEC *) (_NB + vlen));
+ }
+ else
+ move = 0;
+ wing = (vlen - span)/2;
+ vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move);
+ md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move);
+ had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move);
+ hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move);
+ nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move);
+ nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move);
+ td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move);
+ if (vd < 0)
+ memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int));
+ if (md < 0)
+ memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int));
+ if (had < 0)
+ memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int));
+ if (hbd < 0)
+ memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int));
+ if (nad < 0)
+ memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int));
+ if (nbd < 0)
+ memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int));
+ if (td < 0)
+ memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC));
+ if (td > 0)
+ memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC));
+ if (nbd > 0)
+ memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int));
+ if (nad > 0)
+ memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int));
+ if (hbd > 0)
+ memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int));
+ if (had > 0)
+ memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int));
+ if (md > 0)
+ memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int));
+ if (vd > 0)
+ memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int));
+ vmin = low-wing;
+ vmax = hgh+wing;
+ V = _V-vmin;
+ M = _M-vmin;
+ HA = _HA-vmin;
+ HB = _HB-vmin;
+ NA = _NA-vmin;
+ NB = _NB-vmin;
+ T = _T-vmin;
+ }
+ if (low >= minp)
+ { NA[low] = NA[low+1];
+ NB[low] = NB[low+1];
+ V[low] = ap = INT32_MAX;
+ }
+ else
+ ap = V[++low];
+ if (hgh <= maxp)
+ { NA[hgh] = NA[hgh-1];
+ NB[hgh] = NB[hgh-1];
+ V[hgh] = INT32_MAX;
+ }
+ else
+ hgh -= 1;
+ dif += 1;
+ ac = V[hgh+1] = V[low-1] = INT32_MAX;
+ a = aseq + low;
+ t = PATH_INT;
+ n = PATH_LEN;
+ ua = ub = -1;
+ for (k = low; k <= hgh; k++)
+ { int y, m;
+ int ha, hb;
+ int c, d;
+ BVEC b;
+ Pebble *pb;
+ am = ac;
+ ac = ap;
+ ap = V[d = k+1];
+ if (ac > ap)
+ if (ap > am)
+ { c = am-1;
+ m = n;
+ b = t;
+ ha = ua;
+ hb = ub;
+ }
+ else
+ { c = ap-1;
+ m = M[d];
+ b = T[d];
+ ha = HA[d];
+ hb = HB[d];
+ }
+ else
+ if (ac > am)
+ { c = am-1;
+ m = n;
+ b = t;
+ ha = ua;
+ hb = ub;
+ }
+ else
+ { c = ac-2;
+ m = M[k];
+ b = T[k];
+ ha = HA[k];
+ hb = HB[k];
+ }
+ if ((b & PATH_TOP) != 0)
+ m -= 1;
+ b <<= 1;
+ y = (c-k) >> 1;
+ while (1)
+ { c = bseq[y];
+ if (c == 4)
+ { more = 0;
+ if (bclip > k)
+ bclip = k;
+ break;
+ }
+ d = a[y];
+ if (c != d)
+ { if (d == 4)
+ { more = 0;
+ aclip = k;
+ }
+ break;
+ }
+ y -= 1;
+ if ((b & PATH_TOP) == 0)
+ m += 1;
+ b = (b << 1) | 1;
+ }
+ c = (y << 1) + k;
+ while (y+k <= NA[k])
+ { if (cells[ha].mark > NA[k])
+ { if (avail >= cmax)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),
+ "Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = ha;
+ pb->diag = k;
+ pb->diff = dif;
+ pb->mark = NA[k];
+ ha = avail++;
+ }
+ }
+ while (y <= NB[k])
+ { if (cells[hb].mark > NB[k])
+ { if (avail >= cmax)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),
+ "Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = hb;
+ pb->diag = k;
+ pb->diff = dif;
+ pb->mark = NB[k];
+ hb = avail++;
+ }
+ }
+ if (c < besta)
+ { besta = c;
+ besty = y;
+ if (m >= PATH_AVE)
+ { lasta = c;
+ if (TABLE[b & TRIM_MASK] >= 0)
+ if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0)
+ { trima = c;
+ trimy = y;
+ trimd = dif;
+ trimha = ha;
+ trimhb = hb;
+ }
+ }
+ }
+ t = T[k];
+ n = M[k];
+ ua = HA[k];
+ ub = HB[k];
+ V[k] = c;
+ T[k] = b;
+ M[k] = m;
+ HA[k] = ha;
+ HB[k] = hb;
+ a += 1;
+ }
+ if (more == 0)
+ { if (bseq[besty] != 4 && aseq[besta - besty] != 4)
+ more = 1;
+ if (low <= aclip)
+ { low = aclip+1;
+ if (morem <= M[aclip])
+ { morem = M[aclip];
+ morea = V[aclip];
+ morey = (morea - aclip)/2;
+ mored = dif;
+ moreha = HA[aclip];
+ morehb = HB[aclip];
+ }
+ }
+ if (hgh >= bclip)
+ { hgh = bclip-1;
+ if (morem <= M[bclip])
+ { morem = M[bclip];
+ morea = V[bclip];
+ morey = (morea - bclip)/2;
+ mored = dif;
+ moreha = HA[bclip];
+ morehb = HB[bclip];
+ }
+ }
+ aclip = -INT32_MAX;
+ bclip = INT32_MAX;
+ }
+ n = besta + WAVE_LAG;
+ while (hgh >= low)
+ if (V[hgh] > n)
+ hgh -= 1;
+ else
+ { while (V[low] > n)
+ low += 1;
+ break;
+ }
+#ifdef WAVE_STATS
+ k = (hgh-low)+1;
+ if (k > MAX)
+ MAX = k;
+ TOT += k;
+ NWV += 1;
+#ifdef DEBUG_WAVE
+ print_wave(V,M,low,hgh,besta);
+ }
+ { uint16 *atrace = (uint16 *) apath->trace;
+ uint16 *btrace = (uint16 *) bpath->trace;
+ int atlen, btlen;
+ int trimx;
+ int a, b, k, h;
+ int d, e;
+ if (morem >= 0)
+ { trimx = morea-morey;
+ trimy = morey;
+ trimd = mored;
+ trimha = moreha;
+ trimhb = morehb;
+ }
+ else
+ trimx = trima-trimy;
+ atlen = btlen = 0;
+ a = -1;
+ for (h = trimha; h >= 0; h = b)
+ { b = cells[h].ptr;
+ cells[h].ptr = a;
+ a = h;
+ }
+ h = a;
+ k = cells[h].diag;
+ b = cells[h].mark - k;
+ e = 0;
+#ifdef SHOW_TRAIL
+ printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout);
+ if ((b+k)%TRACE_SPACE != 0)
+ { h = cells[h].ptr;
+ if (h < 0)
+ { a = trimy;
+ d = trimd;
+ }
+ else
+ { k = cells[h].diag;
+ a = cells[h].mark - k;
+ d = cells[h].diff;
+ }
+#ifdef SHOW_TRAIL
+ printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout);
+ if (apath->tlen == 0)
+ { atrace[--atlen] = (uint16) (b-a);
+ atrace[--atlen] = (uint16) (d-e);
+ }
+ else
+ { atrace[1] = (uint16) (atrace[1] + (b-a));
+ atrace[0] = (uint16) (atrace[0] + (d-e));
+ }
+ b = a;
+ e = d;
+ }
+ if (h >= 0)
+ { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
+ { k = cells[h].diag;
+ a = cells[h].mark - k;
+ atrace[--atlen] = (uint16) (b-a);
+ d = cells[h].diff;
+ atrace[--atlen] = (uint16) (d-e);
+#ifdef SHOW_TRAIL
+ printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout);
+ b = a;
+ e = d;
+ }
+ if (b+k != trimx)
+ { atrace[--atlen] = (uint16) (b-trimy);
+ atrace[--atlen] = (uint16) (trimd-e);
+#ifdef SHOW_TRAIL
+ printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout);
+ }
+ else if (b != trimy)
+ { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy));
+ atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e));
+#ifdef SHOW_TRAIL
+ printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout);
+ }
+ }
+ a = -1;
+ for (h = trimhb; h >= 0; h = b)
+ { b = cells[h].ptr;
+ cells[h].ptr = a;
+ a = h;
+ }
+ h = a;
+ k = cells[h].diag;
+ b = cells[h].mark + k;
+ e = 0;
+#ifdef SHOW_TRAIL
+ printf(" B path = (%5d,%5d)\n",b,b-k); fflush(stdout);
+ if ((b-k)%TRACE_SPACE != boff)
+ { h = cells[h].ptr;
+ if (h < 0)
+ { a = trimx;
+ d = trimd;
+ }
+ else
+ { k = cells[h].diag;
+ a = cells[h].mark + k;
+ d = cells[h].diff;
+ }
+#ifdef SHOW_TRAIL
+ printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout);
+ if (bpath->tlen == 0)
+ { btrace[--btlen] = (uint16) (b-a);
+ btrace[--btlen] = (uint16) (b-a);
+ }
+ else
+ { btrace[1] = (uint16) (btrace[1] + (b-a));
+ btrace[0] = (uint16) (btrace[0] + (d-e));
+ }
+ b = a;
+ e = d;
+ }
+ if (h >= 0)
+ { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
+ { k = cells[h].diag;
+ a = cells[h].mark + k;
+ btrace[--btlen] = (uint16) (b-a);
+ d = cells[h].diff;
+ btrace[--btlen] = (uint16) (d-e);
+#ifdef SHOW_TRAIL
+ printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout);
+ b = a;
+ e = d;
+ }
+ if (b-k != trimy)
+ { btrace[--btlen] = (uint16) (b-trimx);
+ btrace[--btlen] = (uint16) (trimd-e);
+#ifdef SHOW_TRAIL
+ printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout);
+ }
+ else if (b != trimx)
+ { btrace[btlen+1] = (uint16) (btrace[btlen+1] + (b-trimx));
+ btrace[btlen] = (uint16) (btrace[btlen] + (trimd-e));
+#ifdef SHOW_TRAIL
+ printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout);
+ }
+ }
+ apath->abpos = trimx;
+ apath->bbpos = trimy;
+ apath->diffs = apath->diffs + trimd;
+ apath->tlen = apath->tlen - atlen;
+ apath->trace = atrace + atlen;
+ if (COMP(align->flags))
+ { bpath->aepos = align->blen - apath->bbpos;
+ bpath->bepos = align->alen - apath->abpos;
+ }
+ else
+ { bpath->abpos = apath->bbpos;
+ bpath->bbpos = apath->abpos;
+ }
+ bpath->diffs = bpath->diffs + trimd;
+ bpath->tlen = bpath->tlen - btlen;
+ bpath->trace = btrace + btlen;
+ }
+ return (0);
+/* Find the longest local alignment between aseq and bseq through (xcnt,ycnt)
+ See associated .h file for the precise definition of the interface.
+Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
+ int low, int hgh, int anti, int lbord, int hbord)
+{ _Work_Data *work = ( _Work_Data *) ework;
+ _Align_Spec *spec = (_Align_Spec *) espec;
+ Path *apath, *bpath;
+ int minp, maxp;
+ int selfie;
+ { int alen, blen;
+ int maxtp, wsize;
+ alen = align->alen;
+ blen = align->blen;
+ if (hgh-low >= 7500)
+ wsize = VectorEl*(hgh-low+1);
+ else
+ wsize = VectorEl*10000;
+ if (wsize >= work->vecmax)
+ if (enlarge_vector(work,wsize))
+ if (alen < blen)
+ maxtp = 2*(blen/spec->trace_space+2);
+ else
+ maxtp = 2*(alen/spec->trace_space+2);
+ wsize = 4*maxtp*sizeof(uint16) + sizeof(Path);
+ if (wsize > work->pntmax)
+ if (enlarge_points(work,wsize))
+ apath = align->path;
+ bpath = (Path *) work->points;
+ apath->trace = ((uint16 *) (bpath+1)) + maxtp;
+ bpath->trace = ((uint16 *) apath->trace) + 2*maxtp;
+ }
+ printf("\n");
+ selfie = (align->aseq == align->bseq);
+ if (lbord < 0)
+ { if (selfie && low >= 0)
+ minp = 1;
+ else
+ minp = -INT32_MAX;
+ }
+ else
+ minp = low-lbord;
+ if (hbord < 0)
+ { if (selfie && hgh <= 0)
+ maxp = -1;
+ else
+ maxp = INT32_MAX;
+ }
+ else
+ maxp = hgh+hbord;
+ if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp))
+ printf("F1 (%d,%d) ~ %d => (%d,%d) %d\n",
+ (2*anti+(low+hgh))/4,(anti-(low+hgh))/4,hgh-low,
+ apath->aepos,apath->bepos,apath->diffs);
+ if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp))
+ printf("R1 (%d,%d) => (%d,%d) %d\n",
+ (anti+low)/2,(anti-low)/2,apath->abpos,apath->bbpos,apath->diffs);
+ if (COMP(align->flags))
+ { uint16 *trace = (uint16 *) bpath->trace;
+ uint16 p;
+ int i, j;
+ i = bpath->tlen-2;
+ j = 0;
+ while (j < i)
+ { p = trace[i];
+ trace[i] = trace[j];
+ trace[j] = p;
+ p = trace[i+1];
+ trace[i+1] = trace[j+1];
+ trace[j+1] = p;
+ i -= 2;
+ j += 2;
+ }
+ }
+ { uint16 *trace = (uint16 *) apath->trace;
+ int a, h;
+ printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos);
+ printf(" %c\n",(COMP(align->flags) ? 'c' : 'n'));
+ a = apath->bbpos;
+ for (h = 1; h < apath->tlen; h += 2)
+ { int dif = trace[h-1];
+ int del = trace[h];
+ a += del;
+ printf(" %d / %d (%d)\n",dif,del,a);
+ }
+ }
+ { uint16 *trace = (uint16 *) bpath->trace;
+ int a, h;
+ printf("\nB-path (%d,%d)->(%d,%d)",bpath->abpos,bpath->bbpos,bpath->aepos,bpath->bepos);
+ printf(" %c [%d,%d]\n",(COMP(align->flags) ? 'c' : 'n'),align->blen,align->alen);
+ a = bpath->bbpos;
+ for (h = 1; h < bpath->tlen; h += 2)
+ { int dif = trace[h-1];
+ int del = trace[h];
+ a += del;
+ printf(" %d / %d (%d)\n",dif,del,a);
+ }
+ }
+ return (bpath);
+* *
+* *
+static int VectorEn = 4*sizeof(int) + sizeof(BVEC);
+static int forward_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align,
+ int midd, int mida, int minp, int maxp)
+{ char *aseq = align->aseq;
+ char *bseq = align->bseq;
+ Path *apath = align->path;
+ int hgh, low, dif;
+ int vlen, vmin, vmax;
+ int *V, *M;
+ int *_V, *_M;
+ BVEC *T;
+ BVEC *_T;
+ int *HA, *NA;
+ int *_HA, *_NA;
+ Pebble *cells;
+ int avail, cmax;
+ int TRACE_SPACE = spec->trace_space;
+ int PATH_AVE = spec->ave_path;
+ int16 *SCORE = spec->score;
+ int16 *TABLE = spec->table;
+ int besta, besty;
+ int trima, trimy, trimd;
+ int trimha;
+ int morea, morey, mored;
+ int moreha;
+ int more, morem, lasta;
+ int aclip, bclip;
+ hgh = midd;
+ low = midd;
+ dif = 0;
+ { int span, wing;
+ span = (hgh-low)+1;
+ vlen = work->vecmax/VectorEn;
+ wing = (vlen - span)/2;
+ vmin = low - wing;
+ vmax = hgh + wing;
+ _V = ((int *) work->vector);
+ _M = _V + vlen;
+ _HA = _M + vlen;
+ _NA = _HA + vlen;
+ _T = ((BVEC *) (_NA + vlen));
+ V = _V-vmin;
+ M = _M-vmin;
+ HA = _HA-vmin;
+ NA = _NA-vmin;
+ T = _T-vmin;
+ cells = (Pebble *) (work->cells);
+ cmax = work->celmax;
+ avail = 0;
+ }
+ /* Compute 0-wave starting from mid-line */
+ more = 1;
+ aclip = INT32_MAX;
+ bclip = -INT32_MAX;
+ besta = trima = morea = lasta = mida;
+ besty = trimy = morey = (mida-hgh) >> 1;
+ trimd = mored = 0;
+ trimha = moreha = 0;
+ morem = -1;
+ { int k;
+ char *a;
+ a = aseq + hgh;
+ for (k = hgh; k >= low; k--)
+ { int y, c, d;
+ int ha, na;
+ Pebble *pb;
+ y = (mida-k) >> 1;
+ if (avail >= cmax-1)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = -1;
+ pb->diag = k;
+ pb->diff = 0;
+ pb->mark = na;
+ ha = avail++;
+ na += TRACE_SPACE;
+ while (1)
+ { c = bseq[y];
+ if (c == 4)
+ { more = 0;
+ if (bclip < k)
+ bclip = k;
+ break;
+ }
+ d = a[y];
+ if (c != d)
+ { if (d == 4)
+ { more = 0;
+ aclip = k;
+ }
+ break;
+ }
+ y += 1;
+ }
+ c = (y << 1) + k;
+ while (y+k >= na)
+ { if (avail >= cmax)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = ha;
+ pb->diag = k;
+ pb->diff = 0;
+ pb->mark = na;
+ ha = avail++;
+ na += TRACE_SPACE;
+ }
+ if (c > besta)
+ { besta = trima = lasta = c;
+ besty = trimy = y;
+ trimha = ha;
+ }
+ V[k] = c;
+ T[k] = PATH_INT;
+ M[k] = PATH_LEN;
+ HA[k] = ha;
+ NA[k] = na;
+ a -= 1;
+ }
+ }
+ if (more == 0)
+ { if (bseq[besty] != 4 && aseq[besta - besty] != 4)
+ more = 1;
+ if (hgh >= aclip)
+ { hgh = aclip-1;
+ if (morem <= M[aclip])
+ { morem = M[aclip];
+ morea = V[aclip];
+ morey = (morea - aclip)/2;
+ moreha = HA[aclip];
+ }
+ }
+ if (low <= bclip)
+ { low = bclip+1;
+ if (morem <= M[bclip])
+ { morem = M[bclip];
+ morea = V[bclip];
+ morey = (morea - bclip)/2;
+ moreha = HA[bclip];
+ }
+ }
+ aclip = INT32_MAX;
+ bclip = -INT32_MAX;
+ }
+#ifdef DEBUG_WAVE
+ printf("\nFORWARD WAVE:\n");
+ print_wave(V,M,low,hgh,besta);
+ /* Compute successive waves until no furthest reaching points remain */
+ while (more && lasta >= besta - TRIM_MLAG)
+ { int k, n;
+ int ua;
+ BVEC t;
+ int am, ac, ap;
+ char *a;
+ if (low <= vmin || hgh >= vmax)
+ { int span, wing;
+ int64 move;
+ int64 vd, md, had, nad, td;
+ span = (hgh-low)+1;
+ if (.8*vlen < span)
+ { if (enlarge_vector(work,vlen*VectorEn))
+ EXIT(1);
+ move = ((void *) _V) - work->vector;
+ vlen = work->vecmax/VectorEn;
+ _V = (int *) work->vector;
+ _M = _V + vlen;
+ _HA = _M + vlen;
+ _NA = _HA + vlen;
+ _T = ((BVEC *) (_NA + vlen));
+ }
+ else
+ move = 0;
+ wing = (vlen - span)/2;
+ vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move);
+ md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move);
+ had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move);
+ nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move);
+ td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move);
+ if (vd < 0)
+ memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int));
+ if (md < 0)
+ memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int));
+ if (had < 0)
+ memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int));
+ if (nad < 0)
+ memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int));
+ if (td < 0)
+ memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC));
+ if (td > 0)
+ memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC));
+ if (nad > 0)
+ memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int));
+ if (had > 0)
+ memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int));
+ if (md > 0)
+ memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int));
+ if (vd > 0)
+ memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int));
+ vmin = low-wing;
+ vmax = hgh+wing;
+ V = _V-vmin;
+ M = _M-vmin;
+ HA = _HA-vmin;
+ NA = _NA-vmin;
+ T = _T-vmin;
+ }
+ if (low > minp)
+ { low -= 1;
+ NA[low] = NA[low+1];
+ V[low] = -1;
+ }
+ if (hgh < maxp)
+ { hgh += 1;
+ NA[hgh] = NA[hgh-1];
+ V[hgh] = am = -1;
+ }
+ else
+ am = V[hgh];
+ dif += 1;
+ ac = V[hgh+1] = V[low-1] = -1;
+ a = aseq + hgh;
+ t = PATH_INT;
+ n = PATH_LEN;
+ ua = -1;
+ for (k = hgh; k >= low; k--)
+ { int y, m;
+ int ha;
+ int c, d;
+ BVEC b;
+ Pebble *pb;
+ ap = ac;
+ ac = am;
+ am = V[d = k-1];
+ if (ac < am)
+ if (am < ap)
+ { c = ap+1;
+ m = n;
+ b = t;
+ ha = ua;
+ }
+ else
+ { c = am+1;
+ m = M[d];
+ b = T[d];
+ ha = HA[d];
+ }
+ else
+ if (ac < ap)
+ { c = ap+1;
+ m = n;
+ b = t;
+ ha = ua;
+ }
+ else
+ { c = ac+2;
+ m = M[k];
+ b = T[k];
+ ha = HA[k];
+ }
+ if ((b & PATH_TOP) != 0)
+ m -= 1;
+ b <<= 1;
+ y = (c-k) >> 1;
+ while (1)
+ { c = bseq[y];
+ if (c == 4)
+ { more = 0;
+ if (bclip < k)
+ bclip = k;
+ break;
+ }
+ d = a[y];
+ if (c != d)
+ { if (d == 4)
+ { more = 0;
+ aclip = k;
+ }
+ break;
+ }
+ y += 1;
+ if ((b & PATH_TOP) == 0)
+ m += 1;
+ b = (b << 1) | 1;
+ }
+ c = (y << 1) + k;
+ while (y+k >= NA[k])
+ { if (cells[ha].mark < NA[k])
+ { if (avail >= cmax)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),
+ "Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = ha;
+ pb->diag = k;
+ pb->diff = dif;
+ pb->mark = NA[k];
+ ha = avail++;
+ }
+ }
+ if (c > besta)
+ { besta = c;
+ besty = y;
+ if (m >= PATH_AVE)
+ { lasta = c;
+ if (TABLE[b & TRIM_MASK] >= 0)
+ if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0)
+ { trima = c;
+ trimy = y;
+ trimd = dif;
+ trimha = ha;
+ }
+ }
+ }
+ t = T[k];
+ n = M[k];
+ ua = HA[k];
+ V[k] = c;
+ T[k] = b;
+ M[k] = m;
+ HA[k] = ha;
+ a -= 1;
+ }
+ if (more == 0)
+ { if (bseq[besty] != 4 && aseq[besta-besty] != 4)
+ more = 1;
+ if (hgh >= aclip)
+ { hgh = aclip-1;
+ if (morem <= M[aclip])
+ { morem = M[aclip];
+ morea = V[aclip];
+ morey = (morea - aclip)/2;
+ mored = dif;
+ moreha = HA[aclip];
+ }
+ }
+ if (low <= bclip)
+ { low = bclip+1;
+ if (morem <= M[bclip])
+ { morem = M[bclip];
+ morea = V[bclip];
+ morey = (morea - bclip)/2;
+ mored = dif;
+ moreha = HA[bclip];
+ }
+ }
+ aclip = INT32_MAX;
+ bclip = -INT32_MAX;
+ }
+ n = besta - WAVE_LAG;
+ while (hgh >= low)
+ if (V[hgh] < n)
+ hgh -= 1;
+ else
+ { while (V[low] < n)
+ low += 1;
+ break;
+ }
+#ifdef WAVE_STATS
+ k = (hgh-low)+1;
+ if (k > MAX)
+ MAX = k;
+ TOT += k;
+ NWV += 1;
+#ifdef DEBUG_WAVE
+ print_wave(V,M,low,hgh,besta);
+ }
+ { uint16 *atrace = (uint16 *) apath->trace;
+ int atlen;
+ int trimx;
+ int a, b, k, h;
+ int d, e;
+ if (morem >= 0)
+ { trimx = morea-morey;
+ trimy = morey;
+ trimd = mored;
+ trimha = moreha;
+ }
+ else
+ trimx = trima-trimy;
+ atlen = 0;
+ a = -1;
+ for (h = trimha; h >= 0; h = b)
+ { b = cells[h].ptr;
+ cells[h].ptr = a;
+ a = h;
+ }
+ h = a;
+ k = cells[h].diag;
+ b = (mida-k)/2;
+ e = 0;
+#ifdef SHOW_TRAIL
+ printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout);
+ for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
+ { k = cells[h].diag;
+ a = cells[h].mark - k;
+ d = cells[h].diff;
+ atrace[atlen++] = (uint16) (d-e);
+ atrace[atlen++] = (uint16) (a-b);
+#ifdef SHOW_TRAIL
+ printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout);
+ b = a;
+ e = d;
+ }
+ if (b+k != trimx)
+ { atrace[atlen++] = (uint16) (trimd-e);
+ atrace[atlen++] = (uint16) (trimy-b);
+#ifdef SHOW_TRAIL
+ printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout);
+ }
+ else if (b != trimy)
+ { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b));
+ atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e));
+#ifdef SHOW_TRAIL
+ printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout);
+ }
+ apath->aepos = trimx;
+ apath->bepos = trimy;
+ apath->diffs = trimd;
+ apath->tlen = atlen;
+ }
+ return (0);
+static int reverse_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align,
+ int midd, int mida, int minp, int maxp)
+{ char *aseq = align->aseq - 1;
+ char *bseq = align->bseq - 1;
+ Path *apath = align->path;
+ int hgh, low, dif;
+ int vlen, vmin, vmax;
+ int *V, *M;
+ int *_V, *_M;
+ BVEC *T;
+ BVEC *_T;
+ int *HA, *NA;
+ int *_HA, *_NA;
+ Pebble *cells;
+ int avail, cmax;
+ int TRACE_SPACE = spec->trace_space;
+ int PATH_AVE = spec->ave_path;
+ int16 *SCORE = spec->score;
+ int16 *TABLE = spec->table;
+ int besta, besty;
+ int trima, trimy, trimd;
+ int trimha;
+ int morea, morey, mored;
+ int moreha;
+ int more, morem, lasta;
+ int aclip, bclip;
+ hgh = midd;
+ low = midd;
+ dif = 0;
+ { int span, wing;
+ span = (hgh-low)+1;
+ vlen = work->vecmax/VectorEn;
+ wing = (vlen - span)/2;
+ vmin = low - wing;
+ vmax = hgh + wing;
+ _V = ((int *) work->vector);
+ _M = _V + vlen;
+ _HA = _M + vlen;
+ _NA = _HA + vlen;
+ _T = ((BVEC *) (_NA + vlen));
+ V = _V-vmin;
+ M = _M-vmin;
+ HA = _HA-vmin;
+ NA = _NA-vmin;
+ T = _T-vmin;
+ cells = (Pebble *) (work->cells);
+ cmax = work->celmax;
+ avail = 0;
+ }
+ more = 1;
+ aclip = -INT32_MAX;
+ bclip = INT32_MAX;
+ besta = trima = morea = lasta = mida;
+ besty = trimy = morey = (mida-hgh) >> 1;
+ trimd = mored = 0;
+ trimha = moreha = 0;
+ morem = -1;
+ { int k;
+ char *a;
+ a = aseq + low;
+ for (k = low; k <= hgh; k++)
+ { int y, c, d;
+ int ha, na;
+ Pebble *pb;
+ y = (mida-k) >> 1;
+ if (avail >= cmax-1)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = -1;
+ pb->diag = k;
+ pb->diff = 0;
+ pb->mark = y+k;
+ ha = avail++;
+ while (1)
+ { c = bseq[y];
+ if (c == 4)
+ { more = 0;
+ if (bclip > k)
+ bclip = k;
+ break;
+ }
+ d = a[y];
+ if (c != d)
+ { if (d == 4)
+ { more = 0;
+ aclip = k;
+ }
+ break;
+ }
+ y -= 1;
+ }
+ c = (y << 1) + k;
+ while (y+k <= na)
+ { if (avail >= cmax)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = ha;
+ pb->diag = k;
+ pb->diff = 0;
+ pb->mark = na;
+ ha = avail++;
+ na -= TRACE_SPACE;
+ }
+ if (c < besta)
+ { besta = trima = lasta = c;
+ besty = trimy = y;
+ trimha = ha;
+ }
+ V[k] = c;
+ T[k] = PATH_INT;
+ M[k] = PATH_LEN;
+ HA[k] = ha;
+ NA[k] = na;
+ a += 1;
+ }
+ }
+ if (more == 0)
+ { if (bseq[besty] != 4 && aseq[besta - besty] != 4)
+ more = 1;
+ if (low <= aclip)
+ { low = aclip+1;
+ if (morem <= M[aclip])
+ { morem = M[aclip];
+ morea = V[aclip];
+ morey = (morea - aclip)/2;
+ moreha = HA[aclip];
+ }
+ }
+ if (hgh >= bclip)
+ { hgh = bclip-1;
+ if (morem <= M[bclip])
+ { morem = M[bclip];
+ morea = V[bclip];
+ morey = (morea - bclip)/2;
+ moreha = HA[bclip];
+ }
+ }
+ aclip = -INT32_MAX;
+ bclip = INT32_MAX;
+ }
+#ifdef DEBUG_WAVE
+ printf("\nREVERSE WAVE:\n");
+ print_wave(V,M,low,hgh,besta);
+ while (more && lasta <= besta + TRIM_MLAG)
+ { int k, n;
+ int ua;
+ BVEC t;
+ int am, ac, ap;
+ char *a;
+ if (low <= vmin || hgh >= vmax)
+ { int span, wing;
+ int64 move, vd, md, had, nad, td;
+ span = (hgh-low)+1;
+ if (.8*vlen < span)
+ { if (enlarge_vector(work,vlen*VectorEn))
+ EXIT(1);
+ move = ((void *) _V) - work->vector;
+ vlen = work->vecmax/VectorEn;
+ _V = (int *) work->vector;
+ _M = _V + vlen;
+ _HA = _M + vlen;
+ _NA = _HA + vlen;
+ _T = ((BVEC *) (_NA + vlen));
+ }
+ else
+ move = 0;
+ wing = (vlen - span)/2;
+ vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move);
+ md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move);
+ had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move);
+ nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move);
+ td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move);
+ if (vd < 0)
+ memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int));
+ if (md < 0)
+ memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int));
+ if (had < 0)
+ memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int));
+ if (nad < 0)
+ memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int));
+ if (td < 0)
+ memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC));
+ if (td > 0)
+ memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC));
+ if (nad > 0)
+ memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int));
+ if (had > 0)
+ memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int));
+ if (md > 0)
+ memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int));
+ if (vd > 0)
+ memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int));
+ vmin = low-wing;
+ vmax = hgh+wing;
+ V = _V-vmin;
+ M = _M-vmin;
+ HA = _HA-vmin;
+ NA = _NA-vmin;
+ T = _T-vmin;
+ }
+ if (low > minp)
+ { low -= 1;
+ NA[low] = NA[low+1];
+ V[low] = ap = INT32_MAX;
+ }
+ else
+ ap = V[low];
+ if (hgh < maxp)
+ { hgh += 1;
+ NA[hgh] = NA[hgh-1];
+ V[hgh] = INT32_MAX;
+ }
+ dif += 1;
+ ac = V[hgh+1] = V[low-1] = INT32_MAX;
+ a = aseq + low;
+ t = PATH_INT;
+ n = PATH_LEN;
+ ua = -1;
+ for (k = low; k <= hgh; k++)
+ { int y, m;
+ int ha;
+ int c, d;
+ BVEC b;
+ Pebble *pb;
+ am = ac;
+ ac = ap;
+ ap = V[d = k+1];
+ if (ac > ap)
+ if (ap > am)
+ { c = am-1;
+ m = n;
+ b = t;
+ ha = ua;
+ }
+ else
+ { c = ap-1;
+ m = M[d];
+ b = T[d];
+ ha = HA[d];
+ }
+ else
+ if (ac > am)
+ { c = am-1;
+ m = n;
+ b = t;
+ ha = ua;
+ }
+ else
+ { c = ac-2;
+ m = M[k];
+ b = T[k];
+ ha = HA[k];
+ }
+ if ((b & PATH_TOP) != 0)
+ m -= 1;
+ b <<= 1;
+ y = (c-k) >> 1;
+ while (1)
+ { c = bseq[y];
+ if (c == 4)
+ { more = 0;
+ if (bclip > k)
+ bclip = k;
+ break;
+ }
+ d = a[y];
+ if (c != d)
+ { if (d == 4)
+ { more = 0;
+ aclip = k;
+ }
+ break;
+ }
+ y -= 1;
+ if ((b & PATH_TOP) == 0)
+ m += 1;
+ b = (b << 1) | 1;
+ }
+ c = (y << 1) + k;
+ while (y+k <= NA[k])
+ { if (cells[ha].mark > NA[k])
+ { if (avail >= cmax)
+ { cmax = ((int) (avail*1.2)) + 10000;
+ cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),
+ "Reallocating trace cells");
+ if (cells == NULL)
+ EXIT(1);
+ work->celmax = cmax;
+ work->cells = (void *) cells;
+ }
+#ifdef SHOW_TPS
+ printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout);
+ pb = cells+avail;
+ pb->ptr = ha;
+ pb->diag = k;
+ pb->diff = dif;
+ pb->mark = NA[k];
+ ha = avail++;
+ }
+ }
+ if (c < besta)
+ { besta = c;
+ besty = y;
+ if (m >= PATH_AVE)
+ { lasta = c;
+ if (TABLE[b & TRIM_MASK] >= 0)
+ if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0)
+ { trima = c;
+ trimy = y;
+ trimd = dif;
+ trimha = ha;
+ }
+ }
+ }
+ t = T[k];
+ n = M[k];
+ ua = HA[k];
+ V[k] = c;
+ T[k] = b;
+ M[k] = m;
+ HA[k] = ha;
+ a += 1;
+ }
+ if (more == 0)
+ { if (bseq[besty] != 4 && aseq[besta - besty] != 4)
+ more = 1;
+ if (low <= aclip)
+ { low = aclip+1;
+ if (morem <= M[aclip])
+ { morem = M[aclip];
+ morea = V[aclip];
+ morey = (morea - aclip)/2;
+ mored = dif;
+ moreha = HA[aclip];
+ }
+ }
+ if (hgh >= bclip)
+ { hgh = bclip-1;
+ if (morem <= M[bclip])
+ { morem = M[bclip];
+ morea = V[bclip];
+ morey = (morea - bclip)/2;
+ mored = dif;
+ moreha = HA[bclip];
+ }
+ }
+ aclip = -INT32_MAX;
+ bclip = INT32_MAX;
+ }
+ n = besta + WAVE_LAG;
+ while (hgh >= low)
+ if (V[hgh] > n)
+ hgh -= 1;
+ else
+ { while (V[low] > n)
+ low += 1;
+ break;
+ }
+#ifdef WAVE_STATS
+ k = (hgh-low)+1;
+ if (k > MAX)
+ MAX = k;
+ TOT += k;
+ NWV += 1;
+#ifdef DEBUG_WAVE
+ print_wave(V,M,low,hgh,besta);
+ }
+ { uint16 *atrace = (uint16 *) apath->trace;
+ int atlen;
+ int trimx;
+ int a, b, k, h;
+ int d, e;
+ if (morem >= 0)
+ { trimx = morea-morey;
+ trimy = morey;
+ trimd = mored;
+ trimha = moreha;
+ }
+ else
+ trimx = trima-trimy;
+ atlen = 0;
+ a = -1;
+ for (h = trimha; h >= 0; h = b)
+ { b = cells[h].ptr;
+ cells[h].ptr = a;
+ a = h;
+ }
+ h = a;
+ k = cells[h].diag;
+ b = cells[h].mark - k;
+ e = 0;
+#ifdef SHOW_TRAIL
+ printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout);
+ if ((b+k)%TRACE_SPACE != 0)
+ { h = cells[h].ptr;
+ if (h < 0)
+ { a = trimy;
+ d = trimd;
+ }
+ else
+ { k = cells[h].diag;
+ a = cells[h].mark - k;
+ d = cells[h].diff;
+ }
+#ifdef SHOW_TRAIL
+ printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout);
+ atrace[--atlen] = (uint16) (b-a);
+ atrace[--atlen] = (uint16) (d-e);
+ b = a;
+ e = d;
+ }
+ if (h >= 0)
+ { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr)
+ { k = cells[h].diag;
+ a = cells[h].mark - k;
+ atrace[--atlen] = (uint16) (b-a);
+ d = cells[h].diff;
+ atrace[--atlen] = (uint16) (d-e);
+#ifdef SHOW_TRAIL
+ printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout);
+ b = a;
+ e = d;
+ }
+ if (b+k != trimx)
+ { atrace[--atlen] = (uint16) (b-trimy);
+ atrace[--atlen] = (uint16) (trimd-e);
+#ifdef SHOW_TRAIL
+ printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout);
+ }
+ else if (b != trimy)
+ { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy));
+ atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e));
+#ifdef SHOW_TRAIL
+ printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout);
+ }
+ }
+ apath->abpos = trimx;
+ apath->bbpos = trimy;
+ apath->diffs = trimd;
+ apath->tlen = - atlen;
+ apath->trace = atrace + atlen;
+ }
+ return (0);
+/* Find the longest local alignment between aseq and bseq through (xcnt,ycnt)
+ See associated .h file for the precise definition of the interface.
+int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec,
+ int diag, int anti, int lbord, int hbord, int prefix)
+{ _Work_Data *work = ( _Work_Data *) ework;
+ _Align_Spec *spec = (_Align_Spec *) espec;
+ Path *apath;
+ int minp, maxp;
+ { int alen, blen;
+ int maxtp, wsize;
+ alen = align->alen;
+ blen = align->blen;
+ wsize = VectorEn*10000;
+ if (wsize >= work->vecmax)
+ if (enlarge_vector(work,wsize))
+ EXIT(1);
+ if (alen < blen)
+ maxtp = 2*(blen/spec->trace_space+2);
+ else
+ maxtp = 2*(alen/spec->trace_space+2);
+ wsize = 2*maxtp*sizeof(uint16);
+ if (wsize > work->pntmax)
+ if (enlarge_points(work,wsize))
+ EXIT(1);
+ apath = align->path;
+ apath->trace = ((uint16 *) work->points) + maxtp;
+ }
+ printf("\n");
+ if (lbord < 0)
+ minp = -INT32_MAX;
+ else
+ minp = diag-lbord;
+ if (hbord < 0)
+ maxp = INT32_MAX;
+ else
+ maxp = diag+hbord;
+ if (prefix)
+ { if (reverse_extend(work,spec,align,diag,anti,minp,maxp))
+ EXIT(1);
+ apath->aepos = (anti-diag)/2;
+ apath->bepos = (anti+diag)/2;
+ printf("E1 (%d,%d) => (%d,%d) %d\n",
+ (anti+diag)/2,(anti-diag)/2,apath->abpos,apath->bbpos,apath->diffs);
+ }
+ else
+ { if (forward_extend(work,spec,align,diag,anti,minp,maxp))
+ EXIT(1);
+ apath->abpos = (anti-diag)/2;
+ apath->bbpos = (anti+diag)/2;
+ printf("F1 (%d,%d) => (%d,%d) %d\n",
+ (anti+diag)/2,(anti-diag)/2,apath->aepos,apath->bepos,apath->diffs);
+ }
+ { uint16 *trace = (uint16 *) apath->trace;
+ int a, h;
+ printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos);
+ printf(" %c\n",(COMP(align->flags) ? 'c' : 'n'));
+ a = apath->bbpos;
+ for (h = 1; h < apath->tlen; h += 2)
+ { int dif = trace[h-1];
+ int del = trace[h];
+ a += del;
+ printf(" %d / %d (%d)\n",dif,del,a);
+ }
+ }
+ return (0);
+* *
+* *
+static int64 PtrSize = sizeof(void *);
+static int64 OvlIOSize = sizeof(Overlap) - sizeof(void *);
+int Read_Overlap(FILE *input, Overlap *ovl)
+{ if (fread( ((char *) ovl) + PtrSize, OvlIOSize, 1, input) != 1)
+ return (1);
+ return (0);
+int Read_Trace(FILE *input, Overlap *ovl, int tbytes)
+{ if (tbytes > 0 && ovl->path.tlen > 0)
+ { if (fread(ovl->path.trace, tbytes*ovl->path.tlen, 1, input) != 1)
+ return (1);
+ }
+ return (0);
+void Write_Overlap(FILE *output, Overlap *ovl, int tbytes)
+{ fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output);
+ if (ovl->path.trace != NULL)
+ fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output);
+void Compress_TraceTo8(Overlap *ovl)
+{ uint16 *t16 = (uint16 *) ovl->path.trace;
+ uint8 *t8 = (uint8 *) ovl->path.trace;
+ int j;
+ for (j = 0; j < ovl->path.tlen; j++)
+ t8[j] = (uint8) (t16[j]);
+void Decompress_TraceTo16(Overlap *ovl)
+{ uint16 *t16 = (uint16 *) ovl->path.trace;
+ uint8 *t8 = (uint8 *) ovl->path.trace;
+ int j;
+ for (j = ovl->path.tlen-1; j >= 0; j--)
+ t16[j] = t8[j];
+void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent)
+{ int i;
+ fprintf(output,"%*s%d vs. ",indent,"",ovl->aread);
+ if (COMP(ovl->flags))
+ fprintf(output,"c(%d)\n",ovl->bread);
+ else
+ fprintf(output,"%d\n",ovl->bread);
+ fprintf(output,"%*s [%d,%d] vs [%d,%d] w. %d diffs\n",indent,"",
+ ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos,ovl->path.diffs);
+ if (tbytes == 1)
+ { uint8 *trace = (uint8 *) (ovl->path.trace);
+ if (trace != NULL)
+ { int p = ovl->path.bbpos + trace[1];
+ fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p);
+ for (i = 3; i < ovl->path.tlen; i += 2)
+ { if (i%10 == 0)
+ fprintf(output,"\n%*s",indent+6,"");
+ p += trace[i];
+ fprintf(output," %3d/%5d",trace[i-1],p);
+ }
+ fprintf(output,"\n");
+ }
+ }
+ else
+ { uint16 *trace = (uint16 *) (ovl->path.trace);
+ if (trace != NULL)
+ { int p = ovl->path.bbpos + trace[1];
+ fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p);
+ for (i = 3; i < ovl->path.tlen; i += 2)
+ { if (i%10 == 0)
+ fprintf(output,"\n%*s",indent+6,"");
+ p += trace[i];
+ fprintf(output," %3d/%5d",trace[i-1],p);
+ }
+ fprintf(output,"\n");
+ }
+ }
+int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname)
+{ int i, p;
+ if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2)
+ { if (verbose)
+ EPRINTF(EPLACE," %s: Wrong number of trace points\n",fname);
+ return (1);
+ }
+ p = ovl->path.bbpos;
+ if (tspace <= TRACE_XOVR)
+ { uint8 *trace8 = (uint8 *) ovl->path.trace;
+ for (i = 1; i < ovl->path.tlen; i += 2)
+ p += trace8[i];
+ }
+ else
+ { uint16 *trace16 = (uint16 *) ovl->path.trace;
+ for (i = 1; i < ovl->path.tlen; i += 2)
+ p += trace16[i];
+ }
+ if (p != ovl->path.bepos)
+ { if (verbose)
+ EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname);
+ return (1);
+ }
+ return (0);
+void Flip_Alignment(Alignment *align, int full)
+{ char *aseq = align->aseq;
+ char *bseq = align->bseq;
+ int alen = align->alen;
+ int blen = align->blen;
+ Path *path = align->path;
+ int comp = COMP(align->flags);
+ int *trace = (int *) path->trace;
+ int tlen = path->tlen;
+ int i, j, p;
+ if (comp)
+ { p = path->abpos;
+ path->abpos = blen - path->bepos;
+ path->bepos = alen - p;
+ p = path->aepos;
+ path->aepos = blen - path->bbpos;
+ path->bbpos = alen - p;
+ if (full)
+ { alen += 2;
+ blen += 2;
+ for (i = 0; i < tlen; i++)
+ if ((p = trace[i]) < 0)
+ trace[i] = alen + p;
+ else
+ trace[i] = p - blen;
+ i = tlen-1;
+ j = 0;
+ while (j < i)
+ { p = trace[i];
+ trace[i] = trace[j];
+ trace[j] = p;
+ i -= 1;
+ j += 1;
+ }
+ alen -= 2;
+ blen -= 2;
+ }
+ }
+ else
+ { p = path->abpos;
+ path->abpos = path->bbpos;
+ path->bbpos = p;
+ p = path->aepos;
+ path->aepos = path->bepos;
+ path->bepos = p;
+ if (full)
+ for (i = 0; i < tlen; i++)
+ trace[i] = - (trace[i]);
+ }
+ align->aseq = bseq;
+ align->bseq = aseq;
+ align->alen = blen;
+ align->blen = alen;
+* *
+* *
+/* Complement the sequence in fragment aseq. The operation does the
+ complementation/reversal in place. Calling it a second time on a
+ given fragment restores it to its original state. */
+void Complement_Seq(char *aseq, int len)
+{ char *s, *t;
+ int c;
+ s = aseq;
+ t = aseq + (len-1);
+ while (s < t)
+ { c = 3 - *s;
+ *s++ = (char) (3 - *t);
+ *t-- = (char) c;
+ }
+ if (s == t)
+ *s = (char) (3 - *s);
+/* Print an alignment to file between a and b given in trace (unpacked).
+ Prefix gives the length of the initial prefix of a that is unaligned. */
+static char ToL[8] = { 'a', 'c', 'g', 't', '.', '[', ']', '-' };
+static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' };
+int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework,
+ int indent, int width, int border, int upper, int coord)
+{ _Work_Data *work = (_Work_Data *) ework;
+ int *trace = align->path->trace;
+ int tlen = align->path->tlen;
+ char *Abuf, *Bbuf, *Dbuf;
+ int i, j, o;
+ char *a, *b;
+ char mtag, dtag;
+ int prefa, prefb;
+ int aend, bend;
+ int comp, blen;
+ int sa, sb;
+ int match, diff;
+ char *N2A;
+ if (trace == NULL) return (0);
+#ifdef SHOW_TRACE
+ fprintf(file,"\nTrace:\n");
+ for (i = 0; i < tlen; i++)
+ fprintf(file," %3d\n",trace[i]);
+ o = sizeof(char)*3*(width+1);
+ if (o > work->vecmax)
+ if (enlarge_vector(work,o))
+ EXIT(1);
+ if (upper)
+ N2A = ToU;
+ else
+ N2A = ToL;
+ Abuf = (char *) work->vector;
+ Bbuf = Abuf + (width+1);
+ Dbuf = Bbuf + (width+1);
+ aend = align->path->aepos;
+ bend = align->path->bepos;
+ comp = COMP(align->flags);
+ blen = align->blen;
+ Abuf[width] = Bbuf[width] = Dbuf[width] = '\0';
+ /* buffer/output next column */
+#define COLUMN(x,y) \
+{ int u, v; \
+ if (o >= width) \
+ { fprintf(file,"\n"); \
+ fprintf(file,"%*s",indent,""); \
+ if (coord > 0) \
+ { if (sa < aend) \
+ fprintf(file," %*d",coord,sa); \
+ else \
+ fprintf(file," %*s",coord,""); \
+ fprintf(file," %s\n",Abuf); \
+ fprintf(file,"%*s %*s %s\n",indent,"",coord,"",Dbuf); \
+ fprintf(file,"%*s",indent,""); \
+ if (sb < bend) \
+ if (comp) \
+ fprintf(file," %*d",coord,blen-sb); \
+ else \
+ fprintf(file," %*d",coord,sb); \
+ else \
+ fprintf(file," %*s",coord,""); \
+ fprintf(file," %s",Bbuf); \
+ } \
+ else \
+ { fprintf(file," %s\n",Abuf); \
+ fprintf(file,"%*s %s\n",indent,"",Dbuf); \
+ fprintf(file,"%*s %s",indent,"",Bbuf); \
+ } \
+ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \
+ o = 0; \
+ sa = i-1; \
+ sb = j-1; \
+ match = diff = 0; \
+ } \
+ u = (x); \
+ v = (y); \
+ if (u == 4 || v == 4) \
+ Dbuf[o] = ' '; \
+ else if (u == v) \
+ Dbuf[o] = mtag; \
+ else \
+ Dbuf[o] = dtag; \
+ Abuf[o] = N2A[u]; \
+ Bbuf[o] = N2A[v]; \
+ o += 1; \
+ a = align->aseq - 1;
+ b = align->bseq - 1;
+ o = 0;
+ i = j = 1;
+ prefa = align->path->abpos;
+ prefb = align->path->bbpos;
+ if (prefa > border)
+ { i = prefa-(border-1);
+ prefa = border;
+ }
+ if (prefb > border)
+ { j = prefb-(border-1);
+ prefb = border;
+ }
+ sa = i-1;
+ sb = j-1;
+ mtag = ':';
+ dtag = ':';
+ while (prefa > prefb)
+ { COLUMN(a[i],4)
+ i += 1;
+ prefa -= 1;
+ }
+ while (prefb > prefa)
+ { COLUMN(4,b[j])
+ j += 1;
+ prefb -= 1;
+ }
+ while (prefa > 0)
+ { COLUMN(a[i],b[j])
+ i += 1;
+ j += 1;
+ prefa -= 1;
+ }
+ mtag = '[';
+ if (prefb > 0)
+ COLUMN(5,5)
+ mtag = '|';
+ dtag = '*';
+ match = diff = 0;
+ { int p, c; /* Output columns of alignment til reach trace end */
+ for (c = 0; c < tlen; c++)
+ if ((p = trace[c]) < 0)
+ { p = -p;
+ while (i != p)
+ { COLUMN(a[i],b[j])
+ if (a[i] == b[j])
+ match += 1;
+ else
+ diff += 1;
+ i += 1;
+ j += 1;
+ }
+ COLUMN(7,b[j])
+ j += 1;
+ diff += 1;
+ }
+ else
+ { while (j != p)
+ { COLUMN(a[i],b[j])
+ if (a[i] == b[j])
+ match += 1;
+ else
+ diff += 1;
+ i += 1;
+ j += 1;
+ }
+ COLUMN(a[i],7)
+ i += 1;
+ diff += 1;
+ }
+ p = align->path->aepos;
+ while (i <= p)
+ { COLUMN(a[i],b[j])
+ if (a[i] == b[j])
+ match += 1;
+ else
+ diff += 1;
+ i += 1;
+ j += 1;
+ }
+ }
+ { int c; /* Output remaining column including unaligned suffix */
+ mtag = ']';
+ if (a[i] != 4 && b[j] != 4 && border > 0)
+ COLUMN(6,6)
+ mtag = ':';
+ dtag = ':';
+ c = 0;
+ while (c < border && (a[i] != 4 || b[j] != 4))
+ { if (a[i] != 4)
+ if (b[j] != 4)
+ { COLUMN(a[i],b[j])
+ i += 1;
+ j += 1;
+ }
+ else
+ { COLUMN(a[i],4)
+ i += 1;
+ }
+ else
+ { COLUMN(4,b[j])
+ j += 1;
+ }
+ c += 1;
+ }
+ }
+ /* Print remainder of buffered col.s */
+ fprintf(file,"\n");
+ fprintf(file,"%*s",indent,"");
+ if (coord > 0)
+ { if (sa < aend)
+ fprintf(file," %*d",coord,sa);
+ else
+ fprintf(file," %*s",coord,"");
+ fprintf(file," %.*s\n",o,Abuf);
+ fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf);
+ fprintf(file,"%*s",indent,"");
+ if (sb < bend)
+ if (comp)
+ fprintf(file," %*d",coord,blen-sb);
+ else
+ fprintf(file," %*d",coord,sb);
+ else
+ fprintf(file," %*s",coord,"");
+ fprintf(file," %.*s",o,Bbuf);
+ }
+ else
+ { fprintf(file," %.*s\n",o,Abuf);
+ fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf);
+ fprintf(file,"%*s %.*s",indent,"",o,Bbuf);
+ }
+ if (diff+match > 0)
+ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match));
+ else
+ fprintf(file,"\n");
+ fflush(file);
+ return (0);
+int Print_Reference(FILE *file, Alignment *align, Work_Data *ework,
+ int indent, int block, int border, int upper, int coord)
+{ _Work_Data *work = (_Work_Data *) ework;
+ int *trace = align->path->trace;
+ int tlen = align->path->tlen;
+ char *Abuf, *Bbuf, *Dbuf;
+ int i, j, o;
+ char *a, *b;
+ char mtag, dtag;
+ int prefa, prefb;
+ int aend, bend;
+ int comp, blen;
+ int sa, sb, s0;
+ int match, diff;
+ char *N2A;
+ int vmax;
+ if (trace == NULL) return (0);
+#ifdef SHOW_TRACE
+ fprintf(file,"\nTrace:\n");
+ for (i = 0; i < tlen; i++)
+ fprintf(file," %3d\n",trace[i]);
+ vmax = work->vecmax/3;
+ o = sizeof(char)*6*(block+1);
+ if (o > vmax)
+ { if (enlarge_vector(work,3*o))
+ EXIT(1);
+ vmax = work->vecmax/3;
+ }
+ Abuf = (char *) work->vector;
+ Bbuf = Abuf + vmax;
+ Dbuf = Bbuf + vmax;
+ if (upper)
+ N2A = ToU;
+ else
+ N2A = ToL;
+ aend = align->path->aepos;
+ bend = align->path->bepos;
+ comp = COMP(align->flags);
+ blen = align->blen;
+#define BLOCK(x,y) \
+{ int u, v; \
+ if (i%block == 1 && i != s0 && x < 4 && o > 0) \
+ { fprintf(file,"\n"); \
+ fprintf(file,"%*s",indent,""); \
+ if (coord > 0) \
+ { if (sa < aend) \
+ fprintf(file," %*d",coord,sa); \
+ else \
+ fprintf(file," %*s",coord,""); \
+ fprintf(file," %.*s\n",o,Abuf); \
+ fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); \
+ fprintf(file,"%*s",indent,""); \
+ if (sb < bend) \
+ if (comp) \
+ fprintf(file," %*d",coord,blen-sb); \
+ else \
+ fprintf(file," %*d",coord,sb); \
+ else \
+ fprintf(file," %*s",coord,""); \
+ fprintf(file," %.*s",o,Bbuf); \
+ } \
+ else \
+ { fprintf(file," %.*s\n",o,Abuf); \
+ fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); \
+ fprintf(file,"%*s %.*s",indent,"",o,Bbuf); \
+ } \
+ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \
+ o = 0; \
+ sa = i-1; \
+ sb = j-1; \
+ match = diff = 0; \
+ } \
+ u = (x); \
+ v = (y); \
+ if (u == 4 || v == 4) \
+ Dbuf[o] = ' '; \
+ else if (u == v) \
+ Dbuf[o] = mtag; \
+ else \
+ Dbuf[o] = dtag; \
+ Abuf[o] = N2A[u]; \
+ Bbuf[o] = N2A[v]; \
+ o += 1; \
+ if (o >= vmax) \
+ { if (enlarge_vector(work,3*o)) \
+ EXIT(1); \
+ vmax = work->vecmax/3; \
+ memmove(work->vector+2*vmax,Dbuf,o); \
+ memmove(work->vector+vmax,Bbuf,o); \
+ memmove(work->vector,Abuf,o); \
+ Abuf = (char *) work->vector; \
+ Bbuf = Abuf + vmax; \
+ Dbuf = Bbuf + vmax; \
+ } \
+ a = align->aseq - 1;
+ b = align->bseq - 1;
+ o = 0;
+ i = j = 1;
+ prefa = align->path->abpos;
+ prefb = align->path->bbpos;
+ if (prefa > border)
+ { i = prefa-(border-1);
+ prefa = border;
+ }
+ if (prefb > border)
+ { j = prefb-(border-1);
+ prefb = border;
+ }
+ s0 = i;
+ sa = i-1;
+ sb = j-1;
+ mtag = ':';
+ dtag = ':';
+ while (prefa > prefb)
+ { BLOCK(a[i],4)
+ i += 1;
+ prefa -= 1;
+ }
+ while (prefb > prefa)
+ { BLOCK(4,b[j])
+ j += 1;
+ prefb -= 1;
+ }
+ while (prefa > 0)
+ { BLOCK(a[i],b[j])
+ i += 1;
+ j += 1;
+ prefa -= 1;
+ }
+ mtag = '[';
+ if (prefb > 0)
+ BLOCK(5,5)
+ mtag = '|';
+ dtag = '*';
+ match = diff = 0;
+ { int p, c; /* Output columns of alignment til reach trace end */
+ for (c = 0; c < tlen; c++)
+ if ((p = trace[c]) < 0)
+ { p = -p;
+ while (i != p)
+ { BLOCK(a[i],b[j])
+ if (a[i] == b[j])
+ match += 1;
+ else
+ diff += 1;
+ i += 1;
+ j += 1;
+ }
+ BLOCK(7,b[j])
+ j += 1;
+ diff += 1;
+ }
+ else
+ { while (j != p)
+ { BLOCK(a[i],b[j])
+ if (a[i] == b[j])
+ match += 1;
+ else
+ diff += 1;
+ i += 1;
+ j += 1;
+ }
+ BLOCK(a[i],7)
+ i += 1;
+ diff += 1;
+ }
+ p = align->path->aepos;
+ while (i <= p)
+ { BLOCK(a[i],b[j])
+ if (a[i] == b[j])
+ match += 1;
+ else
+ diff += 1;
+ i += 1;
+ j += 1;
+ }
+ }
+ { int c; /* Output remaining column including unaligned suffix */
+ mtag = ']';
+ if (a[i] != 4 && b[j] != 4 && border > 0)
+ BLOCK(6,6)
+ mtag = ':';
+ dtag = ':';
+ c = 0;
+ while (c < border && (a[i] != 4 || b[j] != 4))
+ { if (a[i] != 4)
+ if (b[j] != 4)
+ { BLOCK(a[i],b[j])
+ i += 1;
+ j += 1;
+ }
+ else
+ { BLOCK(a[i],4)
+ i += 1;
+ }
+ else
+ { BLOCK(4,b[j])
+ j += 1;
+ }
+ c += 1;
+ }
+ }
+ /* Print remainder of buffered col.s */
+ fprintf(file,"\n");
+ fprintf(file,"%*s",indent,"");
+ if (coord > 0)
+ { if (sa < aend)
+ fprintf(file," %*d",coord,sa);
+ else
+ fprintf(file," %*s",coord,"");
+ fprintf(file," %.*s\n",o,Abuf);
+ fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf);
+ fprintf(file,"%*s",indent,"");
+ if (sb < bend)
+ if (comp)
+ fprintf(file," %*d",coord,blen-sb);
+ else
+ fprintf(file," %*d",coord,sb);
+ else
+ fprintf(file," %*s",coord,"");
+ fprintf(file," %.*s",o,Bbuf);
+ }
+ else
+ { fprintf(file," %.*s\n",o,Abuf);
+ fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf);
+ fprintf(file,"%*s %.*s",indent,"",o,Bbuf);
+ }
+ if (diff+match > 0)
+ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match));
+ else
+ fprintf(file,"\n");
+ fflush(file);
+ return (0);
+/* Print an ASCII representation of the overlap in align between fragments
+ a and b to given file. */
+static inline void repchar(FILE *file, int symbol, int rep)
+{ while (rep-- > 0)
+ fputc(symbol,file);
+void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord)
+{ int alen = align->alen;
+ int blen = align->blen;
+ Path *path = align->path;
+ int comp = COMP(align->flags);
+ int w;
+ fprintf(file,"%*s",indent,"");
+ if (path->abpos > 0)
+ fprintf(file," %*d ",coord,path->abpos);
+ else
+ fprintf(file,"%*s",coord+5,"");
+ if (path->aepos < alen)
+ fprintf(file,"%*s%d",coord+8,"",alen-path->aepos);
+ fprintf(file,"\n");
+ fprintf(file,"%*s",indent,"");
+ if (path->abpos > 0)
+ { fprintf(file,"A ");
+ w = Number_Digits((int64) path->abpos);
+ repchar(file,' ',coord-w);
+ repchar(file,'=',w+3);
+ fputc('+',file);
+ repchar(file,'-',coord+5);
+ }
+ else
+ { fprintf(file,"A %*s",coord+4,"");
+ repchar(file,'-',coord+5);
+ }
+ if (path->aepos < alen)
+ { fputc('+',file);
+ w = Number_Digits((int64) (alen-path->aepos));
+ repchar(file,'=',w+2);
+ fputc('>',file);
+ repchar(file,' ',w);
+ }
+ else
+ { fputc('>',file);
+ repchar(file,' ',coord+3);
+ }
+ { int asub, bsub;
+ asub = path->aepos - path->abpos;
+ bsub = path->bepos - path->bbpos;
+ fprintf(file," dif/(len1+len2) = %d/(%d+%d) = %5.2f%%\n",
+ path->diffs,asub,bsub,(200.*path->diffs)/(asub+bsub));
+ }
+ { int sym1e, sym2e;
+ int sym1p, sym2p;
+ if (comp > 0)
+ { sym1p = '<'; sym2p = '-'; sym1e = '<'; sym2e = '='; }
+ else
+ { sym1p = '-'; sym2p = '>'; sym1e = '='; sym2e = '>'; }
+ fprintf(file,"%*s",indent,"");
+ if (path->bbpos > 0)
+ { fprintf(file,"B ");
+ w = Number_Digits((int64) path->bbpos);
+ repchar(file,' ',coord-w);
+ fputc(sym1e,file);
+ repchar(file,'=',w+2);
+ fputc('+',file);
+ repchar(file,'-',coord+5);
+ }
+ else
+ { fprintf(file,"B ");
+ repchar(file,' ',coord+3);
+ fputc(sym1p,file);
+ repchar(file,'-',coord+5);
+ }
+ if (path->bepos < blen)
+ { fprintf(file,"+");
+ w = Number_Digits((int64) (blen-path->bepos));
+ repchar(file,'=',w+2);
+ fprintf(file,"%c\n",sym2e);
+ }
+ else
+ fprintf(file,"%c\n",sym2p);
+ }
+ fprintf(file,"%*s",indent,"");
+ if (path->bbpos > 0)
+ fprintf(file," %*d ",coord,path->bbpos);
+ else
+ fprintf(file,"%*s",coord+5,"");
+ if (path->bepos < blen)
+ fprintf(file,"%*s%d",coord+8,"",blen-path->bepos);
+ fprintf(file,"\n");
+ fflush(file);
+* *
+* O(ND) trace algorithm *
+* *
+static void print_awave(int *V, int low, int hgh)
+{ int k;
+ printf(" [%6d,%6d]: ",low,hgh);
+ for (k = low; k <= hgh; k++)
+ printf(" %3d",V[k]);
+ printf("\n");
+ fflush(stdout);
+static int depth = 0;
+typedef struct
+ { int *Stop; // Ongoing stack of alignment indels
+ char *Aabs, *Babs; // Absolute base of A and B sequences
+ int **PVF, **PHF; // List of waves for iterative np algorithms
+ int mida, midb; // mid point division for mid-point algorithms
+ int *VF, *VB; // Forward/Reverse waves for nd algorithms
+ // (defunct: were used for O(nd) algorithms)
+ } Trace_Waves;
+static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave)
+{ int x, y;
+ int D;
+ printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N);
+ if (M <= 0)
+ { x = (wave->Aabs-A)-1;
+ for (y = 1; y <= N; y++)
+ { *wave->Stop++ = x;
+ printf("%*s *I %ld(%ld)\n",depth,"",y+(B-wave->Babs),(A-wave->Aabs)+1);
+ }
+ return (N);
+ }
+ if (N <= 0)
+ { y = (B-wave->Babs)+1;
+ for (x = 1; x <= M; x++)
+ { *wave->Stop++ = y;
+ printf("%*s *D %ld(%ld)\n",depth,"",x+(A-wave->Aabs),(B-wave->Babs)+1);
+ }
+ return (M);
+ }
+ { int *VF = wave->VF;
+ int *VB = wave->VB;
+ int flow; // fhgh == D !
+ int blow, bhgh;
+ char *a;
+ y = 0;
+ if (N < M)
+ while (y < N && B[y] == A[y])
+ y += 1;
+ else
+ { while (y < M && B[y] == A[y])
+ y += 1;
+ if (y >= M && N == M)
+ return (0);
+ }
+ flow = 0;
+ VF[0] = y;
+ VF[-1] = -2;
+ x = N-M;
+ a = A-x;
+ y = N-1;
+ if (N > M)
+ while (y >= x && B[y] == a[y])
+ y -= 1;
+ else
+ while (y >= 0 && B[y] == a[y])
+ y -= 1;
+ blow = bhgh = -x;
+ VB += x;
+ VB[blow] = y;
+ VB[blow-1] = N+1;
+ for (D = 1; 1; D += 1)
+ { int k, r;
+ int am, ac, ap;
+ // Forward wave
+ flow -= 1;
+ am = ac = VF[flow-1] = -2;
+ a = A + D;
+ x = M - D;
+ for (k = D; k >= flow; k--)
+ { ap = ac;
+ ac = am+1;
+ am = VF[k-1];
+ if (ac < am)
+ if (ap < am)
+ y = am;
+ else
+ y = ap;
+ else
+ if (ap < ac)
+ y = ac;
+ else
+ y = ap;
+ if (blow <= k && k <= bhgh)
+ { r = VB[k];
+ if (y > r)
+ { D = (D<<1)-1;
+ if (ap > r)
+ y = ap;
+ else if (ac > r)
+ y = ac;
+ else
+ y = r+1;
+ x = k+y;
+ goto OVERLAP2;
+ }
+ }
+ if (N < x)
+ while (y < N && B[y] == a[y])
+ y += 1;
+ else
+ while (y < x && B[y] == a[y])
+ y += 1;
+ VF[k] = y;
+ a -= 1;
+ x += 1;
+ }
+ print_awave(VF,flow,D);
+ // Reverse Wave
+ bhgh += 1;
+ blow -= 1;
+ am = ac = VB[blow-1] = N+1;
+ a = A + bhgh;
+ x = -bhgh;
+ for (k = bhgh; k >= blow; k--)
+ { ap = ac+1;
+ ac = am;
+ am = VB[k-1];
+ if (ac > am)
+ if (ap > am)
+ y = am;
+ else
+ y = ap;
+ else
+ if (ap > ac)
+ y = ac;
+ else
+ y = ap;
+ if (flow <= k && k <= D)
+ { r = VF[k];
+ if (y <= r)
+ { D = (D << 1);
+ if (ap <= r)
+ y = ap;
+ else if (ac <= r)
+ y = ac;
+ else
+ y = r;
+ x = k+y;
+ goto OVERLAP2;
+ }
+ }
+ y -= 1;
+ if (x > 0)
+ while (y >= x && B[y] == a[y])
+ y -= 1;
+ else
+ while (y >= 0 && B[y] == a[y])
+ y -= 1;
+ VB[k] = y;
+ a -= 1;
+ x += 1;
+ }
+ print_awave(VB,blow,bhgh);
+ }
+ }
+ printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D);
+ fflush(stdout);
+ if (D > 1)
+ {
+ depth += 2;
+ dandc_nd(A,x,B,y,wave);
+ dandc_nd(A+x,M-x,B+y,N-y,wave);
+ depth -= 2;
+ }
+ else if (D == 1)
+ { if (M > N)
+ { *wave->Stop++ = (B-wave->Babs)+y+1;
+ printf("%*s D %ld(%ld)\n",depth,"",(A-wave->Aabs)+x,(B-wave->Babs)+y+1);
+ }
+ else if (M < N)
+ { *wave->Stop++ = (wave->Aabs-A)-x-1;
+ printf("%*s I %ld(%ld)\n",depth,"",(B-wave->Babs)+y,(A-wave->Aabs)+x+1);
+ }
+ else
+ printf("%*s %ld S %ld\n",depth,"",(wave->Aabs-A)+x,(B-wave->Babs)+y);
+ }
+ return (D);
+static int Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework)
+{ _Work_Data *work = (_Work_Data *) ework;
+ Trace_Waves wave;
+ int L, D;
+ int asub, bsub;
+ Path *path;
+ int *trace;
+ path = align->path;
+ asub = path->aepos-path->abpos;
+ bsub = path->bepos-path->bbpos;
+ if (asub < bsub)
+ L = bsub;
+ else
+ L = asub;
+ L *= sizeof(int);
+ if (L > work->tramax)
+ if (enlarge_trace(work,L))
+ EXIT(1);
+ trace = wave.Stop = ((int *) work->trace);
+ D = 2*(path->diffs + 4)*sizeof(int);
+ if (D > work->vecmax)
+ if (enlarge_vector(work,D))
+ EXIT(1);
+ D = (path->diffs+3)/2;
+ wave.VF = ((int *) work->vector) + (D+1);
+ wave.VB = wave.VF + (2*D+1);
+ wave.Aabs = align->aseq;
+ wave.Babs = align->bseq;
+ path->diffs = dandc_nd(align->aseq+path->abpos,path->aepos-path->abpos,
+ align->bseq+path->bbpos,path->bepos-path->bbpos,&wave);
+ path->trace = trace;
+ path->tlen = wave.Stop - trace;
+ return (0);
+* *
+* O(NP) tracing algorithms *
+* *
+/* Iterative O(np) algorithm for finding the alignment between two substrings (specified
+ by a Path record). The variation includes handling substitutions and guarantees
+ to find left-most alignments so that low complexity runs are always aligned in
+ the same way.
+static int ToA[4] = { 'a', 'c', 'g', 't' };
+static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode)
+{ int **PVF = wave->PVF;
+ int **PHF = wave->PHF;
+ int D;
+ int del = M-N;
+ { int *F0, *F1, *F2;
+ int *HF;
+ int low, hgh;
+ int posl, posh;
+ printf("\n BASE %ld,%ld: %d vs %d\n",A-wave->Aabs,B-wave->Babs,M,N);
+ printf(" A = ");
+ for (D = 0; D < M; D++)
+ printf("%c",ToA[(int) A[D]]);
+ printf("\n");
+ printf(" B = ");
+ for (D = 0; D < N; D++)
+ printf("%c",ToA[(int) B[D]]);
+ printf("\n");
+ if (del >= 0)
+ { low = 0;
+ hgh = del;
+ }
+ else
+ { low = del;
+ hgh = 0;
+ }
+ posl = -INT32_MAX;
+ posh = INT32_MAX;
+ if (wave->Aabs == wave->Babs)
+ { if (B == A)
+ { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n");
+ EXIT(-1);
+ }
+ else if (B < A)
+ posl = (B-A)+1;
+ else
+ posh = (B-A)-1;
+ }
+ F1 = PVF[-2];
+ F0 = PVF[-1];
+ for (D = low-1; D <= hgh+1; D++)
+ F1[D] = F0[D] = -2;
+ F0[0] = -1;
+ low += 1;
+ hgh -= 1;
+ for (D = 0; 1; D += 1)
+ { int k, i, j;
+ int am, ac, ap;
+ char *a;
+ F2 = F1;
+ F1 = F0;
+ F0 = PVF[D];
+ HF = PHF[D];
+ if ((D & 0x1) == 0)
+ { if (low > posl)
+ low -= 1;
+ if (hgh < posh)
+ hgh += 1;
+ }
+ F0[hgh+1] = F0[low-1] = -2;
+#define FS_MOVE(mdir,pdir) \
+ ac = F1[k]+1; \
+ if (ac < am) \
+ if (ap < am) \
+ { HF[k] = mdir; \
+ j = am; \
+ } \
+ else \
+ { HF[k] = pdir; \
+ j = ap; \
+ } \
+ else \
+ if (ap < ac) \
+ { HF[k] = 0; \
+ j = ac; \
+ } \
+ else \
+ { HF[k] = pdir; \
+ j = ap; \
+ } \
+ \
+ if (N < i) \
+ while (j < N && B[j] == a[j]) \
+ j += 1; \
+ else \
+ while (j < i && B[j] == a[j]) \
+ j += 1; \
+ F0[k] = j;
+ j = -2;
+ a = A + hgh;
+ i = M - hgh;
+ for (k = hgh; k > del; k--)
+ { ap = j+1;
+ am = F2[k-1];
+ FS_MOVE(-1,4)
+ a -= 1;
+ i += 1;
+ }
+ j = -2;
+ a = A + low;
+ i = M - low;
+ for (k = low; k < del; k++)
+ { ap = F2[k+1]+1;
+ am = j;
+ FS_MOVE(2,1)
+ a += 1;
+ i -= 1;
+ }
+ ap = F0[del+1]+1;
+ am = j;
+ FS_MOVE(2,4)
+ print_awave(F0,low,hgh);
+ print_awave(HF,low,hgh);
+ if (F0[del] >= N)
+ break;
+ }
+ }
+ { int k, h, m, e, c;
+ int ap = (wave->Aabs-A)-1;
+ int bp = (B-wave->Babs)+1;
+ PHF[0][0] = 3;
+ c = N;
+ k = del;
+ e = PHF[D][k];
+ PHF[D][k] = 3;
+ if (mode == UPPERMOST)
+ while (e != 3)
+ { h = k+e;
+ if (e > 1)
+ h -= 3;
+ else if (e == 0)
+ D -= 1;
+ else
+ D -= 2;
+ if (h < k) // => e = -1 or 2, UPPERMOST
+ { char *a;
+ a = A + k;
+ if (k < 0)
+ m = -k;
+ else
+ m = 0;
+ if (PVF[D][h] <= c)
+ c = PVF[D][h]-1;
+ while (c >= m && a[c] == B[c])
+ c -= 1;
+ if (e == -1) // => edge is 2, others are 1, and 0
+ { if (c <= PVF[D+2][k+1])
+ { e = 4;
+ h = k+1;
+ D = D+2;
+ }
+ else if (c == PVF[D+1][k])
+ { e = 0;
+ h = k;
+ D = D+1;
+ }
+ else
+ PVF[D][h] = c+1;
+ }
+ else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise)
+ { if (k == del)
+ m = D;
+ else
+ m = D-2;
+ if (c <= PVF[m][k+1])
+ { if (k == del)
+ e = 4;
+ else
+ e = 1;
+ h = k+1;
+ D = m;
+ }
+ else if (c == PVF[D-1][k])
+ { e = 0;
+ h = k;
+ D = D-1;
+ }
+ else
+ PVF[D][h] = c+1;
+ }
+ }
+ m = PHF[D][h];
+ PHF[D][h] = e;
+ e = m;
+ k = h;
+ }
+ else if (mode == LOWERMOST)
+ while (e != 3)
+ { h = k+e;
+ if (e > 1)
+ h -= 3;
+ else if (e == 0)
+ D -= 1;
+ else
+ D -= 2;
+ if (h > k) // => e = 1 or 4, LOWERMOST
+ { char *a;
+ a = A + k;
+ if (k < 0)
+ m = -k;
+ else
+ m = 0;
+ if (PVF[D][h] < c)
+ c = PVF[D][h];
+ while (c >= m && a[c] == B[c])
+ c -= 1;
+ if (e == 1) // => edge is 2, others are 1, and 0
+ { if (c < PVF[D+2][k-1])
+ { e = 2;
+ h = k-1;
+ D = D+2;
+ }
+ else if (c == PVF[D+1][k])
+ { e = 0;
+ h = k;
+ D = D+1;
+ }
+ else
+ PVF[D][h] = c--;
+ }
+ else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise)
+ { if (k == del)
+ m = D;
+ else
+ m = D-2;
+ if (c < PVF[m][k-1])
+ { if (k == del)
+ e = 2;
+ else
+ e = -1;
+ h = k-1;
+ D = m;
+ }
+ else if (c == PVF[D-1][k])
+ { e = 0;
+ h = k;
+ D = D-1;
+ }
+ else
+ PVF[D][h] = c--;
+ }
+ }
+ m = PHF[D][h];
+ PHF[D][h] = e;
+ e = m;
+ k = h;
+ }
+ else // mode == GREEDIEST
+ while (e != 3)
+ { h = k+e;
+ if (e > 1)
+ h -= 3;
+ else if (e == 0)
+ D -= 1;
+ else
+ D -= 2;
+ m = PHF[D][h];
+ PHF[D][h] = e;
+ e = m;
+ k = h;
+ }
+ k = D = 0;
+ e = PHF[D][k];
+ while (e != 3)
+ { h = k-e;
+ c = PVF[D][k];
+ if (e > 1)
+ h += 3;
+ else if (e == 0)
+ D += 1;
+ else
+ D += 2;
+ if (h > k)
+ printf(" D %d(%d)\n",(c-k)-(ap-1),c+bp);
+ else if (h < k)
+ printf(" I %d(%d)\n",c+(bp-1),(c+k)-ap);
+ else
+ printf(" %d S %d\n",(c+k)-(ap+1),c+(bp-1));
+ if (h > k)
+ *wave->Stop++ = bp+c;
+ else if (h < k)
+ *wave->Stop++ = ap-(c+k);
+ k = h;
+ e = PHF[D][h];
+ }
+ }
+ return (D + abs(del));
+static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode)
+{ int **PVF = wave->PVF;
+ int **PHF = wave->PHF;
+ int D;
+ int del = M-N;
+ { int *F0, *F1, *F2;
+ int *HF;
+ int low, hgh;
+ int posl, posh;
+ printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N);
+ printf("%*s A = ",depth,"");
+ for (D = 0; D < M; D++)
+ printf("%c",ToA[(int) A[D]]);
+ printf("\n");
+ printf("%*s B = ",depth,"");
+ for (D = 0; D < N; D++)
+ printf("%c",ToA[(int) B[D]]);
+ printf("\n");
+ if (del >= 0)
+ { low = 0;
+ hgh = del;
+ }
+ else
+ { low = del;
+ hgh = 0;
+ }
+ posl = -INT32_MAX;
+ posh = INT32_MAX;
+ if (wave->Aabs == wave->Babs)
+ { if (B == A)
+ { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n");
+ EXIT(1);
+ }
+ else if (B < A)
+ posl = (B-A)+1;
+ else
+ posh = (B-A)-1;
+ }
+ F1 = PVF[-2];
+ F0 = PVF[-1];
+ for (D = low-1; D <= hgh+1; D++)
+ F1[D] = F0[D] = -2;
+ F0[0] = -1;
+ low += 1;
+ hgh -= 1;
+ for (D = 0; 1; D += 1)
+ { int k, i, j;
+ int am, ac, ap;
+ char *a;
+ F2 = F1;
+ F1 = F0;
+ F0 = PVF[D];
+ HF = PHF[D];
+ if ((D & 0x1) == 0)
+ { if (low > posl)
+ low -= 1;
+ if (hgh < posh)
+ hgh += 1;
+ }
+ F0[hgh+1] = F0[low-1] = -2;
+ j = -2;
+ a = A + hgh;
+ i = M - hgh;
+ for (k = hgh; k > del; k--)
+ { ap = j+1;
+ am = F2[k-1];
+ FS_MOVE(-1,4)
+ a -= 1;
+ i += 1;
+ }
+ j = -2;
+ a = A + low;
+ i = M - low;
+ for (k = low; k < del; k++)
+ { ap = F2[k+1]+1;
+ am = j;
+ FS_MOVE(2,1)
+ a += 1;
+ i -= 1;
+ }
+ ap = F0[del+1]+1;
+ am = j;
+ FS_MOVE(2,4)
+ print_awave(F0,low,hgh);
+ print_awave(HF,low,hgh);
+ if (F0[del] >= N)
+ break;
+ }
+ }
+ { int k, h, m, e, c;
+ int d, f;
+ d = D + abs(del);
+ c = N;
+ k = del;
+ if (mode == UPPERMOST)
+ for (f = d/2; d > f; d--)
+ { e = PHF[D][k];
+ h = k+e;
+ if (e > 1)
+ h -= 3;
+ else if (e == 0)
+ D -= 1;
+ else
+ D -= 2;
+ if (h < k) // => e = -1 or 2, UPPERMOST
+ { char *a;
+ a = A + k;
+ if (k < 0)
+ m = -k;
+ else
+ m = 0;
+ if (PVF[D][h] <= c)
+ c = PVF[D][h]-1;
+ while (c >= m && a[c] == B[c])
+ c -= 1;
+ if (e == -1) // => edge is 2, others are 1, and 0
+ { if (c <= PVF[D+2][k+1])
+ { e = 4;
+ h = k+1;
+ D = D+2;
+ }
+ else if (c == PVF[D+1][k])
+ { e = 0;
+ h = k;
+ D = D+1;
+ }
+ else
+ PVF[D][h] = c+1;
+ }
+ else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise)
+ { if (k == del)
+ m = D;
+ else
+ m = D-2;
+ if (c <= PVF[m][k+1])
+ { if (k == del)
+ e = 4;
+ else
+ e = 1;
+ h = k+1;
+ D = m;
+ }
+ else if (c == PVF[D-1][k])
+ { e = 0;
+ h = k;
+ D = D-1;
+ }
+ else
+ PVF[D][h] = c+1;
+ }
+ }
+ k = h;
+ }
+ else if (mode == LOWERMOST)
+ for (f = d/2; d > f; d--)
+ { e = PHF[D][k];
+ h = k+e;
+ if (e > 1)
+ h -= 3;
+ else if (e == 0)
+ D -= 1;
+ else
+ D -= 2;
+ if (h > k) // => e = 1 or 4, LOWERMOST
+ { char *a;
+ a = A + k;
+ if (k < 0)
+ m = -k;
+ else
+ m = 0;
+ if (PVF[D][h] < c)
+ c = PVF[D][h];
+ while (c >= m && a[c] == B[c])
+ c -= 1;
+ if (e == 1) // => edge is 2, others are 1, and 0
+ { if (c < PVF[D+2][k-1])
+ { e = 2;
+ h = k-1;
+ D = D+2;
+ }
+ else if (c == PVF[D+1][k])
+ { e = 0;
+ h = k;
+ D = D+1;
+ }
+ else
+ PVF[D][h] = c--;
+ }
+ else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise)
+ { if (k == del)
+ m = D;
+ else
+ m = D-2;
+ if (c < PVF[m][k-1])
+ { if (k == del)
+ e = 2;
+ else
+ e = -1;
+ h = k-1;
+ D = m;
+ }
+ else if (c == PVF[D-1][k])
+ { e = 0;
+ h = k;
+ D = D-1;
+ }
+ else
+ PVF[D][h] = c--;
+ }
+ }
+ k = h;
+ }
+ else // mode == GREEDIEST
+ for (f = d/2; d > f; d--)
+ { e = PHF[D][k];
+ h = k+e;
+ if (e > 1)
+ h -= 3;
+ else if (e == 0)
+ D -= 1;
+ else
+ D -= 2;
+ k = h;
+ }
+ wave->midb = (B-wave->Babs) + PVF[D][k];
+ wave->mida = (A-wave->Aabs) + k + PVF[D][k];
+ }
+ return (0);
+* *
+* *
+int Compute_Trace_ALL(Alignment *align, Work_Data *ework)
+{ _Work_Data *work = (_Work_Data *) ework;
+ Trace_Waves wave;
+ Path *path;
+ char *aseq, *bseq;
+ int M, N, D;
+ path = align->path;
+ aseq = align->aseq;
+ bseq = align->bseq;
+ M = path->aepos-path->abpos;
+ N = path->bepos-path->bbpos;
+ { int64 s;
+ int d;
+ int dmax;
+ int **PVF, **PHF;
+ if (M < N)
+ s = N;
+ else
+ s = M;
+ s *= sizeof(int);
+ if (s > work->tramax)
+ if (enlarge_trace(work,s))
+ EXIT(1);
+ dmax = path->diffs - abs(M-N);
+ s = (dmax+3)*2*((M+N+3)*sizeof(int) + sizeof(int *));
+ if (s > 256000000)
+ return (Compute_Trace_ND_ALL(align,ework));
+ if (s > work->vecmax)
+ if (enlarge_vector(work,s))
+ EXIT(1);
+ wave.PVF = PVF = ((int **) (work->vector)) + 2;
+ wave.PHF = PHF = PVF + (dmax+3);
+ s = M+N+3;
+ PVF[-2] = ((int *) (PHF + (dmax+1))) + (N+1);
+ for (d = -1; d <= dmax; d++)
+ PVF[d] = PVF[d-1] + s;
+ PHF[-2] = PVF[dmax] + s;
+ for (d = -1; d <= dmax; d++)
+ PHF[d] = PHF[d-1] + s;
+ }
+ wave.Stop = ((int *) work->trace);
+ wave.Aabs = aseq;
+ wave.Babs = bseq;
+ D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave,GREEDIEST);
+ if (D < 0)
+ EXIT(1);
+ path->diffs = D;
+ path->trace = work->trace;
+ path->tlen = wave.Stop - ((int *) path->trace);
+ return (0);
+int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int mode)
+{ _Work_Data *work = (_Work_Data *) ework;
+ Trace_Waves wave;
+ Path *path;
+ char *aseq, *bseq;
+ uint16 *points;
+ int tlen;
+ int ab, bb;
+ int ae, be;
+ int diffs;
+ path = align->path;
+ aseq = align->aseq;
+ bseq = align->bseq;
+ tlen = path->tlen;
+ points = (uint16 *) path->trace;
+ { int64 s;
+ int d;
+ int M, N;
+ int dmax, nmax;
+ int **PVF, **PHF;
+ M = path->aepos-path->abpos;
+ N = path->bepos-path->bbpos;
+ if (M < N)
+ s = N*sizeof(int);
+ else
+ s = M*sizeof(int);
+ if (s > work->tramax)
+ if (enlarge_trace(work,s))
+ EXIT(1);
+ nmax = 0;
+ dmax = 0;
+ for (d = 1; d < tlen; d += 2)
+ { if (points[d-1] > dmax)
+ dmax = points[d-1];
+ if (points[d] > nmax)
+ nmax = points[d];
+ }
+ if (tlen <= 1)
+ nmax = N;
+ s = (dmax+3)*2*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *));
+ if (s > work->vecmax)
+ if (enlarge_vector(work,s))
+ EXIT(1);
+ wave.PVF = PVF = ((int **) (work->vector)) + 2;
+ wave.PHF = PHF = PVF + (dmax+3);
+ s = trace_spacing+nmax+3;
+ PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1);
+ for (d = -1; d <= dmax; d++)
+ PVF[d] = PVF[d-1] + s;
+ PHF[-2] = PVF[dmax] + s;
+ for (d = -1; d <= dmax; d++)
+ PHF[d] = PHF[d-1] + s;
+ }
+ wave.Stop = (int *) (work->trace);
+ wave.Aabs = aseq;
+ wave.Babs = bseq;
+ { int i, d;
+ diffs = 0;
+ ab = path->abpos;
+ ae = (ab/trace_spacing)*trace_spacing;
+ bb = path->bbpos;
+ tlen -= 2;
+ for (i = 1; i < tlen; i += 2)
+ { ae = ae + trace_spacing;
+ be = bb + points[i];
+ d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode);
+ if (d < 0)
+ EXIT(1);
+ diffs += d;
+ ab = ae;
+ bb = be;
+ }
+ ae = path->aepos;
+ be = path->bepos;
+ d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode);
+ if (d < 0)
+ EXIT(1);
+ diffs += d;
+ }
+ path->trace = work->trace;
+ path->tlen = wave.Stop - ((int *) path->trace);
+ path->diffs = diffs;
+ return (0);
+int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int mode)
+{ _Work_Data *work = (_Work_Data *) ework;
+ Trace_Waves wave;
+ Path *path;
+ char *aseq, *bseq;
+ uint16 *points;
+ int tlen;
+ int ab, bb;
+ int ae, be;
+ int diffs;
+ path = align->path;
+ aseq = align->aseq;
+ bseq = align->bseq;
+ tlen = path->tlen;
+ points = (uint16 *) path->trace;
+ { int64 s;
+ int d;
+ int M, N;
+ int dmax, nmax;
+ int **PVF, **PHF;
+ M = path->aepos-path->abpos;
+ N = path->bepos-path->bbpos;
+ if (M < N)
+ s = N*sizeof(int);
+ else
+ s = M*sizeof(int);
+ if (s > work->tramax)
+ if (enlarge_trace(work,s))
+ EXIT(1);
+ nmax = 0;
+ dmax = 0;
+ for (d = 1; d < tlen; d += 2)
+ { if (points[d-1] > dmax)
+ dmax = points[d-1];
+ if (points[d] > nmax)
+ nmax = points[d];
+ }
+ if (tlen <= 1)
+ nmax = N;
+ s = (dmax+3)*4*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *));
+ if (s > work->vecmax)
+ if (enlarge_vector(work,s))
+ EXIT(1);
+ wave.PVF = PVF = ((int **) (work->vector)) + 2;
+ wave.PHF = PHF = PVF + (dmax+3);
+ s = trace_spacing+nmax+3;
+ PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1);
+ for (d = -1; d <= dmax; d++)
+ PVF[d] = PVF[d-1] + s;
+ PHF[-2] = PVF[dmax] + s;
+ for (d = -1; d <= dmax; d++)
+ PHF[d] = PHF[d-1] + s;
+ }
+ wave.Stop = ((int *) work->trace);
+ wave.Aabs = aseq;
+ wave.Babs = bseq;
+ { int i, d;
+ int as, bs;
+ int af, bf;
+ diffs = 0;
+ ab = as = af = path->abpos;
+ ae = (ab/trace_spacing)*trace_spacing;
+ bb = bs = bf = path->bbpos;
+ tlen -= 2;
+ for (i = 1; i < tlen; i += 2)
+ { ae = ae + trace_spacing;
+ be = bb + points[i];
+ if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode))
+ EXIT(1);
+ af = wave.mida;
+ bf = wave.midb;
+ d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode);
+ if (d < 0)
+ EXIT(1);
+ diffs += d;
+ ab = ae;
+ bb = be;
+ as = af;
+ bs = bf;
+ }
+ ae = path->aepos;
+ be = path->bepos;
+ if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode))
+ EXIT(1);
+ af = wave.mida;
+ bf = wave.midb;
+ d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode);
+ if (d < 0)
+ EXIT(1);
+ diffs += d;
+ as = af;
+ bs = bf;
+ d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave,mode);
+ if (d < 0)
+ EXIT(1);
+ diffs += d;
+ }
+ path->trace = work->trace;
+ path->tlen = wave.Stop - ((int *) path->trace);
+ path->diffs = diffs;
+ return (0);
+int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode)
+{ _Work_Data *work = (_Work_Data *) ework;
+ Trace_Waves wave;
+ Path *path;
+ char *aseq, *bseq;
+ uint16 *points;
+ int tlen;
+ int ab, bb;
+ int ae, be;
+ int diffs;
+ path = align->path;
+ aseq = align->aseq;
+ bseq = align->bseq;
+ tlen = path->tlen;
+ points = (uint16 *) path->trace;
+ { int64 s;
+ int d;
+ int M, N;
+ int mmax, nmax, dmax;
+ int **PVF, **PHF;
+ M = path->aepos-path->abpos;
+ N = path->bepos-path->bbpos;
+ if (M < N)
+ s = N*sizeof(int);
+ else
+ s = M*sizeof(int);
+ if (s > work->tramax)
+ if (enlarge_trace(work,s))
+ EXIT(1);
+ nmax = mmax = 0;
+ for (d = 0; d < tlen; d += 2)
+ { if (points[d] > mmax)
+ mmax = points[d];
+ if (points[d+1] > nmax)
+ nmax = points[d+1];
+ }
+ if (tlen <= 1)
+ { mmax = M;
+ nmax = N;
+ }
+ if (mmax > nmax)
+ dmax = nmax;
+ else
+ dmax = mmax;
+ s = (dmax+3)*2*((mmax+nmax+3)*sizeof(int) + sizeof(int *));
+ if (s > work->vecmax)
+ if (enlarge_vector(work,s))
+ EXIT(1);
+ wave.PVF = PVF = ((int **) (work->vector)) + 2;
+ wave.PHF = PHF = PVF + (dmax+3);
+ s = mmax+nmax+3;
+ PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1);
+ for (d = -1; d <= dmax; d++)
+ PVF[d] = PVF[d-1] + s;
+ PHF[-2] = PVF[dmax] + s;
+ for (d = -1; d <= dmax; d++)
+ PHF[d] = PHF[d-1] + s;
+ }
+ wave.Stop = (int *) (work->trace);
+ wave.Aabs = aseq;
+ wave.Babs = bseq;
+ { int i, d;
+ diffs = 0;
+ ab = path->abpos;
+ bb = path->bbpos;
+ for (i = 0; i < tlen; i += 2)
+ { ae = ab + points[i];
+ be = bb + points[i+1];
+ d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode);
+ if (d < 0)
+ EXIT(1);
+ diffs += d;
+ ab = ae;
+ bb = be;
+ }
+ }
+ path->trace = work->trace;
+ path->tlen = wave.Stop - ((int *) path->trace);
+ path->diffs = diffs;
+ return (0);
diff --git a/align.h b/align.h
new file mode 100644
index 0000000..e937b68
--- /dev/null
+++ b/align.h
@@ -0,0 +1,335 @@
+ *
+ * Local alignment module. Routines for finding local alignments given a seed position,
+ * representing such an l.a. with its interval and a set of pass-thru points, so that
+ * a detailed alignment can be efficiently computed on demand.
+ *
+ * All routines work on a numeric representation of DNA sequences, i.e. 0 for A, 1 for C,
+ * 2 for G, and 3 for T.
+ *
+ * Author: Gene Myers
+ * Date : July 2013
+ *
+ ********************************************************************************************/
+#ifndef _A_MODULE
+#define _A_MODULE
+#include "DB.h"
+#define TRACE_XOVR 125 // If the trace spacing is not more than this value, then can
+ // and do compress traces pts to 8-bit unsigned ints
+/*** INTERACTIVE vs BATCH version
+ The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or
+ batch version of the routines in this library are compiled. In batch mode, routines
+ print an error message and exit. In interactive mode, the routines place the error
+ message in EPLACE (also defined in DB.h) and return an error value, typically NULL
+ if the routine returns a pointer, and an unusual integer value if the routine returns
+ an integer.
+ Below when an error return is described, one should understand that this value is returned
+ only if the routine was compiled in INTERACTIVE mode.
+ Coordinates are *between* characters where 0 is the tick just before the first char,
+ 1 is the tick between the first and second character, and so on. Our data structure
+ is called a Path refering to its conceptualization in an edit graph.
+ A local alignment is specified by the point '(abpos,bbpos)' at which its path in
+ the underlying edit graph starts, and the point '(aepos,bepos)' at which it ends.
+ In otherwords A[abpos+1..aepos] is aligned to B[bbpos+1..bepos] (assuming X[1] is
+ the *first* character of X).
+ There are 'diffs' differences in an optimal local alignment between the beginning and
+ end points of the alignment (if computed by Compute_Trace), or nearly so (if computed
+ by Local_Alignment).
+ Optionally, a Path can have additional information about the exact nature of the
+ aligned substrings if the field 'trace' is not NULL. Trace points to either an
+ array of integers (if computed by a Compute_Trace routine), or an array of unsigned
+ short integers (if computed by Local_Alignment).
+ If computed by Local_Alignment 'trace' points at a list of 'tlen' (always even) short
+ values:
+ d_0, b_0, d_1, b_1, ... d_n-1, b_n-1, d_n, b_n
+ to be interpreted as follows. The alignment from (abpos,bbpos) to (aepos,bepos)
+ passes through the n trace points for i in [1,n]:
+ (a_i,b_i) where a_i = floor(abpos/TS)*TS + i*TS
+ and b_i = bbpos + (b_0 + b_1 + b_i-1)
+ where also let a_0,b_0 = abpos,bbpos and a_(n+1),b_(n+1) = aepos,bepos. That is, the
+ interior (i.e. i != 0 and i != n+1) trace points pass through every TS'th position of
+ the aread where TS is the "trace spacing" employed when finding the alignment (see
+ New_Align_Spec). Typically TS is 100. Then d_i is the number of differences in the
+ portion of the alignment between (a_i,b_i) and (a_i+1,b_i+1). These trace points allow
+ the Compute_Trace routines to efficiently compute the exact alignment between the two
+ reads by efficiently computing exact alignments between consecutive pairs of trace points.
+ Moreover, the diff values give one an idea of the quality of the alignment along every
+ segment of TS symbols of the aread.
+ If computed by a Compute_Trace routine, 'trace' points at a list of 'tlen' integers
+ < i1, i2, ... in > that encodes an exact alignment as follows. A negative number j
+ indicates that a dash should be placed before A[-j] and a positive number k indicates
+ that a dash should be placed before B[k], where A and B are the two sequences of the
+ overlap. The indels occur in the trace in the order in which they occur along the
+ alignment. For a good example of how to "decode" a trace into an alignment, see the
+ code for the routine Print_Alignment.
+typedef struct
+ { void *trace;
+ int tlen;
+ int diffs;
+ int abpos, bbpos;
+ int aepos, bepos;
+ } Path;
+ An alignment is modeled by an Alignment record, which in addition to a *pointer* to a
+ 'path', gives pointers to the A and B sequences, their lengths, and indicates whether
+ the B-sequence needs to be complemented ('comp' non-zero if so). The 'trace' pointer
+ of the 'path' subrecord can be either NULL, a list of pass-through points, or an exact
+ trace depending on what routines have been called on the record.
+ One can (1) compute a trace, with Compute_Trace, either from scratch if 'path.trace' = NULL,
+ or using the sequence of pass-through points in trace, (2) print an ASCII representation
+ of an alignment, or (3) reverse the roles of A and B, and (4) complement a sequence
+ (which is a reversible process).
+ If the alignment record shows the B sequence as complemented, *** THEN IT IS THE
+ RESPONSIBILITY OF THE CALLER *** to make sure that bseq points at a complement of
+ the sequence before calling Compute_Trace or Print_Alignment. Complement_Seq complements
+ the sequence a of length n. The operation does the complementation/reversal in place.
+ Calling it a second time on a given fragment restores it to its original state.
+#define COMP(x) ((x) & 0x1)
+#define COMP_FLAG 0x1
+typedef struct
+ { Path *path;
+ uint32 flags; /* Pipeline status and complementation flags */
+ char *aseq; /* Pointer to A sequence */
+ char *bseq; /* Pointer to B sequence */
+ int alen; /* Length of A sequence */
+ int blen; /* Length of B sequence */
+ } Alignment;
+void Complement_Seq(char *a, int n);
+ /* Many routines like Local_Alignment, Compute_Trace, and Print_Alignment need working
+ storage that is more efficiently reused with each call, rather than being allocated anew
+ with each call. Each *thread* can create a Work_Data object with New_Work_Data and this
+ object holds and retains the working storage for routines of this module between calls
+ to the routines. If enough memory for a Work_Data is not available then NULL is returned.
+ Free_Work_Data frees a Work_Data object and all working storage held by it.
+ */
+ typedef void Work_Data;
+ Work_Data *New_Work_Data();
+ void Free_Work_Data(Work_Data *work);
+ /* Local_Alignment seeks local alignments of a quality determined by a number of parameters.
+ These are coded in an Align_Spec object that can be created with New_Align_Spec and
+ freed with Free_Align_Spec when no longer needed. There are 4 essential parameters:
+ ave_corr: the average correlation (1 - 2*error_rate) for the sought alignments. For Pacbio
+ data we set this to .70 assuming an average of 15% error in each read.
+ trace_space: the spacing interval for keeping trace points and segment differences (see
+ description of 'trace' for Paths above)
+ freq[4]: a 4-element vector where afreq[0] = frequency of A, f(A), freq[1] = f(C),
+ freq[2] = f(G), and freq[3] = f(T). This vector is part of the header
+ of every HITS database (see db.h).
+ If an alignment cannot reach the boundary of the d.p. matrix with this condition (i.e.
+ overlap), then the last/first 30 columns of the alignment are guaranteed to be
+ suffix/prefix positive at correlation ave_corr * g(freq) where g is an empirically
+ measured function that increases from 1 as the entropy of freq decreases. If memory is
+ unavailable or the freq distribution is too skewed then NULL is returned.
+ You can get back the original parameters used to create an Align_Spec with the simple
+ utility functions below.
+ */
+ typedef void Align_Spec;
+ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq);
+ void Free_Align_Spec(Align_Spec *spec);
+ int Trace_Spacing (Align_Spec *spec);
+ double Average_Correlation(Align_Spec *spec);
+ float *Base_Frequencies (Align_Spec *spec);
+ /* Local_Alignment finds the longest significant local alignment between the sequences in
+ 'align' subject to:
+ (a) the alignment criterion given by the Align_Spec 'spec',
+ (b) it passes through one of the points (anti+k)/2,(anti-k)/2 for k in [low,hgh] within
+ the underlying dynamic programming matrix (i.e. the points on diagonals low to hgh
+ on anti-diagonal anti or anti-1 (depending on whether the diagonal is odd or even)),
+ (c) if lbord >= 0, then the alignment is always above diagonal low-lbord, and
+ (d) if hbord >= 0, then the alignment is always below diagonal hgh+hbord.
+ The path record of 'align' has its 'trace' filled from the point of view of an overlap
+ between the aread and the bread. In addition a Path record from the point of view of the
+ bread versus the aread is returned by the function, with this Path's 'trace' filled in
+ appropriately. The space for the returned path and the two 'trace's are in the working
+ storage supplied by the Work_Data packet and this space is reused with each call, so if
+ one wants to retain the bread-path and the two trace point sequences, then they must be
+ copied to user-allocated storage before calling the routine again. NULL is returned in
+ the event of an error.
+ Find_Extension is a variant of Local_Alignment that simply finds a local alignment that
+ either ends (if prefix is non-zero) or begins (if prefix is zero) at the point
+ (anti+diag)/2,(anti-diag)/2). All other parameters are as before. It returns a non-zero
+ value only when INTERACTIVE is on and it cannot allocate the memory it needs.
+ Only the path and trace with respect to the aread is returned. This routine is experimental
+ and may not persist in later versions of the code.
+ */
+ Path *Local_Alignment(Alignment *align, Work_Data *work, Align_Spec *spec,
+ int low, int hgh, int anti, int lbord, int hbord);
+ int Find_Extension(Alignment *align, Work_Data *work, Align_Spec *spec, // experimental !!
+ int diag, int anti, int lbord, int hbord, int prefix);
+ /* Given a legitimate Alignment object, Compute_Trace_X computes an exact trace for the alignment.
+ If 'path.trace' is non-NULL, then it is assumed to be a sequence of pass-through points
+ and diff levels computed by Local_Alignment. In either case 'path.trace' is set
+ to point at an integer array within the storage of the Work_Data packet encoding an
+ exact optimal trace from the start to end points. If the trace is needed beyond the
+ next call to a routine that sets it, then it should be copied to an array allocated
+ and managed by the caller.
+ Compute_Trace_ALL does not require a sequence of pass-through points, as it computes the
+ best alignment between (path->abpos,path->bbpos) and (path->aepos,path->bepos) in the
+ edit graph between the sequences. Compute_Trace_PTS computes a trace by computing the
+ trace between successive pass through points. It is much, much faster than Compute_Trace_ALL
+ but at the tradeoff of not necessarily being optimal as pass-through points are not all
+ perfect. Compute_Trace_MID computes a trace by computing the trace between the mid-points
+ of alignments between two adjacent pairs of pass through points. It is generally twice as
+ slow as Compute_Trace_PTS, but it produces nearer optimal alignments. All these routines
+ return 1 if an error occurred and 0 otherwise.
+ */
+#define LOWERMOST -1 // Possible modes for "mode" parameter below)
+#define GREEDIEST 0
+#define UPPERMOST 1
+ int Compute_Trace_ALL(Alignment *align, Work_Data *work);
+ int Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing, int mode);
+ int Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing, int mode);
+ /* Compute_Trace_IRR (IRR for IRRegular) computes a trace for the given alignment where
+ it assumes the spacing between trace points between both the A and B read varies, and
+ futher assumes that the A-spacing is given in the short integers normally occupied by
+ the differences in the alignment between the trace points. This routine is experimental
+ and may not persist in later versions of the code.
+ */
+ int Compute_Trace_IRR(Alignment *align, Work_Data *work, int mode); // experimental !!
+ /* Alignment_Cartoon prints an ASCII representation of the overlap relationhip between the
+ two reads of 'align' to the given 'file' indented by 'indent' space. Coord controls
+ the display width of numbers, it must be not less than the width of any number to be
+ displayed.
+ If the alignment trace is an exact trace, then one can ask Print_Alignment to print an
+ ASCII representation of the alignment 'align' to the file 'file'. Indent the display
+ by "indent" spaces and put "width" columns per line in the display. Show "border"
+ characters of sequence on each side of the aligned region. If upper is non-zero then
+ display bases in upper case. If coord is greater than 0, then the positions of the
+ first character in A and B in the given row is displayed with a field width given by
+ coord's value.
+ Print_Reference is like Print_Alignment but rather than printing exaclty "width" columns
+ per segment, it prints "block" characters of the A sequence in each segment. This results
+ in segments of different lengths, but is convenient when looking at two alignments involving
+ A as segments are guaranteed to cover the same interval of A in a segment.
+ Both Print routines return 1 if an error occurred (not enough memory), and 0 otherwise.
+ Flip_Alignment modifies align so the roles of A and B are reversed. If full is off then
+ the trace is ignored, otherwise the trace must be to a full alignment trace and this trace
+ is also appropriately inverted.
+ */
+ void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord);
+ int Print_Alignment(FILE *file, Alignment *align, Work_Data *work,
+ int indent, int width, int border, int upper, int coord);
+ int Print_Reference(FILE *file, Alignment *align, Work_Data *work,
+ int indent, int block, int border, int upper, int coord);
+ void Flip_Alignment(Alignment *align, int full);
+ Externally, between modules an Alignment is modeled by an "Overlap" record, which
+ (a) replaces the pointers to the two sequences with their ID's in the HITS data bases,
+ (b) does not contain the length of the 2 sequences (must fetch from DB), and
+ (c) contains its path as a subrecord rather than as a pointer (indeed, typically the
+ corresponding Alignment record points at the Overlap's path sub-record). The trace pointer
+ is always to a sequence of trace points and can be either compressed (uint8) or
+ uncompressed (uint16). One can read and write binary records of an "Overlap".
+typedef struct {
+ Path path; /* Path: begin- and end-point of alignment + diffs */
+ uint32 flags; /* Pipeline status and complementation flags */
+ int aread; /* Id # of A sequence */
+ int bread; /* Id # of B sequence */
+} Overlap;
+ /* Read_Overlap reads the next Overlap record from stream 'input', not including the trace
+ (if any), and without modifying 'ovl's trace pointer. Read_Trace reads the ensuing trace
+ into the memory pointed at by the trace field of 'ovl'. It is assumed to be big enough to
+ accommodate the trace where each value take 'tbytes' bytes (1 if uint8 or 2 if uint16).
+ Write_Overlap write 'ovl' to stream 'output' followed by its trace vector (if any) that
+ occupies 'tbytes' bytes per value.
+ Print_Overlap prints an ASCII version of the contents of 'ovl' to stream 'output'
+ where the trace occupes 'tbytes' per value and the print out is indented from the left
+ margin by 'indent' spaces.
+ Compress_TraceTo8 converts a trace fo 16-bit values to 8-bit values in place, and
+ Decompress_TraceTo16 does the reverse conversion.
+ Check_Trace_Points checks that the number of trace points is correct and that the sum
+ of the b-read displacements equals the b-read alignment interval, assuming the trace
+ spacing is 'tspace'. It reports an error message if there is a problem and 'verbose'
+ is non-zero. The 'ovl' came from the file names 'fname'.
+ */
+ int Read_Overlap(FILE *input, Overlap *ovl);
+ int Read_Trace(FILE *innput, Overlap *ovl, int tbytes);
+ void Write_Overlap(FILE *output, Overlap *ovl, int tbytes);
+ void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent);
+ void Compress_TraceTo8(Overlap *ovl);
+ void Decompress_TraceTo16(Overlap *ovl);
+ int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname);
+#endif // _A_MODULE
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/dascrubber.git
More information about the debian-med-commit
mailing list