[med-svn] [dascrubber] 01/05: New upstream version 0~20180108
Afif Elghraoui
afif at moszumanska.debian.org
Sun Feb 4 18:55:46 UTC 2018
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository dascrubber.
commit 717ed4840fa40ce73cc1007fafbd76863278a991
Author: Afif Elghraoui <afif at debian.org>
Date: Sun Feb 4 13:51:29 2018 -0500
New upstream version 0~20180108
---
DASedit.c | 21 ++-
DASmap.c | 6 +-
DASpatch.c | 44 +++---
DASqv.c | 28 ++--
DASrealign.c | 23 +--
DAStrim.c | 497 ++++++++++++++++++++++++++++++++++++++++-------------------
DB.c | 325 +++++++++++++++++++++++++++++---------
DB.h | 208 ++++++++++++++++++++-----
REPqv.c | 4 +-
REPtrim.c | 89 ++++++-----
align.c | 128 +++++++++++----
align.h | 13 +-
12 files changed, 990 insertions(+), 396 deletions(-)
diff --git a/DASedit.c b/DASedit.c
index 63e7a6f..2106401 100644
--- a/DASedit.c
+++ b/DASedit.c
@@ -82,7 +82,7 @@ static char *Usage = "[-v] [-x<int>] <source:db> <target:db>";
// Read-only
-static HITS_DB _DB, *DB = &_DB; // Data base
+static DAZZ_DB _DB, *DB = &_DB; // Data base
static int64 *TRIM_IDX;
static int *TRIM;
@@ -191,7 +191,7 @@ static int Load_Model(int *patch, char *target, int depth)
}
int main(int argc, char *argv[])
-{ HITS_READ *reads;
+{ DAZZ_READ *reads;
int nreads;
int64 boff;
@@ -368,7 +368,7 @@ int main(int argc, char *argv[])
reads = DB->reads;
nreads = DB->nreads;
- { HITS_TRACK *track;
+ { DAZZ_TRACK *track;
int i;
track = Load_Track(DB,"trim");
@@ -498,7 +498,12 @@ int main(int argc, char *argv[])
for (i = 0; i < STACK_SIZE; i++)
BSTACK[i] = NULL;
- target = New_Read_Buffer(DB);
+ { int ml = DB->maxlen;
+ DB->maxlen = 1.5*ml + 10000;
+ target = New_Read_Buffer(DB);
+ DB->maxlen = ml;
+ }
+
aseq = BSTACK[0] = New_Read_Buffer(DB);
segfate[0] = GOOD_LAST;
@@ -756,8 +761,8 @@ int main(int argc, char *argv[])
{ int i, s;
int bi, gb;
int tlen, first;
- HITS_DB NB;
- HITS_READ newrec;
+ DAZZ_DB NB;
+ DAZZ_READ newrec;
#ifdef DEBUG_INDEX
int ni;
#endif
@@ -772,7 +777,7 @@ int main(int argc, char *argv[])
NB.totlen = ntot;
NB.maxlen = nmax;
- fwrite(&NB,sizeof(HITS_DB),1,IDX_FILE);
+ fwrite(&NB,sizeof(DAZZ_DB),1,IDX_FILE);
#ifdef DEBUG_INDEX
printf("\nINDEXING\n");
@@ -807,7 +812,7 @@ int main(int argc, char *argv[])
newrec.flags = (reads[i].flags & DB_QV) | DB_BEST;
if (segfate[bi] > TRIMMED)
newrec.flags |= DB_CSS;
- fwrite(&newrec,sizeof(HITS_READ),1,IDX_FILE);
+ fwrite(&newrec,sizeof(DAZZ_READ),1,IDX_FILE);
boff += COMPRESSED_LEN(tlen);
if (s == GOOD)
diff --git a/DASmap.c b/DASmap.c
index 83adfa9..5177a0e 100644
--- a/DASmap.c
+++ b/DASmap.c
@@ -57,7 +57,7 @@ int next_read(File_Iterator *it)
if (fgets(nbuffer,MAX_BUFFER,it->input) == NULL)
{ if (feof(it->input))
return (1);
- SYSTEM_ERROR;
+ SYSTEM_READ_ERROR;
}
if ((eol = index(nbuffer,'\n')) == NULL)
{ fprintf(stderr,"%s: Line %d in read list is longer than %d chars!\n",
@@ -77,8 +77,8 @@ int next_read(File_Iterator *it)
}
int main(int argc, char *argv[])
-{ HITS_DB _db, *db = &_db;
- HITS_TRACK *map;
+{ DAZZ_DB _db, *db = &_db;
+ DAZZ_TRACK *map;
int reps, *pts;
int input_pts;
diff --git a/DASpatch.c b/DASpatch.c
index 53294b5..ad7aec0 100644
--- a/DASpatch.c
+++ b/DASpatch.c
@@ -58,7 +58,7 @@ static int ANCHOR_THRESH;
static int TRACE_SPACING; // Trace spacing (from .las file)
-static HITS_DB _DB, *DB = &_DB; // Data base
+static DAZZ_DB _DB, *DB = &_DB; // Data base
static int DB_FIRST; // First read of DB to process
static int DB_LAST; // Last read of DB to process (+1)
static int DB_PART; // 0 if all, otherwise block #
@@ -549,11 +549,9 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
else
tbytes = sizeof(uint16);
- if (novl <= 0)
- return (0);
-
- Read_Overlap(input,ovls);
- if (trace)
+ if (Read_Overlap(input,ovls) != 0)
+ ovls[0].aread = INT32_MAX;
+ else if (trace)
{ if (ovls[0].path.tlen > pmax)
{ pmax = 1.2*(ovls[0].path.tlen)+10000;
paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
@@ -627,6 +625,12 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
ACTION(j,ovls,n);
}
+ if (ovls[n].aread < INT32_MAX)
+ { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n",
+ Prog_Name,DB_PART);
+ exit (1);
+ }
+
return (max);
}
@@ -635,9 +639,8 @@ int main(int argc, char *argv[])
char *root, *dpwd;
char *las, *lpwd;
int64 novl;
- HITS_TRACK *track;
+ DAZZ_TRACK *track;
int c;
- int COVERAGE;
// Process arguments
@@ -714,19 +717,12 @@ int main(int argc, char *argv[])
if (extra == 4*sizeof(int))
{ fread(&GOOD_QV,sizeof(int),1,afile);
fread(&BAD_QV,sizeof(int),1,afile);
- fread(&COVERAGE,sizeof(int),1,afile);
+ fread(&HGAP_MIN,sizeof(int),1,afile);
fread(&HGAP_MIN,sizeof(int),1,afile);
}
- else if (extra == 3*sizeof(int))
- { fread(&GOOD_QV,sizeof(int),1,afile);
- fread(&BAD_QV,sizeof(int),1,afile);
- fread(&COVERAGE,sizeof(int),1,afile);
- HGAP_MIN = 0;
- }
- else if (extra == 2*sizeof(int))
+ else if (extra == 2*sizeof(int) || extra == 3*sizeof(int))
{ fread(&GOOD_QV,sizeof(int),1,afile);
fread(&BAD_QV,sizeof(int),1,afile);
- COVERAGE = -1;
HGAP_MIN = 0;
}
else
@@ -802,19 +798,19 @@ int main(int argc, char *argv[])
if (dbfile == NULL)
exit (1);
if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
for (i = 0; i < nfiles; i++)
if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
for (i = 1; i <= part; i++)
if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
fclose(dbfile);
DB_PART = part;
*p = '\0';
@@ -849,7 +845,7 @@ int main(int argc, char *argv[])
// Open overlap file
- lpwd = PathTo(argv[2]);
+ lpwd = PathTo(argv[c]);
if (DB_PART)
input = Fopen(Catenate(lpwd,"/",las,Numbered_Suffix(".",DB_PART,".las")),"r");
else
diff --git a/DASqv.c b/DASqv.c
index 96b3d6f..806a5db 100644
--- a/DASqv.c
+++ b/DASqv.c
@@ -38,7 +38,7 @@ static int HGAP_MIN; // Under this length do not process for HGAP
static int TRACE_SPACING; // Trace spacing (from .las file)
static int TBYTES; // Bytes per trace segment (from .las file)
-static HITS_DB _DB, *DB = &_DB; // Data base
+static DAZZ_DB _DB, *DB = &_DB; // Data base
static int DB_FIRST; // First read of DB to process
static int DB_LAST; // Last read of DB to process (+1)
static int DB_PART; // 0 if all, otherwise block #
@@ -306,11 +306,9 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
else
TBYTES = sizeof(uint16);
- if (novl <= 0)
- return (0);
-
- Read_Overlap(input,ovls);
- if (trace)
+ if (Read_Overlap(input,ovls) != 0)
+ ovls[0].aread = INT32_MAX;
+ else if (trace)
{ if (ovls[0].path.tlen > pmax)
{ pmax = 1.2*(ovls[0].path.tlen)+10000;
paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
@@ -384,6 +382,12 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
ACTION(j,ovls,n);
}
+ if (ovls[n].aread < INT32_MAX)
+ { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n",
+ Prog_Name,DB_PART);
+ exit (1);
+ }
+
return (max);
}
@@ -509,19 +513,19 @@ int main(int argc, char *argv[])
if (dbfile == NULL)
exit (1);
if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
for (i = 0; i < nfiles; i++)
if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
for (i = 1; i <= part; i++)
if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
fclose(dbfile);
DB_PART = part;
*p = '\0';
diff --git a/DASrealign.c b/DASrealign.c
index 69b7a20..dd7329f 100644
--- a/DASrealign.c
+++ b/DASrealign.c
@@ -70,8 +70,8 @@ static int TBYTES; // Bytes per trace segment (from .las file)
static int SMALL; // Trace points can fit in a byte
static int MIN_LEN; // Minimum piece length
-static HITS_DB _ADB, *ADB = &_ADB; // A-read database
-static HITS_DB _BDB, *BDB = &_BDB; // B-read database
+static DAZZ_DB _ADB, *ADB = &_ADB; // A-read database
+static DAZZ_DB _BDB, *BDB = &_BDB; // B-read database
static int ADB_ofirst, ADB_olast;
static int BDB_ofirst, BDB_olast;
@@ -536,7 +536,7 @@ static void EXTENDER(int aread, Overlap *ovls, int novl)
return;
if (work == NULL)
- { spec = New_Align_Spec(.70,100,ADB->freq);
+ { spec = New_Align_Spec(.70,100,ADB->freq,1);
work = New_Work_Data();
ralign->path = &rpath;
falign->path = &fpath;
@@ -897,11 +897,9 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
SMALL = 0;
}
- if (novl <= 0)
- return (0);
-
- Read_Overlap(input,ovls);
- if (trace)
+ if (Read_Overlap(input,ovls) != 0)
+ ovls[0].aread = INT32_MAX;
+ else if (trace)
{ if (ovls[0].path.tlen > pmax)
{ pmax = 1.2*(ovls[0].path.tlen)+10000;
paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
@@ -971,12 +969,18 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
ACTION(j,ovls,n);
}
+ if (ovls[n].aread < INT32_MAX)
+ { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n",
+ Prog_Name,ADB->part);
+ exit (1);
+ }
+
return (max);
}
int main(int argc, char *argv[])
-{ HITS_TRACK *map1, *map2;
+{ DAZZ_TRACK *map1, *map2;
// Process arguments
@@ -1047,7 +1051,6 @@ int main(int argc, char *argv[])
}
Trim_DB(BDB);
-
Read_All_Sequences(BDB,0);
}
AFIRST = ADB->tfirst;
diff --git a/DAStrim.c b/DAStrim.c
index ef01807..18a0c35 100644
--- a/DAStrim.c
+++ b/DAStrim.c
@@ -25,20 +25,24 @@
#define PATHSEP "/"
#endif
-#undef DEBUG_HQ_BLOCKS // Various DEBUG flags (normally all off)
+#undef DEBUG_HQ_BLOCKS // Various DEBUG flags (normally all off)
#undef SHOW_EVENTS
-#undef DEBUG_HOLE_FINDER
-#undef DEBUG_GAP_STATUS
+#undef DEBUG_HOLE_FINDER
+#undef DEBUG_GAP_STATUS
#undef SHOW_PAIRS
-#undef DEBUG_SUMMARY
+#undef DEBUG_PATCHING
+#undef DEBUG_SUMMARY
-#undef ANNOTATE // Output annotation tracks for DaViewer
+#undef DEBUG_CLASS
+
+#define ANNOTATE // Output annotation tracks for DaViewer
// Command format and global parameter variables
static char *Usage = " [-v] -g<int> -b<int> <source:db> <overlaps:las> ...";
+static int COVERAGE; // estimated coverage
static int BAD_QV; // qv >= and you are "bad"
static int GOOD_QV; // qv <= and you are "good"
static int HGAP_MIN; // less than this length do not process for HGAP
@@ -66,6 +70,8 @@ static int AdSuf = ADSUF;
#define COVER_LEN 400 // An overlap covers a point if it extends COVER_LEN to either side.
#define ANCHOR_MATCH .25 // Delta in trace interval at both ends of patch must be < this %.
+#define MIN_OVERLAP 900 // Was COVER_LEN, too small?
+
// Wall Constants
#define MIN_PNT 5 // Minimum # of events in a wall
@@ -79,7 +85,7 @@ static int AdSuf = ADSUF;
static int TRACE_SPACING; // Trace spacing (from .las file)
-static HITS_DB _DB, *DB = &_DB; // Data base
+static DAZZ_DB _DB, *DB = &_DB; // Data base
static int DB_FIRST; // First read of DB to process
static int DB_LAST; // Last read of DB to process (+1)
static int DB_PART; // 0 if all, otherwise block #
@@ -707,9 +713,9 @@ static int FIND_HOLES(int aread, Overlap *ovls, int novl, Interval *block, int n
printf(" Dev %5d-%3d-%5d -> %5d",queue[lflank].pos,nend,queue[first].pos,dpos);
#endif
- // Second, look for the rightmost RGT-(LFT-)wall that overlaps the left (right)
+ // Second, look for the leftmost RGT-(LFT-)wall that overlaps the left (right)
// flank, i.e. queue[lflank,first].pos (queue[last,rflank].pos), and if found
- // take the average position of the wall.
+ // take the average position of the flank position within the wall.
for (d = dnum-1; d >= 0; d--)
if (dwall[d].beg <= queue[first].pos)
@@ -728,7 +734,7 @@ static int FIND_HOLES(int aread, Overlap *ovls, int novl, Interval *block, int n
}
dpos = sum/n;
#ifdef DEBUG_HOLE_FINDER
- printf(" [%5d,%5d] -> %4d\n",dwall[d].beg,dwall[d].end,dpos);
+ printf(" Map [%5d,%5d] -> %4d\n",dwall[d].beg,dwall[d].end,dpos);
#endif
dcov = dwall[d].cov + dwall[d].cnt;
d -= 1;
@@ -779,7 +785,7 @@ static int FIND_HOLES(int aread, Overlap *ovls, int novl, Interval *block, int n
}
apos = sum/n;
#ifdef DEBUG_HOLE_FINDER
- printf(" [%5d,%5d] -> %4d\n",awall[a].beg,awall[a].end,apos);
+ printf(" Map [%5d,%5d] -> %4d\n",awall[a].beg,awall[a].end,apos);
#endif
acov = awall[a].cov + awall[a].cnt;
a += 1;
@@ -804,14 +810,14 @@ static int FIND_HOLES(int aread, Overlap *ovls, int novl, Interval *block, int n
{ dcov = dwall[d].cov + dwall[d].cnt;
dpos = dwall[d--].beg;
#ifdef DEBUG_HOLE_FINDER
- printf(" <- %d\n",dpos);
+ printf(" Push <- %d\n",dpos);
#endif
}
else
{ acov = awall[a].cov + awall[a].cnt;
apos = awall[a++].end;
#ifdef DEBUG_HOLE_FINDER
- printf(" -> %d\n",apos);
+ printf(" Push -> %d\n",apos);
#endif
}
}
@@ -819,7 +825,7 @@ static int FIND_HOLES(int aread, Overlap *ovls, int novl, Interval *block, int n
{ dcov = dwall[d].cov + dwall[d].cnt;
dpos = dwall[d--].beg;
#ifdef DEBUG_HOLE_FINDER
- printf(" <- %d\n",dpos);
+ printf(" Push <- %d\n",dpos);
#endif
}
else
@@ -827,7 +833,7 @@ static int FIND_HOLES(int aread, Overlap *ovls, int novl, Interval *block, int n
{ acov = awall[a].cov + awall[a].cnt;
apos = awall[a++].end;
#ifdef DEBUG_HOLE_FINDER
- printf(" -> %d\n",apos);
+ printf(" Push -> %d\n",apos);
#endif
}
else
@@ -1022,6 +1028,9 @@ typedef struct
{ int lidx; // left LA index
int ridx; // right LA index
int delta; // Difference between A-gap and B-gap
+ int soft; // 0 (soft) = pair is not close to gap border or adjacent gap border on both sides
+ // 1 (anciliary) = pair is not close to gap border but is to adjacent gap border
+ // 2 (hard) = pair is close to gap border on both sides
} Spanner;
typedef struct
@@ -1174,14 +1183,103 @@ static Patch *compute_patch(int lft, int rgt, Overlap *lov, Overlap *rov)
return (&ans);
}
+// Examine the spanning pairs for a gap. Group those with sufficient density
+// i.e. with 20 + 10% of the last one. If test == 1, keep groups that have at
+// least 4 members, and are either 60% hard or at least 8 hard pairs, but trim
+// away any extremal non-hard pairs. If test == 0, keep the largest group that has
+// at least 4 members, and is either 60% not soft or at least 8 non-soft pairs,
+// but trim awya ny extermal soft pairs. If "move" is non-zero then compress gsort
+// accordingly, return the size of gsort after trimming in all instances.
+
+static int analyze_gap_pairs(int gsize, Spanner *gsort, Overlap *ovls,
+ int gcnt, int scnt, int test, int move)
+{ int j, l, c, x, w;
+ int bord, soft, keeper;
+ int ncnt, biggest;
+
+ (void) ovls;
+
+ biggest = 0;
+ ncnt = 0;
+#ifdef SHOW_PAIRS
+ if (move)
+ printf(" Gsort: %d\n",gsize);
+#endif
+ c = gsize - gsort[0].delta;
+ w = 0;
+ for (j = 0; j <= gcnt; j++)
+ { l = c;
+ if (j >= gcnt)
+ bord = 1;
+ else
+ { c = gsize - gsort[j].delta;
+ if (l < 0)
+ bord = (l-c >= 20-.1*c);
+ else
+ bord = (l-c >= 20+.1*l);
+ }
+ if (bord)
+ { soft = 0;
+ for (x = w; x < j; x++)
+ soft += (gsort[x].soft <= test);
+ keeper = (j-w >= 4 && (soft < .4*(j-w) || (j-w)-soft >= 8));
+ if (test == 0)
+ { if (keeper && j-w > biggest)
+ { biggest = j-w;
+ ncnt = 0;
+ }
+ else
+ keeper = 0;
+ }
+#ifdef SHOW_PAIRS
+ if (move)
+ { printf("----\n");
+ for (x = w; x < j; x++)
+ { printf(" %3d: %5d %5d",x,gsort[x].delta,gsize-gsort[x].delta);
+ printf(" %5d",ovls[gsort[x].lidx].bread);
+ if (gsort[x].soft == 0)
+ printf(" @");
+ else if (gsort[x].soft == 1)
+ printf(" #");
+ else
+ printf(" ");
+ if (!keeper)
+ printf(" X");
+ printf("\n");
+ }
+ }
+#endif
+ if (keeper)
+ { for (x = w; x < j; x++)
+ if (gsort[x].soft > test)
+ break;
+ for (w = j; gsort[w-1].soft <= test; w--)
+ ;
+ if (move)
+ while (x < w)
+ gsort[ncnt++] = gsort[x++];
+ else
+ ncnt += w-x;
+ }
+ w = j;
+ }
+ }
+ if (move)
+ for (x = gcnt; x < gcnt+scnt; x++)
+ gsort[ncnt++] = gsort[x];
+ else
+ ncnt += scnt;
+
+ return (ncnt);
+}
+
// Categorize each gap and if appropriate return the best patch for each
static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rblock,
int *p_lft, int *p_rgt)
{ static int nmax = 0;
- static Spanner *gsort = NULL; // A-B delta and idx-pair for all B-reads spanning a gap
- static int *asort = NULL; // A-B delta for all B-reads spanning a gap
+ static Spanner *gsort = NULL; // A-B delta and idx-pair for all B-reads spanning a gap
static int ANCHOR_THRESH;
@@ -1197,8 +1295,7 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
{ if (novl > nmax)
{ nmax = 1.2*novl + 500;
gsort = (Spanner *) Realloc(gsort,nmax*sizeof(Spanner),"Allocating gap vector");
- asort = (int *) Realloc(asort,nmax*sizeof(int),"Allocating adaptemer position vector");
- if (gsort == NULL || asort == NULL)
+ if (gsort == NULL)
exit (1);
ANCHOR_THRESH = ANCHOR_MATCH * TRACE_SPACING;
}
@@ -1257,30 +1354,33 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
#endif
}
- { int bread, bcomp, blen;
+ { int bread, bcomp, blen, blast;
int ab, ae;
+ int lstack[10], ltop;
+ int rstack[10], rtop;
int lcnt, rcnt, scnt, gcnt, acnt;
- int lidx, ridx, sidx, cidx;
+ int lidx, ridx, sidx, Lidx, Ridx;
int k;
// Find LA pairs or LAs spanning the gap flank [lcv,rcv]
+ bread = -1;
lcnt = rcnt = scnt = gcnt = acnt = 0;
for (j = 0; j < novl; j = k)
- { bread = ovls[j].bread;
+ { blast = bread;
+ bread = ovls[j].bread;
blen = DB->reads[bread].rlen;
bcomp = COMP(ovls[j].flags);
- if (bcomp)
- cidx = j;
+ Lidx = lidx;
+ Ridx = ridx;
+ ltop = rtop = 0;
lidx = ridx = sidx = -1; // For all LA's with same b-read
for (k = j; k < novl; k++)
{ if (ovls[k].bread != bread)
break;
if (COMP(ovls[k].flags) != (uint32) bcomp) // Note when b switches orientation
- { cidx = k;
- bcomp = COMP(ovls[k].flags);
- }
+ break;
ab = ovls[k].path.abpos;
ae = ovls[k].path.aepos;
@@ -1293,47 +1393,82 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
#endif
// Is LA a spanner, left-partner, or right partner
+ // A partner is hard if end=point is within COVER_LEN of the gap boundary
+ // Record rigthmost/leftmost left/right hard partners (if any)
if (ab <= lcv && ae >= rcv)
{ sidx = k;
lidx = ridx = -1;
+ ltop = rtop = 0;
continue;
}
+ if (sidx >= 0)
+ continue;
+
+ if (ae >= rcv && ab > lft)
+ { if (rtop < 10)
+ rstack[rtop++] = k;
#ifdef SHOW_PAIRS
- if (ae >= rcv && ab <= rcv && ab - ovls[k].path.bbpos <= lft - COVER_LEN)
- printf("r");
- else
- printf(" ");
- if (ab <= lcv && ae >= lcv && ae + (blen-ovls[j].path.bepos) >= rgt + COVER_LEN)
- printf("l");
- else
- printf(" ");
-#endif
-
- if (ae >= rcv && ab <= rcv && ab - ovls[k].path.bbpos <= lft - COVER_LEN)
- ridx = k;
- if (ab <= lcv && ae >= lcv && ae + (blen-ovls[j].path.bepos) >= rgt + COVER_LEN)
- lidx = k;
- }
- if (! bcomp)
- cidx = k;
+ printf("r");
+#endif
+ if (ab <= rcv && ridx < 0)
+ { ridx = k;
+#ifdef SHOW_PAIRS
+ printf("+");
+#endif
+ }
+ }
+ if (ab <= lcv && ae < rgt)
+ { if (ltop < 10)
+ lstack[ltop++] = k;
#ifdef SHOW_PAIRS
- printf(" =");
- if (sidx >= 0)
- printf(" S");
- if (lidx >= 0)
- printf(" L");
- if (ridx >= 0)
- printf(" R");
- if (0 <= lidx && lidx < ridx && (ridx < cidx || lidx >= cidx))
- printf(" G");
- if ((0<=ridx && ridx<cidx && cidx<=lidx) || (0<=lidx && lidx<cidx && cidx<=ridx))
- printf(" A");
+ printf("l");
#endif
+ if (ae >= lcv)
+ { lidx = k;
+#ifdef SHOW_PAIRS
+ printf("+");
+#endif
+ }
+ }
+ }
- // Add spanning LA or spanning pair to gsort list. Add contra pairs to asort list.
+ // Check for a hard contra pair and if found add
+ // Then check for a spanner and if so then add to gsort list.
+ // Then check for a spanning pair: use hard pair if available, otherwise
+ // use tightest pair and term it anciliary if endpoints are within an adjacent
+ // gap boundary, or soft otherwise.
+ // Finally, if left or right hard (but unpaired) then record as a conflict if
+ // projection extends MIN_OVERLAP past the other side.
+
+ if (blast == bread)
+ { if (ridx < 0)
+ { if (lidx >= 0 && Ridx >= 0 && Lidx < 0)
+ { acnt += 1;
+ if (Ridx >= 0 && ovls[Ridx].path.abpos - ovls[Ridx].path.bbpos
+ <= lft - MIN_OVERLAP)
+ rcnt -= 1;
+#ifdef SHOW_PAIRS
+ printf(" = A");
+#endif
+ continue;
+ }
+ }
+ else
+ { if (lidx < 0 && Ridx < 0 && Lidx >= 0)
+ { acnt += 1;
+ if (Lidx >= 0 && ovls[Lidx].path.aepos + (blen-ovls[Lidx].path.bepos)
+ >= rgt + MIN_OVERLAP)
+ lcnt -= 1;
+#ifdef SHOW_PAIRS
+ printf(" = A");
+#endif
+ continue;
+ }
+ }
+ }
if (sidx >= 0)
{ gsort[gcnt].delta = DB->maxlen;
@@ -1341,102 +1476,148 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
gsort[gcnt].ridx = sidx;
gcnt += 1;
scnt += 1;
+#ifdef SHOW_PAIRS
+ printf(" = S");
+#endif
+ continue;
}
- else
- { if (lidx >= 0)
- lcnt += 1;
- if (ridx >= 0)
- rcnt += 1;
- if (0 <= lidx && lidx < ridx && (ridx < cidx || cidx <= lidx))
- { gsort[gcnt].delta = (ovls[ridx].path.abpos - ovls[lidx].path.aepos)
- - (ovls[ridx].path.bbpos - ovls[lidx].path.bepos);
- gsort[gcnt].lidx = lidx;
- gsort[gcnt].ridx = ridx;
- gcnt += 1;
+
+ if (ltop > 0 && rtop > 0)
+ { int lok, rok, x;
+
+ if (lidx < 0 || ridx < 0)
+ { int dif, bst;
+ int x, y;
+
+ bst = 0x7fffffff;
+ for (ltop--; ltop >= 0; ltop--)
+ { x = lstack[ltop];
+ for (rtop--; rtop >= 0; rtop--)
+ { y = rstack[rtop];
+ dif = (ovls[y].path.abpos - ovls[x].path.aepos)
+ - (ovls[y].path.bbpos - ovls[x].path.bepos);
+ dif = abs(dif);
+ if (dif < bst)
+ { bst = dif;
+ lidx = x;
+ ridx = y;
+ dif = (ovls[ridx].path.abpos - ovls[lidx].path.aepos)
+ - (ovls[ridx].path.bbpos - ovls[lidx].path.bepos);
+#ifdef SHOW_PAIRS
+ printf(" C(%d,%d = %d)",x,y,dif);
+#endif
+ }
+ }
+ }
+ }
+
+ lok = 2;
+ if (ovls[lidx].path.aepos < lcv)
+ { x = ovls[lidx].path.aepos;
+ lok = (lblock > FirstB && x <= lblock->beg && x >= lblock[-1].end - COVER_LEN);
+ }
+ rok = 2;
+ if (ovls[ridx].path.abpos > rcv)
+ { x = ovls[ridx].path.abpos;
+ rok = (rblock < LastB && x >= rblock->end && x <= rblock[1].beg + COVER_LEN);
}
- else if ((0<=ridx && ridx<cidx && cidx<=lidx) || (0<=lidx && lidx<cidx && cidx<=ridx))
- asort[acnt++] = (((blen-ovls[ridx].path.bbpos) - ovls[lidx].path.bepos)
- + (ovls[lidx].path.aepos + ovls[ridx].path.abpos))/2;
+
+ if (lok >= 2 && rok >= 2)
+ gsort[gcnt].soft = 2;
+ else if (lok >= 1 && rok >= 1)
+ gsort[gcnt].soft = 1;
+ else
+ gsort[gcnt].soft = 0;
+ gsort[gcnt].delta = (ovls[ridx].path.abpos - ovls[lidx].path.aepos)
+ - (ovls[ridx].path.bbpos - ovls[lidx].path.bepos);
+ gsort[gcnt].lidx = lidx;
+ gsort[gcnt].ridx = ridx;
+ gcnt += 1;
+#ifdef SHOW_PAIRS
+ printf(" = G%d",gsort[gcnt-1].delta);
+#endif
+ continue;
}
- }
+ if (ridx >= 0 && ovls[ridx].path.abpos - ovls[ridx].path.bbpos <= lft - MIN_OVERLAP)
+ { rcnt += 1;
#ifdef SHOW_PAIRS
- printf("\n");
+ printf(" = R");
+#endif
+ }
+ if (lidx >= 0 && ovls[lidx].path.aepos + (blen-ovls[lidx].path.bepos) >= rgt + MIN_OVERLAP)
+ { lcnt += 1;
+#ifdef SHOW_PAIRS
+ printf(" = L");
#endif
+ }
+ }
-#ifdef DEBUG_GAP_STATUS
- printf(" lcnt = %d scnt = %d(%d) rcnt = %d acnt = %d\n",lcnt,gcnt-scnt,scnt,rcnt,acnt);
+#ifdef SHOW_PAIRS
+ printf("\n");
#endif
- { int64 med, dev;
- int std, avg;
- int low, hgh;
+ { int ccnt, ocnt;
if (lcnt < rcnt)
- rcnt = lcnt;
+ ccnt = lcnt;
+ else
+ ccnt = rcnt;
+
+ // Analyze pair list gsort: if standard analysis (only hard pairs count) does not yield
+ // a span, then consider anciliary pair spanners (rarely makes a difference but does
+ // save a few.
+
+ qsort(gsort,gcnt,sizeof(Spanner),GSORT);
+ gcnt -= scnt;
+
+ ocnt = gcnt;
+ gcnt = analyze_gap_pairs(rgt-lft,gsort,ovls,gcnt,scnt,1,0);
+ if (scnt < 4 && gcnt < 10 && gcnt < ccnt)
+ {
+#ifdef SHOW_PAIRS
+ printf(" SPECIAL\n");
+#endif
+ gcnt = analyze_gap_pairs(rgt-lft,gsort,ovls,ocnt,scnt,0,1);
+#ifdef SHOW_PAIRS
+ if (gcnt >= 10 || gcnt >= ccnt)
+ printf(" SWITCH\n");
+#endif
+ }
+ else
+ analyze_gap_pairs(rgt-lft,gsort,ovls,ocnt,scnt,1,1);
+
+#ifdef DEBUG_GAP_STATUS
+ printf(" lcnt = %d gcnt = %d scnt = %d rcnt = %d acnt = %d\n",
+ lcnt,gcnt-scnt,scnt,rcnt,acnt);
+#endif
// Lots of contra pairs and less spanning support, call it an adaptamer gap.
- if (acnt >= .4*rcnt && gcnt < .3*acnt)
+ if (acnt >= .3*ccnt && gcnt < acnt)
{
#ifdef DEBUG_GAP_STATUS
- qsort(asort,acnt,sizeof(int),ASORT);
- med = asort[acnt/2];
- low = acnt*.25;
- hgh = acnt*.75;
- dev = 0;
- for (j = low; j <= hgh; j++)
- dev += (asort[j]-med)*(asort[j]-med);
- std = sqrt((1.*dev)/acnt);
- if (std > 200)
- printf(" UNCERTAIN, std. dev. large\n");
printf(" ADAPT %3d\n",std);
#endif
return (ADAPT);
}
- // Examine the spanning pairs for the gap and compute average and deviation
- // of gap deltas for second and third quartile
-
- qsort(gsort,gcnt,sizeof(Spanner),GSORT);
- gcnt -= scnt;
+ // If there is insufficient evidence for a span, then split.
- if (gcnt >= 4)
- { med = gsort[gcnt/2].delta;
- low = gcnt*.25;
- hgh = gcnt*.75;
- dev = 0;
- avg = 0;
- for (j = low; j <= hgh; j++)
- { dev += (gsort[j].delta-med)*(gsort[j].delta-med);
- avg += gsort[j].delta;
- }
- std = sqrt((1.*dev)/gcnt);
- avg = avg/((hgh-low)+1);
- low = avg-4*std;
- hgh = avg+4*std;
- }
- else
- std = 0;
-
- // If the pairing gap deviation is too large or there are too few pairs or
- // most potential partners are not paired, then be safe and split it.
-
- gcnt += scnt;
- if (std >= 150 || gcnt < 10 || (gcnt < .4*rcnt && gcnt < 20))
+ if (scnt < 4 && gcnt < 10 && gcnt <= ccnt)
{
#ifdef DEBUG_GAP_STATUS
- if (rcnt >= 20)
+ if (ccnt >= 20)
printf(" STRONG SPLIT\n");
else
printf(" WEAK SPLIT\n");
if (gcnt >= 10)
- printf(" UNCERTAIN %5.1f %5d %3d\n",(scnt+gcnt)/(1.*rcnt),rgt-lft,gcnt);
+ printf(" UNCERTAIN %5.1f %5d %3d\n",gcnt/(1.*ccnt),rgt-lft,gcnt);
#endif
return (SPLIT);
}
- // Otherwise consider the gap linkable and try to find a viable patch, declaring a split
+ // Otherwise consider the gap spannable and try to find a viable patch, declaring a split
// iff all patch attemtps fail
else
@@ -1506,7 +1687,7 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
{ lidx = gsort[j].lidx;
ridx = gsort[j].ridx;
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
if (lidx != ridx)
printf(" %5d [%5d,%5d] [%5d,%5d] %4d",
ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos,
@@ -1520,17 +1701,17 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
if (can != NULL)
{
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
printf(" %d",can->end-can->beg);
#endif
if (can->anc <= ANCHOR_THRESH)
{ ncand += 1;
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
printf(" AA %d %d(%d)",can->anc,can->bad,can->avg);
#endif
}
}
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
printf("\n");
#endif
}
@@ -1542,7 +1723,7 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
{ int x, best, nlft, nrgt;
int nanchor, ntry;
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
printf(" NOT ENOUGH\n");
#endif
@@ -1569,7 +1750,7 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
for (j = 0; j < gcnt; j++)
if (eval_lft_anchor(x,ovls+gsort[j].lidx) <= ANCHOR_THRESH)
nanchor += 1;
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
printf(" %5d: %3d\n",x,nanchor);
#endif
if (nanchor > best)
@@ -1577,7 +1758,7 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
nlft = x;
}
}
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
printf(" %5d->%5d\n",lft,nlft);
#endif
@@ -1597,7 +1778,7 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
for (j = 0; j < gcnt; j++)
if (eval_rgt_anchor(x,ovls+gsort[j].ridx) <= ANCHOR_THRESH)
nanchor += 1;
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
printf(" %5d: %3d\n",x,nanchor);
#endif
if (nanchor > best)
@@ -1605,7 +1786,7 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
nrgt = x;
}
}
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
printf(" %5d->%5d\n",rgt,nrgt);
#endif
@@ -1628,7 +1809,7 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
{ lidx = gsort[j].lidx;
ridx = gsort[j].ridx;
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
if (lidx != ridx)
printf(" %5d [%5d,%5d] [%5d,%5d] %4d",
ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos,
@@ -1643,18 +1824,18 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
if (can != NULL)
{
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
printf(" %d",can->end-can->beg);
#endif
if (can->anc <= ANCHOR_THRESH)
{ ncand += 1;
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
printf(" AA %d %d(%d)",can->anc,can->bad,can->avg);
#endif
}
}
}
-#ifdef DEBUG_GAP_STATUS
+#ifdef DEBUG_PATCHING
printf("\n");
#endif
}
@@ -1674,7 +1855,7 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
*p_rgt = rgt;
#ifdef DEBUG_GAP_STATUS
- printf(" SPAN %3d %5d: PATCHABLE\n",std,rgt-lft);
+ printf(" SPAN %5d: PATCHABLE\n",rgt-lft);
#endif
return (SPAN);
}
@@ -1682,11 +1863,11 @@ static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rbloc
}
}
-static int *GAP_ANALYSIS(Overlap *ovls, int novl, Interval *block, int *p_nblk)
+static int *GAP_ANALYSIS(int aread, Overlap *ovls, int novl, Interval *block, int *p_nblk)
{ static int bmax = 0;
static int *status = NULL; // Status of gaps between HQ_blocks
-#ifdef DEBUG_SUMMARY
+#if defined(DEBUG_SUMMARY) || defined(DEBUG_CLASS)
static char *status_string[4] = { "LOWQ", "SPAN", "SPLIT", "ADAPT" };
#endif
@@ -1694,6 +1875,8 @@ static int *GAP_ANALYSIS(Overlap *ovls, int novl, Interval *block, int *p_nblk)
int i, j;
int slft = 0, srgt = 0;
+ (void) aread;
+
nblk = *p_nblk;
if (nblk > bmax)
{ bmax = 1.2*nblk + 100;
@@ -1733,6 +1916,11 @@ static int *GAP_ANALYSIS(Overlap *ovls, int novl, Interval *block, int *p_nblk)
for (i = 1; i < nblk; i++)
printf(" %s [%d,%d]",status_string[status[i]],block[i].beg,block[i].end);
#endif
+
+#ifdef DEBUG_CLASS
+ for (i = 1; i < nblk; i++)
+ printf("AREAD %d %s [%d,%d]\n",aread,status_string[status[i]],block[i-1].end,block[i].beg);
+#endif
*p_nblk = nblk;
return (status);
@@ -1759,11 +1947,7 @@ static void GAPS(int aread, Overlap *ovls, int novl)
Interval *block;
int *status;
-#if defined(DEBUG_HQ_BLOCKS) || defined(DEBUG_HOLE_FINDER)
- printf("\n");
- printf("AREAD %d\n",aread);
-#endif
-#if defined(DEBUG_GAP_STATUS) || defined(DEBUG_SUMMARY)
+#if defined(DEBUG_HQ_BLOCKS) || defined(DEBUG_HOLE_FINDER) || defined(DEBUG_GAP_STATUS) || defined(DEBUG_SUMMARY)
printf("\n");
printf("AREAD %d\n",aread);
#endif
@@ -1803,7 +1987,7 @@ static void GAPS(int aread, Overlap *ovls, int novl)
// Determine the status of each gap between a pair of blocks
if (nblk > 0)
- status = GAP_ANALYSIS(ovls,novl,block,&nblk);
+ status = GAP_ANALYSIS(aread,ovls,novl,block,&nblk);
// No blocks? ==> nothing to do
@@ -2010,11 +2194,9 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
else
tbytes = sizeof(uint16);
- if (novl <= 0)
- return (0);
-
- Read_Overlap(input,ovls);
- if (trace)
+ if (Read_Overlap(input,ovls) != 0)
+ ovls[0].aread = INT32_MAX;
+ else if (trace)
{ if (ovls[0].path.tlen > pmax)
{ pmax = 1.2*(ovls[0].path.tlen)+10000;
paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
@@ -2088,6 +2270,12 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
ACTION(j,ovls,n);
}
+ if (ovls[n].aread < INT32_MAX)
+ { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n",
+ Prog_Name,DB_PART);
+ exit (1);
+ }
+
return (max);
}
@@ -2096,9 +2284,8 @@ int main(int argc, char *argv[])
char *root, *dpwd;
char *las, *lpwd;
int64 novl;
- HITS_TRACK *track;
+ DAZZ_TRACK *track;
int c;
- int COVERAGE;
// Process arguments
@@ -2260,19 +2447,19 @@ int main(int argc, char *argv[])
if (dbfile == NULL)
exit (1);
if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
for (i = 0; i < nfiles; i++)
if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
for (i = 1; i <= part; i++)
if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2)
- SYSTEM_ERROR
+ SYSTEM_READ_ERROR
fclose(dbfile);
DB_PART = part;
*p = '\0';
@@ -2321,7 +2508,7 @@ int main(int argc, char *argv[])
// Open overlap file
- lpwd = PathTo(argv[2]);
+ lpwd = PathTo(argv[c]);
if (DB_PART)
input = Fopen(Catenate(lpwd,"/",las,Numbered_Suffix(".",DB_PART,".las")),"r");
else
diff --git a/DB.c b/DB.c
index 69060d0..9b5c85f 100644
--- a/DB.c
+++ b/DB.c
@@ -41,6 +41,24 @@ char Ebuffer[1000];
#endif
+int Count_Args(char *var)
+{ int cnt, lev;
+ char *s;
+
+ cnt = 1;
+ lev = 0;
+ for (s = var; *s != '\0'; s++)
+ if (*s == ',')
+ { if (lev == 0)
+ cnt += 1;
+ }
+ else if (*s == '(')
+ lev += 1;
+ else if (*s == ')')
+ lev -= 1;
+ return (cnt);
+}
+
void *Malloc(int64 size, char *mesg)
{ void *p;
@@ -382,7 +400,7 @@ void Number_Arrow(char *s)
********************************************************************************************/
-// Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has
+// Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has
// a part # in it then just the part is opened. The index array is allocated (for all or
// just the part) and read in.
// Return status of routine:
@@ -390,8 +408,8 @@ void Number_Arrow(char *s)
// 0: Open of DB proceeded without mishap
// 1: Open of DAM proceeded without mishap
-int Open_DB(char* path, HITS_DB *db)
-{ HITS_DB dbcopy;
+int Open_DB(char* path, DAZZ_DB *db)
+{ DAZZ_DB dbcopy;
char *root, *pwd, *bptr, *fptr, *cat;
int nreads;
FILE *index, *dbvis;
@@ -437,7 +455,7 @@ int Open_DB(char* path, HITS_DB *db)
if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL)
goto error1;
- if (fread(db,sizeof(HITS_DB),1,index) != 1)
+ if (fread(db,sizeof(DAZZ_DB),1,index) != 1)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
goto error2;
}
@@ -505,28 +523,28 @@ int Open_DB(char* path, HITS_DB *db)
nreads = ulast-ufirst;
if (part <= 0)
- { db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index");
+ { db->reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index");
if (db->reads == NULL)
goto error2;
db->reads += 1;
- if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
+ if (fread(db->reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
free(db->reads-1);
goto error2;
}
}
else
- { HITS_READ *reads;
+ { DAZZ_READ *reads;
int i, r, maxlen;
int64 totlen;
- reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index");
+ reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index");
if (reads == NULL)
goto error2;
reads += 1;
- fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR);
- if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
+ fseeko(index,sizeof(DAZZ_READ)*ufirst,SEEK_CUR);
+ if (fread(reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
free(reads-1);
goto error2;
@@ -580,13 +598,13 @@ error:
// of the current DB partition. Reallocate smaller memory blocks for the information kept
// for the retained reads.
-void Trim_DB(HITS_DB *db)
+void Trim_DB(DAZZ_DB *db)
{ int i, j, r;
int allflag, cutoff;
int64 totlen;
int maxlen, nreads;
- HITS_TRACK *record;
- HITS_READ *reads;
+ DAZZ_TRACK *record;
+ DAZZ_READ *reads;
if (db->trimmed) return;
@@ -603,7 +621,7 @@ void Trim_DB(HITS_DB *db)
for (record = db->tracks; record != NULL; record = record->next)
if (strcmp(record->name,". at qvs") == 0)
- { uint16 *table = ((HITS_QV *) record)->table;
+ { uint16 *table = ((DAZZ_QV *) record)->table;
j = 0;
for (i = 0; i < db->nreads; i++)
@@ -675,7 +693,7 @@ void Trim_DB(HITS_DB *db)
db->trimmed = 1;
if (j < nreads)
- { db->reads = Realloc(reads-1,sizeof(HITS_READ)*(j+2),NULL);
+ { db->reads = Realloc(reads-1,sizeof(DAZZ_READ)*(j+2),NULL);
db->reads += 1;
}
}
@@ -683,12 +701,12 @@ void Trim_DB(HITS_DB *db)
// The DB has already been trimmed, but a track over the untrimmed DB needs to be loaded.
// Trim the track by rereading the untrimmed DB index from the file system.
-static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart)
+static int Late_Track_Trim(DAZZ_DB *db, DAZZ_TRACK *track, int ispart)
{ int i, j, r;
int allflag, cutoff;
int ureads;
char *root;
- HITS_READ read;
+ DAZZ_READ read;
FILE *indx;
if (!db->trimmed) return (0);
@@ -703,7 +721,7 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart)
root = rindex(db->path,'/') + 2;
indx = Fopen(Catenate(db->path,"","",".idx"),"r");
- fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*db->ufirst,SEEK_SET);
+ fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*db->ufirst,SEEK_SET);
if (ispart)
ureads = ((int *) (db->reads))[-1];
else
@@ -725,7 +743,7 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart)
{ anno = (char *) track->anno;
j = r = 0;
for (i = r = 0; i < ureads; i++, r += size)
- { if (fread(&read,sizeof(HITS_READ),1,indx) != 1)
+ { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
fclose(indx);
EXIT(1);
@@ -744,7 +762,7 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart)
anno4 = (int *) (track->anno);
j = anno4[0] = 0;
for (i = 0; i < ureads; i++)
- { if (fread(&read,sizeof(HITS_READ),1,indx) != 1)
+ { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
fclose(indx);
EXIT(1);
@@ -764,7 +782,7 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart)
anno8 = (int64 *) (track->anno);
j = anno8[0] = 0;
for (i = 0; i < ureads; i++)
- { if (fread(&read,sizeof(HITS_READ),1,indx) != 1)
+ { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
fclose(indx);
EXIT(1);
@@ -789,8 +807,8 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart)
// and any open file pointers. The record pointed at by db however remains (the user
// supplied it and so should free it).
-void Close_DB(HITS_DB *db)
-{ HITS_TRACK *t, *p;
+void Close_DB(DAZZ_DB *db)
+{ DAZZ_TRACK *t, *p;
if (db->loaded)
free(((char *) (db->bases)) - 1);
@@ -813,19 +831,19 @@ void Close_DB(HITS_DB *db)
// Return the size in bytes of the memory occupied by a given DB
-int64 sizeof_DB(HITS_DB *db)
+int64 sizeof_DB(DAZZ_DB *db)
{ int64 s;
- HITS_TRACK *t;
+ DAZZ_TRACK *t;
- s = sizeof(HITS_DB)
- + sizeof(HITS_READ)*(db->nreads+2)
+ s = sizeof(DAZZ_DB)
+ + sizeof(DAZZ_READ)*(db->nreads+2)
+ strlen(db->path)+1
+ (db->totlen+db->nreads+4);
t = db->tracks;
if (t != NULL && strcmp(t->name,". at qvs") == 0)
- { HITS_QV *q = (HITS_QV *) t;
- s += sizeof(HITS_QV)
+ { DAZZ_QV *q = (DAZZ_QV *) t;
+ s += sizeof(DAZZ_QV)
+ sizeof(uint16) * db->nreads
+ q->ncodes * sizeof(QVcoding)
+ 6;
@@ -833,7 +851,7 @@ int64 sizeof_DB(HITS_DB *db)
}
for (; t != NULL; t = t->next)
- { s += sizeof(HITS_TRACK)
+ { s += sizeof(DAZZ_TRACK)
+ strlen(t->name)+1
+ t->size * (db->nreads+1);
if (t->data != NULL)
@@ -854,14 +872,14 @@ int64 sizeof_DB(HITS_DB *db)
*
********************************************************************************************/
-HITS_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry"
-HITS_QV *Active_QV; // Becomes invalid after closing
+DAZZ_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry"
+DAZZ_QV *Active_QV; // Becomes invalid after closing
-int Load_QVs(HITS_DB *db)
+int Load_QVs(DAZZ_DB *db)
{ FILE *quiva, *istub, *indx;
char *root;
uint16 *table;
- HITS_QV *qvtrk;
+ DAZZ_QV *qvtrk;
QVcoding *coding, *nx;
int ncodes = 0;
@@ -956,7 +974,7 @@ int Load_QVs(HITS_DB *db)
goto error;
}
- // Carefully get the first coding scheme (its offset is most likely in a HITS_RECORD
+ // Carefully get the first coding scheme (its offset is most likely in a DAZZ_RECORD
// in .idx that is *not* in memory). Get all the other coding schemes normally and
// assign the tables # for each read in the block in "tables".
@@ -974,10 +992,10 @@ int Load_QVs(HITS_DB *db)
i = n-fbeg;
if (first < pfirst)
- { HITS_READ read;
+ { DAZZ_READ read;
- fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*first,SEEK_SET);
- if (fread(&read,sizeof(HITS_READ),1,indx) != 1)
+ fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*first,SEEK_SET);
+ if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
ncodes = i;
goto error;
@@ -1050,17 +1068,17 @@ int Load_QVs(HITS_DB *db)
}
}
- // Allocate and fill in the HITS_QV record and add it to the front of the
+ // Allocate and fill in the DAZZ_QV record and add it to the front of the
// track list
- qvtrk = (HITS_QV *) Malloc(sizeof(HITS_QV),"Allocating QV pseudo-track");
+ qvtrk = (DAZZ_QV *) Malloc(sizeof(DAZZ_QV),"Allocating QV pseudo-track");
if (qvtrk == NULL)
goto error;
qvtrk->name = Strdup(". at qvs","Allocating QV pseudo-track name");
if (qvtrk->name == NULL)
goto error;
qvtrk->next = db->tracks;
- db->tracks = (HITS_TRACK *) qvtrk;
+ db->tracks = (DAZZ_TRACK *) qvtrk;
qvtrk->ncodes = ncodes;
qvtrk->table = table;
qvtrk->coding = coding;
@@ -1091,16 +1109,16 @@ error:
// Close the QV stream, free the QV pseudo track and all associated memory
-void Close_QVs(HITS_DB *db)
-{ HITS_TRACK *track;
- HITS_QV *qvtrk;
+void Close_QVs(DAZZ_DB *db)
+{ DAZZ_TRACK *track;
+ DAZZ_QV *qvtrk;
int i;
Active_DB = NULL;
track = db->tracks;
if (track != NULL && strcmp(track->name,". at qvs") == 0)
- { qvtrk = (HITS_QV *) track;
+ { qvtrk = (DAZZ_QV *) track;
for (i = 0; i < qvtrk->ncodes; i++)
Free_QVcoding(qvtrk->coding+i);
free(qvtrk->coding);
@@ -1125,7 +1143,7 @@ void Close_QVs(HITS_DB *db)
// -1: Track is not the right size of DB either trimmed or untrimmed
// -2: Could not find the track
-int Check_Track(HITS_DB *db, char *track, int *kind)
+int Check_Track(DAZZ_DB *db, char *track, int *kind)
{ FILE *afile;
int tracklen, size, ispart;
int ureads, treads;
@@ -1181,10 +1199,10 @@ int Check_Track(HITS_DB *db, char *track, int *kind)
// If track is not already in the db's track list, then allocate all the storage for it,
// read it in from the appropriate file, add it to the track list, and return a pointer
-// to the newly created HITS_TRACK record. If the track does not exist or cannot be
+// to the newly created DAZZ_TRACK record. If the track does not exist or cannot be
// opened for some reason, then NULL is returned.
-HITS_TRACK *Load_Track(HITS_DB *db, char *track)
+DAZZ_TRACK *Load_Track(DAZZ_DB *db, char *track)
{ FILE *afile, *dfile;
int tracklen, size;
int nreads, ispart;
@@ -1192,7 +1210,7 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track)
void *anno;
void *data;
char *name;
- HITS_TRACK *record;
+ DAZZ_TRACK *record;
if (track[0] == '.')
{ EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track);
@@ -1340,7 +1358,7 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track)
fclose(afile);
- record = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating Track Record");
+ record = (DAZZ_TRACK *) Malloc(sizeof(DAZZ_TRACK),"Allocating Track Record");
if (record == NULL)
goto error;
record->name = Strdup(track,"Allocating Track Name");
@@ -1379,8 +1397,161 @@ error:
EXIT (NULL);
}
-void Close_Track(HITS_DB *db, char *track)
-{ HITS_TRACK *record, *prev;
+// Assumming file pointer for afile is correctly positioned at the start of a extra item,
+// and aname is the name of the .anno file, decode the value present and places it in
+// extra if extra->nelem == 0, otherwise reduce the value just read into extra according
+// according the to the directive given by 'accum'. Leave the read poinrt at the next
+// extra or end-of-file.
+// Returns:
+// 1 if at the end of file,
+// 0 if item was read and folded correctly,
+// -1 if there was a system IO or allocation error (if interactive), and
+// -2 if the new value could not be reduced into the currenct value of extra (interactive)
+
+int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra)
+{ int vtype, nelem, accum, slen;
+ char *name;
+ void *value;
+
+#define EREAD(v,s,n,file,ret) \
+ { if (fread(v,s,n,file) != (size_t) n) \
+ { if (ferror(file)) \
+ fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \
+ else if (ret) \
+ return (1); \
+ else \
+ fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,aname); \
+ EXIT(-1); \
+ } \
+ }
+
+ EREAD(&vtype,sizeof(int),1,afile,1)
+ EREAD(&nelem,sizeof(int),1,afile,0)
+ EREAD(&accum,sizeof(int),1,afile,0)
+ EREAD(&slen,sizeof(int),1,afile,0)
+
+ if (extra == NULL)
+ { if (fseeko(afile,slen+8*nelem,SEEK_CUR) < 0)
+ { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name);
+ EXIT(-1);
+ }
+ return (0);
+ }
+
+ name = (char *) Malloc(slen+1,"Allocating extra name");
+ value = Malloc(8*nelem,"Allocating extra value");
+ if (name == NULL || value == NULL)
+ EXIT(-1);
+
+ EREAD(name,1,slen,afile,0);
+ EREAD(value,8,nelem,afile,0);
+ name[slen] = '\0';
+
+ if (extra->nelem == 0)
+ { extra->vtype = vtype;
+ extra->nelem = nelem;
+ extra->accum = accum;
+ extra->name = name;
+ extra->value = value;
+ return (0);
+ }
+
+ if (vtype != extra->vtype)
+ { fprintf(stderr,"%s: Type of extra %s does not agree with previous .anno block files\n",
+ Prog_Name,name);
+ goto error;
+ }
+ if (nelem != extra->nelem)
+ { fprintf(stderr,"%s: Length of extra %s does not agree with previous .anno block files\n",
+ Prog_Name,name);
+ goto error;
+ }
+ if (accum != extra->accum)
+ { fprintf(stderr,"%s: Reduction indicator of extra %s does not agree with",Prog_Name,name);
+ fprintf(stderr," previos .anno block files\n");
+ goto error;
+ }
+ if (strcmp(name,extra->name) != 0)
+ { fprintf(stderr,"%s: Expecting extra %s in .anno block file, not %s\n",
+ Prog_Name,extra->name,name);
+ goto error;
+ }
+
+ if (vtype == DB_INT)
+ { int64 *ival = (int64 *) value;
+ int64 *eval = (int64 *) (extra->value);
+ int j;
+
+ if (accum == DB_EXACT)
+ { for (j = 0; j < nelem; j++)
+ if (eval[j] != ival[j])
+ { fprintf(stderr,"%s: Value of extra %s doe not agree",Prog_Name,name);
+ fprintf(stderr," with previous .anno block files\n");
+ goto error;
+ }
+ }
+ else
+ { for (j = 0; j < nelem; j++)
+ eval[j] += ival[j];
+ }
+ }
+
+ else
+ { double *ival = (double *) value;
+ double *eval = (double *) (extra->value);
+ int j;
+
+ if (accum == DB_EXACT)
+ { for (j = 0; j < nelem; j++)
+ if (eval[j] != ival[j])
+ { fprintf(stderr,"%s: Value of extra %s doe not agree",Prog_Name,name);
+ fprintf(stderr," with previous .anoo block files\n");
+ goto error;
+ }
+ }
+ else
+ { for (j = 0; j < nelem; j++)
+ eval[j] += ival[j];
+ }
+ }
+
+ free(value);
+ free(name);
+ return (0);
+
+error:
+ free(value);
+ free(name);
+ EXIT(1);
+}
+
+// Write extra record to end of file afile and advance write pointer
+// If interactive, then return non-zero on error, if bash, then print
+// and halt if an error
+
+int Write_Extra(FILE *afile, DAZZ_EXTRA *extra)
+{ int slen;
+
+#define EWRITE(v,s,n,file) \
+ { if (fwrite(v,s,n,file) != (size_t) n) \
+ { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \
+ EXIT(1); \
+ } \
+ }
+
+ EWRITE(&(extra->vtype),sizeof(int),1,afile)
+ FWRITE(&(extra->nelem),sizeof(int),1,afile)
+ FWRITE(&(extra->accum),sizeof(int),1,afile)
+ slen = strlen(extra->name);
+ FWRITE(&slen,sizeof(int),1,afile)
+ FWRITE(extra->name,1,slen,afile)
+ FWRITE(extra->value,8,extra->nelem,afile)
+
+ return (0);
+}
+
+void Close_Track(DAZZ_DB *db, char *track)
+{ DAZZ_TRACK *record, *prev;
prev = NULL;
for (record = db->tracks; record != NULL; record = record->next)
@@ -1410,7 +1581,7 @@ void Close_Track(HITS_DB *db, char *track)
// Allocate and return a buffer big enough for the largest read in 'db', leaving room
// for an initial delimiter character
-char *New_Read_Buffer(HITS_DB *db)
+char *New_Read_Buffer(DAZZ_DB *db)
{ char *read;
read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer");
@@ -1425,11 +1596,11 @@ char *New_Read_Buffer(HITS_DB *db)
//
// **NB**, the byte before read will be set to a delimiter character!
-int Load_Read(HITS_DB *db, int i, char *read, int ascii)
+int Load_Read(DAZZ_DB *db, int i, char *read, int ascii)
{ FILE *bases = (FILE *) db->bases;
int64 off;
int len, clen;
- HITS_READ *r = db->reads;
+ DAZZ_READ *r = db->reads;
if (i >= db->nreads)
{ EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name);
@@ -1472,14 +1643,14 @@ int Load_Read(HITS_DB *db, int i, char *read, int ascii)
// and as a numeric string otherwise.
//
-HITS_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow"
+DAZZ_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow"
FILE *Arrow_File = NULL; // Becomes invalid after closing
-int Load_Arrow(HITS_DB *db, int i, char *read, int ascii)
+int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii)
{ FILE *arrow;
int64 off;
int len, clen;
- HITS_READ *r = db->reads;
+ DAZZ_READ *r = db->reads;
if (i >= db->nreads)
{ EPRINTF(EPLACE,"%s: Index out of bounds (Load_Arrow)\n",Prog_Name);
@@ -1519,12 +1690,12 @@ int Load_Arrow(HITS_DB *db, int i, char *read, int ascii)
return (0);
}
-char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii)
+char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii)
{ FILE *bases = (FILE *) db->bases;
int64 off;
int len, clen;
int bbeg, bend;
- HITS_READ *r = db->reads;
+ DAZZ_READ *r = db->reads;
if (i >= db->nreads)
{ EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name);
@@ -1578,7 +1749,7 @@ char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii)
// Allocate and return a buffer of 5 vectors big enough for the largest read in 'db'
-char **New_QV_Buffer(HITS_DB *db)
+char **New_QV_Buffer(DAZZ_DB *db)
{ char **entry;
char *qvs;
int i;
@@ -1595,8 +1766,8 @@ char **New_QV_Buffer(HITS_DB *db)
// Load into entry the QV streams for the i'th read from db. The parameter ascii applies to
// the DELTAG stream as described for Load_Read.
-int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii)
-{ HITS_READ *reads;
+int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii)
+{ DAZZ_READ *reads;
FILE *quiva;
int rlen;
@@ -1605,7 +1776,7 @@ int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii)
{ EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name);
EXIT(1);
}
- Active_QV = (HITS_QV *) db->tracks;
+ Active_QV = (DAZZ_QV *) db->tracks;
Active_DB = db;
}
if (i >= db->nreads)
@@ -1655,10 +1826,10 @@ int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii)
// non-zero then the reads are converted to ACGT ascii, otherwise the reads are left
// as numeric strings over 0(A), 1(C), 2(G), and 3(T).
-int Read_All_Sequences(HITS_DB *db, int ascii)
+int Read_All_Sequences(DAZZ_DB *db, int ascii)
{ FILE *bases;
int nreads = db->nreads;
- HITS_READ *reads = db->reads;
+ DAZZ_READ *reads = db->reads;
void (*translate)(char *s);
char *seq;
@@ -1713,6 +1884,16 @@ int Read_All_Sequences(HITS_DB *db, int ascii)
return (0);
}
+// For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all
+// those of the form "prefix/[.]root.part" and call actor with the complete path to each file
+// pointed at by path, and the suffix of the path by extension. The . proceeds the root
+// name if the defined constant HIDE_FILES is set. Always the first call is with the
+// path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for
+// "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and
+// so this routine gives one a way to know all the tracks associated with a given DB.
+// -1 is returned if the path could not be found, and 1 is returned if an error (reported
+// to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned.
+
int List_DB_Files(char *path, void actor(char *path, char *extension))
{ int status, plen, rlen, dlen;
char *root, *pwd, *name;
@@ -1750,19 +1931,9 @@ int List_DB_Files(char *path, void actor(char *path, char *extension))
{ isdam = 1;
break;
}
- if (strcasecmp(name,Catenate("","",root,".db")) == 0)
- { strncpy(root,name,rlen);
- break;
- }
- if (strcasecmp(name,Catenate("","",root,".dam")) == 0)
- { strncpy(root,name,rlen);
- isdam = 1;
- break;
- }
}
if (dp == NULL)
- { EPRINTF(EPLACE,"%s: Cannot find %s (List_DB_Files)\n",Prog_Name,pwd);
- status = -1;
+ { status = -1;
closedir(dirp);
goto error;
}
diff --git a/DB.h b/DB.h
index dc281de..ff41e9f 100644
--- a/DB.h
+++ b/DB.h
@@ -12,9 +12,9 @@
*
********************************************************************************************/
-#ifndef _HITS_DB
+#ifndef _DAZZ_DB
-#define _HITS_DB
+#define _DAZZ_DB
#include <stdio.h>
@@ -74,11 +74,6 @@ extern char Ebuffer[];
#endif
-#define SYSTEM_ERROR \
- { EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \
- exit (2); \
- }
-
#define ARG_INIT(name) \
Prog_Name = Strdup(name,""); \
for (i = 0; i < 128; i++) \
@@ -125,6 +120,108 @@ extern char Ebuffer[];
exit (1); \
}
+
+/*******************************************************************************************
+ *
+ * GUARDED BATCH IO MACROS
+ *
+ ********************************************************************************************/
+
+ // Utilitieis
+
+int Count_Args(char *arg);
+
+#define SYSTEM_READ_ERROR \
+ { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \
+ exit (2); \
+ }
+
+#define SYSTEM_WRITE_ERROR \
+ { fprintf(stderr,"%s: System error, write failed!\n",Prog_Name); \
+ exit (2); \
+ }
+
+#define SYSTEM_CLOSE_ERROR \
+ { fprintf(stderr,"%s: System error, file close failed!\n",Prog_Name); \
+ exit (2); \
+ }
+
+ // Output
+
+#define FWRITE(v,s,n,file) \
+ { if (fwrite(v,s,n,file) != (size_t) n) \
+ SYSTEM_WRITE_ERROR \
+ }
+
+#define FPRINTF(file,...) \
+ { if (fprintf(file,__VA_ARGS__) < 0) \
+ SYSTEM_WRITE_ERROR \
+ }
+
+#define PRINTF(...) \
+ { if (printf(__VA_ARGS__) < 0) \
+ SYSTEM_WRITE_ERROR \
+ }
+
+#define FPUTS(x,file) \
+ { if (fputs(x,file) == EOF) \
+ SYSTEM_WRITE_ERROR \
+ }
+
+ // Close
+
+#define FCLOSE(file) \
+ { if (fclose(file) != 0) \
+ SYSTEM_CLOSE_ERROR \
+ }
+
+ // Input
+
+#define FREAD(v,s,n,file) \
+ { if (fread(v,s,n,file) != (size_t) n) \
+ { if (ferror(file)) \
+ SYSTEM_READ_ERROR \
+ else \
+ { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \
+ exit (1); \
+ } \
+ } \
+ }
+
+#define FSCANF(file,...) \
+ { if (fscanf(file,__VA_ARGS__) != Count_Args(#__VA_ARGS__)-1) \
+ { if (ferror(file)) \
+ SYSTEM_READ_ERROR \
+ else \
+ { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \
+ exit (1); \
+ } \
+ } \
+ }
+
+#define FGETS(v,n,file) \
+ { if (fgets(v,n,file) == NULL) \
+ { if (ferror(file)) \
+ SYSTEM_READ_ERROR \
+ else \
+ { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \
+ exit (1); \
+ } \
+ } \
+ }
+
+#define FSEEKO(file,p,d) \
+ { if (fseeko(file,p,d) < 0) \
+ SYSTEM_READ_ERROR \
+ }
+
+#define FTELLO(file) \
+ ( { int x = ftello(file); \
+ if (x < 0) \
+ SYSTEM_READ_ERROR \
+ ; x; \
+ } )
+
/*******************************************************************************************
*
* UTILITIES
@@ -193,7 +290,7 @@ typedef struct
// Offset (in bytes) of scaffold header string in '.hdr' file (DAM)
// 4 compressed shorts containing snr info if an arrow DB.
int flags; // QV of read + flags above (DB only)
- } HITS_READ;
+ } DAZZ_READ;
// A track can be of 3 types:
// data == NULL: there are nreads 'anno' records of size 'size'.
@@ -208,9 +305,31 @@ typedef struct _track
int size; // Size in bytes of anno records
void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records
void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL
- } HITS_TRACK;
+ } DAZZ_TRACK;
+
+// The tailing part of a .anno track file can contain meta-information produced by the
+// command that produced the track. For example, the coverage, or good/bad parameters
+// for trimming, or even say a histogram of QV values. Each item is an array of 'nelem'
+// 64-bit ints or floats ('vtype' = DB_INT or DB_REAL), has a 'name' string that
+// describes it, and an indicator as to whether the values should be equal accross all
+// block tracks, or summed accross all block tracks (by Catrack). 'value' points at the
+// array of values
+
+#define DB_INT 0
+#define DB_REAL 1
-// The information for accessing QV streams is in a HITS_QV record that is a "pseudo-track"
+#define DB_EXACT 0
+#define DB_SUM 1
+
+typedef struct
+ { int vtype; // INT64 or FLOAST64
+ int nelem; // >= 1
+ int accum; // EXACT, SUM
+ char *name;
+ void *value;
+ } DAZZ_EXTRA;
+
+// The information for accessing QV streams is in a DAZZ_QV record that is a "pseudo-track"
// named ". at qvs" and is always the first track record in the list (if present). Since normal
// track names cannot begin with a . (this is enforced), this pseudo-track is never confused
// with a normal track.
@@ -223,11 +342,11 @@ typedef struct
uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with
// scheme coding[table[i]]
FILE *quiva; // the open file pointer to the .qvs file
- } HITS_QV;
+ } DAZZ_QV;
// The DB record holds all information about the current state of an active DB including an
-// array of HITS_READS, one per read, and a linked list of HITS_TRACKs the first of which
-// is always a HITS_QV pseudo-track (if the QVs have been loaded).
+// array of DAZZ_READS, one per read, and a linked list of DAZZ_TRACKs the first of which
+// is always a DAZZ_QV pseudo-track (if the QVs have been loaded).
typedef struct
{ int ureads; // Total number of reads in untrimmed DB
@@ -257,9 +376,9 @@ typedef struct
int loaded; // Are reads loaded in memory?
void *bases; // file pointer for bases file (to fetch reads from),
// or memory pointer to uncompressed block of all sequences.
- HITS_READ *reads; // Array [-1..nreads] of HITS_READ
- HITS_TRACK *tracks; // Linked list of loaded tracks
- } HITS_DB;
+ DAZZ_READ *reads; // Array [-1..nreads] of DAZZ_READ
+ DAZZ_TRACK *tracks; // Linked list of loaded tracks
+ } DAZZ_DB;
/*******************************************************************************************
@@ -294,7 +413,7 @@ typedef struct
// contain N-separated contigs), and .fpulse the first base of the contig in the
// fasta entry
- // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has
+ // Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has
// a part # in it then just the part is opened. The index array is allocated (for all or
// just the part) and read in.
// Return status of routine:
@@ -302,34 +421,34 @@ typedef struct
// 0: Open of DB proceeded without mishap
// 1: Open of DAM proceeded without mishap
-int Open_DB(char *path, HITS_DB *db);
+int Open_DB(char *path, DAZZ_DB *db);
// Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings
// of the current DB partition. Reallocate smaller memory blocks for the information kept
// for the retained reads.
-void Trim_DB(HITS_DB *db);
+void Trim_DB(DAZZ_DB *db);
// Shut down an open 'db' by freeing all associated space, including tracks and QV structures,
// and any open file pointers. The record pointed at by db however remains (the user
// supplied it and so should free it).
-void Close_DB(HITS_DB *db);
+void Close_DB(DAZZ_DB *db);
// Return the size in bytes of the given DB
-int64 sizeof_DB(HITS_DB *db);
+int64 sizeof_DB(DAZZ_DB *db);
// If QV pseudo track is not already in db's track list, then load it and set it up.
// The database must not have been trimmed yet. -1 is returned if a .qvs file is not
// present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE
// is defined. Otherwise a 0 is returned.
-int Load_QVs(HITS_DB *db);
+int Load_QVs(DAZZ_DB *db);
// Remove the QV pseudo track, all space associated with it, and close the .qvs file.
-void Close_QVs(HITS_DB *db);
+void Close_QVs(DAZZ_DB *db);
// Look up the file and header in the file of the indicated track. Return:
// 1: Track is for trimmed DB
@@ -344,28 +463,47 @@ void Close_QVs(HITS_DB *db);
#define CUSTOM_TRACK 0
#define MASK_TRACK 1
-int Check_Track(HITS_DB *db, char *track, int *kind);
+int Check_Track(DAZZ_DB *db, char *track, int *kind);
// If track is not already in the db's track list, then allocate all the storage for it,
// read it in from the appropriate file, add it to the track list, and return a pointer
- // to the newly created HITS_TRACK record. If the track does not exist or cannot be
+ // to the newly created DAZZ_TRACK record. If the track does not exist or cannot be
// opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise
// the routine prints an error message to stderr and exits if an error occurs, and returns
// with NULL only if the track does not exist.
-HITS_TRACK *Load_Track(HITS_DB *db, char *track);
+DAZZ_TRACK *Load_Track(DAZZ_DB *db, char *track);
+
+ // Assumming file pointer for afile is correctly positioned at the start of a extra item,
+ // and aname is the name of the .anno file, decode the value present and places it in
+ // extra if extra->nelem == 0, otherwise reduce the value just read into extra according
+ // according the to the directive given by 'accum'. Leave the read poinrt at the next
+ // extra or end-of-file.
+ // Returns:
+ // 1 if at the end of file,
+ // 0 if item was read and folded correctly,
+ // -1 if there was a system IO or allocation error (if interactive), and
+ // -2 if the new value could not be reduced into the currenct value of extra (interactive)
+
+int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra);
+
+// Write extra record to end of file afile and advance write pointer
+// If interactive, then return non-zero on error, if bash, then print
+// and halt if an error
+
+int Write_Extra(FILE *afile, DAZZ_EXTRA *extra);
// If track is on the db's track list, then it is removed and all storage associated with it
// is freed.
-void Close_Track(HITS_DB *db, char *track);
+void Close_Track(DAZZ_DB *db, char *track);
// Allocate and return a buffer big enough for the largest read in 'db'.
// **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte
// are needed by the alignment algorithms. If cannot allocate memory then return NULL
// if INTERACTIVE is defined, or print error to stderr and exit otherwise.
-char *New_Read_Buffer(HITS_DB *db);
+char *New_Read_Buffer(DAZZ_DB *db);
// Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an
// upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T)
@@ -373,12 +511,12 @@ char *New_Read_Buffer(HITS_DB *db);
// for traversals in either direction. A non-zero value is returned if an error occured
// and INTERACTIVE is defined.
-int Load_Read(HITS_DB *db, int i, char *read, int ascii);
+int Load_Read(DAZZ_DB *db, int i, char *read, int ascii);
// Exactly the same as Load_Read, save the arrow information is loaded, not the DNA sequence,
// and there is only a choice between numeric (0) or ascii (1);
-int Load_Arrow(HITS_DB *db, int i, char *read, int ascii);
+int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii);
// Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the
// the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii
@@ -387,7 +525,7 @@ int Load_Arrow(HITS_DB *db, int i, char *read, int ascii);
// the string holding the substring so it has a delimeter for traversals in either direction.
// A NULL pointer is returned if an error occured and INTERACTIVE is defined.
-char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii);
+char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii);
// Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur
// in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined,
@@ -399,13 +537,13 @@ char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii);
#define SUB_QV 3 // The substitution QVs
#define MRG_QV 4 // The merge QVs
-char **New_QV_Buffer(HITS_DB *db);
+char **New_QV_Buffer(DAZZ_DB *db);
// Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters
// are converted to a numeric or upper/lower case ascii string as per ascii. Return with
// a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1.
-int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii);
+int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii);
// Allocate a block big enough for all the uncompressed sequences, read them into it,
// reset the 'off' in each read record to be its in-memory offset, and set the
@@ -415,7 +553,7 @@ int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii);
// Return with a zero, except when an error occurs and INTERACTIVE is defined in which
// case return wtih 1.
-int Read_All_Sequences(HITS_DB *db, int ascii);
+int Read_All_Sequences(DAZZ_DB *db, int ascii);
// For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all
// those of the form "prefix/[.]root.part" and call actor with the complete path to each file
@@ -429,4 +567,4 @@ int Read_All_Sequences(HITS_DB *db, int ascii);
int List_DB_Files(char *path, void actor(char *path, char *extension));
-#endif // _HITS_DB
+#endif // _DAZZ_DB
diff --git a/REPqv.c b/REPqv.c
index 2b51f4a..f183e7b 100644
--- a/REPqv.c
+++ b/REPqv.c
@@ -25,7 +25,7 @@ static char *Usage = "<source:db> ...";
#define MAXQV 50 // Max QV score is 50
#define MAXQV1 51
-static HITS_DB _DB, *DB = &_DB; // Data base
+static DAZZ_DB _DB, *DB = &_DB; // Data base
static int64 *QV_IDX; // qual track index
static uint8 *QV; // qual track values
@@ -48,7 +48,7 @@ int main(int argc, char *argv[])
{ int status;
char *root;
int i, bval, gval, cover, hgap_min;
- HITS_TRACK *track;
+ DAZZ_TRACK *track;
int64 nreads, totlen;
int64 qgram[MAXQV1];
int64 qsum, qtotal;
diff --git a/REPtrim.c b/REPtrim.c
index 5e74674..831c677 100644
--- a/REPtrim.c
+++ b/REPtrim.c
@@ -30,10 +30,9 @@ static char *Usage = "<source:db> ...";
#define ADSUF 4 // Gap is due to adaptemer, trim suffix interval to right
-
// Global Variables (must exist across the processing of each pile)
-static HITS_DB _DB, *DB = &_DB; // Data base
+static DAZZ_DB _DB, *DB = &_DB; // Data base
static int64 *TRIM_IDX; // trim track index
static int *TRIM; // trim track values
@@ -57,7 +56,7 @@ int main(int argc, char *argv[])
char *root;
int i, a, tb, te;
int alen;
- HITS_TRACK *track;
+ DAZZ_TRACK *track;
int64 nreads, totlen;
int64 nelim, nelimbp;
int64 n5trm, n5trmbp;
@@ -67,7 +66,8 @@ int main(int argc, char *argv[])
int64 nlowq, nlowqbp;
int64 nspan, nspanbp;
int64 nchim, nchimbp;
- int BAD_QV, GOOD_QV, COVERAGE, HGAP_MIN;
+ int rlog, blog;
+ int BAD_QV, GOOD_QV, HGAP_MIN;
status = Open_DB(argv[c],DB);
if (status < 0)
@@ -104,25 +104,17 @@ int main(int argc, char *argv[])
if (extra == 4*sizeof(int))
{ fread(&GOOD_QV,sizeof(int),1,afile);
fread(&BAD_QV,sizeof(int),1,afile);
- fread(&COVERAGE,sizeof(int),1,afile);
+ fread(&HGAP_MIN,sizeof(int),1,afile);
fread(&HGAP_MIN,sizeof(int),1,afile);
}
- else if (extra == 3*sizeof(int))
- { fread(&GOOD_QV,sizeof(int),1,afile);
- fread(&BAD_QV,sizeof(int),1,afile);
- fread(&COVERAGE,sizeof(int),1,afile);
- HGAP_MIN = 0;
- }
- else if (extra == 2*sizeof(int))
+ else if (extra == 3*sizeof(int) || extra == 2*sizeof(int))
{ fread(&GOOD_QV,sizeof(int),1,afile);
fread(&BAD_QV,sizeof(int),1,afile);
- COVERAGE = -1;
HGAP_MIN = 0;
}
else
{ GOOD_QV = -1;
BAD_QV = -1;
- COVERAGE = -1;
HGAP_MIN = 0;
}
fclose(afile);
@@ -216,10 +208,35 @@ int main(int argc, char *argv[])
printf(" [-H%d]",HGAP_MIN);
printf(" %s\n\n",root);
+ { int64 mult;
+
+ rlog = 0;
+ mult = 1;
+ while (mult <= nreads || mult <= ngaps)
+ { mult *= 10;
+ rlog += 1;
+ }
+ if (rlog <= 3)
+ rlog = 3;
+ else
+ rlog += (rlog-1)/3;
+
+ blog = 0;
+ mult = 1;
+ while (mult <= totlen)
+ { mult *= 10;
+ blog += 1;
+ }
+ if (blog <= 3)
+ blog = 3;
+ else
+ blog += (blog-1)/3;
+ }
+
printf(" Input: ");
- Print_Number((int64) nreads,7,stdout);
+ Print_Number((int64) nreads,rlog,stdout);
printf(" (100.0%%) reads ");
- Print_Number(totlen,12,stdout);
+ Print_Number(totlen,blog,stdout);
printf(" (100.0%%) bases");
if (HGAP_MIN > 0)
{ printf(" (another ");
@@ -229,69 +246,69 @@ int main(int argc, char *argv[])
printf("\n");
printf(" Trimmed: ");
- Print_Number(nelim,7,stdout);
+ Print_Number(nelim,rlog,stdout);
printf(" (%5.1f%%) reads ",(100.*nelim)/nreads);
- Print_Number(nelimbp,12,stdout);
+ Print_Number(nelimbp,blog,stdout);
printf(" (%5.1f%%) bases\n",(100.*nelimbp)/totlen);
printf(" 5' trim: ");
- Print_Number(n5trm,7,stdout);
+ Print_Number(n5trm,rlog,stdout);
printf(" (%5.1f%%) reads ",(100.*n5trm)/nreads);
- Print_Number(n5trmbp,12,stdout);
+ Print_Number(n5trmbp,blog,stdout);
printf(" (%5.1f%%) bases\n",(100.*n5trmbp)/totlen);
printf(" 3' trim: ");
- Print_Number(n3trm,7,stdout);
+ Print_Number(n3trm,rlog,stdout);
printf(" (%5.1f%%) reads ",(100.*n3trm)/nreads);
- Print_Number(n3trmbp,12,stdout);
+ Print_Number(n3trmbp,blog,stdout);
printf(" (%5.1f%%) bases\n",(100.*n3trmbp)/totlen);
if (natrm > 0)
{ printf(" Adapter: ");
- Print_Number(natrm,7,stdout);
+ Print_Number(natrm,rlog,stdout);
printf(" (%5.1f%%) reads ",(100.*natrm)/nreads);
- Print_Number(natrmbp,12,stdout);
+ Print_Number(natrmbp,blog,stdout);
printf(" (%5.1f%%) bases\n",(100.*natrmbp)/totlen);
}
printf("\n");
printf(" Gaps: ");
- Print_Number(ngaps,7,stdout);
+ Print_Number(ngaps,rlog,stdout);
printf(" (%5.1f%%) gaps ",(100.*(ngaps))/nreads);
- Print_Number(ngapsbp,12,stdout);
+ Print_Number(ngapsbp,blog,stdout);
printf(" (%5.1f%%) bases\n",(100.*(ngapsbp))/totlen);
printf(" Low QV: ");
- Print_Number(nlowq,7,stdout);
+ Print_Number(nlowq,rlog,stdout);
printf(" (%5.1f%%) gaps ",(100.*(nlowq))/nreads);
- Print_Number(nlowqbp,12,stdout);
+ Print_Number(nlowqbp,blog,stdout);
printf(" (%5.1f%%) bases\n",(100.*(nlowqbp))/totlen);
printf(" Span'd: ");
- Print_Number(nspan,7,stdout);
+ Print_Number(nspan,rlog,stdout);
printf(" (%5.1f%%) gaps ",(100.*(nspan))/nreads);
- Print_Number(nspanbp,12,stdout);
+ Print_Number(nspanbp,blog,stdout);
printf(" (%5.1f%%) bases\n",(100.*(nspanbp))/totlen);
printf(" Break: ");
- Print_Number(nchim,7,stdout);
+ Print_Number(nchim,rlog,stdout);
printf(" (%5.1f%%) gaps ",(100.*(nchim))/nreads);
- Print_Number(nchimbp,12,stdout);
+ Print_Number(nchimbp,blog,stdout);
printf(" (%5.1f%%) bases\n",(100.*(nchimbp))/totlen);
printf("\n");
printf(" Clipped: ");
- Print_Number(n5trm+n3trm+nelim+nchim,7,stdout);
+ Print_Number(n5trm+n3trm+nelim+nchim,rlog,stdout);
printf(" clips ");
- Print_Number(n5trmbp+n3trmbp+nelimbp+nchimbp,12,stdout);
+ Print_Number(n5trmbp+n3trmbp+nelimbp+nchimbp,blog,stdout);
printf(" (%5.1f%%) bases\n",(100.*(n5trmbp+n3trmbp+nelimbp+nchimbp))/totlen);
printf(" Patched: ");
- Print_Number(nlowq+nspan,7,stdout);
+ Print_Number(nlowq+nspan,rlog,stdout);
printf(" patches ");
- Print_Number(nlowqbp+nspanbp,12,stdout);
+ Print_Number(nlowqbp+nspanbp,blog,stdout);
printf(" (%5.1f%%) bases\n",(100.*(nlowqbp+nspanbp))/totlen);
free(root);
diff --git a/align.c b/align.c
index d4fdfe8..e408eb9 100644
--- a/align.c
+++ b/align.c
@@ -161,6 +161,7 @@ static double Bias_Factor[10] = { .690, .690, .690, .690, .780,
typedef struct
{ double ave_corr;
int trace_space;
+ int reach;
float freq[4];
int ave_path;
int16 *score;
@@ -196,7 +197,7 @@ static void set_table(int bit, int prefix, int score, int max, Table_Bits *parms
/* Create an alignment specification record including path tip tables & values */
-Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq)
+Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq, int reach)
{ _Align_Spec *spec;
Table_Bits parms;
double match;
@@ -208,6 +209,7 @@ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq)
spec->ave_corr = ave_corr;
spec->trace_space = trace_space;
+ spec->reach = reach;
spec->freq[0] = freq[0];
spec->freq[1] = freq[1];
spec->freq[2] = freq[2];
@@ -257,6 +259,9 @@ int Trace_Spacing(Align_Spec *espec)
float *Base_Frequencies(Align_Spec *espec)
{ return (((_Align_Spec *) espec)->freq); }
+int Overlap_If_Possible(Align_Spec *espec)
+{ return (((_Align_Spec *) espec)->reach); }
+
/****************************************************************************************\
* *
@@ -343,6 +348,7 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
int TRACE_SPACE = spec->trace_space;
int PATH_AVE = spec->ave_path;
+ int REACH = spec->reach;
int16 *SCORE = spec->score;
int16 *TABLE = spec->table;
@@ -874,7 +880,7 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
int a, b, k, h;
int d, e;
- if (morem >= 0)
+ if (morem >= 0 && REACH)
{ trimx = morea-morey;
trimy = morey;
trimd = mored;
@@ -1004,6 +1010,7 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
int TRACE_SPACE = spec->trace_space;
int PATH_AVE = spec->ave_path;
+ int REACH = spec->reach;
int16 *SCORE = spec->score;
int16 *TABLE = spec->table;
@@ -1527,7 +1534,7 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
int a, b, k, h;
int d, e;
- if (morem >= 0)
+ if (morem >= 0 && REACH)
{ trimx = morea-morey;
trimy = morey;
trimd = mored;
@@ -4194,7 +4201,10 @@ static int ToA[4] = { 'a', 'c', 'g', 't' };
#endif
-static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode)
+static char *TP_Align =
+ "Bad alignment between trace points (Compute_Trace), source DB likely incorrect";
+
+static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode, int dmax)
{ int **PVF = wave->PVF;
int **PHF = wave->PHF;
int D;
@@ -4226,17 +4236,21 @@ static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode)
hgh = 0;
}
- posl = -INT32_MAX;
- posh = INT32_MAX;
+ posl = -dmax;
+ posh = dmax;
if (wave->Aabs == wave->Babs)
{ if (B == A)
- { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n");
+ { EPRINTF(EPLACE,"%s: self comparison starts on diagonal 0 (Compute_Trace)\n",Prog_Name);
EXIT(-1);
}
else if (B < A)
- posl = (B-A)+1;
+ { if ((B-A)+1 > posl)
+ posl = (B-A)+1;
+ }
else
- posh = (B-A)-1;
+ { if ((B-A)-1 < posh)
+ posh = (B-A)-1;
+ }
}
F1 = PVF[-2];
@@ -4254,6 +4268,11 @@ static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode)
int am, ac, ap;
char *a;
+ if (D > dmax)
+ { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Align);
+ EXIT(-1);
+ }
+
F2 = F1;
F1 = F0;
F0 = PVF[D];
@@ -4523,7 +4542,7 @@ static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode)
return (D + abs(del));
}
-static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode)
+static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode, int dmax)
{ int **PVF = wave->PVF;
int **PHF = wave->PHF;
int D;
@@ -4555,17 +4574,21 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode
hgh = 0;
}
- posl = -INT32_MAX;
- posh = INT32_MAX;
+ posl = -dmax;
+ posh = dmax;
if (wave->Aabs == wave->Babs)
{ if (B == A)
- { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n");
+ { EPRINTF(EPLACE,"%s: self comparison starts on diagonal 0 (Compute_Trace)\n",Prog_Name);
EXIT(1);
}
else if (B < A)
- posl = (B-A)+1;
+ { if ((B-A)+1 > posl)
+ posl = (B-A)+1;
+ }
else
- posh = (B-A)-1;
+ { if ((B-A)-1 < posh)
+ posh = (B-A)-1;
+ }
}
F1 = PVF[-2];
@@ -4583,6 +4606,11 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode
int am, ac, ap;
char *a;
+ if (D > dmax)
+ { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Align);
+ EXIT(-1);
+ }
+
F2 = F1;
F1 = F0;
F0 = PVF[D];
@@ -4795,14 +4823,20 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode
* *
\****************************************************************************************/
+static char *TP_Error = "Trace point out of bounds (Compute_Trace), source DB likely incorrect";
+
int Compute_Trace_ALL(Alignment *align, Work_Data *ework)
{ _Work_Data *work = (_Work_Data *) ework;
Trace_Waves wave;
Path *path;
char *aseq, *bseq;
+ int alen, blen;
int M, N, D;
+ int dmax;
+ alen = align->alen;
+ blen = align->blen;
path = align->path;
aseq = align->aseq;
bseq = align->bseq;
@@ -4812,7 +4846,6 @@ int Compute_Trace_ALL(Alignment *align, Work_Data *ework)
{ int64 s;
int d;
- int dmax;
int **PVF, **PHF;
if (M < N)
@@ -4851,7 +4884,11 @@ int Compute_Trace_ALL(Alignment *align, Work_Data *ework)
wave.Aabs = aseq;
wave.Babs = bseq;
- D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave,GREEDIEST);
+ if (path->aepos > alen || path->bepos > blen)
+ { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error);
+ EXIT(1);
+ }
+ D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave,GREEDIEST,dmax);
if (D < 0)
EXIT(1);
path->diffs = D;
@@ -4867,12 +4904,15 @@ int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int
Path *path;
char *aseq, *bseq;
+ int alen, blen;
uint16 *points;
int tlen;
int ab, bb;
int ae, be;
- int diffs;
+ int diffs, dmax;
+ alen = align->alen;
+ blen = align->blen;
path = align->path;
aseq = align->aseq;
bseq = align->bseq;
@@ -4882,7 +4922,7 @@ int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int
{ int64 s;
int d;
int M, N;
- int dmax, nmax;
+ int nmax;
int **PVF, **PHF;
M = path->aepos-path->abpos;
@@ -4938,7 +4978,11 @@ int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int
for (i = 1; i < tlen; i += 2)
{ ae = ae + trace_spacing;
be = bb + points[i];
- d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode);
+ if (ae > alen || be > blen)
+ { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error);
+ EXIT(1);
+ }
+ d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax);
if (d < 0)
EXIT(1);
diffs += d;
@@ -4947,7 +4991,11 @@ int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int
}
ae = path->aepos;
be = path->bepos;
- d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode);
+ if (ae > alen || be > blen)
+ { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error);
+ EXIT(1);
+ }
+ d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax);
if (d < 0)
EXIT(1);
diffs += d;
@@ -4966,12 +5014,15 @@ int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int
Path *path;
char *aseq, *bseq;
+ int alen, blen;
uint16 *points;
int tlen;
int ab, bb;
int ae, be;
- int diffs;
+ int diffs, dmax;
+ alen = align->alen;
+ blen = align->blen;
path = align->path;
aseq = align->aseq;
bseq = align->bseq;
@@ -4981,7 +5032,7 @@ int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int
{ int64 s;
int d;
int M, N;
- int dmax, nmax;
+ int nmax;
int **PVF, **PHF;
M = path->aepos-path->abpos;
@@ -5039,11 +5090,15 @@ int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int
for (i = 1; i < tlen; i += 2)
{ ae = ae + trace_spacing;
be = bb + points[i];
- if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode))
+ if (ae > alen || be > blen)
+ { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error);
+ EXIT(1);
+ }
+ if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax))
EXIT(1);
af = wave.mida;
bf = wave.midb;
- d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode);
+ d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode,dmax);
if (d < 0)
EXIT(1);
diffs += d;
@@ -5056,18 +5111,22 @@ int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int
ae = path->aepos;
be = path->bepos;
- if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode))
+ if (ae > alen || be > blen)
+ { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error);
+ EXIT(1);
+ }
+ if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax))
EXIT(1);
af = wave.mida;
bf = wave.midb;
- d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode);
+ d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode,dmax);
if (d < 0)
EXIT(1);
diffs += d;
as = af;
bs = bf;
- d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave,mode);
+ d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave,mode,dmax);
if (d < 0)
EXIT(1);
diffs += d;
@@ -5086,12 +5145,15 @@ int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode)
Path *path;
char *aseq, *bseq;
+ int alen, blen;
uint16 *points;
int tlen;
int ab, bb;
int ae, be;
- int diffs;
+ int diffs, dmax;
+ alen = align->alen;
+ blen = align->blen;
path = align->path;
aseq = align->aseq;
bseq = align->bseq;
@@ -5101,7 +5163,7 @@ int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode)
{ int64 s;
int d;
int M, N;
- int mmax, nmax, dmax;
+ int mmax, nmax;
int **PVF, **PHF;
M = path->aepos-path->abpos;
@@ -5160,7 +5222,11 @@ int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode)
for (i = 0; i < tlen; i += 2)
{ ae = ab + points[i];
be = bb + points[i+1];
- d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode);
+ if (ae > alen || be > blen)
+ { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error);
+ EXIT(1);
+ }
+ d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax);
if (d < 0)
EXIT(1);
diffs += d;
diff --git a/align.h b/align.h
index daa9151..d5f49a9 100644
--- a/align.h
+++ b/align.h
@@ -138,6 +138,10 @@ typedef struct
#define CHAIN_NEXT(x) ((x) & NEXT_FLAG)
#define BEST_CHAIN(x) ((x) & BEST_FLAG)
+#define ELIM_FLAG 0x20 // This LA should be ignored
+
+#define ELIM(x) ((x) & ELIM_FLAG)
+
typedef struct
{ Path *path;
uint32 flags; /* Pipeline status and complementation flags */
@@ -173,7 +177,9 @@ void Complement_Seq(char *a, int n);
description of 'trace' for Paths above)
freq[4]: a 4-element vector where afreq[0] = frequency of A, f(A), freq[1] = f(C),
freq[2] = f(G), and freq[3] = f(T). This vector is part of the header
- of every HITS database (see db.h).
+ of every DAZZ database (see db.h).
+ reach: a boolean, if set alignment extend to the boundary when reasonable, otherwise
+ the terminate only at suffix-positive points.
If an alignment cannot reach the boundary of the d.p. matrix with this condition (i.e.
overlap), then the last/first 30 columns of the alignment are guaranteed to be
@@ -187,13 +193,14 @@ void Complement_Seq(char *a, int n);
typedef void Align_Spec;
- Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq);
+ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq, int reach);
void Free_Align_Spec(Align_Spec *spec);
int Trace_Spacing (Align_Spec *spec);
double Average_Correlation(Align_Spec *spec);
float *Base_Frequencies (Align_Spec *spec);
+ int Overlap_If_Possible(Align_Spec *spec);
/* Local_Alignment finds the longest significant local alignment between the sequences in
'align' subject to:
@@ -303,7 +310,7 @@ void Complement_Seq(char *a, int n);
/*** OVERLAP ABSTRACTION:
Externally, between modules an Alignment is modeled by an "Overlap" record, which
- (a) replaces the pointers to the two sequences with their ID's in the HITS data bases,
+ (a) replaces the pointers to the two sequences with their ID's in the DAZZ data bases,
(b) does not contain the length of the 2 sequences (must fetch from DB), and
(c) contains its path as a subrecord rather than as a pointer (indeed, typically the
corresponding Alignment record points at the Overlap's path sub-record). The trace pointer
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/dascrubber.git
More information about the debian-med-commit
mailing list