[med-svn] [dascrubber] 01/08: New upstream version 0~20171015
Afif Elghraoui
afif at moszumanska.debian.org
Sun Oct 22 10:35:20 UTC 2017
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to annotated tag debian/0_20171015-1
in repository dascrubber.
commit 12e74df14958f2d8419955513c8e65fe8f81e803
Author: Afif Elghraoui <afif at debian.org>
Date: Thu Oct 19 23:51:04 2017 -0400
New upstream version 0~20171015
---
DASedit.c | 856 ++++++++++++++++++++++++++++++++++++++++++
DASmap.c | 293 +++++++++++++++
DASpatch.c | 893 ++++++++++++++++++++++++++++++++++++++++++++
DASqv.c | 47 ++-
DASrealign.c | 1166 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
DAStrim.c | 954 ++++++++++++++++++++++++++++++++++++++---------
DB.c | 105 +++++-
DB.h | 25 +-
Makefile | 22 +-
QV.c | 94 +++++
QV.h | 3 +
README | 47 ---
README.md | 215 +++++++++++
REPqv.c | 188 ++++++++++
REPtrim.c | 303 +++++++++++++++
align.c | 177 +++++----
align.h | 27 +-
17 files changed, 5093 insertions(+), 322 deletions(-)
diff --git a/DASedit.c b/DASedit.c
new file mode 100644
index 0000000..63e7a6f
--- /dev/null
+++ b/DASedit.c
@@ -0,0 +1,856 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2015, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Given the "trim" track patching directions, produce a new database <X>.E.db from
+ * <X>.db containing all the patched and cut images of the original reads.
+ *
+ * Author: Gene Myers
+ * Date : August 2016
+ *
+ *******************************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "DB.h"
+#include "align.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+#undef DEBUG_PATCHING
+#undef SHOW_PIECES
+#undef DEBUG_MAPPING
+#undef DEBUG_INDEX
+
+static char *Usage = "[-v] [-x<int>] <source:db> <target:db>";
+
+// Gap states
+
+#define LOWQ 0 // Gap is spanned by many LAs and patchable
+#define SPAN 1 // Gap has many paired LAs and patchable
+#define SPLIT 2 // Gap is a chimer or an unpatchable gap
+#define ADPRE 3 // Gap is an adatper break and prefix should be removed
+#define ADSUF 4 // Gap is an adatper break and suffix should be removed
+
+// Global Variables (must exist across the processing of each pile)
+
+ // Read-only
+
+static HITS_DB _DB, *DB = &_DB; // Data base
+
+static int64 *TRIM_IDX;
+static int *TRIM;
+
+static int64 *PATCH_IDX;
+static int *PATCH;
+
+
+void Print_Seq(char *target, int tlen)
+{ static int letter[4] = { 'a', 'c', 'g', 't' };
+ int i;
+
+ for (i = 0; i < tlen; i++)
+ { if (i%100 == 0 && i != 0)
+ printf("\n");
+ printf("%c",letter[(int) target[i]]);
+ }
+ printf("\n");
+}
+
+#define STACK_SIZE 50
+
+static char *BSTACK[STACK_SIZE];
+
+static int Load_Model(int *patch, char *target, int depth)
+{ int bread, bbeg, bend;
+ int lend, rend;
+ int gb, ge, pb;
+ int tlen, plen;
+ char *bseq;
+
+ if (BSTACK[depth] == NULL)
+ BSTACK[depth] = New_Read_Buffer(DB);
+
+ bread = patch[0];
+ if (bread < 0)
+ bread = - (bread+1);
+ else
+ bread -= 1;
+ bbeg = patch[1];
+ bend = patch[2];
+
+#ifdef DEBUG_PATCHING
+ printf("%*sPatching %d%c[%d,%d]->[%d,%d]\n",
+ 2*depth,"",bread,(patch[0]<0?'c':'n'),patch[1],patch[2],bbeg,bend);
+#endif
+
+ bseq = Load_Subread(DB,bread,bbeg,bend,BSTACK[depth],0) - bbeg;
+
+ gb = TRIM_IDX[bread];
+ ge = TRIM_IDX[bread+1];
+ pb = PATCH_IDX[bread];
+
+ tlen = 0;
+ rend = -1;
+ for (; gb < ge; gb += 3)
+ { lend = TRIM[gb];
+ if (lend > bbeg)
+ { if (rend < bbeg || lend > bend || PATCH[pb] == 0)
+ return (-1);
+ plen = Load_Model(PATCH+pb,target+tlen,depth+1);
+ if (plen < 0)
+ return (-1);
+ tlen += plen;
+
+ rend = TRIM[gb+1];
+ if (rend > bend)
+ rend = bend;
+ memmove(target+tlen,bseq+lend,rend-lend);
+ tlen += rend-lend;
+#ifdef DEBUG_PATCHING
+ printf("%*s Piece %d,%d\n",2*depth,"",lend,rend);
+#endif
+#ifdef SHOW_PIECES
+ Print_Seq(bseq+lend,rend-lend);
+#endif
+ pb += 3;
+ }
+ else // lend <= bbeg
+ { rend = TRIM[gb+1];
+ if (rend > bbeg)
+ { if (rend > bend)
+ rend = bend;
+ memmove(target+tlen,bseq+bbeg,rend-bbeg);
+ tlen += rend-bbeg;
+#ifdef DEBUG_PATCHING
+ printf("%*s Piece %d,%d\n",2*depth,"",bbeg,rend);
+#endif
+#ifdef SHOW_PIECES
+ Print_Seq(bseq+bbeg,rend-bbeg);
+#endif
+ }
+ else
+ pb += 3;
+ }
+ if (rend >= bend)
+ break;
+ }
+ if (gb >= ge)
+ fprintf(stderr,"Fatal: Should not happen\n");
+
+ if (patch[0] < 0)
+ Complement_Seq(target,tlen);
+
+ return (tlen);
+}
+
+int main(int argc, char *argv[])
+{ HITS_READ *reads;
+ int nreads;
+ int64 boff;
+
+ int nnew, nmax;
+ int64 ntot;
+
+ int *segfate;
+ int nsegs;
+
+ char *pwd1, *root1; // inputs
+ char *pwd2, *root2;
+ int VERBOSE;
+ int CUTOFF;
+ int GOOD_QV;
+ int BAD_QV;
+ int COVERAGE;
+ int HGAP_MIN;
+
+ int nfiles; // contents of source .db file
+ int nblocks;
+ int64 bsize;
+ char **flist = NULL;
+ char **plist = NULL;
+ int *findx = NULL;
+ int *bindx = NULL;
+
+ FILE *NB_FILE; // files for writing target
+ FILE *IDX_FILE;
+ FILE *BPS_FILE;
+ FILE *MP_AFILE;
+ FILE *MP_DFILE;
+
+ // Process arguments
+
+ { int i, j, k;
+ int flags[128];
+ char *eptr;
+
+ ARG_INIT("DASedit")
+
+ CUTOFF = 0;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("v")
+ break;
+ case 'x':
+ ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff")
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ VERBOSE = flags['v'];
+
+ if (argc != 3)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ // Open source and make target has a different pwd/name
+
+ { int status;
+
+ status = Open_DB(argv[1],DB);
+ if (status < 0)
+ exit (1);
+ if (status == 1)
+ { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ if (DB->part)
+ { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+
+ pwd1 = PathTo(argv[1]);
+ root1 = Root(argv[1],".db");
+
+ pwd2 = PathTo(argv[2]);
+ root2 = Root(argv[2],".db");
+
+ if (strcmp(root1,root2) == 0 && strcmp(pwd1,pwd2) == 0)
+ { fprintf(stderr,"%s: source and target are the same !\n",Prog_Name);
+ exit (1);
+ }
+ }
+
+ // Load the file and block structure of the source database
+
+ { int i;
+ int nid, oid;
+ int cutoff, allflag, ufirst;
+ FILE *dstub;
+
+ dstub = Fopen(Catenate(pwd1,"/",root1,".db"),"r");
+ if (dstub == NULL)
+ exit (1);
+
+ if (fscanf(dstub,DB_NFILE,&nfiles) != 1)
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root1);
+ exit (1);
+ }
+
+ flist = (char **) Malloc(sizeof(char *)*nfiles,"Allocating file list");
+ plist = (char **) Malloc(sizeof(char *)*nfiles,"Allocating prolog list");
+ findx = (int *) Malloc(sizeof(int *)*nfiles,"Allocating file index");
+ if (flist == NULL || plist == NULL || findx == NULL)
+ exit (1);
+
+ for (i = 0; i < nfiles; i++)
+ { char prolog[MAX_NAME], fname[MAX_NAME];
+
+ if (fscanf(dstub,DB_FDATA,findx+i,fname,prolog) != 3)
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root1);
+ exit (1);
+ }
+ if ((flist[i] = Strdup(fname,"Adding to file list")) == NULL)
+ exit (1);
+ if ((plist[i] = Strdup(prolog,"Adding to prolog list")) == NULL)
+ exit (1);
+ }
+
+ if (fscanf(dstub,DB_NBLOCK,&nblocks) != 1)
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root1);
+ exit (1);
+ }
+ if (fscanf(dstub,DB_PARAMS,&bsize,&cutoff,&allflag) != 3)
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root1);
+ exit (1);
+ }
+
+ bindx = (int *) Malloc(sizeof(int *)*(nblocks+1),"Allocating block indices");
+ if (bindx == NULL)
+ exit (1);
+
+ for (i = 0; i <= nblocks; i++)
+ if (fscanf(dstub,DB_BDATA,&ufirst,bindx+i) != 2)
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root1);
+ exit (1);
+ }
+
+ fclose(dstub);
+
+ // map read counts for each file into trimmed read counts for each file
+
+ if (allflag)
+ allflag = 0;
+ else
+ allflag = DB_BEST;
+ reads = DB->reads;
+
+ nid = 0;
+ oid = 0;
+ for (i = 0; i < nfiles; i++)
+ { while (oid < findx[i])
+ { if ((reads[oid].flags & DB_BEST) >= allflag && reads[oid].rlen >= cutoff)
+ nid++;
+ oid += 1;
+ }
+ findx[i] = nid;
+ }
+ }
+
+ // Can now trim source DB. Also load .trim and .patch tracks
+
+ Trim_DB(DB);
+
+ reads = DB->reads;
+ nreads = DB->nreads;
+
+ { HITS_TRACK *track;
+ int i;
+
+ track = Load_Track(DB,"trim");
+ if (track != NULL)
+ { FILE *afile;
+ int size, tracklen, extra;
+
+ TRIM_IDX = (int64 *) track->anno;
+ TRIM = (int *) track->data;
+ for (i = 0; i <= nreads; i++)
+ TRIM_IDX[i] /= sizeof(int);
+
+ // if newer .trim tracks with -g, -b, -c, -H meta data, grab it
+
+ afile = fopen(Catenate(DB->path,".","trim",".anno"),"r");
+ fread(&tracklen,sizeof(int),1,afile);
+ fread(&size,sizeof(int),1,afile);
+ fseeko(afile,0,SEEK_END);
+ extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int));
+ fseeko(afile,-extra,SEEK_END);
+ if (extra == 4*sizeof(int))
+ { fread(&GOOD_QV,sizeof(int),1,afile);
+ fread(&BAD_QV,sizeof(int),1,afile);
+ fread(&COVERAGE,sizeof(int),1,afile);
+ fread(&HGAP_MIN,sizeof(int),1,afile);
+ }
+ else if (extra == 3*sizeof(int))
+ { fread(&GOOD_QV,sizeof(int),1,afile);
+ fread(&BAD_QV,sizeof(int),1,afile);
+ fread(&COVERAGE,sizeof(int),1,afile);
+ HGAP_MIN = 0;
+ }
+ else if (extra == 2*sizeof(int))
+ { fread(&GOOD_QV,sizeof(int),1,afile);
+ fread(&BAD_QV,sizeof(int),1,afile);
+ COVERAGE = -1;
+ HGAP_MIN = 0;
+ }
+ else
+ { fprintf(stderr,"%s: trim annotation is out of date, rerun DAStrim\n",Prog_Name);
+ exit (1);
+ }
+ fclose(afile);
+
+ { int a, t, x;
+ int tb, te;
+
+ x = 0;
+ for (a = 0; a < DB->nreads; a++)
+ { tb = TRIM_IDX[a];
+ te = TRIM_IDX[a+1];
+ if (tb+2 < te)
+ { if (TRIM[tb+2] == ADPRE)
+ tb += 3;
+ if (TRIM[te-3] == ADSUF)
+ te -= 3;
+ }
+ TRIM_IDX[a] = x;
+ for (t = tb; t < te; t++)
+ TRIM[x++] = TRIM[t];
+ }
+ TRIM_IDX[DB->nreads] = x;
+ }
+ }
+ else
+ { fprintf(stderr,"%s: Must have a 'trim' track, run DAStrim\n",Prog_Name);
+ exit (1);
+ }
+
+ track = Load_Track(DB,"patch");
+ if (track != NULL)
+ { PATCH_IDX = (int64 *) track->anno;
+ PATCH = (int *) track->data;
+ for (i = 0; i <= nreads; i++)
+ PATCH_IDX[i] /= sizeof(int);
+ }
+ else
+ { fprintf(stderr,"%s: Must have a 'patch' track, run DASfix\n",Prog_Name);
+ exit (1);
+ }
+ }
+
+ // Allocate segment status: will have value of defined constants below or if > 0
+ // the the segment is followed by a patch of given size.
+ // Also open new db files here, to be certain they can be so manipulated before
+ // doing anything that would need to be reversed.
+
+#define SHORT_LAST -4
+#define GOOD_LAST -3
+#define TRIMMED -2
+#define SHORT -1
+#define GOOD 0
+
+ nsegs = nreads + PATCH_IDX[nreads]/3;
+
+ segfate = Malloc(sizeof(int)*nsegs,"Allocating block status vector");
+
+ NB_FILE = Fopen(Catenate(pwd2,"/",root2,".db"),"w");
+ IDX_FILE = Fopen(Catenate(pwd2,PATHSEP,root2,".idx"),"w");
+ BPS_FILE = Fopen(Catenate(pwd2,PATHSEP,root2,".bps"),"w");
+ MP_AFILE = Fopen(Catenate(pwd2,PATHSEP,root2,".map.anno"),"w");
+ MP_DFILE = Fopen(Catenate(pwd2,PATHSEP,root2,".map.data"),"w");
+ if (NB_FILE == NULL || IDX_FILE == NULL || BPS_FILE == NULL ||
+ MP_AFILE == NULL || MP_DFILE == NULL)
+ exit (1);
+
+ // Patch all the reads creating a new compressed .bps file of said for the new
+ // DB. Further tally the total bytes and number of cuts, and also produce
+ // the .map track.
+
+ { int i, ni, bi;
+ int gb, ge, pb;
+ char *target, *aseq;
+ int64 MP_INDEX;
+
+ int64 ntrim, nshort;
+ int64 nttot, nstot;
+ int64 htrim, httot;
+
+ ni = 0;
+ fwrite(&ni,sizeof(int),1,MP_AFILE);
+ ni = 8;
+ fwrite(&ni,sizeof(int),1,MP_AFILE);
+ MP_INDEX = 0;
+ fwrite(&MP_INDEX,sizeof(int64),1,MP_AFILE);
+
+ for (i = 0; i < STACK_SIZE; i++)
+ BSTACK[i] = NULL;
+
+ target = New_Read_Buffer(DB);
+ aseq = BSTACK[0] = New_Read_Buffer(DB);
+ segfate[0] = GOOD_LAST;
+
+ ntrim = nshort = 0;
+ nstot = nttot = 0;
+ htrim = httot = 0;
+ ntot = 0;
+ nnew = nmax = 0;
+
+ boff = 0;
+ ni = 0;
+ bi = 0;
+ pb = 0;
+ ge = 0;
+ for (i = 0; i < nreads; i++)
+ { int tlen, clen;
+ int lend, rend, blen;
+ int gb1, bi1;
+
+ gb = ge;
+ ge = TRIM_IDX[i+1];
+
+#ifdef DEBUG_PATCHING
+ printf("Doing %d\n",i);
+#endif
+
+ if (reads[i].rlen < HGAP_MIN)
+ { segfate[bi++] = TRIMMED;
+ htrim += 1;
+ httot += reads[i].rlen;
+ continue;
+ }
+
+ if (ge <= gb)
+ { segfate[bi++] = TRIMMED;
+ ntrim += 1;
+ nttot += reads[i].rlen;
+ continue;
+ }
+
+ Load_Read(DB,i,aseq,0);
+
+ nttot += TRIM[gb];
+
+ gb1 = gb;
+ bi1 = bi;
+ tlen = 0;
+ for ( ; gb < ge; gb += 3)
+ { lend = TRIM[gb];
+ rend = TRIM[gb+1];
+ blen = rend - lend;
+
+ memmove(target+tlen,aseq+lend,blen);
+ tlen += blen;
+#ifdef DEBUG_PATCHING
+ printf(" Piece %d,%d (%d)\n",lend,rend,bi);
+#endif
+#ifdef SHOW_PIECES
+ Print_Seq(aseq+lend,blen);
+#endif
+
+ if (gb+3 < ge)
+ { if (PATCH[pb] != 0)
+ clen = Load_Model(PATCH+pb,target+tlen,1);
+ else
+ clen = -1;
+ pb += 3;
+
+ if (clen >= 0)
+ { tlen += clen;
+ segfate[bi++] = clen;
+ continue;
+ }
+ }
+
+ if (tlen >= CUTOFF)
+ { Compress_Read(tlen,target);
+ clen = COMPRESSED_LEN(tlen);
+ fwrite(target,1,clen,BPS_FILE);
+ boff += clen;
+
+#ifdef DEBUG_PATCHING
+ printf(" Output %d(%d)\n",ni,tlen);
+#endif
+ ni += 1;
+
+ fwrite(&i,sizeof(int),1,MP_DFILE);
+ fwrite(&(reads[i].rlen),sizeof(int),1,MP_DFILE);
+ MP_INDEX += 2*sizeof(int);
+ while (gb1 < gb)
+ { fwrite(TRIM+gb1,sizeof(int),1,MP_DFILE);
+ fwrite(TRIM+(gb1+1),sizeof(int),1,MP_DFILE);
+ fwrite(segfate+bi1,sizeof(int),1,MP_DFILE);
+ MP_INDEX += 3*sizeof(int);
+ gb1 += 3;
+ bi1 += 1;
+ }
+ fwrite(TRIM+gb1,sizeof(int),1,MP_DFILE);
+ fwrite(TRIM+(gb1+1),sizeof(int),1,MP_DFILE);
+ MP_INDEX += 2*sizeof(int);
+ fwrite(&MP_INDEX,sizeof(int64),1,MP_AFILE);
+
+ gb1 = gb+3;
+ if (gb1 <= ge)
+ segfate[bi++] = GOOD;
+ else
+ segfate[bi++] = GOOD_LAST;
+ bi1 = bi;
+ nnew += 1;
+ ntot += tlen;
+ if (tlen > nmax)
+ nmax = tlen;
+ }
+ else
+ { gb1 = gb+3;
+ if (gb1 <= ge)
+ segfate[bi++] = SHORT;
+ else
+ segfate[bi++] = SHORT_LAST;
+ bi1 = bi;
+ nshort += 1;
+ nstot += tlen;
+#ifdef DEBUG_PATCHING
+ printf(" Remove %d(%d)\n",ni,tlen);
+#endif
+ }
+ tlen = 0;
+ if (gb1 <= ge)
+ { nttot += TRIM[gb1]-rend;
+#ifdef DEBUG_PATCHING
+ printf(" Cutting %d,%d\n",rend,TRIM[gb1]);
+#endif
+ }
+ else
+ nttot += reads[i].rlen-rend;
+ }
+ }
+
+ nsegs = bi;
+ for (i = 0; i < STACK_SIZE; i++)
+ if (BSTACK[i] == NULL)
+ break;
+ else
+ free(BSTACK[i]-1);
+ free(target-1);
+
+ rewind(MP_AFILE);
+ fwrite(&ni,sizeof(int),1,MP_AFILE);
+
+ if (VERBOSE)
+ { printf("\n ");
+
+ if (htrim > 0)
+ { Print_Number(htrim,0,stdout);
+ printf(" reads and ");
+ Print_Number(httot,0,stdout);
+ printf(" bases in reads < H-length (%d)\n\n ",HGAP_MIN);
+ }
+
+ Print_Number(DB->nreads-htrim,0,stdout);
+ printf(" reads and ");
+ Print_Number(DB->totlen-httot,0,stdout);
+ if (htrim > 0)
+ printf(" bases in reads >= H-length in source DB \n\n ");
+ else
+ printf(" bases in source DB\n\n ");
+
+ Print_Number(ntrim,0,stdout);
+ printf(" reads and ");
+ Print_Number(nttot,0,stdout);
+ printf(" bases were trimmmed by scrubbing (DAStrim)\n\n ");
+
+ if (CUTOFF > 0)
+ { Print_Number(nshort,0,stdout);
+ printf(" edited reads < %d bases, totaling ",CUTOFF);
+ Print_Number(nstot,0,stdout);
+ printf(" bases were cut (-x option)\n\n ");
+ }
+
+ printf("The edited DB has ");
+ Print_Number((int64) nnew,0,stdout);
+ printf(" edited reads and ");
+ Print_Number(ntot,0,stdout);
+ printf(" bases\n");
+ }
+
+ fclose(BPS_FILE);
+ fclose(MP_AFILE);
+ fclose(MP_DFILE);
+ }
+
+
+ // Output the file structure for the new database, adjusting the number
+ // of reads in each file according to how reads are split, and also adjust
+ // block indices similarly. Further compress all read records that are trimmed
+ // or produce no edited reads.
+
+ { int i, s;
+ int oi, ni;
+ int nf, nb;
+
+#ifdef DEBUG_MAPPING
+ printf("\nMAPPING\n");
+#endif
+ nf = 0;
+ nb = 1;
+ oi = 0;
+ ni = 0;
+ for (i = 0; i < nsegs; i++)
+ { s = segfate[i];
+ if (s <= 0)
+ { if (s == GOOD || s == GOOD_LAST)
+ ni += 1;
+ if (s <= TRIMMED)
+ { oi += 1;
+ if (oi == findx[nf])
+ { findx[nf++] = ni;
+#ifdef DEBUG_MAPPING
+ printf(" %2d: %d->%d\n",nf-1,oi,ni);
+#endif
+ }
+ if (oi == bindx[nb])
+ { bindx[nb++] = ni;
+#ifdef DEBUG_MAPPING
+ printf(" %2d: %d->%d\n",nb-1,oi,ni);
+#endif
+ }
+ }
+ }
+ }
+
+#ifdef DEBUG_MAPPING
+ printf(" Total reads = %d Trimmed Reads = %d\n",ni,oi);
+ if (nf != nfiles)
+ printf(" File count not correct %d %d\n",nf,nfiles);
+ if (nb != nblocks+1)
+ printf(" Block count not correct %d %d\n",nb,nblocks+1);
+ fflush(stdout);
+#endif
+
+ fprintf(NB_FILE,DB_NFILE,nfiles);
+
+ for (i = 0; i < nfiles; i++)
+ fprintf(NB_FILE,DB_FDATA,findx[i],flist[i],plist[i]);
+
+ fprintf(NB_FILE,DB_NBLOCK,nblocks);
+ fprintf(NB_FILE,DB_PARAMS,bsize,CUTOFF,1);
+
+ for (i = 0; i <= nblocks; i++)
+ fprintf(NB_FILE,DB_BDATA,bindx[i],bindx[i]);
+
+ fclose(NB_FILE);
+ }
+
+ { int i, s;
+ int bi, gb;
+ int tlen, first;
+ HITS_DB NB;
+ HITS_READ newrec;
+#ifdef DEBUG_INDEX
+ int ni;
+#endif
+
+ // Write an index of the patched reads
+
+ NB = *DB;
+ NB.ureads = nnew;
+ NB.treads = nnew;
+ NB.cutoff = CUTOFF;
+ NB.allarr = DB->allarr | DB_ALL;
+ NB.totlen = ntot;
+ NB.maxlen = nmax;
+
+ fwrite(&NB,sizeof(HITS_DB),1,IDX_FILE);
+
+#ifdef DEBUG_INDEX
+ printf("\nINDEXING\n");
+ ni = 0;
+#endif
+
+ boff = 0;
+ i = 0;
+ gb = 0;
+ bi = 0;
+ while (bi < nsegs)
+ { tlen = 0;
+ first = TRIM[gb];
+ while ((s = segfate[bi++]) > 0)
+ { tlen += (TRIM[gb+1] - TRIM[gb]) + s;
+#ifdef DEBUG_INDEX
+ printf(" [%d,%d] %d",TRIM[gb],TRIM[gb+1],s);
+#endif
+ gb += 3;
+ }
+ if (s == GOOD || s == GOOD_LAST)
+ { tlen += (TRIM[gb+1] - TRIM[gb]);
+#ifdef DEBUG_INDEX
+ printf(" [%d,%d]",TRIM[gb],TRIM[gb+1]);
+ printf(" GOOD %d: (%d,%d)\n",i,ni++,bi);
+#endif
+ newrec.origin = reads[i].origin;
+ newrec.fpulse = reads[i].fpulse + first;
+ newrec.rlen = tlen;
+ newrec.boff = boff;
+ newrec.coff = i;
+ newrec.flags = (reads[i].flags & DB_QV) | DB_BEST;
+ if (segfate[bi] > TRIMMED)
+ newrec.flags |= DB_CSS;
+ fwrite(&newrec,sizeof(HITS_READ),1,IDX_FILE);
+ boff += COMPRESSED_LEN(tlen);
+
+ if (s == GOOD)
+ gb += 3;
+ else
+ { gb += 2;
+ i += 1;
+ }
+ }
+ else if (s == TRIMMED)
+ {
+#ifdef DEBUG_INDEX
+ printf(" TRIM %d: (%d)\n",i,bi);
+#endif
+ i += 1;
+ }
+ else // s == SHORT || s == SHORT_LAST
+ {
+#ifdef DEBUG_INDEX
+ printf(" [%d,%d]",TRIM[gb],TRIM[gb+1]);
+ printf(" SHRT %d: (%d)\n",i,bi);
+#endif
+ if (s == SHORT)
+ gb += 3;
+ else
+ { gb += 2;
+ i += 1;
+ }
+ }
+ }
+
+#ifdef DEBUG_INDEX
+ printf("Finish %d %d %lld\n",ni,i,boff);
+#endif
+
+ fclose(IDX_FILE);
+ }
+
+ free(pwd2);
+ free(root2);
+ free(pwd1);
+ free(root1);
+ free(Prog_Name);
+
+ exit (0);
+}
diff --git a/DASmap.c b/DASmap.c
new file mode 100644
index 0000000..83adfa9
--- /dev/null
+++ b/DASmap.c
@@ -0,0 +1,293 @@
+/*******************************************************************************************
+ *
+ * Display a specified set of reads of a database in fasta format.
+ *
+ * Author: Gene Myers
+ * Date : September 2013
+ * Mod : With DB overhaul, made this a routine strictly for printing a selected subset
+ * and created DB2fasta for recreating all the fasta files of a DB
+ * Date : April 2014
+ * Mod : Added options to display QV streams
+ * Date : July 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+static char Usage[] = "[-p] <path:db|dam> [ <reads:FILE> | <reads:range> ... ]";
+
+#define LAST_READ_SYMBOL '$'
+#define MAX_BUFFER 10001
+
+typedef struct
+ { FILE *input;
+ int lineno;
+ int read;
+ int beg;
+ int end;
+ } File_Iterator;
+
+File_Iterator *init_file_iterator(FILE *input)
+{ File_Iterator *it;
+
+ it = Malloc(sizeof(File_Iterator),"Allocating file iterator");
+ it->input = input;
+ it->lineno = 1;
+ rewind(input);
+ return (it);
+}
+
+int next_read(File_Iterator *it)
+{ static char nbuffer[MAX_BUFFER];
+
+ char *eol;
+ int x;
+
+ if (fgets(nbuffer,MAX_BUFFER,it->input) == NULL)
+ { if (feof(it->input))
+ return (1);
+ SYSTEM_ERROR;
+ }
+ if ((eol = index(nbuffer,'\n')) == NULL)
+ { fprintf(stderr,"%s: Line %d in read list is longer than %d chars!\n",
+ Prog_Name,it->lineno,MAX_BUFFER-1);
+ return (1);
+ }
+ *eol = '\0';
+ x = sscanf(nbuffer," %d %d %d",&(it->read),&(it->beg),&(it->end));
+ if (x == 1)
+ it->beg = -1;
+ else if (x != 3)
+ { fprintf(stderr,"%s: Line %d of read list is improperly formatted\n",Prog_Name,it->lineno);
+ return (1);
+ }
+ it->lineno += 1;
+ return (0);
+}
+
+int main(int argc, char *argv[])
+{ HITS_DB _db, *db = &_db;
+ HITS_TRACK *map;
+
+ int reps, *pts;
+ int input_pts;
+ File_Iterator *iter = NULL;
+ FILE *input;
+
+ int PRETTY;
+
+ // Process arguments
+
+ { int i, j, k;
+ int flags[128];
+
+ ARG_INIT("DASmap")
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ ARG_FLAGS("p")
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ PRETTY = flags['p'];
+
+ if (argc <= 1)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ // Open DB or DAM, and if a DAM open also .hdr file
+
+ { int status, kind;
+
+ status = Open_DB(argv[1],db);
+ if (status < 0)
+ exit (1);
+ if (status == 1)
+ { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ if (db->part)
+ { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+
+ status = Check_Track(db,"map",&kind);
+ if (status == -2)
+ { fprintf(stderr,"%s: 'map' track not found.\n",Prog_Name);
+ exit (1);
+ }
+ else if (status == -1)
+ { fprintf(stderr,"%s: Warning: 'map' track not sync'd with db.\n",Prog_Name);
+ exit (1);
+ }
+ map = Load_Track(db,"map");
+ }
+
+ // Process read index arguments into a list of read ranges
+
+ input_pts = 0;
+ if (argc == 3)
+ { if (argv[2][0] != LAST_READ_SYMBOL || argv[2][1] != '\0')
+ { char *eptr, *fptr;
+ int b, e;
+
+ b = strtol(argv[2],&eptr,10);
+ if (eptr > argv[2] && b > 0)
+ { if (*eptr == '-')
+ { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0')
+ { e = strtol(eptr+1,&fptr,10);
+ input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0);
+ }
+ }
+ else
+ input_pts = (*eptr != '\0');
+ }
+ else
+ input_pts = 1;
+ }
+ }
+
+ if (input_pts)
+ { input = Fopen(argv[2],"r");
+ if (input == NULL)
+ exit (1);
+
+ iter = init_file_iterator(input);
+ }
+ else
+ { pts = (int *) Malloc(sizeof(int)*2*(argc-1),"Allocating read parameters");
+ if (pts == NULL)
+ exit (1);
+
+ reps = 0;
+ if (argc > 2)
+ { int c, b, e;
+ char *eptr, *fptr;
+
+ for (c = 2; c < argc; c++)
+ { if (argv[c][0] == LAST_READ_SYMBOL)
+ { b = db->nreads;
+ eptr = argv[c]+1;
+ }
+ else
+ b = strtol(argv[c],&eptr,10);
+ if (eptr > argv[c])
+ { if (b <= 0)
+ { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b);
+ exit (1);
+ }
+ if (*eptr == 0)
+ { pts[reps++] = b;
+ pts[reps++] = b;
+ continue;
+ }
+ else if (*eptr == '-')
+ { if (eptr[1] == LAST_READ_SYMBOL)
+ { e = db->nreads;
+ fptr = eptr+2;
+ }
+ else
+ e = strtol(eptr+1,&fptr,10);
+ if (fptr > eptr+1 && *fptr == 0 && e > 0)
+ { pts[reps++] = b;
+ pts[reps++] = e;
+ if (b > e)
+ { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]);
+ exit (1);
+ }
+ continue;
+ }
+ }
+ }
+ fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]);
+ exit (1);
+ }
+ }
+ else
+ { pts[reps++] = 1;
+ pts[reps++] = db->nreads;
+ }
+ }
+
+ // Display each read (and/or QV streams) in the active DB according to the
+ // range pairs in pts[0..reps) and according to the display options.
+
+ { int c, b, e, i;
+ int64 *anno;
+ int *data;
+ int64 s, f, j;
+
+ anno = (int64 *) map->anno;
+ data = (int *) map->data;
+
+ c = 0;
+ while (1)
+ { if (input_pts)
+ { if (next_read(iter))
+ break;
+ e = iter->read;
+ b = e-1;
+ }
+ else
+ { if (c >= reps)
+ break;
+ b = pts[c]-1;
+ e = pts[c+1];
+ if (e > db->nreads)
+ e = db->nreads;
+ c += 2;
+ }
+
+ if (PRETTY)
+ for (i = b; i < e; i++)
+ { s = (anno[i] >> 2);
+ f = (anno[i+1] >> 2);
+ printf(" %d -> %d(%d)",i+1,data[s]+1,data[s+1]);
+ for (j = s+2; j < f; j += 3)
+ { printf(" [%d,%d]",data[j],data[j+1]);
+ if (j+2 < f)
+ printf(" %d",data[j+2]);
+ }
+ printf("\n");
+ }
+ else
+ for (i = b; i < e; i++)
+ { s = (anno[i] >> 2);
+ f = (anno[i+1] >> 2);
+ printf(" %d %d %d %lld",i+1,data[s]+1,data[s+1],(f-s)-2);
+ for (j = s+2; j < f; j += 3)
+ { printf(" %d %d",data[j],data[j+1]);
+ if (j+2 < f)
+ printf(" %d",data[j+2]);
+ }
+ printf("\n");
+ }
+ }
+ }
+
+ if (input_pts)
+ { fclose(input);
+ free(iter);
+ }
+ else
+ free(pts);
+
+ Close_DB(db);
+
+ exit (0);
+}
diff --git a/DASpatch.c b/DASpatch.c
new file mode 100644
index 0000000..53294b5
--- /dev/null
+++ b/DASpatch.c
@@ -0,0 +1,893 @@
+/*******************************************************************************************
+ *
+ * Using overlap pile for each read,intrinisic quality values, and trimmed hq-intervals
+ * for each read, determine the B-read and segment thereof to use to patch each low
+ * quality segment between hq-intervals.
+ *
+ * Author: Gene Myers
+ * Date : June 2016
+ *
+ *******************************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+
+#include "DB.h"
+#include "align.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+#undef DEBUG_GAP_FILL
+#undef SHOW_PAIRS
+#undef DEBUG_SUMMARY
+
+// Command format and global parameter variables
+
+static char *Usage = " [-v] <source:db> <overlaps:las> ...";
+
+static int BAD_QV; // qv >= and you are "bad"
+static int GOOD_QV; // qv <= and you are "good"
+static int HGAP_MIN; // less than this length do not process for HGAP
+static int VERBOSE;
+
+// Gap states
+
+#define LOWQ 0 // Gap is spanned by many LAs and patchable
+#define SPAN 1 // Gap has many paired LAs and patchable
+#define SPLIT 2 // Gap is a chimer or an unpatchable gap
+#define ADPRE 3 // Gap is an adatper break and prefix should be removed
+#define ADSUF 4 // Gap is an adatper break and suffix should be removed
+#define NOPAT 3 // Gap could not be patched (internal only)
+
+#define COVER_LEN 400 // An overlap covers a point if it extends COVER_LEN to either side.
+#define ANCHOR_MATCH .25 // Delta in trace interval at both ends of patch must be < this %.
+
+static int ANCHOR_THRESH;
+
+
+// Global Variables (must exist across the processing of each pile)
+
+ // Read-only
+
+static int TRACE_SPACING; // Trace spacing (from .las file)
+
+static HITS_DB _DB, *DB = &_DB; // Data base
+static int DB_FIRST; // First read of DB to process
+static int DB_LAST; // Last read of DB to process (+1)
+static int DB_PART; // 0 if all, otherwise block #
+
+static int64 *QV_IDX; // qual track index
+static uint8 *QV; // qual track values
+
+static int64 *TRIM_IDX; // trim track index
+static int *TRIM; // trim track values
+
+ // Write-only
+
+static FILE *PR_AFILE; // .trim.anno
+static FILE *PR_DFILE; // .trim.data
+static int64 PR_INDEX; // Current index into .trim.data file as it is being written
+
+ // Statistics
+
+static int fpatch, npatch;
+
+// Data Structures
+
+typedef struct // General read interval [beg..end]
+ { int beg;
+ int end;
+ } Interval;
+
+
+/*******************************************************************************************
+ *
+ * FIND ANY UNREMOVED ADAPTER (OR POLYMERASE SWITCHES) AND TRIM SMALLER PARTS
+ *
+ ********************************************************************************************/
+
+typedef struct
+ { int bread; // bread^comp[beg..end] is the patch sequence
+ int comp;
+ int beg;
+ int end;
+
+ int anc; // maximum anchor interval match
+ int bad; // number of segments that are bad
+ int avg; // average QV of the patch
+ } Patch;
+
+// Evaluate the quality of lov->bread = rov->bread spaning [lcv,rcv] as a patch
+
+static Patch *compute_patch(int lft, int rgt, Overlap *lov, Overlap *rov)
+{ static Patch ans;
+
+ uint16 *tr;
+ int bread, bcomp, blen;
+ int bb, be;
+ int t, te;
+ int bl, br;
+ uint8 *qb;
+ int avg, anc, bad;
+
+ bread = lov->bread;
+ bcomp = COMP(lov->flags);
+ blen = DB->reads[bread].rlen;
+ if (blen < HGAP_MIN)
+ return (NULL);
+
+ if (lft > lov->path.aepos || rgt < rov->path.abpos) // Cannot anchor
+ return (NULL);
+ if (lov->path.abpos > lft-TRACE_SPACING || rgt+TRACE_SPACING > rov->path.aepos)
+ return (NULL);
+
+ // Get max of left and right anchors as anchor score
+
+ tr = (uint16 *) lov->path.trace;
+ te = 2 * (((lft + (TRACE_SPACING-1)) - lov->path.abpos)/TRACE_SPACING);
+ if (te == 0)
+ return (NULL);
+ anc = tr[te-2];
+
+ bb = lov->path.bbpos;
+ for (t = 1; t < te; t += 2)
+ bb += tr[t];
+
+ tr = (uint16 *) rov->path.trace;
+ te = 2 * (((rgt + (TRACE_SPACING-1)) - rov->path.abpos)/TRACE_SPACING);
+ if (te >= rov->path.tlen)
+ return (NULL);
+ if (tr[te] > anc)
+ anc = tr[te];
+
+ be = rov->path.bepos;
+ for (t = rov->path.tlen-1; t > te; t -= 2)
+ be -= tr[t];
+
+ if (bb >= be)
+ return (NULL);
+
+ // Compute metrics for b-read patch
+
+ if (bcomp)
+ { t = blen - be;
+ be = blen - bb;
+ bb = t;
+ }
+
+ bl = bb/TRACE_SPACING;
+ br = (be+(TRACE_SPACING-1))/TRACE_SPACING;
+ qb = QV + QV_IDX[bread];
+ if (bl >= br)
+ { avg = qb[bl];
+ if (avg >= BAD_QV)
+ bad = 1;
+ else
+ bad = 0;
+ }
+ else
+ { avg = 0;
+ bad = 0;
+ for (t = bl; t < br; t++)
+ { avg += qb[t];
+ if (qb[t] >= BAD_QV)
+ bad += 1;
+ }
+ avg /= (br-bl);
+ }
+
+ ans.bread = bread;
+ ans.comp = bcomp;
+ ans.beg = bb;
+ ans.end = be;
+ ans.anc = anc;
+ ans.bad = bad;
+ ans.avg = avg;
+
+ return (&ans);
+}
+
+static int unsuitable(int bread, int lft, int rgt)
+{ int tb, te;
+
+ tb = TRIM_IDX[bread];
+ te = TRIM_IDX[bread+1];
+ for ( ; tb < te; tb += 3)
+ if (TRIM[tb+1] >= lft)
+ break;
+ if (tb >= te || TRIM[tb] > lft)
+ return (1);
+ for ( ; tb < te ; tb += 3)
+ { if (TRIM[tb+1] >= rgt)
+ break;
+ if (TRIM[tb+2] == SPLIT)
+ return (1);
+ }
+ if (tb >= te || TRIM[tb] > rgt)
+ return (1);
+ return (0);
+}
+
+// Categorize each gap and if appropriate return the best patch for each
+
+static Patch *lowq_patch(Overlap *ovls, int novl, Interval *lblock, Interval *rblock)
+{ static Patch patch;
+
+ int j;
+ int lft, rgt;
+ int lcv, rcv;
+
+ lft = lblock->end;
+ rgt = rblock->beg;
+ lcv = lft - COVER_LEN;
+ rcv = rgt + COVER_LEN;
+ if (lcv < lblock->beg)
+ lcv = lblock->beg;
+ if (rcv > rblock->end)
+ rcv = rblock->end;
+
+ patch.bread = -1;
+ patch.anc = TRACE_SPACING;
+ patch.avg = 100;
+ for (j = 0; j < novl; j++)
+ if (ovls[j].path.abpos <= lcv && ovls[j].path.aepos >= rcv)
+ { Patch *can;
+
+ can = compute_patch(lft,rgt,ovls+j,ovls+j);
+
+ if (can == NULL) continue;
+
+ if (unsuitable(can->bread,can->beg,can->end))
+ continue;
+
+ if (can->anc <= ANCHOR_THRESH && can->avg <= GOOD_QV && can->bad == 0 &&
+ can->avg + can->anc < patch.anc + patch.avg)
+ patch = *can;
+ }
+
+#ifdef DEBUG_GAP_FILL
+ if (patch.bread >= 0)
+ printf(" LOWQ PATCH = %d%c[%d..%d] %d (%d)\n",
+ patch.bread,patch.comp?'c':'n',patch.beg,patch.end,patch.anc,patch.avg);
+ else
+ printf(" LOWQ PATCH FAIL\n");
+#endif
+
+ return (&patch);
+}
+
+static Patch *span_patch(Overlap *ovls, int novl, Interval *lblock, Interval *rblock)
+{ static Patch patch;
+
+ int j, k;
+ int lft, rgt;
+ int lcv, rcv;
+ int bread, bcomp, blen;
+ int ab, ae;
+ int lidx, ridx, sidx, cidx;
+ Patch *can;
+
+ lft = lblock->end;
+ rgt = rblock->beg;
+ lcv = lft - COVER_LEN;
+ rcv = rgt + COVER_LEN;
+ if (lcv < lblock->beg)
+ lcv = lblock->beg;
+ if (rcv > rblock->end)
+ rcv = rblock->end;
+
+ // Find LA pairs or LAs spanning the gap flank [lcv,rcv]
+
+ patch.bread = -1;
+ patch.bad = DB->maxlen;
+ patch.avg = 100;
+ for (j = 0; j < novl; j = k)
+ { bread = ovls[j].bread;
+ blen = DB->reads[bread].rlen;
+ bcomp = COMP(ovls[j].flags);
+ if (bcomp)
+ cidx = j;
+
+ lidx = ridx = sidx = -1; // For all LA's with same b-read
+ for (k = j; k < novl; k++)
+ { if (ovls[k].bread != bread)
+ break;
+ if (COMP(ovls[k].flags) != (uint32) bcomp) // Note when b switches orientation
+ { cidx = k;
+ bcomp = COMP(ovls[k].flags);
+ }
+ ab = ovls[k].path.abpos;
+ ae = ovls[k].path.aepos;
+
+#ifdef SHOW_PAIRS
+ printf("\n %5d [%5d,%5d] %c",bread,ab,ae,COMP(ovls[k].flags)?'c':'n');
+ if (ab <= lcv && ae >= rcv)
+ printf("s");
+ else
+ printf(" ");
+#endif
+
+ // Is LA a spanner, left-partner, or right partner
+
+ if (ab <= lcv && ae >= rcv)
+ { sidx = k;
+ lidx = ridx = -1;
+ continue;
+ }
+
+#ifdef SHOW_PAIRS
+ if (ae >= rcv && ab <= rcv && ab - ovls[k].path.bbpos <= lft - COVER_LEN)
+ printf("r");
+ else
+ printf(" ");
+ if (ab <= lcv && ae >= lcv && ae + (blen-ovls[j].path.bepos) >= rgt + COVER_LEN)
+ printf("l");
+ else
+ printf(" ");
+#endif
+
+ if (ae >= rcv && ab <= rcv && ab - ovls[k].path.bbpos <= lft - COVER_LEN)
+ ridx = k;
+ if (ab <= lcv && ae >= lcv && ae + (blen-ovls[j].path.bepos) >= rgt + COVER_LEN)
+ lidx = k;
+ }
+ if (! bcomp)
+ cidx = k;
+
+#ifdef SHOW_PAIRS
+ printf(" =");
+ if (sidx >= 0)
+ printf(" S");
+ if (lidx >= 0)
+ printf(" L");
+ if (ridx >= 0)
+ printf(" R");
+ if (0 <= lidx && lidx < ridx && (ridx < cidx || lidx >= cidx))
+ printf(" G");
+ if ((0<=ridx && ridx<cidx && cidx<=lidx) || (0<=lidx && lidx<cidx && cidx<=ridx))
+ printf(" A");
+#endif
+
+ // Check for spanning pair
+
+ if (sidx >= 0)
+ lidx = ridx = sidx;
+ else if (0 > lidx || lidx >= ridx || (ridx >= cidx && cidx > lidx))
+ continue;
+
+ // Otherwise consider the gap linkable and try to patch it, declaring a split
+ // iff all patch attemtps fail
+
+#ifdef DEBUG_GAP_FILL
+ if (lidx != ridx)
+ printf(" %5d [%5d,%5d] [%5d,%5d]",
+ ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos,
+ ovls[ridx].path.abpos,ovls[ridx].path.aepos);
+ else
+ printf(" %5d [%5d,%5d] SSS",
+ ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos);
+#endif
+
+ can = compute_patch(lft,rgt,ovls+lidx,ovls+ridx);
+
+ if (can != NULL)
+ {
+#ifdef DEBUG_GAP_FILL
+ printf(" %d",can->end - can->beg);
+#endif
+ if ( ! unsuitable(can->bread,can->beg,can->end) && can->anc <= ANCHOR_THRESH)
+ { if (can->bad < patch.bad)
+ patch = *can;
+ else if (can->bad == patch.bad)
+ { if (can->avg < patch.avg)
+ patch = *can;
+ }
+#ifdef DEBUG_GAP_FILL
+ printf(" AA %d %d(%d)",can->anc,can->bad,can->avg);
+#endif
+ }
+ }
+#ifdef DEBUG_GAP_FILL
+ printf("\n");
+#endif
+ }
+
+#ifdef DEBUG_GAP_FILL
+ if (patch.bread >= 0)
+ printf(" SPAN %5d: PATCH = %d%c[%d..%d] %d %d(%d)\n",rgt-lft,
+ patch.bread,patch.comp?'c':'n',patch.beg,
+ patch.end,patch.anc,patch.bad,patch.avg);
+ else
+ printf(" SPAN PATCH FAIL\n");
+#endif
+
+ return (&patch);
+}
+
+/*******************************************************************************************
+ *
+ * SCRUB EACH PILE:
+ * Trim low-quality tips of reads and patch low quality intervals within a sequence
+ * Trim adapter (and associated redundant prefix or suffix)
+ * Break chimers or all unscaffoldable no-coverage gaps of reads
+ *
+ ********************************************************************************************/
+
+// Analyze all the gaps between the good patches found in the first pass.
+// Consider a hole between two good intervals [lb,le] and [rb,re]. An overlap
+ // is anchored to the left of the whole if abpos <= le-COVER_LEN and aepos >= rb+COVER_LEN
+
+static void PATCH_GAPS(int aread, Overlap *ovls, int novl)
+{ static Patch dummy = { 0, 0, 0, 0, 0, 0, 0 };
+
+#ifdef DEBUG_SUMMARY
+ static char *status_string[4] = { "LOWQ", "SPAN", "SPLIT", "NOPAT" };
+#endif
+
+ int alen;
+ Interval lblock, rblock;
+ Patch *patch = NULL;
+ int status;
+ int tb, te;
+ int val;
+
+ alen = DB->reads[aread].rlen;
+ if (alen < HGAP_MIN)
+ { fwrite(&PR_INDEX,sizeof(int64),1,PR_AFILE);
+ return;
+ }
+
+#if defined(DEBUG_GAP_FILL) || defined(DEBUG_SUMMARY)
+ printf("\n");
+ printf("AREAD %d\n",aread);
+#endif
+
+ // Determine patch for every LOWQ and SPAN gap and output dummy 0-patch
+ // for all SPLIT decisions
+
+ tb = TRIM_IDX[aread];
+ te = TRIM_IDX[aread+1];
+ if (tb+2 < te)
+ { lblock.beg = TRIM[tb];
+ lblock.end = TRIM[tb+1];
+ for (tb += 3; tb < te; tb += 3)
+ { status = TRIM[tb-1];
+ rblock.beg = TRIM[tb];
+ rblock.end = TRIM[tb+1];
+
+ if (status == LOWQ)
+ { patch = lowq_patch(ovls,novl,&lblock,&rblock);
+ if (patch->bread < 0)
+ status = SPAN;
+ }
+ if (status == SPAN)
+ patch = span_patch(ovls,novl,&lblock,&rblock);
+
+ if (status == SPLIT)
+ { val = 0;
+ patch = &dummy;
+ }
+ else
+ { if (patch->bread < 0)
+ { val = 0;
+ fpatch += 1;
+#ifdef DEBUG_SUMMARY
+ TRIM[tb-1] = NOPAT;
+#endif
+ }
+ else if (patch->comp)
+ val = -(patch->bread+1);
+ else
+ val = patch->bread+1;
+ npatch += 1;
+ }
+ fwrite(&val,sizeof(int),1,PR_DFILE);
+ fwrite(&(patch->beg),sizeof(int),1,PR_DFILE);
+ fwrite(&(patch->end),sizeof(int),1,PR_DFILE);
+ PR_INDEX += 3*sizeof(int);
+
+ lblock = rblock;
+ }
+ }
+ fwrite(&PR_INDEX,sizeof(int64),1,PR_AFILE);
+
+#ifdef DEBUG_SUMMARY
+ tb = TRIM_IDX[aread];
+ te = TRIM_IDX[aread+1];
+#ifdef DEBUG_GAP_FILL
+ if (tb+2 < te)
+ printf(" FINAL:\n");
+#endif
+ if (tb < te)
+ { printf(" [%d,%d]",TRIM[tb],TRIM[tb+1]);
+ for (tb += 3; tb < te; tb += 3)
+ printf(" %s [%d,%d]",status_string[TRIM[tb-1]],TRIM[tb],TRIM[tb+1]);
+ printf("\n");
+ }
+#endif
+}
+
+
+ // Read in each successive pile and call ACTION on it. Read in the traces only if
+ // "trace" is nonzero
+
+static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int trace)
+{ static Overlap *ovls = NULL;
+ static int omax = 500;
+ static uint16 *paths = NULL;
+ static int pmax = 100000;
+
+ int64 i, j, novl;
+ int n, a;
+ int pcur;
+ int max;
+ int tbytes;
+
+ if (ovls == NULL)
+ { ovls = (Overlap *) Malloc(sizeof(Overlap)*omax,"Allocating overlap buffer");
+ if (ovls == NULL)
+ exit (1);
+ }
+ if (trace && paths == NULL)
+ { paths = (uint16 *) Malloc(sizeof(uint16)*pmax,"Allocating path buffer");
+ if (paths == NULL)
+ exit (1);
+ }
+
+ rewind(input);
+ fread(&novl,sizeof(int64),1,input);
+ fread(&TRACE_SPACING,sizeof(int),1,input);
+ if (TRACE_SPACING <= TRACE_XOVR)
+ tbytes = sizeof(uint8);
+ else
+ tbytes = sizeof(uint16);
+
+ if (novl <= 0)
+ return (0);
+
+ Read_Overlap(input,ovls);
+ if (trace)
+ { if (ovls[0].path.tlen > pmax)
+ { pmax = 1.2*(ovls[0].path.tlen)+10000;
+ paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
+ if (paths == NULL) exit (1);
+ }
+ fread(paths,tbytes,ovls[0].path.tlen,input);
+ if (tbytes == 1)
+ { ovls[0].path.trace = paths;
+ Decompress_TraceTo16(ovls);
+ }
+ }
+ else
+ fseek(input,tbytes*ovls[0].path.tlen,SEEK_CUR);
+
+ if (ovls[0].aread < DB_FIRST)
+ { fprintf(stderr,"%s: .las file overlaps don't correspond to reads in block %d of DB\n",
+ Prog_Name,DB_PART);
+ exit (1);
+ }
+
+ pcur = 0;
+ n = max = 0;
+ for (j = DB_FIRST; j < DB_LAST; j++)
+ { ovls[0] = ovls[n];
+ a = ovls[0].aread;
+ if (a != j)
+ n = 0;
+ else
+ { if (trace)
+ memmove(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
+ n = 1;
+ pcur = ovls[0].path.tlen;
+ while (1)
+ { if (Read_Overlap(input,ovls+n) != 0)
+ { ovls[n].aread = INT32_MAX;
+ break;
+ }
+ if (trace)
+ { if (pcur + ovls[n].path.tlen > pmax)
+ { pmax = 1.2*(pcur+ovls[n].path.tlen)+10000;
+ paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
+ if (paths == NULL) exit (1);
+ }
+ fread(paths+pcur,tbytes,ovls[n].path.tlen,input);
+ if (tbytes == 1)
+ { ovls[n].path.trace = paths+pcur;
+ Decompress_TraceTo16(ovls+n);
+ }
+ }
+ else
+ fseek(input,tbytes*ovls[n].path.tlen,SEEK_CUR);
+ if (ovls[n].aread != a)
+ break;
+ pcur += ovls[n].path.tlen;
+ n += 1;
+ if (n >= omax)
+ { omax = 1.2*n + 100;
+ ovls = (Overlap *) Realloc(ovls,sizeof(Overlap)*omax,"Expanding overlap buffer");
+ if (ovls == NULL) exit (1);
+ }
+ }
+
+ if (n >= max)
+ max = n;
+ pcur = 0;
+ for (i = 0; i < n; i++)
+ { ovls[i].path.trace = paths+pcur;
+ pcur += ovls[i].path.tlen;
+ }
+ }
+ ACTION(j,ovls,n);
+ }
+
+ return (max);
+}
+
+int main(int argc, char *argv[])
+{ FILE *input;
+ char *root, *dpwd;
+ char *las, *lpwd;
+ int64 novl;
+ HITS_TRACK *track;
+ int c;
+ int COVERAGE;
+
+ // Process arguments
+
+ { int i, j, k;
+ int flags[128];
+
+ ARG_INIT("DASpatch")
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("v")
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ VERBOSE = flags['v'];
+
+ if (argc < 3)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ // Open trimmed DB and .qual and .trim tracks
+
+ { int i, status;
+
+ status = Open_DB(argv[1],DB);
+ if (status < 0)
+ exit (1);
+ if (status == 1)
+ { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ if (DB->part)
+ { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ Trim_DB(DB);
+
+ track = Load_Track(DB,"qual");
+ if (track != NULL)
+ { QV_IDX = (int64 *) track->anno;
+ QV = (uint8 *) track->data;
+ }
+ else
+ { fprintf(stderr,"%s: Must have a 'qual' track, run DASqv\n",Prog_Name);
+ exit (1);
+ }
+
+ track = Load_Track(DB,"trim");
+ if (track != NULL)
+ { FILE *afile;
+ int size, tracklen, extra;
+
+ TRIM_IDX = (int64 *) track->anno;
+ TRIM = (int *) track->data;
+ for (i = 0; i <= DB->nreads; i++)
+ TRIM_IDX[i] /= sizeof(int);
+
+ // if newer .trim tracks with -g, -b, -c, -H meta data, grab it
+
+ afile = fopen(Catenate(DB->path,".","trim",".anno"),"r");
+ fread(&tracklen,sizeof(int),1,afile);
+ fread(&size,sizeof(int),1,afile);
+ fseeko(afile,0,SEEK_END);
+ extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int));
+ fseeko(afile,-extra,SEEK_END);
+ if (extra == 4*sizeof(int))
+ { fread(&GOOD_QV,sizeof(int),1,afile);
+ fread(&BAD_QV,sizeof(int),1,afile);
+ fread(&COVERAGE,sizeof(int),1,afile);
+ fread(&HGAP_MIN,sizeof(int),1,afile);
+ }
+ else if (extra == 3*sizeof(int))
+ { fread(&GOOD_QV,sizeof(int),1,afile);
+ fread(&BAD_QV,sizeof(int),1,afile);
+ fread(&COVERAGE,sizeof(int),1,afile);
+ HGAP_MIN = 0;
+ }
+ else if (extra == 2*sizeof(int))
+ { fread(&GOOD_QV,sizeof(int),1,afile);
+ fread(&BAD_QV,sizeof(int),1,afile);
+ COVERAGE = -1;
+ HGAP_MIN = 0;
+ }
+ else
+ { fprintf(stderr,"%s: trim annotation is out of date, rerun DAStrim\n",Prog_Name);
+ exit (1);
+ }
+ fclose(afile);
+
+ { int a, t, x;
+ int tb, te;
+
+ x = 0;
+ for (a = 0; a < DB->nreads; a++)
+ { tb = TRIM_IDX[a];
+ te = TRIM_IDX[a+1];
+ if (tb+2 < te)
+ { if (TRIM[tb+2] == ADPRE)
+ tb += 3;
+ if (TRIM[te-3] == ADSUF)
+ te -= 3;
+ }
+ TRIM_IDX[a] = x;
+ for (t = tb; t < te; t++)
+ TRIM[x++] = TRIM[t];
+ }
+ TRIM_IDX[DB->nreads] = x;
+ }
+ }
+ else
+ { fprintf(stderr,"%s: Must have a 'trim' track, run DAStrim\n",Prog_Name);
+ exit (1);
+ }
+ }
+
+ // Initialize statistics gathering
+
+ if (VERBOSE)
+ { npatch = 0;
+ fpatch = 0;
+
+ printf("\nDASpatch -g%d -b%d %s",GOOD_QV,BAD_QV,argv[1]);
+ for (c = 2; c < argc; c++)
+ printf(" %s",argv[c]);
+ printf("\n");
+ }
+
+ // For each .las block/file
+
+ dpwd = PathTo(argv[1]);
+ root = Root(argv[1],".db");
+
+ for (c = 2; c < argc; c++)
+ { las = Root(argv[c],".las");
+
+ // Determine if a .las block is being processed and if so get first and last read
+ // from .db file
+
+ { FILE *dbfile;
+ char buffer[2*MAX_NAME+100];
+ char *p, *eptr;
+ int i, part, nfiles, nblocks, cutoff, all, oindx;
+ int64 size;
+
+ DB_PART = 0;
+ DB_FIRST = 0;
+ DB_LAST = DB->nreads;
+
+ p = rindex(las,'.');
+ if (p != NULL)
+ { part = strtol(p+1,&eptr,10);
+ if (*eptr == '\0' && eptr != p+1)
+ { dbfile = Fopen(Catenate(dpwd,"/",root,".db"),"r");
+ if (dbfile == NULL)
+ exit (1);
+ if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
+ SYSTEM_ERROR
+ for (i = 0; i < nfiles; i++)
+ if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL)
+ SYSTEM_ERROR
+ if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1)
+ SYSTEM_ERROR
+ if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3)
+ SYSTEM_ERROR
+ for (i = 1; i <= part; i++)
+ if (fscanf(dbfile,DB_BDATA,&oindx,&DB_FIRST) != 2)
+ SYSTEM_ERROR
+ if (fscanf(dbfile,DB_BDATA,&oindx,&DB_LAST) != 2)
+ SYSTEM_ERROR
+ fclose(dbfile);
+ DB_PART = part;
+ *p = '\0';
+ }
+ }
+ }
+
+ // Set up QV trimming track
+
+ { int len, size;
+
+ if (DB_PART > 0)
+ { PR_AFILE = Fopen(Catenate(dpwd,PATHSEP,root,
+ Numbered_Suffix(".",DB_PART,".patch.anno")),"w");
+ PR_DFILE = Fopen(Catenate(dpwd,PATHSEP,root,
+ Numbered_Suffix(".",DB_PART,".patch.data")),"w");
+ }
+ else
+ { PR_AFILE = Fopen(Catenate(dpwd,PATHSEP,root,".patch.anno"),"w");
+ PR_DFILE = Fopen(Catenate(dpwd,PATHSEP,root,".patch.data"),"w");
+ }
+ if (PR_AFILE == NULL || PR_DFILE == NULL)
+ exit (1);
+
+ len = DB_LAST - DB_FIRST;
+ size = 8;
+ fwrite(&len,sizeof(int),1,PR_AFILE);
+ fwrite(&size,sizeof(int),1,PR_AFILE);
+ PR_INDEX = 0;
+ fwrite(&PR_INDEX,sizeof(int64),1,PR_AFILE);
+ }
+
+ // Open overlap file
+
+ lpwd = PathTo(argv[2]);
+ if (DB_PART)
+ input = Fopen(Catenate(lpwd,"/",las,Numbered_Suffix(".",DB_PART,".las")),"r");
+ else
+ input = Fopen(Catenate(lpwd,"/",las,".las"),"r");
+ if (input == NULL)
+ exit (1);
+
+ free(lpwd);
+ free(las);
+
+ // Get trace point spacing information
+
+ fread(&novl,sizeof(int64),1,input);
+ fread(&TRACE_SPACING,sizeof(int),1,input);
+ ANCHOR_THRESH = ANCHOR_MATCH * TRACE_SPACING;
+
+ make_a_pass(input,PATCH_GAPS,1);
+
+ // Clean up
+
+ fclose(PR_AFILE);
+ fclose(PR_DFILE);
+ }
+
+ // If verbose output statistics summary to stdout
+
+ if (VERBOSE)
+ { if (fpatch == 0)
+ printf("%s: All %d patches were successful\n",Prog_Name,npatch);
+ else
+ printf("%s: %d out of %d total patches failed\n",Prog_Name,fpatch,npatch);
+ }
+
+ free(dpwd);
+ free(root);
+
+ Close_DB(DB);
+ free(Prog_Name);
+
+ exit (0);
+}
diff --git a/DASqv.c b/DASqv.c
index 131e3f1..96b3d6f 100644
--- a/DASqv.c
+++ b/DASqv.c
@@ -23,7 +23,7 @@
#undef QV_DEBUG
-static char *Usage = "[-v] -c<int> <source:db> <overlaps:las> ...";
+static char *Usage = "[-v] [-H<int>] -c<int> <source:db> <overlaps:las> ...";
#define MAXQV 50 // Max QV score is 50
#define MAXQV1 51
@@ -31,8 +31,9 @@ static char *Usage = "[-v] -c<int> <source:db> <overlaps:las> ...";
#define PARTIAL .20 // Partial terminal segments covering this percentage are scored
-static int QV_DEEP; // # of best diffs to average for QV score
static int VERBOSE;
+static int QV_DEEP; // # of best diffs to average for QV score
+static int HGAP_MIN; // Under this length do not process for HGAP
static int TRACE_SPACING; // Trace spacing (from .las file)
static int TBYTES; // Bytes per trace segment (from .las file)
@@ -51,7 +52,6 @@ static int64 QV_INDEX; // Current index into .qual.data file
static int64 nreads, totlen;
static int64 qgram[MAXQV1], sgram[MAXQV1];
-
// For each pile, calculate QV scores of the aread at tick spacing TRACE_SPACING
static void CALCULATE_QVS(int aread, Overlap *ovls, int novl)
@@ -68,6 +68,11 @@ static void CALCULATE_QVS(int aread, Overlap *ovls, int novl)
alen = DB->reads[aread].rlen;
atick = (alen + (TRACE_SPACING-1))/TRACE_SPACING;
+ if (alen < HGAP_MIN)
+ { fwrite(&QV_INDEX,sizeof(int64),1,QV_AFILE);
+ return;
+ }
+
#if defined(QV_DEBUG)
printf("AREAD %d",aread);
if (novl == 0)
@@ -250,7 +255,7 @@ static void CALCULATE_QVS(int aread, Overlap *ovls, int novl)
cick += MAXQV1;
#ifdef QV_DEBUG
- printf(" >> %2d %2d = %2d <<\n",qvn,qvc,qvec[i]);
+ printf(" >> %2d(%d) %2d(%d) = %2d <<\n",qvn,cntn,qvc,cntc,qvec[i]);
#endif
}
@@ -301,6 +306,9 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
else
TBYTES = sizeof(uint16);
+ if (novl <= 0)
+ return (0);
+
Read_Overlap(input,ovls);
if (trace)
{ if (ovls[0].path.tlen > pmax)
@@ -332,7 +340,7 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
n = 0;
else
{ if (trace)
- memcpy(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
+ memmove(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
n = 1;
pcur = ovls[0].path.tlen;
while (1)
@@ -395,6 +403,7 @@ int main(int argc, char *argv[])
ARG_INIT("DASqv")
COVERAGE = -1;
+ HGAP_MIN = 0;
j = 1;
for (i = 1; i < argc; i++)
@@ -406,6 +415,9 @@ int main(int argc, char *argv[])
case 'c':
ARG_POSITIVE(COVERAGE,"Voting depth")
break;
+ case 'H':
+ ARG_POSITIVE(HGAP_MIN,"HGAP threshold (in bp.s)")
+ break;
}
else
argv[j++] = argv[i];
@@ -532,11 +544,11 @@ int main(int argc, char *argv[])
if (QV_AFILE == NULL || QV_DFILE == NULL)
exit (1);
- { int size, nreads;
+ { int size, length;
- nreads = DB_LAST - DB_FIRST;
+ length = DB_LAST - DB_FIRST;
size = sizeof(int64);
- fwrite(&nreads,sizeof(int),1,QV_AFILE);
+ fwrite(&length,sizeof(int),1,QV_AFILE);
fwrite(&size,sizeof(int),1,QV_AFILE);
QV_INDEX = 0;
fwrite(&QV_INDEX,sizeof(int64),1,QV_AFILE);
@@ -564,6 +576,9 @@ int main(int argc, char *argv[])
make_a_pass(input,CALCULATE_QVS,1);
+ fwrite(&COVERAGE,sizeof(int),1,QV_AFILE);
+ fwrite(&HGAP_MIN,sizeof(int),1,QV_AFILE);
+
fclose(QV_AFILE);
fclose(QV_DFILE);
}
@@ -574,12 +589,19 @@ int main(int argc, char *argv[])
{ int i;
int64 ssum, qsum;
int64 stotal, qtotal;
+ int gval, bval;
printf("\nInput: ");
Print_Number(nreads,7,stdout);
printf("reads, ");
Print_Number(totlen,12,stdout);
- printf(" bases\n");
+ printf(" bases");
+ if (HGAP_MIN > 0)
+ { printf(" (another ");
+ Print_Number((DB_LAST-DB_FIRST) - nreads,0,stdout);
+ printf(" were < H-length)");
+ }
+ printf("\n");
stotal = qtotal = 0;
for (i = 0; i <= MAXQV; i++)
@@ -594,6 +616,7 @@ int main(int argc, char *argv[])
printf("\n %2d: %9lld %5.1f%% %9lld %5.1f%%\n\n",
MAXQV,sgram[MAXQV],(100.*ssum)/stotal,qgram[MAXQV],(100.*qsum)/qtotal);
+ bval = gval = -1;
qtotal -= qsum;
stotal -= ssum;
ssum = qsum = 0;
@@ -604,7 +627,13 @@ int main(int argc, char *argv[])
printf(" %2d: %9lld %5.1f%% %9lld %5.1f%%\n",
i,sgram[i],(100.*ssum)/stotal,
qgram[i],(100.*qsum)/qtotal);
+ if ((100.*qsum)/qtotal > 7. && bval < 0)
+ bval = i;
+ if ((100.*qsum)/qtotal > 20. && gval < 0)
+ gval = i;
}
+
+ printf("\n Recommend \'DAStrim -g%d -b%d'\n\n",gval,bval);
}
// Clean up
diff --git a/DASrealign.c b/DASrealign.c
new file mode 100644
index 0000000..69b7a20
--- /dev/null
+++ b/DASrealign.c
@@ -0,0 +1,1166 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2015, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Map and extend every overlap to the patched read framework.
+ *
+ * Author: Gene Myers
+ * Date : March 2015
+ *
+ *******************************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "DB.h"
+#include "align.h"
+
+#undef OUTLINE
+#undef SHOW_MAP
+#undef TRACE
+#undef SHOW_FINAL
+#undef SHOW_ALIGNMENTS
+
+static char *Usage = " [-v] [-l<int(800)>] <block1:db> <block2:db> <source:las> <target:las>";
+
+static int TRACE_SPACING; // Trace spacing (from .las file)
+static int TBYTES; // Bytes per trace segment (from .las file)
+static int SMALL; // Trace points can fit in a byte
+static int MIN_LEN; // Minimum piece length
+
+static HITS_DB _ADB, *ADB = &_ADB; // A-read database
+static HITS_DB _BDB, *BDB = &_BDB; // B-read database
+
+static int ADB_ofirst, ADB_olast;
+static int BDB_ofirst, BDB_olast;
+
+static int AFIRST, BFIRST;
+
+static int64 *AMAP_IDX; // Map to originals for A-reads
+static int *AMAP;
+static int64 *BMAP_IDX; // Map to orignals for B-reads
+static int *BMAP;
+
+static int *IAMAP, *IBMAP; // Inverse map, old x -> new IMAP[x]..IMAP[x+1]-1
+
+static FILE *OUTPUT; // The new set of overlaps
+static int64 WOVLS;
+
+static int VERBOSE;
+
+int lowTK(int a)
+{ return (a/TRACE_SPACING); }
+
+int hghTK(int a)
+{ return ((a+(TRACE_SPACING-1))/TRACE_SPACING); }
+
+int lowTP(int a)
+{ return ((a/TRACE_SPACING)*TRACE_SPACING); }
+
+
+/*******************************************************************************************
+ *
+ * Finger iterator: allows one to map the next trace point of a read to its
+ * patched read as the trace points are examined in order.
+ *
+ *******************************************************************************************/
+
+typedef struct
+ { int cidx;
+ int lidx;
+ int dist;
+ int last;
+ int blen;
+ int *map;
+ } Finger;
+
+ // A finger is initialized with init_finger where cur is suppled by the user, and the
+ // patch sequence is in GRIM[gb..ge].
+
+static inline void init_finger(Finger *f, int *map, int mb, int me, int blen)
+{ if (blen == 0)
+ { f->cidx = mb+3;
+ f->lidx = me;
+ f->last = map[mb];
+ }
+ else
+ { f->cidx = me-4;
+ f->lidx = mb-1;
+ f->last = blen - map[me-1];
+ }
+ f->dist = 0;
+ f->blen = blen;
+ f->map = map;
+}
+
+ // Advance finger to position pos and return position in patched read, if known, -1 otherwise
+
+static inline int good(Finger *cur, int pos)
+{ int blen, *map;
+
+ map = cur->map;
+ blen = cur->blen;
+ if (blen == 0)
+ { while (cur->cidx < cur->lidx && pos >= map[cur->cidx])
+ { cur->dist += (map[cur->cidx-2] - cur->last) + map[cur->cidx-1];
+ cur->last = map[cur->cidx];
+ cur->cidx += 3;
+ }
+ if (pos <= map[cur->cidx-2])
+ { if (pos < cur->last)
+ return (-1);
+ else
+ return (cur->dist + (pos-cur->last));
+ }
+ }
+ else
+ { while (cur->cidx > cur->lidx && pos >= blen - map[cur->cidx])
+ { cur->dist += ((blen - map[cur->cidx+2]) - cur->last) + map[cur->cidx+1];
+ cur->last = blen - map[cur->cidx];
+ cur->cidx -= 3;
+ }
+ if (pos <= blen - map[cur->cidx+2])
+ { if (pos < cur->last)
+ return (-1);
+ else
+ return (cur->dist + (pos-cur->last));
+ }
+ }
+ return (-1);
+}
+
+ // Advance finger to position pos, and return best estimate of position in patched read,
+ // or -1 if outside the bounds of the patched read. acc points at the distance the
+ // estimate is from a non-patched segment (0 if mapped).
+
+static inline int where(Finger *cur, int pos, int *acc)
+{ int blen, *map;
+
+ map = cur->map;
+ blen = cur->blen;
+ if (blen == 0)
+ { while (cur->cidx < cur->lidx && pos >= map[cur->cidx])
+ { cur->dist += (map[cur->cidx-2] - cur->last) + map[cur->cidx-1];
+ cur->last = map[cur->cidx];
+ cur->cidx += 3;
+ }
+ if (pos <= map[cur->cidx-2])
+ { if (pos < cur->last)
+ return (-1);
+ else
+ { *acc = 0;
+ return (cur->dist + (pos-cur->last));
+ }
+ }
+ if (cur->cidx >= cur->lidx)
+ return (-1);
+ else
+ { int ab, ae;
+
+ ab = map[cur->cidx-2];
+ ae = map[cur->cidx];
+ if (pos-ab < ae-pos)
+ *acc = pos-ab;
+ else
+ *acc = ae-pos;
+ return (cur->dist + (ab-cur->last) + ((1.*(pos-ab))/(ae-ab)) * map[cur->cidx-1]);
+ }
+ }
+ else
+ { while (cur->cidx > cur->lidx && pos >= blen - map[cur->cidx])
+ { cur->dist += ((blen - map[cur->cidx+2]) - cur->last) + map[cur->cidx+1];
+ cur->last = blen - map[cur->cidx];
+ cur->cidx -= 3;
+ }
+ if (pos <= blen - map[cur->cidx+2])
+ { if (pos < cur->last)
+ return (-1);
+ else
+ { *acc = 0;
+ return (cur->dist + (pos-cur->last));
+ }
+ }
+ if (cur->cidx <= cur->lidx)
+ return (-1);
+ else
+ { int ab, ae;
+
+ ab = blen - map[cur->cidx+2];
+ ae = blen - map[cur->cidx];
+ if (pos-ab < ae-pos)
+ *acc = pos-ab;
+ else
+ *acc = ae-pos;
+ return (cur->dist + (ab-cur->last) + ((1.*(pos-ab))/(ae-ab)) * map[cur->cidx+1]);
+ }
+ }
+}
+
+
+/*******************************************************************************************
+ *
+ * Trace point mapping:
+ * recon makes a trace with all the mapable pairs
+ * when there are no mapable pairs, estimate finds the estimated point position closest
+ * to a mapable region.
+ *
+ *******************************************************************************************/
+
+static int recon(Path *image, Path *path, Finger *afinger, Finger *bfinger)
+{ static int tmax = -1;
+ static uint16 *itrace = NULL;
+
+ int ae, be;
+ int al, bl;
+ int an, bn;
+ int t, tl, df;
+ uint16 *strace = ((uint16 *) (path->trace));
+
+ if (path->tlen > tmax)
+ { tmax = 1.2*path->tlen + 100;
+ itrace = (uint16 *) Realloc(itrace,sizeof(uint16)*tmax,"Reallocating image trace");
+ }
+ image->trace = itrace;
+
+#ifdef TRACE
+ printf(" Backbone:\n");
+ fflush(stdout);
+#endif
+ df = 0;
+ tl = -1;
+ al = bl = 0;
+ ae = lowTP(path->abpos);
+ be = path->bbpos;
+ if ((an = good(afinger,path->abpos)) >= 0 && (bn = good(bfinger,be)) >= 0)
+ { image->abpos = al = an;
+ image->bbpos = bl = bn;
+ tl = 0;
+#ifdef TRACE
+ printf(" %5d,%5d -> %5d,%5d\n",path->abpos,be,an,bn);
+ fflush(stdout);
+#endif
+ }
+ for (t = 1; t < path->tlen; t += 2)
+ { ae += TRACE_SPACING;
+ be += strace[t];
+ if (ae > path->aepos)
+ ae = path->aepos;
+ if (tl >= 0)
+ df += strace[t-1];
+ if ((an = good(afinger,ae)) >= 0 && (bn = good(bfinger,be)) >= 0)
+ { if (tl < 0)
+ { image->abpos = an;
+ image->bbpos = bn;
+ tl = 0;
+ }
+ else
+ { itrace[tl] = an-al;
+ itrace[tl+1] = bn-bl;
+ tl += 2;
+ }
+ image->aepos = al = an;
+ image->bepos = bl = bn;
+ image->diffs = df;
+#ifdef TRACE
+ printf(" %5d,%5d -> %5d,%5d\n",ae,be,an,bn);
+ fflush(stdout);
+#endif
+ }
+ }
+
+ image->tlen = tl;
+ if (tl <= 0)
+ return (0);
+ else
+ return (1);
+}
+
+static int estimate(Path *path, Finger *afinger, Finger *bfinger, int *bsta, int *bstb, int *acc)
+{ int ae, be;
+ int an, bn;
+ int best, adst, bdst;
+ int t;
+ uint16 *strace = ((uint16 *) (path->trace));
+
+ *bsta = *bstb = -1;
+ best = INT32_MAX;
+
+#ifdef TRACE
+ printf(" Point Estimate:\n");
+ fflush(stdout);
+#endif
+ ae = lowTP(path->abpos);
+ be = path->bbpos;
+ if ((an = where(afinger,path->abpos,&adst)) >= 0 && (bn = where(bfinger,be,&bdst)) >= 0)
+ { best = adst + bdst;
+ *bsta = an;
+ *bstb = bn;
+#ifdef TRACE
+ printf(" %5d,%5d -> %5d(%d),%5d(%d)\n",path->abpos,be,an,adst,bn,bdst);
+ fflush(stdout);
+#endif
+ }
+ for (t = 1; t < path->tlen; t += 2)
+ { ae += TRACE_SPACING;
+ be += strace[t];
+ if (ae > path->aepos)
+ ae = path->aepos;
+ if ((an = where(afinger,ae,&adst)) >= 0 && (bn = where(bfinger,be,&bdst)) >= 0)
+ { if (adst + bdst < best)
+ { best = adst + bdst;
+ *bsta = an;
+ *bstb = bn;
+ }
+#ifdef TRACE
+ printf(" %5d,%5d -> %5d(%d),%5d(%d)\n",ae,be,an,adst,bn,bdst);
+ fflush(stdout);
+#endif
+ }
+ }
+ *acc = best;
+
+ return (*bsta >= 0);
+}
+
+#ifdef SHOW_MAP
+
+static void print_map(int *map, int mb, int me, int clen)
+{ int b, dist;
+
+ if (clen == 0)
+ { printf(" n");
+ for (b = mb; b < me; b += 3)
+ { printf(" [%5d,%5d]",map[b],map[b+1]);
+ if (b+2 < me)
+ printf(" %5d",map[b+2]);
+ }
+ printf("\n");
+
+ printf(" ");
+ dist = 0;
+ for (b = mb; b < me; b += 3)
+ { printf(" [%5d,%5d]",dist,dist+(map[b+1]-map[b]));
+ dist += map[b+1]-map[b];
+ if (b+2 < me)
+ { printf(" %5d",map[b+2]);
+ dist += map[b+2];
+ }
+ }
+ printf("\n");
+ }
+ else
+ { printf(" c");
+ for (b = me; b >= mb; b -= 3)
+ { printf(" [%5d,%5d]",clen-map[b-1],clen-map[b-2]);
+ if (b-3 > mb)
+ printf(" %5d",map[b-3]);
+ }
+ printf("\n");
+
+ printf(" ");
+ dist = 0;
+ for (b = me; b >= mb; b -= 3)
+ { printf(" [%5d,%5d]",dist,dist+(map[b-1]-map[b-2]));
+ dist += map[b-1]-map[b-2];
+ if (b-3 > mb)
+ { printf(" %5d",map[b-3]);
+ dist += map[b-3];
+ }
+ }
+ printf("\n");
+ }
+}
+
+#endif
+
+#ifdef SHOW_FINAL
+
+static void show_overlap(Overlap *ovl)
+{ int i, a, b;
+ uint16 *t;
+
+ t = (uint16 *) (ovl->path.trace);
+ a = ovl->path.abpos;
+ b = ovl->path.bbpos;
+ for (i = 0; i < ovl->path.tlen; i += 2)
+ { a += t[i];
+ b += t[i+1];
+ printf(" %5d %5d :: %5d %5d\n",t[i],t[i+1],a,b);
+ fflush(stdout);
+ }
+}
+
+#endif
+
+static void convert_trace(Path *path)
+{ int ab, ae;
+ int t;
+ uint16 *trace = ((uint16 *) (path->trace));
+
+ ae = lowTP(path->abpos);
+ ab = path->abpos;
+ for (t = 0; t < path->tlen; t += 2)
+ { ae += TRACE_SPACING;
+ if (ae > path->aepos)
+ ae = path->aepos;
+#ifdef TRACE
+ printf(" %5d,%5d -> %5d,%5d\n",trace[t],trace[t+1],ae-ab,trace[t+1]);
+ fflush(stdout);
+#endif
+ trace[t] = ae-ab;
+ ab = ae;
+ }
+}
+
+
+// Produce the concatentation of path1 and path2 where they are known to meet at
+// the trace point with coordinate ap. Place this result in a big growing buffer,
+// that gets reset when fusion is called with path1 = NULL
+
+static void fusion(Path *path1, Path *path2, int wch)
+{ static uint16 *paths = NULL;
+ static int pmax = 0;
+ static int ptop = 0;
+
+ int k;
+ int len;
+ uint16 *trace;
+
+ if (path1 == NULL)
+ { ptop = 0;
+ return;
+ }
+
+ len = path1->tlen + path2->tlen;
+
+ if (ptop + len >= pmax)
+ { pmax = 1.2*(ptop+len) + 1000;
+ paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Allocating paths");
+ if (paths == NULL)
+ exit (1);
+ }
+ trace = paths+ptop;
+ ptop += len;
+
+ len = 0;
+ if (path1->tlen > 0)
+ { uint16 *t = (uint16 *) (path1->trace);
+ for (k = 0; k < path1->tlen; k += 2)
+ { trace[len++] = t[k];
+ trace[len++] = t[k+1];
+ }
+ }
+ if (path2->tlen > 0)
+ { uint16 *t = (uint16 *) (path2->trace);
+ for (k = 0; k < path2->tlen; k += 2)
+ { trace[len++] = t[k];
+ trace[len++] = t[k+1];
+ }
+ }
+
+ if (wch == 1)
+ { path1->aepos = path2->aepos;
+ path1->bepos = path2->bepos;
+ path1->diffs += path2->diffs;
+ path1->trace = trace;
+ path1->tlen = len;
+ }
+ else
+ { path2->abpos = path1->abpos;
+ path2->bbpos = path1->bbpos;
+ path2->diffs += path1->diffs;
+ path2->trace = trace;
+ path2->tlen = len;
+ }
+}
+
+static void EXTENDER(int aread, Overlap *ovls, int novl)
+{ Finger _afinger, *afinger = &_afinger;
+ Finger _bfinger, *bfinger = &_bfinger;
+
+ static Overlap _ovla, *ovla = &_ovla;
+ static Path *ipath = &_ovla.path;
+
+ static Path rpath, fpath;
+ static Alignment _ralign, *ralign = &_ralign;
+ static Alignment _falign, *falign = &_falign;
+
+ static Work_Data *work = NULL;
+ static Align_Spec *spec;
+
+ int ap, alast;
+
+ if (aread < ADB_ofirst || aread >= ADB_olast)
+ return;
+
+ if (work == NULL)
+ { spec = New_Align_Spec(.70,100,ADB->freq);
+ work = New_Work_Data();
+ ralign->path = &rpath;
+ falign->path = &fpath;
+ }
+
+ alast = IAMAP[aread+1];
+ for (ap = IAMAP[aread]; ap < alast; ap++)
+ { int mb, me;
+ int aend, abeg, alen;
+
+ mb = AMAP_IDX[ap]+2;
+ me = AMAP_IDX[ap+1];
+
+ abeg = AMAP[mb];
+ aend = AMAP[me-1];
+ if (aend - abeg < MIN_LEN)
+ continue;
+
+ ralign->aseq = falign->aseq = ((char *) ADB->bases) + ADB->reads[ap].boff;
+ ralign->alen = falign->alen = alen = ADB->reads[ap].rlen;
+
+#ifdef OUTLINE
+ printf("AREAD %d -> %d [%d,%d]\n",aread,ap,abeg,aend);
+ fflush(stdout);
+#endif
+#ifdef SHOW_MAP
+ print_map(AMAP,mb,me,0);
+#endif
+
+ { int o, ob, oe;
+ Path *path;
+ int bread;
+ int bp, blast;
+
+ for (ob = 0; ob < novl; ob = oe)
+ { bread = ovls[ob].bread;
+ for (oe = ob+1; oe < novl && ovls[oe].bread == bread; oe += 1)
+ ;
+ if (bread < BDB_ofirst || bread >= BDB_olast)
+ continue;
+
+ blast = IBMAP[bread+1];
+ for (bp = IBMAP[bread]; bp < blast; bp++)
+ { int hb, he;
+ int bend, bbeg, blen;
+ int alpos, clen;
+
+ hb = BMAP_IDX[bp]+2;
+ he = BMAP_IDX[bp+1];
+
+ bbeg = BMAP[hb];
+ bend = BMAP[he-1];
+ if (bend - bbeg < MIN_LEN)
+ continue;
+
+#ifdef OUTLINE
+ printf(" BREAD %d->%d [%d,%d]\n",bread,bp,bbeg,bend);
+ fflush(stdout);
+#endif
+#ifdef SHOW_MAP
+ print_map(BMAP,hb,he,0);
+#endif
+
+ alpos = -1;
+ for (o = ob; o < oe; o++)
+ { int bbreal, bereal;
+
+ path = &(ovls[o].path);
+ if (COMP(ovls[o].flags))
+ { clen = BMAP[hb-1];
+ bbreal = clen-path->bepos;
+ bereal = clen-path->bbpos;
+ }
+ else
+ { clen = 0;
+ bbreal = path->bbpos;
+ bereal = path->bepos;
+ }
+
+#ifdef OUTLINE
+ printf(" OVL %d: [%d,%d] %c [%d,%d]\n",o,
+ path->abpos,path->aepos,(clen==0)?'n':'c',bbreal,bereal);
+ fflush(stdout);
+#endif
+
+ if (path->abpos <= aend-MIN_LEN && path->aepos >= abeg+MIN_LEN &&
+ bbreal <= bend-MIN_LEN && bereal >= bbeg+MIN_LEN)
+
+ { ralign->bseq = falign->bseq = ((char *) BDB->bases) + BDB->reads[bp].boff;
+ ralign->blen = falign->blen = blen = BDB->reads[bp].rlen;
+ ralign->flags = falign->flags = ovls[o].flags;
+
+ if (COMP(ralign->flags))
+ Complement_Seq(ralign->bseq,blen);
+
+#ifdef SHOW_MAP
+ print_map(BMAP,hb,he,clen);
+#endif
+
+ init_finger(afinger,AMAP,mb,me,0);
+ init_finger(bfinger,BMAP,hb,he,clen);
+ if ( ! recon(ipath,path,afinger,bfinger))
+ { int apos, bpos, acc, len, diag;
+
+ init_finger(afinger,AMAP,mb,me,0);
+ init_finger(bfinger,BMAP,hb,he,clen);
+ if (estimate(path,afinger,bfinger,&apos,&bpos,&acc))
+ if (apos > alpos)
+ { diag = apos-bpos;
+
+ acc /= 2;
+ if (apos + acc > alen)
+ acc = alen-apos;
+ if (bpos + acc > blen)
+ acc = blen-bpos;
+ if (apos < acc)
+ acc = apos;
+ if (bpos < acc)
+ acc = bpos;
+ if (acc > 500)
+ acc = 500;
+ acc *= 2;
+
+#ifdef OUTLINE
+ printf(" Trying: %d,%d + %d\n",apos,bpos,acc);
+ fflush(stdout);
+#endif
+ Local_Alignment(ralign,work,spec,
+ diag-acc,diag+acc,apos+bpos,-1,-1);
+#ifdef OUTLINE
+ printf(" Local: <%d,%d> -> <%d,%d>\n",
+ ralign->path->abpos,ralign->path->bbpos,
+ ralign->path->aepos,ralign->path->bepos);
+ fflush(stdout);
+#endif
+ len = ralign->path->aepos - ralign->path->abpos;
+ if (len >= MIN_LEN && ralign->path->diffs <= .35*len)
+ { ovla->aread = ap;
+ ovla->bread = bp;
+ ovla->flags = ralign->flags;
+ _ovla.path = *(ralign->path);
+ convert_trace(ralign->path);
+#ifdef OUTLINE
+ printf(" Final: %d[%d..%d] vs %d[%d..%d] %c d=%d\n",
+ ovla->aread,ovla->path.abpos,ovla->path.aepos,
+ ovla->bread,ovla->path.bbpos,ovla->path.bepos,
+ (COMP(ovla->flags) ? 'c' : 'n'), ovla->path.diffs);
+ fflush(stdout);
+#endif
+#ifdef SHOW_FINAL
+ show_overlap(ovla);
+#endif
+#ifdef SHOW_ALIGNMENTS
+ Compute_Trace_IRR(ralign,work,GREEDIEST);
+ Print_Alignment(stdout,ralign,work,4,80,10,0,6);
+#else
+ Write_Overlap(OUTPUT,ovla,sizeof(uint16));
+ WOVLS += 1;
+#endif
+
+ alpos = ralign->path->aepos;
+#ifdef OUTLINE
+ printf(" ACCEPT\n");
+#endif
+ }
+#ifdef OUTLINE
+ else
+ printf(" REJECT\n");
+#endif
+ }
+#ifdef OUTLINE
+ else
+ printf(" SKIP\n");
+ else
+ printf(" NO OVERLAP\n");
+ fflush(stdout);
+#endif
+ }
+
+ else if (ipath->aepos > alpos)
+ { int ab, bb;
+ int ae, be;
+ int ar, br;
+ int af, bf;
+
+ ab = ipath->abpos;
+ bb = ipath->bbpos;
+ ae = ipath->aepos;
+ be = ipath->bepos;
+
+ ar = ab;
+ br = bb;
+ if (ab > 0 && bb > 0)
+ { Find_Extension(ralign,work,spec,ab-bb,ab+bb,-1,-1,1);
+ ar = ralign->path->abpos;
+ br = ralign->path->bbpos;
+#ifdef OUTLINE
+ printf(" Rev: (%d,%d)",ab,bb);
+ printf(" -> (%d,%d)",ar,br);
+ printf(" %d",ralign->path->diffs);
+ fflush(stdout);
+#endif
+ if (ar == 0 || br == 0 || ralign->path->diffs <= .35*(ab-ar))
+ {
+#ifdef OUTLINE
+ printf(" OK\n");
+ fflush(stdout);
+#endif
+ if (ab - 10 < ar)
+ { uint16 *trace = (uint16 *) ipath->trace;
+ int tlen = ipath->tlen;
+
+ if (tlen > 0)
+ { trace[0] += ab - ar;
+ trace[1] += bb - br;
+ }
+ ipath->abpos = ar;
+ ipath->bbpos = br;
+ }
+ else
+ { convert_trace(ralign->path);
+ fusion(ralign->path,ipath,2);
+ }
+ }
+ else
+ { ar = ab;
+ br = bb;
+#ifdef OUTLINE
+ printf(" NOTOK\n");
+ fflush(stdout);
+#endif
+ }
+ }
+
+ af = ae;
+ bf = be;
+ if (ae < alen && be < blen)
+ { Find_Extension(falign,work,spec,ae-be,ae+be,-1,-1,0);
+ af = falign->path->aepos;
+ bf = falign->path->bepos;
+#ifdef OUTLINE
+ printf(" Fow: (%d,%d)",ae,be);
+ printf(" -> (%d,%d)",af,bf);
+ printf(" %d",falign->path->diffs);
+ fflush(stdout);
+#endif
+ if (af == alen || bf == blen || falign->path->diffs <= .35*(af-ae))
+ {
+#ifdef OUTLINE
+ printf(" OK\n");
+ fflush(stdout);
+#endif
+ if (ae + 10 > af)
+ { uint16 *trace = (uint16 *) ipath->trace;
+ int tlen = ipath->tlen;
+
+ if (tlen > 0)
+ { trace[tlen-2] += af-ae;
+ trace[tlen-1] += bf-be;
+ }
+ ipath->aepos = af;
+ ipath->bepos = bf;
+ }
+ else
+ { convert_trace(falign->path);
+ fusion(ipath,falign->path,1);
+ }
+ }
+ else
+ { af = ae;
+ bf = be;
+#ifdef OUTLINE
+ printf(" NOTOK\n");
+ fflush(stdout);
+#endif
+ }
+ }
+
+ alpos = af;
+ if (af-ar >= MIN_LEN)
+ { ovla->aread = AFIRST + ap;
+ ovla->bread = BFIRST + bp;
+ ovla->flags = ralign->flags;
+#ifdef OUTLINE
+ printf(" Final: %d[%d..%d] vs %d[%d..%d] %c d=%d\n",
+ ovla->aread,ovla->path.abpos,ovla->path.aepos,
+ ovla->bread,ovla->path.bbpos,ovla->path.bepos,
+ (COMP(ovla->flags) ? 'c' : 'n'), ovla->path.diffs);
+ fflush(stdout);
+#endif
+#ifdef SHOW_FINAL
+ show_overlap(ovla);
+#endif
+#ifdef SHOW_ALIGNMENTS
+ fpath = *ipath;
+ Compute_Trace_IRR(falign,work,GREEDIEST);
+ Print_Alignment(stdout,falign,work,4,80,10,0,6);
+#else
+ Write_Overlap(OUTPUT,ovla,sizeof(uint16));
+ WOVLS += 1;
+#endif
+ }
+#ifdef OUTLINE
+ else
+ printf(" NO OVERLAP\n");
+#endif
+ fusion(NULL,NULL,0);
+ }
+
+#ifdef OUTLINE
+ else
+ printf(" SKIP\n");
+ fflush(stdout);
+#endif
+
+ if (COMP(ralign->flags))
+ Complement_Seq(ralign->bseq,blen);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+
+static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int trace)
+{ static Overlap *ovls = NULL;
+ static int omax = 500;
+ static uint16 *paths = NULL;
+ static int pmax = 100000;
+
+ int64 i, j, novl;
+ int n, a;
+ int pcur;
+ int max;
+
+ if (ovls == NULL)
+ { ovls = (Overlap *) Malloc(sizeof(Overlap)*omax,"Allocating overlap buffer");
+ if (ovls == NULL)
+ exit (1);
+ }
+ if (trace && paths == NULL)
+ { paths = (uint16 *) Malloc(sizeof(uint16)*pmax,"Allocating path buffer");
+ if (paths == NULL)
+ exit (1);
+ }
+
+ rewind(input);
+ fread(&novl,sizeof(int64),1,input);
+ fread(&TRACE_SPACING,sizeof(int),1,input);
+ if (TRACE_SPACING <= TRACE_XOVR)
+ { TBYTES = sizeof(uint8);
+ SMALL = 1;
+ }
+ else
+ { TBYTES = sizeof(uint16);
+ SMALL = 0;
+ }
+
+ if (novl <= 0)
+ return (0);
+
+ Read_Overlap(input,ovls);
+ if (trace)
+ { if (ovls[0].path.tlen > pmax)
+ { pmax = 1.2*(ovls[0].path.tlen)+10000;
+ paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
+ if (paths == NULL) exit (1);
+ }
+ fread(paths,TBYTES,ovls[0].path.tlen,input);
+ if (TBYTES == 1)
+ { ovls[0].path.trace = paths;
+ Decompress_TraceTo16(ovls);
+ }
+ }
+ else
+ fseek(input,TBYTES*ovls[0].path.tlen,SEEK_CUR);
+
+ pcur = 0;
+ n = max = 0;
+ for (j = ovls[0].aread; j < ADB_olast; j++)
+ { ovls[0] = ovls[n];
+ a = ovls[0].aread;
+ if (a != j)
+ n = 0;
+ else
+ { if (trace)
+ memmove(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
+ n = 1;
+ pcur = ovls[0].path.tlen;
+ while (1)
+ { if (Read_Overlap(input,ovls+n) != 0)
+ { ovls[n].aread = INT32_MAX;
+ break;
+ }
+ if (trace)
+ { if (pcur + ovls[n].path.tlen > pmax)
+ { pmax = 1.2*(pcur+ovls[n].path.tlen)+10000;
+ paths = (uint16 *) Realloc(paths,sizeof(uint16)*pmax,"Expanding path buffer");
+ if (paths == NULL) exit (1);
+ }
+ fread(paths+pcur,TBYTES,ovls[n].path.tlen,input);
+ if (TBYTES == 1)
+ { ovls[n].path.trace = paths+pcur;
+ Decompress_TraceTo16(ovls+n);
+ }
+ }
+ else
+ fseek(input,TBYTES*ovls[n].path.tlen,SEEK_CUR);
+ if (ovls[n].aread != a)
+ break;
+ pcur += ovls[n].path.tlen;
+ n += 1;
+ if (n >= omax)
+ { omax = 1.2*n + 100;
+ ovls = (Overlap *) Realloc(ovls,sizeof(Overlap)*omax,"Expanding overlap buffer");
+ if (ovls == NULL) exit (1);
+ }
+ }
+
+ if (n >= max)
+ max = n;
+ pcur = 0;
+ for (i = 0; i < n; i++)
+ { ovls[i].path.trace = paths+pcur;
+ pcur += ovls[i].path.tlen;
+ }
+ }
+
+ if (j >= ADB_ofirst)
+ ACTION(j,ovls,n);
+ }
+
+ return (max);
+}
+
+
+int main(int argc, char *argv[])
+{ HITS_TRACK *map1, *map2;
+
+ // Process arguments
+
+ { int i, j, k;
+ int flags[128];
+ char *eptr;
+
+ ARG_INIT("DASrealign")
+
+ MIN_LEN = 800;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("v")
+ break;
+ case 'l':
+ ARG_POSITIVE(MIN_LEN,"Minimum piece length to recompute")
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ VERBOSE = flags['v'];
+
+ if (argc != 5)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ { int status;
+
+ status = Open_DB(argv[1],ADB);
+ if (status < 0)
+ exit (1);
+ if (status == 1)
+ { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ if ( ! ADB->part)
+ { fprintf(stderr,"%s: Must be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ Trim_DB(ADB);
+
+ Read_All_Sequences(ADB,0);
+ }
+
+ if (strcmp(argv[1],argv[2]) == 0)
+ BDB = ADB;
+ else
+ { int status;
+
+ status = Open_DB(argv[2],BDB);
+ if (status < 0)
+ exit (1);
+ if (status == 1)
+ { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ if ( ! BDB->part)
+ { fprintf(stderr,"%s: Must be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ Trim_DB(BDB);
+
+
+ Read_All_Sequences(BDB,0);
+ }
+ AFIRST = ADB->tfirst;
+ BFIRST = BDB->tfirst;
+
+ map1 = Load_Track(ADB,"map");
+ if (map1 != NULL)
+ { int i, o, q, n;
+
+ AMAP_IDX = (int64 *) map1->anno;
+ AMAP = (int *) map1->data;
+ for (i = 0; i <= ADB->nreads; i++)
+ AMAP_IDX[i] /= sizeof(int);
+
+ ADB_ofirst = AMAP[AMAP_IDX[0]];
+ ADB_olast = AMAP[AMAP_IDX[ADB->nreads-1]]+1;
+ IAMAP = (int *) Malloc(sizeof(int)*((ADB_olast-ADB_ofirst)+1),"Inverse map") - ADB_ofirst;
+
+ IAMAP[q = ADB_olast] = n = ADB->nreads;
+ for (i = ADB->nreads-1; i >= 0; i--)
+ { o = AMAP[AMAP_IDX[i]];
+ if (q > o)
+ while (--q > o)
+ IAMAP[q] = n;
+ IAMAP[o] = n = i;
+ }
+ }
+ else
+ { fprintf(stderr,"%s: Must have a 'map' track, run DASedit\n",Prog_Name);
+ exit (1);
+ }
+
+ if (BDB == ADB)
+ { map2 = map1;
+ BMAP_IDX = AMAP_IDX;
+ BMAP = AMAP;
+ BDB_ofirst = ADB_ofirst;
+ BDB_olast = ADB_olast ;
+ IBMAP = IAMAP;
+ }
+ else
+ { map2 = Load_Track(BDB,"map");
+ if (map2 != NULL)
+ { int i, o, q, n;
+
+ BMAP_IDX = (int64 *) map2->anno;
+ BMAP = (int *) map2->data;
+ for (i = 0; i <= BDB->nreads; i++)
+ BMAP_IDX[i] /= sizeof(int);
+
+ BDB_ofirst = BMAP[BMAP_IDX[0]];
+ BDB_olast = BMAP[BMAP_IDX[BDB->nreads-1]]+1;
+ IBMAP = (int *) Malloc(sizeof(int)*((BDB_olast-BDB_ofirst)+1),"Inverse map") - BDB_ofirst;
+
+ IBMAP[q = BDB_olast] = n = BDB->nreads;
+ for (i = BDB->nreads-1; i >= 0; i--)
+ { o = BMAP[BMAP_IDX[i]];
+ if (q > o)
+ while (--q > o)
+ IBMAP[q] = n;
+ IBMAP[o] = n = i;
+ }
+ }
+ else
+ { fprintf(stderr,"%s: Must have a 'map' track, run DASedit\n",Prog_Name);
+ exit (1);
+ }
+ }
+
+ ADB_ofirst = AMAP[AMAP_IDX[0]];
+ BDB_ofirst = BMAP[BMAP_IDX[0]];
+ ADB_olast = AMAP[AMAP_IDX[ADB->nreads-1]]+1;
+ BDB_olast = BMAP[BMAP_IDX[BDB->nreads-1]]+1;
+
+ // Open .las and process piles therein output new piles to F.las
+
+ { FILE *input;
+ char *las, *pwd;
+ char *lasT, *pwdT;
+ int64 novl;
+
+ las = Root(argv[3],".las");
+ pwd = PathTo(argv[3]);
+
+ lasT = Root(argv[4],".las");
+ pwdT = PathTo(argv[4]);
+
+ if (strcmp(las,lasT) == 0 && strcmp(pwd,pwdT) == 0)
+ { fprintf(stderr,"%s: source and target are the same !\n",Prog_Name);
+ exit (1);
+ }
+
+ input = Fopen(Catenate(pwd,"/",las,".las"),"r");
+ OUTPUT = Fopen(Catenate(pwdT,"/",lasT,".las"),"w");
+ if (input == NULL || OUTPUT == NULL)
+ exit (1);
+ free(pwd);
+ free(las);
+
+ WOVLS = 0;
+ TRACE_SPACING = 0;
+ fwrite(&WOVLS,sizeof(int64),1,OUTPUT);
+ fwrite(&TRACE_SPACING,sizeof(int),1,OUTPUT);
+
+ fread(&novl,sizeof(int64),1,input);
+ fread(&TRACE_SPACING,sizeof(int),1,input);
+
+ make_a_pass(input,EXTENDER,1);
+
+ rewind(OUTPUT);
+ fwrite(&WOVLS,sizeof(int64),1,OUTPUT);
+ fclose(OUTPUT);
+ }
+
+ exit (0);
+}
diff --git a/DAStrim.c b/DAStrim.c
index 7932981..ef01807 100644
--- a/DAStrim.c
+++ b/DAStrim.c
@@ -3,9 +3,10 @@
* Using overlap pile for each read and intrinisic quality values, determine the
* high quality segments with interspersed gaps. Any unremoved
* adaptemer sequences are dectected and the shorter side trimmed.
+ * Every gap is analyzed and either patched or splits the read.
*
* Author: Gene Myers
- * Date : March 2015
+ * Date : June 2016
*
*******************************************************************************************/
@@ -25,31 +26,45 @@
#endif
#undef DEBUG_HQ_BLOCKS // Various DEBUG flags (normally all off)
-#undef SHOW_EVENTS
+#undef SHOW_EVENTS
#undef DEBUG_HOLE_FINDER
#undef DEBUG_GAP_STATUS
-#undef SHOW_PAIRS
-#define ANNOTATE
+#undef SHOW_PAIRS
+#undef DEBUG_SUMMARY
+
+#undef ANNOTATE // Output annotation tracks for DaViewer
// Command format and global parameter variables
-static char *Usage = " [-v] [-l<int(1000)>] -g<int> -b<int> <source:db> <overlaps:las> ...";
+static char *Usage = " [-v] -g<int> -b<int> <source:db> <overlaps:las> ...";
static int BAD_QV; // qv >= and you are "bad"
static int GOOD_QV; // qv <= and you are "good"
-static int MIN_LEN; // Minimum segment length to keep
+static int HGAP_MIN; // less than this length do not process for HGAP
static int VERBOSE;
+// Gap states
+
+#define LOWQ 0 // Gap is spanned by many LAs and patchable
+#define SPAN 1 // Gap has many paired LAs and patchable
+#define SPLIT 2 // Gap is a chimer or an unpatchable gap
+#define ADPRE 3 // Gap is due to adaptemer, trim prefix interval to left
+#define ADSUF 4 // Gap is due to adaptemer, trim suffix interval to right
+#define ADAPT 3 // Gap is due to adaptemer (internal only)
+
+static int AdPre = ADPRE;
+static int AdSuf = ADSUF;
-// Good patch constant
+// Good patch constants
#define MIN_BLOCK 500 // Minimum length of a good patch
// Gap constants
-#define MIN_COVER 3 // A coverage gap occurs at or below this level
-#define COVER_LEN 400 // An overlap covers a point if it extends COVER_LEN to either side.
+#define MIN_COVER 3 // A coverage gap occurs at or below this level
+#define COVER_LEN 400 // An overlap covers a point if it extends COVER_LEN to either side.
+#define ANCHOR_MATCH .25 // Delta in trace interval at both ends of patch must be < this %.
// Wall Constants
@@ -60,7 +75,7 @@ static int VERBOSE;
// Global Variables (must exist across the processing of each pile)
- // Read-only
+ // Input
static int TRACE_SPACING; // Trace spacing (from .las file)
@@ -72,7 +87,11 @@ static int DB_PART; // 0 if all, otherwise block #
static int64 *QV_IDX; // qual track index
static uint8 *QV; // qual track values
- // Read & Write
+ // Output
+
+static FILE *TR_AFILE; // .trim.anno
+static FILE *TR_DFILE; // .trim.data
+static int64 TR_INDEX; // Current index into .trim.data file as it is being written
#ifdef ANNOTATE
@@ -102,6 +121,8 @@ static int64 KP_INDEX; // Current index into .keep.data file as it is being
#endif
+ // Statistics
+
static int64 nreads, totlen;
static int64 nelim, nelimbp;
static int64 n5trm, n5trmbp;
@@ -112,6 +133,7 @@ static int64 ngaps, ngapsbp;
static int64 nspan, nspanbp;
static int64 nchim, nchimbp;
+
// Data Structures
typedef struct // General read interval [beg..end]
@@ -191,7 +213,7 @@ static Interval *HQ_BLOCKS(int aread, int *p_nblk)
// Find all blocks < BAD_QV with either len >= MIN_BLOCK or all <= GOOD_QV in block[0..nblk)
// Mark those satisfying 1. or 2. as "alive" (.alv)
- { int lmost = 0, rmost, thr;
+ { int lmost = 0, rmost = 0, thr;
int i, in;
thr = (MIN_BLOCK-1)/TRACE_SPACING;
@@ -453,7 +475,7 @@ static Wall *find_walls(int novl, Event *queue, int *anum, int *dnum)
// Holes are usually found between HQ-blocks. However occasionally they intersect one or
// more blocks and this requires the HQ-blocks be refined as follows:
// a. Hole spans an HQ-block:
- // The block needs to be removed as HQ *if* it is not based 5 or more LA's
+ // The block needs to be removed as HQ *if* it is not based on 5 or more LA's
// (this usually never happens, 10^-5 or less)
// b. Hole is contained in an HQ-block:
// The block needs to be split around the hole because one needs to verify that
@@ -461,8 +483,8 @@ static Wall *find_walls(int novl, Event *queue, int *anum, int *dnum)
// (this happens occasionaly, ~ 10^-3)
// c. Hole overlaps an HQ-block:
// If this happens, then the overlap is very small and the block is left unperturbed.
- // (this worries me a bit, but in all testing it remains so)
- // Given the above affects, the list of HQ-blocks can be modified by FIND_HOLES.
+ // (this worries me a bit, but in all testing it (very small overlap) remains so)
+ // Given the above possibilities, the list of HQ-blocks can be modified by FIND_HOLES.
static int ESORT(const void *l, const void *r)
{ Event *x = (Event *) l;
@@ -473,8 +495,7 @@ static int ESORT(const void *l, const void *r)
return (x->pos - y->pos);
}
-static Interval *FIND_HOLES(int aread, Overlap *ovls, int novl,
- int *p_nhole, Interval **p_block, int *p_nblk)
+static int FIND_HOLES(int aread, Overlap *ovls, int novl, Interval *block, int nblk)
{ static int nmax = 0;
int nev;
@@ -487,9 +508,6 @@ static Interval *FIND_HOLES(int aread, Overlap *ovls, int novl,
static Interval *cover = NULL; // Coverage at block ends [0..nblk)
static Interval *nwblk; // Modified block list [0..nblk')
- int nblk = *p_nblk; // Initial HQ block list [0..nblk)
- Interval *block = *p_block;
-
int anum = 0, dnum = 0; // LFT and RGT walls, awall[0..anum) & dwall[0..dnum)
Wall *awall, *dwall;
@@ -960,14 +978,37 @@ static Interval *FIND_HOLES(int aread, Overlap *ovls, int novl,
while (p < nblk)
nwblk[q++] = block[p++];
nblk = q;
+
+ // Transfer new blocks to original block vector
+
+ for (i = 0; i < nblk; i++)
+ block[i] = nwblk[i];
+ }
+
+#ifdef ANNOTATE
+ { int i;
+
+ for (i = 0; i < nhole; i++)
+ if (holes[i].end - holes[i].beg < 75)
+ { holes[i].end += 50;
+ holes[i].beg -= 50;
+ fwrite(&(holes[i].beg),sizeof(int),1,HL_DFILE);
+ fwrite(&(holes[i].end),sizeof(int),1,HL_DFILE);
+ holes[i].end -= 50;
+ holes[i].beg += 50;
+ }
+ else
+ { fwrite(&(holes[i].beg),sizeof(int),1,HL_DFILE);
+ fwrite(&(holes[i].end),sizeof(int),1,HL_DFILE);
+ }
+ HL_INDEX += 2*nhole*sizeof(int);
+ fwrite(&HL_INDEX,sizeof(int64),1,HL_AFILE);
}
+#endif
// Return the list of holes holes[0..nhole) and the new list of blocks, nwblk[0..nblk)
- *p_nblk = nblk;
- *p_block = nwblk;
- *p_nhole = nhole;
- return (holes);
+ return (nblk);
}
@@ -977,37 +1018,194 @@ static Interval *FIND_HOLES(int aread, Overlap *ovls, int novl,
*
********************************************************************************************/
+typedef struct
+ { int lidx; // left LA index
+ int ridx; // right LA index
+ int delta; // Difference between A-gap and B-gap
+ } Spanner;
+
+typedef struct
+ { int bread; // bread^comp[beg..end] is the patch sequence
+ int comp;
+ int beg;
+ int end;
+ int anc; // maximum anchor interval match
+ int bad; // number of segments that are bad
+ int avg; // average QV of the patch
+ } Patch;
+
static int GSORT(const void *l, const void *r)
-{ int x = *((int *) l);
- int y = *((int *) r);
+{ Spanner *x = (Spanner *) l;
+ Spanner *y = (Spanner *) r;
+
+ return (x->delta - y->delta);
+}
+
+#ifdef DEBUG_GAP_STATUS
+
+static int ASORT(const void *l, const void *r)
+{ int *x = (int *) l;
+ int *y = (int *) r;
+
+ return (*x - *y);
+}
+
+#endif
+
+// Return match score of lov->bread with "anchor" lov->aread[lft-TRACE_SPACING,lft]
+
+static int eval_lft_anchor(int lft, Overlap *lov)
+{ uint16 *tr;
+ int te;
+
+ if (lft > lov->path.aepos)
+ return (50);
+ tr = (uint16 *) lov->path.trace;
+ te = 2 * (((lft + (TRACE_SPACING-1)) - lov->path.abpos)/TRACE_SPACING);
+ if (te <= 0)
+ return (50);
+ return (tr[te-2]);
+}
- return (x - y);
+// Return match score of lov->bread with "anchor" lov->aread[rgt,rgt+TRACE_SPACING]
+
+static int eval_rgt_anchor(int rgt, Overlap *rov)
+{ uint16 *tr;
+ int te;
+
+ if (rgt < rov->path.abpos)
+ return (50);
+ tr = (uint16 *) rov->path.trace;
+ te = 2 * (((rgt + (TRACE_SPACING-1)) - rov->path.abpos)/TRACE_SPACING);
+ if (te >= rov->path.tlen)
+ return (50);
+ return (tr[te]);
}
-#define LOWQ 0
-#define SPAN 1
-#define SPLIT 2
-#define ADAPT 3
+// Evaluate the quality of lov->bread = rov->bread spaning [lcv,rcv] as a patch
+
+static Patch *compute_patch(int lft, int rgt, Overlap *lov, Overlap *rov)
+{ static Patch ans;
+
+ uint16 *tr;
+ int bread, bcomp, blen;
+ int bb, be;
+ int t, te;
+ int bl, br;
+ uint8 *qb;
+ int avg, anc, bad;
+
+ bread = lov->bread;
+ bcomp = COMP(lov->flags);
+ blen = DB->reads[bread].rlen;
+
+ if (blen < HGAP_MIN)
+ return (NULL);
+
+ if (lft > lov->path.aepos || rgt < rov->path.abpos) // Cannot anchor
+ return (NULL);
+ if (lov->path.abpos > lft-TRACE_SPACING || rgt+TRACE_SPACING > rov->path.aepos)
+ return (NULL);
+
+ // Get max of left and right anchors as anchor score
+
+ tr = (uint16 *) lov->path.trace;
+ te = 2 * (((lft + (TRACE_SPACING-1)) - lov->path.abpos)/TRACE_SPACING);
+ if (te == 0)
+ return (NULL);
+ anc = tr[te-2];
+
+ bb = lov->path.bbpos;
+ for (t = 1; t < te; t += 2)
+ bb += tr[t];
+
+ tr = (uint16 *) rov->path.trace;
+ te = 2 * (((rgt + (TRACE_SPACING-1)) - rov->path.abpos)/TRACE_SPACING);
+ if (te >= rov->path.tlen)
+ return (NULL);
+ if (tr[te] > anc)
+ anc = tr[te];
+
+ be = rov->path.bepos;
+ for (t = rov->path.tlen-1; t > te; t -= 2)
+ be -= tr[t];
+
+ if (bb >= be)
+ return (NULL);
+
+ ans.bread = bread;
+ ans.comp = bcomp;
+ ans.beg = bb;
+ ans.end = be;
+ ans.anc = anc;
+
+ // Compute metrics for b-read patch
+
+ if (bcomp)
+ { t = blen - be;
+ be = blen - bb;
+ bb = t;
+ }
+
+ bl = bb/TRACE_SPACING;
+ br = (be+(TRACE_SPACING-1))/TRACE_SPACING;
+ qb = QV + QV_IDX[bread];
+ if (bl >= br)
+ { avg = qb[bl];
+ if (avg >= BAD_QV)
+ bad = 1;
+ else
+ bad = 0;
+ }
+ else
+ { avg = 0;
+ bad = 0;
+ for (t = bl; t < br; t++)
+ { avg += qb[t];
+ if (qb[t] >= BAD_QV)
+ bad += 1;
+ }
+ avg /= (br-bl);
+ }
+
+ ans.bad = bad;
+ ans.avg = avg;
-static int gap_status(Overlap *ovls, int novl, Interval *rblock)
+ return (&ans);
+}
+
+// Categorize each gap and if appropriate return the best patch for each
+
+static int gap_status(Overlap *ovls, int novl, Interval *lblock, Interval *rblock,
+ int *p_lft, int *p_rgt)
{ static int nmax = 0;
- static int *gsort = NULL; // A-B delta for all B-reads spanning a gap
- static int *asort = NULL; // A-B delta for all B-reads spanning a gap
+ static Spanner *gsort = NULL; // A-B delta and idx-pair for all B-reads spanning a gap
+ static int *asort = NULL; // A-B delta for all B-reads spanning a gap
- Interval *lblock = rblock-1;
+ static int ANCHOR_THRESH;
+
+ static Interval *FirstB;
+ static Interval *LastB;
int j;
int lft, rgt;
int lcv, rcv;
int cnt;
- if (novl > nmax)
- { nmax = 1.2*novl + 500;
- gsort = (int *) Realloc(gsort,nmax*sizeof(int),"Allocating gap vector");
- asort = (int *) Realloc(asort,nmax*sizeof(int),"Allocating adaptemer position vector");
- if (gsort == NULL || asort == NULL)
- exit (1);
+ if (p_lft == NULL)
+ { if (novl > nmax)
+ { nmax = 1.2*novl + 500;
+ gsort = (Spanner *) Realloc(gsort,nmax*sizeof(Spanner),"Allocating gap vector");
+ asort = (int *) Realloc(asort,nmax*sizeof(int),"Allocating adaptemer position vector");
+ if (gsort == NULL || asort == NULL)
+ exit (1);
+ ANCHOR_THRESH = ANCHOR_MATCH * TRACE_SPACING;
+ }
+
+ FirstB = lblock;
+ LastB = rblock-1;
+ return (0);
}
lft = lblock->end;
@@ -1020,9 +1218,11 @@ static int gap_status(Overlap *ovls, int novl, Interval *rblock)
rcv = rblock->end;
#ifdef DEBUG_GAP_STATUS
- printf(" GAP [%5d,%5d]\n",lcv,rcv);
+ printf(" GAP [%5d,%5d] <%5d,%5d>\n",lft,rgt,lcv,rcv);
#endif
+ // If the gap flank [lcv,rcv] is covered by 10 or more LAs, then a LOWQ gap
+
cnt = 0;
for (j = 0; j < novl; j++)
if (ovls[j].path.abpos <= lcv && ovls[j].path.aepos >= rcv)
@@ -1030,12 +1230,31 @@ static int gap_status(Overlap *ovls, int novl, Interval *rblock)
if (cnt >= 10)
break;
}
+
+ // If so and it is patchable then report LOWQ
+
if (cnt >= 10)
- {
+ { for (j = 0; j < novl; j++)
+ if (ovls[j].path.abpos <= lcv && ovls[j].path.aepos >= rcv)
+ { Patch *can;
+
+ can = compute_patch(lft,rgt,ovls+j,ovls+j);
+
+ if (can == NULL) continue;
+
+ if (can->anc <= ANCHOR_THRESH && can->avg <= GOOD_QV && can->bad == 0)
+ {
+#ifdef DEBUG_GAP_STATUS
+ printf(" LOWQ PATCHABLE = %d%c[%d..%d] %d (%d)\n",
+ can->bread,can->comp?'c':'n',can->beg,
+ can->end,can->anc,can->avg);
+#endif
+ return (LOWQ);
+ }
+ }
#ifdef DEBUG_GAP_STATUS
- printf(" LOWQ\n");
+ printf(" FAILING TO PATCH_LOWQ\n");
#endif
- return (LOWQ);
}
{ int bread, bcomp, blen;
@@ -1044,6 +1263,8 @@ static int gap_status(Overlap *ovls, int novl, Interval *rblock)
int lidx, ridx, sidx, cidx;
int k;
+ // Find LA pairs or LAs spanning the gap flank [lcv,rcv]
+
lcnt = rcnt = scnt = gcnt = acnt = 0;
for (j = 0; j < novl; j = k)
{ bread = ovls[j].bread;
@@ -1052,11 +1273,11 @@ static int gap_status(Overlap *ovls, int novl, Interval *rblock)
if (bcomp)
cidx = j;
- lidx = ridx = sidx = -1;
+ lidx = ridx = sidx = -1; // For all LA's with same b-read
for (k = j; k < novl; k++)
{ if (ovls[k].bread != bread)
break;
- if (COMP(ovls[k].flags) != (uint32) bcomp)
+ if (COMP(ovls[k].flags) != (uint32) bcomp) // Note when b switches orientation
{ cidx = k;
bcomp = COMP(ovls[k].flags);
}
@@ -1071,30 +1292,28 @@ static int gap_status(Overlap *ovls, int novl, Interval *rblock)
printf(" ");
#endif
+ // Is LA a spanner, left-partner, or right partner
+
if (ab <= lcv && ae >= rcv)
{ sidx = k;
+ lidx = ridx = -1;
continue;
}
- // Duplicate left or right hits were due mainly to low complexity sequence
- // Just ignore, you want to lose the left or right have that is bad
- // Duplicate pairs are due entirely to unremoved adapters (palindromes)
- // Let the complement pair go through, both are equivalent
-
#ifdef SHOW_PAIRS
- if (ae >= rcv && ab <= rcv && ab - ovls[k].path.bbpos <= lft - MIN_LEN)
+ if (ae >= rcv && ab <= rcv && ab - ovls[k].path.bbpos <= lft - COVER_LEN)
printf("r");
else
printf(" ");
- if (ab <= lcv && ae >= lcv && ae + (blen-ovls[j].path.bepos) >= rgt + MIN_LEN)
+ if (ab <= lcv && ae >= lcv && ae + (blen-ovls[j].path.bepos) >= rgt + COVER_LEN)
printf("l");
else
printf(" ");
#endif
- if (ae >= rcv && ab <= rcv && ab - ovls[k].path.bbpos <= lft - MIN_LEN)
+ if (ae >= rcv && ab <= rcv && ab - ovls[k].path.bbpos <= lft - COVER_LEN)
ridx = k;
- if (ab <= lcv && ae >= lcv && ae + (blen-ovls[j].path.bepos) >= rgt + MIN_LEN)
+ if (ab <= lcv && ae >= lcv && ae + (blen-ovls[j].path.bepos) >= rgt + COVER_LEN)
lidx = k;
}
if (! bcomp)
@@ -1114,18 +1333,31 @@ static int gap_status(Overlap *ovls, int novl, Interval *rblock)
printf(" A");
#endif
+ // Add spanning LA or spanning pair to gsort list. Add contra pairs to asort list.
+
if (sidx >= 0)
- scnt += 1;
- if (lidx >= 0)
- lcnt += 1;
- if (ridx >= 0)
- rcnt += 1;
- if (0 <= lidx && lidx < ridx && (ridx < cidx || cidx <= lidx))
- gsort[gcnt++] = (ovls[ridx].path.abpos - ovls[lidx].path.aepos)
- - (ovls[ridx].path.bbpos - ovls[lidx].path.bepos);
- if ((0<=ridx && ridx<cidx && cidx<=lidx) || (0<=lidx && lidx<cidx && cidx<=ridx))
- asort[acnt++] = (((blen-ovls[ridx].path.bbpos) - ovls[lidx].path.bepos)
- + (ovls[lidx].path.aepos + ovls[ridx].path.abpos))/2;
+ { gsort[gcnt].delta = DB->maxlen;
+ gsort[gcnt].lidx = sidx;
+ gsort[gcnt].ridx = sidx;
+ gcnt += 1;
+ scnt += 1;
+ }
+ else
+ { if (lidx >= 0)
+ lcnt += 1;
+ if (ridx >= 0)
+ rcnt += 1;
+ if (0 <= lidx && lidx < ridx && (ridx < cidx || cidx <= lidx))
+ { gsort[gcnt].delta = (ovls[ridx].path.abpos - ovls[lidx].path.aepos)
+ - (ovls[ridx].path.bbpos - ovls[lidx].path.bepos);
+ gsort[gcnt].lidx = lidx;
+ gsort[gcnt].ridx = ridx;
+ gcnt += 1;
+ }
+ else if ((0<=ridx && ridx<cidx && cidx<=lidx) || (0<=lidx && lidx<cidx && cidx<=ridx))
+ asort[acnt++] = (((blen-ovls[ridx].path.bbpos) - ovls[lidx].path.bepos)
+ + (ovls[lidx].path.aepos + ovls[ridx].path.abpos))/2;
+ }
}
#ifdef SHOW_PAIRS
@@ -1133,69 +1365,336 @@ static int gap_status(Overlap *ovls, int novl, Interval *rblock)
#endif
#ifdef DEBUG_GAP_STATUS
- printf(" lcnt = %d scnt = %d(%d) rcnt = %d acnt = %d\n",lcnt,gcnt,scnt,rcnt,acnt);
+ printf(" lcnt = %d scnt = %d(%d) rcnt = %d acnt = %d\n",lcnt,gcnt-scnt,scnt,rcnt,acnt);
#endif
- { int64 med, dev = 0;
- int std, low, hgh;
+ { int64 med, dev;
+ int std, avg;
+ int low, hgh;
if (lcnt < rcnt)
rcnt = lcnt;
+
+ // Lots of contra pairs and less spanning support, call it an adaptamer gap.
- if (acnt >= .4*rcnt && scnt+gcnt < .3*acnt)
- { qsort(asort,acnt,sizeof(int),GSORT);
+ if (acnt >= .4*rcnt && gcnt < .3*acnt)
+ {
+#ifdef DEBUG_GAP_STATUS
+ qsort(asort,acnt,sizeof(int),ASORT);
med = asort[acnt/2];
low = acnt*.25;
hgh = acnt*.75;
+ dev = 0;
for (j = low; j <= hgh; j++)
- dev = (asort[j]-med)*(asort[j]-med);
+ dev += (asort[j]-med)*(asort[j]-med);
std = sqrt((1.*dev)/acnt);
if (std > 200)
- printf(" Warning: Read %d adaptemer test may be wrong\n",ovls[0].aread);
-#ifdef DEBUG_GAP_STATUS
+ printf(" UNCERTAIN, std. dev. large\n");
printf(" ADAPT %3d\n",std);
#endif
return (ADAPT);
}
+
+ // Examine the spanning pairs for the gap and compute average and deviation
+ // of gap deltas for second and third quartile
- qsort(gsort,gcnt,sizeof(int),GSORT);
- med = gsort[gcnt/2];
- low = gcnt*.25;
- hgh = gcnt*.75;
- for (j = low; j <= hgh; j++)
- dev = (gsort[j]-med)*(gsort[j]-med);
- std = sqrt((1.*dev)/gcnt);
-
- if (std < 150 && scnt+gcnt >= 10 && (scnt+gcnt >= .4*rcnt || scnt+gcnt >= 20))
- {
-#ifdef DEBUG_GAP_STATUS
- printf(" SPAN %3d %5d\n",std,rgt-lft);
-#endif
- return (SPAN);
+ qsort(gsort,gcnt,sizeof(Spanner),GSORT);
+ gcnt -= scnt;
+
+ if (gcnt >= 4)
+ { med = gsort[gcnt/2].delta;
+ low = gcnt*.25;
+ hgh = gcnt*.75;
+ dev = 0;
+ avg = 0;
+ for (j = low; j <= hgh; j++)
+ { dev += (gsort[j].delta-med)*(gsort[j].delta-med);
+ avg += gsort[j].delta;
+ }
+ std = sqrt((1.*dev)/gcnt);
+ avg = avg/((hgh-low)+1);
+ low = avg-4*std;
+ hgh = avg+4*std;
}
else
+ std = 0;
+
+ // If the pairing gap deviation is too large or there are too few pairs or
+ // most potential partners are not paired, then be safe and split it.
+
+ gcnt += scnt;
+ if (std >= 150 || gcnt < 10 || (gcnt < .4*rcnt && gcnt < 20))
{
#ifdef DEBUG_GAP_STATUS
if (rcnt >= 20)
printf(" STRONG SPLIT\n");
else
printf(" WEAK SPLIT\n");
- if (scnt + gcnt >= 10)
- printf(" UNCERTAIN %5.1f %5d %3d\n",(scnt+gcnt)/(.01*rcnt),rgt-lft,scnt+gcnt);
+ if (gcnt >= 10)
+ printf(" UNCERTAIN %5.1f %5d %3d\n",(scnt+gcnt)/(1.*rcnt),rgt-lft,gcnt);
#endif
return (SPLIT);
}
+
+ // Otherwise consider the gap linkable and try to find a viable patch, declaring a split
+ // iff all patch attemtps fail
+
+ else
+ { Patch *can;
+ int ncand;
+ uint8 *qa;
+ Interval *clb, *crb;
+
+ qa = QV + QV_IDX[ovls[0].aread];
+ clb = lblock;
+ crb = rblock;
+
+ // First make sure enough partners provide anchors, and if not
+ // shift them back to the next good segment of A-read
+
+ { int nshort;
+
+ nshort = 0;
+ for (j = 0; j < gcnt; j++)
+ { if (lft > ovls[gsort[j].lidx].path.aepos)
+ nshort += 1;
+ }
+ if (nshort > .2*gcnt)
+ do
+ { lft -= TRACE_SPACING;
+ if (lft <= clb->beg)
+ { if (clb <= FirstB)
+ break;
+ clb -= 1;
+ lft = clb->end;
+ }
+ }
+ while (qa[lft/TRACE_SPACING-1] > GOOD_QV);
+
+ nshort = 0;
+ for (j = 0; j < gcnt; j++)
+ { if (rgt < ovls[gsort[j].ridx].path.abpos)
+ nshort += 1;
+ }
+ if (nshort > .2*gcnt)
+ do
+ { rgt += TRACE_SPACING;
+ if (rgt >= crb->end)
+ { if (crb >= LastB)
+ break;
+ crb += 1;
+ rgt = crb->beg;
+ }
+ }
+ while (qa[rgt/TRACE_SPACING] > GOOD_QV);
+
+ // Could not find primary anchor pair, then declare a SPLIT
+
+ if (clb < FirstB || crb > LastB)
+ {
+#ifdef DEBUG_GAP_STATUS
+ printf(" ANCHOR FAIL (BOUNDS)\n");
+#endif
+ return (SPLIT);
+ }
+ }
+
+ // Count all patch candidates that have a good anchor pair
+
+ ncand = 0;
+ for (j = 0; j < gcnt; j++)
+ { lidx = gsort[j].lidx;
+ ridx = gsort[j].ridx;
+
+#ifdef DEBUG_GAP_STATUS
+ if (lidx != ridx)
+ printf(" %5d [%5d,%5d] [%5d,%5d] %4d",
+ ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos,
+ ovls[ridx].path.abpos,ovls[ridx].path.aepos,gsort[j].delta);
+ else
+ printf(" %5d [%5d,%5d] SSS",
+ ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos);
+#endif
+
+ can = compute_patch(lft,rgt,ovls+lidx,ovls+ridx);
+
+ if (can != NULL)
+ {
+#ifdef DEBUG_GAP_STATUS
+ printf(" %d",can->end-can->beg);
+#endif
+ if (can->anc <= ANCHOR_THRESH)
+ { ncand += 1;
+#ifdef DEBUG_GAP_STATUS
+ printf(" AA %d %d(%d)",can->anc,can->bad,can->avg);
+#endif
+ }
+ }
+#ifdef DEBUG_GAP_STATUS
+ printf("\n");
+#endif
+ }
+
+ // If there are less than 5 of them, then seek better anchor points a bit
+ // further back
+
+ if (ncand < 5)
+ { int x, best, nlft, nrgt;
+ int nanchor, ntry;
+
+#ifdef DEBUG_GAP_STATUS
+ printf(" NOT ENOUGH\n");
+#endif
+
+ // Try 4 additional anchor spots located at good intervals of A (if available)
+ // One can cross other gaps in the search. Try the one with the most
+ // partners having match scores below the anchor threshold. Do this to the
+ // left and right. (A better search could be arranged (i.e. find smallest
+ // spanning pair of adjusted anchors, but this situation happens 1 in 5000
+ // times, so felt it was not worth it).
+
+ ntry = 0;
+ nlft = lft;
+ best = -1;
+ for (x = lft; ntry < 5; x -= TRACE_SPACING)
+ if (x <= clb->beg)
+ { if (clb <= FirstB)
+ break;
+ clb -= 1;
+ x = clb->end + TRACE_SPACING;
+ }
+ else if (qa[x/TRACE_SPACING-1] <= GOOD_QV)
+ { ntry += 1;
+ nanchor = 0;
+ for (j = 0; j < gcnt; j++)
+ if (eval_lft_anchor(x,ovls+gsort[j].lidx) <= ANCHOR_THRESH)
+ nanchor += 1;
+#ifdef DEBUG_GAP_STATUS
+ printf(" %5d: %3d\n",x,nanchor);
+#endif
+ if (nanchor > best)
+ { best = nanchor;
+ nlft = x;
+ }
+ }
+#ifdef DEBUG_GAP_STATUS
+ printf(" %5d->%5d\n",lft,nlft);
+#endif
+
+ ntry = 0;
+ nrgt = rgt;
+ best = -1;
+ for (x = rgt; ntry < 5; x += TRACE_SPACING)
+ if (x >= crb->end)
+ { if (crb >= LastB)
+ break;
+ crb += 1;
+ x = crb->beg - TRACE_SPACING;
+ }
+ else if (qa[x/TRACE_SPACING] <= GOOD_QV)
+ { ntry += 1;
+ nanchor = 0;
+ for (j = 0; j < gcnt; j++)
+ if (eval_rgt_anchor(x,ovls+gsort[j].ridx) <= ANCHOR_THRESH)
+ nanchor += 1;
+#ifdef DEBUG_GAP_STATUS
+ printf(" %5d: %3d\n",x,nanchor);
+#endif
+ if (nanchor > best)
+ { best = nanchor;
+ nrgt = x;
+ }
+ }
+#ifdef DEBUG_GAP_STATUS
+ printf(" %5d->%5d\n",rgt,nrgt);
+#endif
+
+ // If a better candidate pair of anchor points does not exist, then split.
+
+ if (lft == nlft && rgt == nrgt)
+ {
+#ifdef DEBUG_GAP_STATUS
+ printf(" ANCHOR FAIL (ONCE) %d\n",ncand);
+#endif
+ return (SPLIT);
+ }
+ lft = nlft;
+ rgt = nrgt;
+
+ // Check out if the new anchor pair has 5 or more candidate patches
+
+ ncand = 0;
+ for (j = 0; j < gcnt; j++)
+ { lidx = gsort[j].lidx;
+ ridx = gsort[j].ridx;
+
+#ifdef DEBUG_GAP_STATUS
+ if (lidx != ridx)
+ printf(" %5d [%5d,%5d] [%5d,%5d] %4d",
+ ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos,
+ ovls[ridx].path.abpos,ovls[ridx].path.aepos,gsort[j].delta);
+ else
+ printf(" %5d [%5d,%5d] SSS",
+ ovls[lidx].bread,ovls[lidx].path.abpos,ovls[lidx].path.aepos);
+#endif
+
+ if (lft <= ovls[lidx].path.aepos && rgt >= ovls[ridx].path.abpos)
+ { can = compute_patch(lft,rgt,ovls+lidx,ovls+ridx);
+
+ if (can != NULL)
+ {
+#ifdef DEBUG_GAP_STATUS
+ printf(" %d",can->end-can->beg);
+#endif
+ if (can->anc <= ANCHOR_THRESH)
+ { ncand += 1;
+#ifdef DEBUG_GAP_STATUS
+ printf(" AA %d %d(%d)",can->anc,can->bad,can->avg);
+#endif
+ }
+ }
+ }
+#ifdef DEBUG_GAP_STATUS
+ printf("\n");
+#endif
+ }
+
+ // Could not arrange 5 patch candidates, give up and split.
+
+ if (ncand < 5)
+ {
+#ifdef DEBUG_GAP_STATUS
+ printf(" ANCHOR FAIL (TWICE) %d\n",ncand);
+#endif
+ return (SPLIT);
+ }
+ }
+
+ *p_lft = lft;
+ *p_rgt = rgt;
+
+#ifdef DEBUG_GAP_STATUS
+ printf(" SPAN %3d %5d: PATCHABLE\n",std,rgt-lft);
+#endif
+ return (SPAN);
+ }
}
}
}
-static int *GAP_ANALYSIS(Overlap *ovls, int novl, Interval *block, int nblk)
+static int *GAP_ANALYSIS(Overlap *ovls, int novl, Interval *block, int *p_nblk)
{ static int bmax = 0;
-
static int *status = NULL; // Status of gaps between HQ_blocks
+
+#ifdef DEBUG_SUMMARY
+ static char *status_string[4] = { "LOWQ", "SPAN", "SPLIT", "ADAPT" };
+#endif
- int i;
+ int nblk;
+ int i, j;
+ int slft = 0, srgt = 0;
+ nblk = *p_nblk;
if (nblk > bmax)
{ bmax = 1.2*nblk + 100;
status = (int *) Realloc(status,bmax*sizeof(int),"Allocating status vector");
@@ -1203,12 +1702,39 @@ static int *GAP_ANALYSIS(Overlap *ovls, int novl, Interval *block, int nblk)
exit (1);
}
+ gap_status(ovls,novl,block,block+nblk,NULL,NULL); // Initialization call
+ j = 0;
+ for (i = 1; i < nblk; i++)
+ { status[i] = gap_status(ovls,novl,block+j,block+i,&slft,&srgt);
+ if (status[i] == SPAN)
+ { while (slft < block[j].beg)
+ j -= 1;
+ block[j].end = slft;
+ j += 1;
+ status[j] = status[i];
+ while (srgt > block[i].end)
+ i += 1;
+ block[j] = block[i];
+ block[j].beg = srgt;
+ }
+ else
+ { j += 1;
+ status[j] = status[i];
+ block[j] = block[i];
+ }
+ }
+ nblk = j+1;
+
+#ifdef DEBUG_SUMMARY
#ifdef DEBUG_GAP_STATUS
- printf(" GAPS\n");
+ printf(" FINAL:\n");
#endif
+ printf(" [%d,%d]",block[0].beg,block[0].end);
for (i = 1; i < nblk; i++)
- status[i] = gap_status(ovls,novl,block+i);
-
+ printf(" %s [%d,%d]",status_string[status[i]],block[i].beg,block[i].end);
+#endif
+
+ *p_nblk = nblk;
return (status);
}
@@ -1233,15 +1759,32 @@ static void GAPS(int aread, Overlap *ovls, int novl)
Interval *block;
int *status;
- int nhole;
- Interval *holes;
-
-#if defined(DEBUG_HQ_BLOCKS) || defined(SHOW_EVENTS) || defined(DEBUG_HOLE_FINDER) || defined(DEBUG_ADAPTER)
+#if defined(DEBUG_HQ_BLOCKS) || defined(DEBUG_HOLE_FINDER)
+ printf("\n");
+ printf("AREAD %d\n",aread);
+#endif
+#if defined(DEBUG_GAP_STATUS) || defined(DEBUG_SUMMARY)
printf("\n");
printf("AREAD %d\n",aread);
#endif
alen = DB->reads[aread].rlen;
+ if (alen < HGAP_MIN)
+ {
+#ifdef ANNOTATE
+ fwrite(&HQ_INDEX,sizeof(int64),1,HQ_AFILE);
+ fwrite(&SN_INDEX,sizeof(int64),1,SN_AFILE);
+ fwrite(&SP_INDEX,sizeof(int64),1,SP_AFILE);
+ fwrite(&AD_INDEX,sizeof(int64),1,AD_AFILE);
+
+ fwrite(&HL_INDEX,sizeof(int64),1,HL_AFILE);
+
+ fwrite(&KP_INDEX,sizeof(int64),1,KP_AFILE);
+#endif
+
+ fwrite(&TR_INDEX,sizeof(int64),1,TR_AFILE);
+ return;
+ }
if (VERBOSE)
{ nreads += 1;
@@ -1252,6 +1795,16 @@ static void GAPS(int aread, Overlap *ovls, int novl)
block = HQ_BLOCKS(aread,&nblk);
+ // Find holes and modify HQ-blocks if necessary
+
+ if (nblk > 0)
+ nblk = FIND_HOLES(aread,ovls,novl,block,nblk);
+
+ // Determine the status of each gap between a pair of blocks
+
+ if (nblk > 0)
+ status = GAP_ANALYSIS(ovls,novl,block,&nblk);
+
// No blocks? ==> nothing to do
if (nblk <= 0)
@@ -1269,28 +1822,11 @@ static void GAPS(int aread, Overlap *ovls, int novl)
fwrite(&KP_INDEX,sizeof(int64),1,KP_AFILE);
#endif
- return;
- }
-
- // Find holes and modify HQ-blocks if necessary
-
- holes = FIND_HOLES(aread,ovls,novl,&nhole,&block,&nblk);
- if (VERBOSE)
- { if (block[0].beg > 0)
- { n5trm += 1;
- n5trmbp += block[0].beg;
- }
- if (block[nblk-1].end < alen)
- { n3trm += 1;
- n3trmbp += alen - block[nblk-1].end;
- }
+ fwrite(&TR_INDEX,sizeof(int64),1,TR_AFILE);
+ return;
}
- // Determine the status of each gap between a pair of blocks
-
- status = GAP_ANALYSIS(ovls,novl,block,nblk);
-
#ifdef ANNOTATE
{ int i;
@@ -1320,25 +1856,11 @@ static void GAPS(int aread, Overlap *ovls, int novl)
fwrite(&SN_INDEX,sizeof(int64),1,SN_AFILE);
fwrite(&SP_INDEX,sizeof(int64),1,SP_AFILE);
fwrite(&AD_INDEX,sizeof(int64),1,AD_AFILE);
-
- for (i = 0; i < nhole; i++)
- if (holes[i].end - holes[i].beg < 75)
- { holes[i].end += 50;
- holes[i].beg -= 50;
- fwrite(&(holes[i].beg),sizeof(int),1,HL_DFILE);
- fwrite(&(holes[i].end),sizeof(int),1,HL_DFILE);
- holes[i].end -= 50;
- holes[i].beg += 50;
- }
- else
- { fwrite(&(holes[i].beg),sizeof(int),1,HL_DFILE);
- fwrite(&(holes[i].end),sizeof(int),1,HL_DFILE);
- }
- HL_INDEX += 2*nhole*sizeof(int);
- fwrite(&HL_INDEX,sizeof(int64),1,HL_AFILE);
}
#endif
+ // Find largest non-adaptemer/subread range: block[abeg..aend)
+
{ int cmax, amax, abeg = 0, aend = 0;
int p, i;
@@ -1364,13 +1886,30 @@ static void GAPS(int aread, Overlap *ovls, int novl)
abeg = p;
aend = nblk;
}
+
+ if (block[aend-1].end - block[aend-1].beg < TRACE_SPACING)
+ { aend -= 1;
+ nblk -= 1; // assert: aend == nblk && status[aend-1] = SPLIT
+ }
+ if (block[aend-1].end == alen)
+ block[aend-1].end = (alen/TRACE_SPACING)*TRACE_SPACING;
-#ifdef DEBUG_ADAPTER
- printf(" Keeping %d-%d [%d,%d]\n",abeg,aend-1,block[abeg].beg,block[aend-1].end);
+#ifdef DEBUG_SUMMARY
+ printf(" ::: Keeping [%d,%d]\n",block[abeg].beg,block[aend-1].end);
#endif
+ // Accummulate statistics
+
if (VERBOSE)
- { if (abeg > 0)
+ { if (block[0].beg > 0)
+ { n5trm += 1;
+ n5trmbp += block[0].beg;
+ }
+ if (block[nblk-1].end < alen)
+ { n3trm += 1;
+ n3trmbp += alen - block[nblk-1].end;
+ }
+ if (abeg > 0)
{ natrm += 1;
natrmbp += block[abeg].beg - block[0].beg;
}
@@ -1396,27 +1935,44 @@ static void GAPS(int aread, Overlap *ovls, int novl)
}
}
- nblk = aend-abeg;
- block += abeg;
- status += abeg;
- holes += abeg;
- }
-
#ifdef ANNOTATE
- { int i;
-
- fwrite(&(block[0].beg),sizeof(int),1,KP_DFILE);
- for (i = 1; i < nblk; i++)
+ fwrite(&(block[abeg].beg),sizeof(int),1,KP_DFILE);
+ for (i = abeg+1; i < aend; i++)
if (status[i] == SPLIT)
{ fwrite(&(block[i-1].end),sizeof(int),1,KP_DFILE);
fwrite(&(block[i].beg),sizeof(int),1,KP_DFILE);
KP_INDEX += 2*sizeof(int);
}
- fwrite(&(block[nblk-1].end),sizeof(int),1,KP_DFILE);
+ fwrite(&(block[aend-1].end),sizeof(int),1,KP_DFILE);
KP_INDEX += 2*sizeof(int);
fwrite(&KP_INDEX,sizeof(int64),1,KP_AFILE);
- }
#endif
+
+ // Output .trim track for this read
+
+ if (abeg > 0)
+ { fwrite(&(block[0].beg),sizeof(int),1,TR_DFILE);
+ fwrite(&(block[abeg-1].end),sizeof(int),1,TR_DFILE);
+ fwrite(&AdPre,sizeof(int),1,TR_DFILE);
+ TR_INDEX += 3*sizeof(int);
+ }
+ fwrite(&(block[abeg].beg),sizeof(int),1,TR_DFILE);
+ fwrite(&(block[abeg].end),sizeof(int),1,TR_DFILE);
+ TR_INDEX += 2*sizeof(int);
+ for (i = abeg+1; i < aend; i++)
+ { fwrite(status+i,sizeof(int),1,TR_DFILE);
+ fwrite(&(block[i].beg),sizeof(int),1,TR_DFILE);
+ fwrite(&(block[i].end),sizeof(int),1,TR_DFILE);
+ TR_INDEX += 3*sizeof(int);
+ }
+ if (aend < nblk)
+ { fwrite(&AdSuf,sizeof(int),1,TR_DFILE);
+ fwrite(&(block[aend].beg),sizeof(int),1,TR_DFILE);
+ fwrite(&(block[nblk-1].end),sizeof(int),1,TR_DFILE);
+ TR_INDEX += 3*sizeof(int);
+ }
+ fwrite(&TR_INDEX,sizeof(int64),1,TR_AFILE);
+ }
}
@@ -1454,6 +2010,9 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
else
tbytes = sizeof(uint16);
+ if (novl <= 0)
+ return (0);
+
Read_Overlap(input,ovls);
if (trace)
{ if (ovls[0].path.tlen > pmax)
@@ -1485,7 +2044,7 @@ static int make_a_pass(FILE *input, void (*ACTION)(int, Overlap *, int), int tra
n = 0;
else
{ if (trace)
- memcpy(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
+ memmove(paths,paths+pcur,sizeof(uint16)*ovls[0].path.tlen);
n = 1;
pcur = ovls[0].path.tlen;
while (1)
@@ -1539,6 +2098,7 @@ int main(int argc, char *argv[])
int64 novl;
HITS_TRACK *track;
int c;
+ int COVERAGE;
// Process arguments
@@ -1550,7 +2110,6 @@ int main(int argc, char *argv[])
BAD_QV = -1;
GOOD_QV = -1;
- MIN_LEN = 1000;
j = 1;
for (i = 1; i < argc; i++)
@@ -1565,9 +2124,6 @@ int main(int argc, char *argv[])
case 'g':
ARG_NON_NEGATIVE(GOOD_QV,"Maximum QV score for being considered good")
break;
- case 'l':
- ARG_POSITIVE(MIN_LEN,"Minimum retained segment length")
- break;
}
else
argv[j++] = argv[i];
@@ -1595,7 +2151,7 @@ int main(int argc, char *argv[])
}
}
- // Open trimmed DB and the prim-track
+ // Open trimmed DB and the qual-track
{ int status;
@@ -1611,6 +2167,41 @@ int main(int argc, char *argv[])
exit (1);
}
Trim_DB(DB);
+
+ track = Load_Track(DB,"qual");
+ if (track != NULL)
+ { FILE *afile;
+ int size, tracklen, extra;
+
+ QV_IDX = (int64 *) track->anno;
+ QV = (uint8 *) track->data;
+
+ // if newer .qual tracks with -c meta data, grab it
+
+ afile = fopen(Catenate(DB->path,".","qual",".anno"),"r");
+ fread(&tracklen,sizeof(int),1,afile);
+ fread(&size,sizeof(int),1,afile);
+ fseeko(afile,0,SEEK_END);
+ extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int));
+ fseeko(afile,-extra,SEEK_END);
+ if (extra == 2*sizeof(int))
+ { fread(&COVERAGE,sizeof(int),1,afile);
+ fread(&HGAP_MIN,sizeof(int),1,afile);
+ }
+ else if (extra == sizeof(int))
+ { fread(&COVERAGE,sizeof(int),1,afile);
+ HGAP_MIN = 0;
+ }
+ else
+ { COVERAGE = -1;
+ HGAP_MIN = 0;
+ }
+ fclose(afile);
+ }
+ else
+ { fprintf(stderr,"%s: Must have a 'qual' track, run DASqv\n",Prog_Name);
+ exit (1);
+ }
}
// Initialize statistics gathering
@@ -1636,7 +2227,7 @@ int main(int argc, char *argv[])
nspanbp = 0;
nchimbp = 0;
- printf("\nDAStrim -g%d -b%d -l%d %s", GOOD_QV,BAD_QV,MIN_LEN,argv[1]);
+ printf("\nDAStrim -g%d -b%d %s", GOOD_QV,BAD_QV,argv[1]);
for (c = 2; c < argc; c++)
printf(" %s",argv[c]);
printf("\n");
@@ -1689,21 +2280,9 @@ int main(int argc, char *argv[])
}
}
- track = Load_Track(DB,"qual");
- if (track != NULL)
- { QV_IDX = (int64 *) track->anno;
- QV = (uint8 *) track->data;
- }
- else
- { fprintf(stderr,"%s: Must have a 'qual' track, run DASqv\n",Prog_Name);
- exit (1);
- }
-
-#ifdef ANNOTATE
-
// Set up QV trimming track
-#define SETUP(AFILE,DFILE,INDEX,anno,data) \
+#define SETUP(AFILE,DFILE,INDEX,anno,data,S) \
{ int len, size; \
\
if (DB_PART > 0) \
@@ -1720,21 +2299,24 @@ int main(int argc, char *argv[])
exit (1); \
\
len = DB_LAST - DB_FIRST; \
- size = 0; \
+ size = S; \
fwrite(&len,sizeof(int),1,AFILE); \
fwrite(&size,sizeof(int),1,AFILE); \
INDEX = 0; \
fwrite(&INDEX,sizeof(int64),1,AFILE); \
}
- SETUP(HQ_AFILE,HQ_DFILE,HQ_INDEX,".hq.anno",".hq.data")
- SETUP(SN_AFILE,SN_DFILE,SN_INDEX,".span.anno",".span.data")
- SETUP(SP_AFILE,SP_DFILE,SP_INDEX,".split.anno",".split.data")
- SETUP(AD_AFILE,AD_DFILE,AD_INDEX,".adapt.anno",".adapt.data")
+ SETUP(TR_AFILE,TR_DFILE,TR_INDEX,".trim.anno",".trim.data",8)
- SETUP(HL_AFILE,HL_DFILE,HL_INDEX,".hole.anno",".hole.data")
+#ifdef ANNOTATE
+ SETUP(HQ_AFILE,HQ_DFILE,HQ_INDEX,".hq.anno",".hq.data",0)
+ SETUP(SN_AFILE,SN_DFILE,SN_INDEX,".span.anno",".span.data",0)
+ SETUP(SP_AFILE,SP_DFILE,SP_INDEX,".split.anno",".split.data",0)
+ SETUP(AD_AFILE,AD_DFILE,AD_INDEX,".adapt.anno",".adapt.data",0)
- SETUP(KP_AFILE,KP_DFILE,KP_INDEX,".keep.anno",".keep.data")
+ SETUP(HL_AFILE,HL_DFILE,HL_INDEX,".hole.anno",".hole.data",0)
+
+ SETUP(KP_AFILE,KP_DFILE,KP_INDEX,".keep.anno",".keep.data",0)
#endif
// Open overlap file
@@ -1754,12 +2336,18 @@ int main(int argc, char *argv[])
fread(&novl,sizeof(int64),1,input);
fread(&TRACE_SPACING,sizeof(int),1,input);
- MIN_LEN = (MIN_LEN +(TRACE_SPACING-1))/TRACE_SPACING;
make_a_pass(input,GAPS,1);
// Clean up
+ fwrite(&GOOD_QV,sizeof(int),1,TR_AFILE);
+ fwrite(&BAD_QV,sizeof(int),1,TR_AFILE);
+ fwrite(&COVERAGE,sizeof(int),1,TR_AFILE);
+ fwrite(&HGAP_MIN,sizeof(int),1,TR_AFILE);
+
+ fclose(TR_AFILE);
+ fclose(TR_DFILE);
#ifdef ANNOTATE
fclose(HQ_AFILE);
@@ -1789,7 +2377,13 @@ int main(int argc, char *argv[])
Print_Number((int64) nreads,7,stdout);
printf(" (100.0%%) reads ");
Print_Number(totlen,12,stdout);
- printf(" (100.0%%) bases\n");
+ printf(" (100.0%%) bases");
+ if (HGAP_MIN > 0)
+ { printf(" (another ");
+ Print_Number((int64) ((DB_LAST-DB_FIRST)-nreads),0,stdout);
+ printf(" were < H-length)");
+ }
+ printf("\n");
printf("Trimmed: ");
Print_Number(nelim,7,stdout);
diff --git a/DB.c b/DB.c
index b536536..69060d0 100644
--- a/DB.c
+++ b/DB.c
@@ -314,6 +314,14 @@ void Upper_Read(char *s)
*s = '\0';
}
+void Letter_Arrow(char *s)
+{ static char letter[4] = { '1', '2', '3', '4' };
+
+ for ( ; *s != 4; s++)
+ *s = letter[(int) *s];
+ *s = '\0';
+}
+
// Convert read in ascii representation to [0-3] representation (end with 4)
void Number_Read(char *s)
@@ -341,6 +349,31 @@ void Number_Read(char *s)
*s = 4;
}
+void Number_Arrow(char *s)
+{ static char arrow[128] =
+ { 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 0, 1, 2, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ };
+
+ for ( ; *s != '\0'; s++)
+ *s = arrow[(int) *s];
+ *s = 4;
+}
+
/*******************************************************************************************
*
@@ -426,7 +459,7 @@ int Open_DB(char* path, HITS_DB *db)
if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1)
if (part == 0)
{ cutoff = 0;
- all = 1;
+ all = DB_ALL;
}
else
{ EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n",
@@ -466,7 +499,7 @@ int Open_DB(char* path, HITS_DB *db)
db->tracks = NULL;
db->part = part;
db->cutoff = cutoff;
- db->all = all;
+ db->allarr |= all;
db->ufirst = ufirst;
db->tfirst = tfirst;
@@ -478,7 +511,7 @@ int Open_DB(char* path, HITS_DB *db)
db->reads += 1;
if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
- free(db->reads);
+ free(db->reads-1);
goto error2;
}
}
@@ -495,7 +528,7 @@ int Open_DB(char* path, HITS_DB *db)
fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR);
if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
- free(reads);
+ free(reads-1);
goto error2;
}
@@ -557,10 +590,10 @@ void Trim_DB(HITS_DB *db)
if (db->trimmed) return;
- if (db->cutoff <= 0 && db->all) return;
+ if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return;
cutoff = db->cutoff;
- if (db->all)
+ if ((db->allarr & DB_ALL) != 0)
allflag = 0;
else
allflag = DB_BEST;
@@ -660,10 +693,10 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart)
if (!db->trimmed) return (0);
- if (db->cutoff <= 0 && db->all) return (0);
+ if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return (0);
cutoff = db->cutoff;
- if (db->all)
+ if ((db->allarr & DB_ALL) != 0)
allflag = 0;
else
allflag = DB_BEST;
@@ -1245,7 +1278,10 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track)
if ( ! ispart && db->part > 0)
fseeko(afile,size*db->ufirst,SEEK_CUR);
}
- nreads = tracklen;
+ if (tracklen == treads)
+ nreads = ((int *) (db->reads))[-2];
+ else
+ nreads = ((int *) (db->reads))[-1];
anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector");
if (anno == NULL)
@@ -1432,6 +1468,57 @@ int Load_Read(HITS_DB *db, int i, char *read, int ascii)
return (0);
}
+// Load into 'read' the i'th arrow in 'db'. As an ASCII string if ascii is 1,
+// and as a numeric string otherwise.
+//
+
+HITS_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow"
+FILE *Arrow_File = NULL; // Becomes invalid after closing
+
+int Load_Arrow(HITS_DB *db, int i, char *read, int ascii)
+{ FILE *arrow;
+ int64 off;
+ int len, clen;
+ HITS_READ *r = db->reads;
+
+ if (i >= db->nreads)
+ { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Arrow)\n",Prog_Name);
+ EXIT(1);
+ }
+ if (Arrow_DB != db)
+ { if (Arrow_File != NULL)
+ fclose(Arrow_File);
+ arrow = Fopen(Catenate(db->path,"","",".arw"),"r");
+ if (arrow == NULL)
+ EXIT(1);
+ Arrow_File = arrow;
+ Arrow_DB = db;
+ }
+ else
+ arrow = Arrow_File;
+
+ off = r[i].boff;
+ len = r[i].rlen;
+
+ if (ftello(arrow) != off)
+ fseeko(arrow,off,SEEK_SET);
+ clen = COMPRESSED_LEN(len);
+ if (clen > 0)
+ { if (fread(read,clen,1,arrow) != 1)
+ { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Arrow)\n",Prog_Name);
+ EXIT(1);
+ }
+ }
+ Uncompress_Read(len,read);
+ if (ascii == 1)
+ { Letter_Arrow(read);
+ read[-1] = '\0';
+ }
+ else
+ read[-1] = 4;
+ return (0);
+}
+
char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii)
{ FILE *bases = (FILE *) db->bases;
int64 off;
diff --git a/DB.h b/DB.h
index a7b8636..dc281de 100644
--- a/DB.h
+++ b/DB.h
@@ -164,6 +164,9 @@ void Lower_Read(char *s); // Convert read from numbers to lowercase letters
void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT)
void Number_Read(char *s); // Convert read from letters to numbers
+void Letter_Arrow(char *s); // Convert arrow pw's from numbers to uppercase letters (0-3 to 1234)
+void Number_Arrow(char *s); // Convert arrow pw string from letters to numbers
+
/*******************************************************************************************
*
@@ -175,14 +178,21 @@ void Number_Read(char *s); // Convert read from letters to numbers
#define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert
#define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1)
+#define DB_ARROW 0x2 // DB is an arrow DB
+#define DB_ALL 0x1 // all wells are in the trimmed DB
+
+// Fields have different interpretations if a .db versus a .dam
+
typedef struct
- { int origin; // Well #
+ { int origin; // Well # (DB), Contig # (DAM)
int rlen; // Length of the sequence (Last pulse = fpulse + rlen)
- int fpulse; // First pulse
+ int fpulse; // First pulse (DB), left index of contig in scaffold (DAM)
int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of
// uncompressed bases in memory block
- int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file
- int flags; // QV of read + flags above
+ int64 coff; // Offset (in bytes) of compressed quiva streams in '.qvs' file (DB),
+ // Offset (in bytes) of scaffold header string in '.hdr' file (DAM)
+ // 4 compressed shorts containing snr info if an arrow DB.
+ int flags; // QV of read + flags above (DB only)
} HITS_READ;
// A track can be of 3 types:
@@ -223,7 +233,7 @@ typedef struct
{ int ureads; // Total number of reads in untrimmed DB
int treads; // Total number of reads in trimmed DB
int cutoff; // Minimum read length in block (-1 if not yet set)
- int all; // Consider multiple reads from a given well
+ int allarr; // DB_ALL | DB_ARROW
float freq[4]; // frequency of A, C, G, T, respectively
// Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed)
@@ -365,6 +375,11 @@ char *New_Read_Buffer(HITS_DB *db);
int Load_Read(HITS_DB *db, int i, char *read, int ascii);
+ // Exactly the same as Load_Read, save the arrow information is loaded, not the DNA sequence,
+ // and there is only a choice between numeric (0) or ascii (1);
+
+int Load_Arrow(HITS_DB *db, int i, char *read, int ascii);
+
// Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the
// the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii
// string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string
diff --git a/Makefile b/Makefile
index 851972d..01d4a52 100644
--- a/Makefile
+++ b/Makefile
@@ -2,16 +2,34 @@ DEST_DIR = ~/bin
CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing
-ALL = DASqv DAStrim
+ALL = DASqv DAStrim DASpatch DASedit DASmap DASrealign REPqv REPtrim
all: $(ALL)
DASqv: DASqv.c align.c align.h DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DASqv DASqv.c align.c DB.c QV.c -lm
+REPqv: REPqv.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o REPqv REPqv.c DB.c QV.c -lm
+
DAStrim: DAStrim.c align.c align.h DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DAStrim DAStrim.c align.c DB.c QV.c -lm
+REPtrim: REPtrim.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o REPtrim REPtrim.c DB.c QV.c -lm
+
+DASpatch: DASpatch.c align.h align.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DASpatch DASpatch.c align.c DB.c QV.c -lm
+
+DASedit: DASedit.c align.c align.h DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DASedit DASedit.c align.c DB.c QV.c -lm
+
+DASmap: DASmap.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DASmap DASmap.c DB.c QV.c -lm
+
+DASrealign: DASrealign.c align.c align.h DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DASrealign DASrealign.c align.c DB.c QV.c -lm
+
clean:
rm -f $(ALL)
rm -fr *.dSYM
@@ -22,4 +40,4 @@ install:
package:
make clean
- tar -zcf scrubber.tar.gz README Makefile *.h *.c
+ tar -zcf scrubber.tar.gz README.md Makefile *.h *.c
diff --git a/QV.c b/QV.c
index cdb6a63..d7d7263 100644
--- a/QV.c
+++ b/QV.c
@@ -863,6 +863,62 @@ static int delChar, subChar;
// Referred by: QVcoding_Scan, Create_QVcoding
+void QVcoding_Scan1(int rlen, char *delQV, char *delTag, char *insQV, char *mergeQV, char *subQV)
+{
+ if (rlen == 0) // Initialization call
+ { int i;
+
+ // Zero histograms
+
+ bzero(delHist,sizeof(uint64)*256);
+ bzero(mrgHist,sizeof(uint64)*256);
+ bzero(insHist,sizeof(uint64)*256);
+ bzero(subHist,sizeof(uint64)*256);
+
+ for (i = 0; i < 256; i++)
+ delRun[i] = subRun[i] = 1;
+
+ totChar = 0;
+ delChar = -1;
+ subChar = -1;
+ return;
+ }
+
+ // Add streams to accumulating histograms and figure out the run chars
+ // for the deletion and substition streams
+
+ Histogram_Seqs(delHist,(uint8 *) delQV,rlen);
+ Histogram_Seqs(insHist,(uint8 *) insQV,rlen);
+ Histogram_Seqs(mrgHist,(uint8 *) mergeQV,rlen);
+ Histogram_Seqs(subHist,(uint8 *) subQV,rlen);
+
+ if (delChar < 0)
+ { int k;
+
+ for (k = 0; k < rlen; k++)
+ if (delTag[k] == 'n' || delTag[k] == 'N')
+ { delChar = delQV[k];
+ break;
+ }
+ }
+ if (delChar >= 0)
+ Histogram_Runs( delRun,(uint8 *) delQV,rlen,delChar);
+ totChar += rlen;
+ if (subChar < 0)
+ { if (totChar >= 100000)
+ { int k;
+
+ subChar = 0;
+ for (k = 1; k < 256; k++)
+ if (subHist[k] > subHist[subChar])
+ subChar = k;
+ }
+ }
+ if (subChar >= 0)
+ Histogram_Runs( subRun,(uint8 *) subQV,rlen,subChar);
+ return;
+}
+
int QVcoding_Scan(FILE *input, int num, FILE *temp)
{ char *slash;
int rlen;
@@ -1284,6 +1340,44 @@ void Free_QVcoding(QVcoding *coding)
*
********************************************************************************************/
+void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub,
+ FILE *output, QVcoding *coding, int lossy)
+{ int clen;
+
+ if (coding->delChar < 0)
+ { Encode(coding->delScheme, output, (uint8 *) del, rlen);
+ clen = rlen;
+ }
+ else
+ { Encode_Run(coding->delScheme, coding->dRunScheme, output,
+ (uint8 *) del, rlen, coding->delChar);
+ clen = Pack_Tag(tag,del,rlen,coding->delChar);
+ }
+ Number_Read(tag);
+ Compress_Read(clen,tag);
+ fwrite(tag,1,COMPRESSED_LEN(clen),output);
+
+ if (lossy)
+ { uint8 *insert = (uint8 *) ins;
+ uint8 *merge = (uint8 *) mrg;
+ int k;
+
+ for (k = 0; k < rlen; k++)
+ { insert[k] = (uint8) ((insert[k] >> 1) << 1);
+ merge[k] = (uint8) (( merge[k] >> 2) << 2);
+ }
+ }
+
+ Encode(coding->insScheme, output, (uint8 *) ins, rlen);
+ Encode(coding->mrgScheme, output, (uint8 *) mrg, rlen);
+ if (coding->subChar < 0)
+ Encode(coding->subScheme, output, (uint8 *) sub, rlen);
+ else
+ Encode_Run(coding->subScheme, coding->sRunScheme, output,
+ (uint8 *) sub, rlen, coding->subChar);
+ return;
+}
+
int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy)
{ int rlen, clen;
diff --git a/QV.h b/QV.h
index 532b2f4..e5c9485 100644
--- a/QV.h
+++ b/QV.h
@@ -58,6 +58,7 @@ int Get_QV_Line();
// If there is an error then -1 is returned, otherwise the number of entries read.
int QVcoding_Scan(FILE *input, int num, FILE *temp);
+void QVcoding_Scan1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub);
// Given QVcoding_Scan has been called at least once, create an encoding scheme based on
// the accumulated statistics and return a pointer to it. The returned encoding object
@@ -83,6 +84,8 @@ void Free_QVcoding(QVcoding *coding);
// occurred, and the sequence length otherwise.
int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy);
+void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub,
+ FILE *output, QVcoding *coding, int lossy);
// Assuming the input is position just beyond the compressed encoding of an entry header,
// read the set of compressed encodings for the ensuing 5 QV vectors, decompress them,
diff --git a/README b/README
deleted file mode 100644
index b68a35f..0000000
--- a/README
+++ /dev/null
@@ -1,47 +0,0 @@
-
-
-
-*** PLEASE GO TO THE DAZZLER BLOG (https://dazzlerblog.wordpress.com) FOR TYPESET ***
- DOCUMENTATION, EXAMPLES OF USE, AND DESIGN PHILOSOPHY.
-
-
-
- The Dazzler Scrubbing Suite: DAS SCRUBBER
-
- Author: Gene Myers
- First: October 12, 2014
- Recent: March 27, 2016
-
- The goal of scrubbing is to produce a set of edited reads that are guaranteed to
-(a) be continuous stretches of the underlying genome (i.e. no unremoved adapters
-and not chimers), and (b) have no very low quality stretches (i.e. the error rate
-never exceeds some reasonable maximum, 20% or so in the case of Pacbio data). The
-secondary goal of scrubbing is to do so with the minimum removal of data and splitting
-of reads.
-
- The "DAS" suite will consist of a pipeline of several programs that will accomplish
-the task of scrubbing. At this time, we are releasing the first program which assigns
-intrinsic quality values to every trace point interval of a read.
-
-1. DASqv [-v] -c<int> <source:db> <overlaps:las>
-
- DASqv takes as input a database <source> and the local alignments, <overlaps>, for
-said database or a block thereof. Note carefully that <source> must always refer to
-the entire DB, only <overlaps> can involve a block number.
-
- Using the local alignment-pile for each A-read, DASqv produces a QV value for each
-complete segment of TRACE_SPACING bases (e.g. 100bp, the -s parameter to daligner).
-The quality value of the average percentile of the best 25-5-% alignment matches
-covering it depending on the coverage estimate -c. One must supply the -c parameter
-to the expected coverage of the genome in question. All quality values over 50 are
-clipped to 50.
-
- The quality values are written to a .qual track, that can be viewed by calling
-DBdump with the -i option set ("i" for "intrinsic QV").
-
- The -v option prints out a histogram of the segment align matches, and the quality
-values produced. This histgram is usefull in assessing, for a given data set, what
-constitutes the threshold -g and -b, to be used by down stream commands, for what is
-definitely a good segment and what is definitely a bad segment.
-
-2. DAStrim -- soon !
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5a07609
--- /dev/null
+++ b/README.md
@@ -0,0 +1,215 @@
+
+# Dascrubber: The Dazzler Read Scrubbing Suite
+
+## _Author: Gene Myers_
+## _First: March 27, 2016_
+
+For typeset documentation, examples of use, and design philosophy please go to
+my [blog](https://dazzlerblog.wordpress.com/command-guides/dascrubber-command-guide).
+
+This is still an incomplete release.
+The current set of commands provide a pipeline that one can use to scrub reads and if desired
+to scrub the alignment piles (with DASrealign). Ultimately DASpatch/DASedit and DASrealign will
+be replaced with more powerful programs that correct reads and not only scrub alignment piles,
+but also remove haplotype and repeat induced overlaps, prior to assembly via a string graph
+method.
+
+The goal of scrubbing is to produce a set of edited reads that are guaranteed to
+(a) be continuous stretches of the underlying genome (i\.e\. no unremoved adapters
+and not chimers), and (b) have no very low quality stretches (i\.e\. the error rate
+never exceeds some reasonable maximum, 20% or so in the case of Pacbio data). The
+secondary goal of scrubbing is to do so with the minimum removal of data and splitting
+of reads. The \"DAS\" suite consists of a pipeline of several programs that in sequence accomplish
+the task of scrubbing.
+
+```
+1. DASqv [-v] [-H<int>] -c<int> <subject:db> <overlaps:las> ...
+```
+
+This command takes as input a database \<source\> and a sequence of sorted local alignments
+blocks, \<overlaps\>, produced by an overlap/daligner run for said database. It is recommended
+that one employ repeat-masking in the overlap run as per the new DAMASKER module described in
+this post. Note carefully that \<source\> must always refer to the entire DB, only \<overlaps\>
+can involve a block number. If \<overlaps\> involves a block number, e\.g\. Ecoli.2.las, then
+the .las file is expected to contain all the overlaps where the A-read is in block 2 of the
+underlying database. The HPC.daligner scripts in the DALIGNER module produce such .las files
+as their final result.
+
+Using the local alignment-pile for each A-read, DASqv produces a QV value for each complete
+segment of TRACE_SPACING bases (e\.g\. 100bp, the -s parameter to daligner). The quality value
+of the average percentile of the best 25-50% alignment matches covering it depending on the
+coverage estimate -c. One must supply the -c parameter to the expected coverage of the genome
+in question. All quality values over 50 are clipped to 50.
+
+The -v option prints out a histogram of the segment align matches, and the quality values
+produced. This histogram is useful in assessing, for a given data set, what constitutes the
+threshold -g and -b, to be used by down stream commands, for what is definitely a good segment
+and what is definitely a bad segment. The -H option is for HGAP-based assembly (see the -H
+option of daligner) wherein only reads longer than the -H parameter are considered for overlap,
+scrubbing, and assembly. With this option set, DASqv and all subsequent commands in the
+scrubbing pipeline, only perform their functions on reads of length -H or more. All other
+reads are used in the overlap piles for H-reads to help assess and scrub the H-reads, but
+are themselves not scrubbed.
+
+The quality values are written to a .qual track, that can be viewed by calling DBdump with
+the -i option set (\"i\" for \"intrinsic QV\"). If the overlap file contains a block number
+then the track files also contain a block number, e\.g\. \"DASqv -c50 DB OVL.2\" will result in
+the track files DB.2.qual.[anno,data]. Furthermore, if DASqv is run on .las blocks, then
+once it has been run on all the blocks of the DB, the block tracks must be concatenated
+into a single track for the entire database with Catrack in preparation for the next phase
+of scrubbing by DAStrim.
+
+```
+2. DAStrim [-v] -g<int> -b<int> <subject:db> <overlaps:las> ...
+```
+
+This command takes as input a database \<source\> and a sequence of sorted local alignments
+blocks, \<overlaps\>, produced by an overlap/daligner run for said database. A .qual track
+obtained by running DASqv must be present for the entire data base. Note carefully that
+\<source\> must always refer to the entire DB, only \<overlaps\> can involve a block number.
+
+Using the local alignment-pile for each A-read and the QV\'s for all the reads in the pile,
+DAStrim (1) finds and breaks all chimeric reads, (2) finds all missed adaptamers and retains
+only the longest subread between missed adaptaers, and (3) identifies all low-quality regions
+that should be improved/replaced by better sequence. It makes these inherently heuristic
+decisions conservatively so that what remains is very highly likely to be free of chimers,
+adaptamers, and undetected low-quality sequence segments. Some of these artifacts may still
+get through, but at very low odds, less than 1 in 10,000 in our experience. The decision
+process is guided by the parameters -g and -b which indicate the thresholds for considering
+intrinsic QV values good, bad, or unknown. In our experience, -g should be set to the value
+for which 80% or more of the QV\'s produced by DASqv are better than this value, and -b should
+be set to the value for which 5-7% or more of the QV\'s are worse than this value. Consult
+the histogram produced by DASqv to set these parameter. We further recommend doing this
+for each project on a case-by-case basis.
+
+The -v option prints out a report of how many chimer and adaptamer breaks were detected, how
+much sequence was trimmed, how many low-quality segments were spanned by alignments, and how
+many were rescued by many pairs of local alignments spanning the gap indicued by the
+low-quality region, and so on.
+
+The retained high-quality intervals for each read are written to a .trim track, in
+left-to-right order with an indicator of whether the gap between two such intervals is spanned
+by local alignments or by span-consistent pairs of local alignments. Like DASqv and all other
+scrubber modules, block tracks are produced in response to block .las files and these must be
+concatenated with Catrack into a single .trim track for the entire DB in preparation for the
+next phase of scrubbing by DASpatch.
+
+```
+3. DASpatch [-v] <subject:db> <overlaps:las> ...
+```
+
+This command takes as input a database \<source\> and a sequence of sorted local alignments
+blocks, \<overlaps\>, produced by an overlap/daligner run for said database. A .qual track
+and a .trim track obtained by running DASqv and DAStrim must be present for the entire data
+base. Note carefully that \<source\> must always refer to the entire DB, only \<overlaps\> can
+involve a block number.
+
+Using the local alignment-pile for each A-read, the QV\'s for all the reads in the pile, and
+the hiqh-quality segments annotated by DAStrim, DASpatch selects a high-quality B-read segment
+with which to patch every intervening low-quality segment of an A-read. Given that these gaps
+are annotated before each read is trimmed by DAStrim, it may be the case in this second
+examination, that the gap is no longer spanned by the now trimmed B-reads in which case the
+span/patch can fail. This is very rare but does occur and is the number of such events is
+reported by DASpatch when the -v option is set.
+
+The B-read segments for each patch (or a special \"failure patch\") are written to a .patch
+track, in left-to-right order. Like DASqv and all other scrubber modules, block tracks are
+produced in response to block .las files and these must be concatenated with Catrack into a
+single .patch track for the entire DB in preparation for the next phase of scrubbing by DASedit.
+
+```
+4. DASedit [-v] [-x<int>] <source:db> <target:db>
+```
+
+This command takes as input a database \<source\> for which a .trim, and .patch tracks have
+produced by DAStrim and DASpatch in sequence (and perforce DASqv initially). Using the
+information in the two tracks, DASedit produces a new database \<target\> whose reads are the
+patched, high-quality sub-reads. That is, every low quality segment is patched with the relevant
+sequence of another read, and some reads give rise to two or more reads if deemed chimers, or
+no reads if the entire read was deemed junk. This command can take considerable time as the
+access pattern to read sequences (for the patching) is not sequential or localized, implying
+poor cache performance.
+
+The new database does not have a .qvs or .arr component, that is it is a a sequence or
+S-database (see the original Dazzler DB post). Very importantly, the new database has exactly
+the same block divisions as the original. That is, all patched subreads in a block of the new
+database have been derived from reads of the same block in the original database, and only
+from those reads. The new database does have a .map track that for each read encodes the
+original read in the source DB that was patched and the segments of that read that were
+high-quality (i\.e\. not patched). The program DASmap below can be used to output this
+information in either an easy-to-read or an easy-to-parse format.
+
+```
+5. DASmap [-p] <path:db> [ <reads:FILE> |<reads:range> ... ]
+```
+
+This command takes as input a database of patched reads \<source\> produced by DASedit, and for
+the specified reads outputs a line for each showing the source read index and length in the
+originating DB, as well as annotating which segments were original and which were patched.
+The convention on interpreting the read arguments is as for DBshow and DBdump. As an example,
+with the -p option (pretty print) set one might see:
+
+```
+ 55 -> 57(2946) [400,2946]
+ 56 -> 58(11256) [700,1900]
+ 57 -> 58(11256) [6600,9900] 83 [10000,11256]
+ 58 -> 59(12282) [400,4100] 88 [4200,9400] 97 [9500,12282]
+```
+
+The first line indicates that read 55 in the patched database was derived from read 57 in the
+original database and is the segment from [400,2946] of that read. Reads 56 and 57 were both
+derived from read 58 in the original DB, and read 57 consists of segments [6600,9900] and
+[10000,11256] of read 58 with a patch between them of 83bp (but the source of the patch data
+is not given). The read length of each original read is given for convenience. With the -p
+option off, the output consists of space separated integers encoding the same information where
+the 4th field is always the number of integers in the segment description (always 3n+2 for
+some n):
+
+```
+ 55 57 2946 2 400 2946
+ 56 58 11256 2 700 1900
+ 57 58 11256 5 6600 9900 83 10000 11256
+ 58 59 12282 8 400 4100 88 4200 9400 97 9500 12282
+```
+
+```
+6. DASrealign [-v] [-l<int:(800)>] <block1> <block2> <source:las> <target:las>
+```
+
+This command takes as input two blocks a patched database \<source\> created by DASedit, and the
+original .las file \<overlap\> for the block pair. That is the .las file that was produced for
+the blocks when daligner was run on the original database blocks. DASrealign then produces
+the set of alignments (inferrable from those in the original) between the new patched reads,
+placing them in the file \<target\>. These new .las files can then be merged
+with LAmerge to form block .las files for the new database.
+
+The idea of this program is to avoid having to run daligner again on the patched reads, but
+rather to simply refine the alignments already computed with respect to the new read set. It
+has the draw back that there is some small chance that there are previously undetected overlaps
+between reads now that they are patched, and the patched trace point encoding, while stillable
+to deliver alignments are no longer usable for quality estimation as the tracepoint spacing in
+the A-read becomes irregular. This is contrasted with the speedup of the new process which
+is roughly 40X faster than the original daligner run and the collection of overlaps in the
+output can be feed directly into a string graph construction process.
+
+```
+7. REPqv <subject:db> ...
+```
+
+This command takes as input a sequence of databases or blocks \<source\> and for each outputs
+a histogram of the intrinsic quality values of the reads in the source along with a
+recommendation of the -g and -b values with which to run DAStrim. The .qual track produced
+by DASqv must be present for all sources referred to. The command is a quick way to get
+the -v output of DASqv at any time after producing the intrinsic quality values and to get
+the histogram for the entire data base (as opposed to a block of the database).
+
+```
+8. REPtrim <subject:db> ...
+```
+
+This command takes as input a sequence of databases or blocks \<source\> and for each outputs
+the scrubbing statistics for the source, i.e. the same report produced by DAStrim with
+the -v option set. The .trim track produced by DAStrim must be present for all sources
+referred to. The command is a quick way to get the -v output of DAStrim at any time after
+the fact and to get the statistics for the entire data base (as opposed to a block of the
+database).
diff --git a/REPqv.c b/REPqv.c
new file mode 100644
index 0000000..2b51f4a
--- /dev/null
+++ b/REPqv.c
@@ -0,0 +1,188 @@
+/*******************************************************************************************
+ *
+ * Read the .qual track of each db or db block on the command line and output a histogram
+ * of the intrinsic QV's therein.
+ *
+ * Author: Gene Myers
+ * Date : August 2017
+ *
+ *******************************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+
+#include "DB.h"
+
+// Command format and global parameter variables
+
+static char *Usage = "<source:db> ...";
+
+// Global Variables
+
+#define MAXQV 50 // Max QV score is 50
+#define MAXQV1 51
+
+static HITS_DB _DB, *DB = &_DB; // Data base
+
+static int64 *QV_IDX; // qual track index
+static uint8 *QV; // qual track values
+
+int main(int argc, char *argv[])
+{ int c;
+
+ // Process arguments
+
+ Prog_Name = Strdup("REPqv","");
+
+ if (argc < 2)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+
+ // Open trimmed DB and the qual-track
+
+ for (c = 1; c < argc; c++)
+ { int status;
+ char *root;
+ int i, bval, gval, cover, hgap_min;
+ HITS_TRACK *track;
+ int64 nreads, totlen;
+ int64 qgram[MAXQV1];
+ int64 qsum, qtotal;
+
+ status = Open_DB(argv[c],DB);
+ if (status < 0)
+ exit (1);
+ if (status == 1)
+ { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ Trim_DB(DB);
+
+ track = Load_Track(DB,"qual");
+ if (track != NULL)
+ { FILE *afile;
+ int size, tracklen, extra;
+
+ QV_IDX = (int64 *) track->anno;
+ QV = (uint8 *) track->data;
+
+ // if newer .qual tracks with -c meta data, grab it
+
+ if (DB->part)
+ { afile = fopen(Catenate(DB->path,
+ Numbered_Suffix(".",DB->part,"."),"qual",".anno"),"r");
+ if (afile == NULL)
+ afile = fopen(Catenate(DB->path,".","qual",".anno"),"r");
+ }
+ else
+ afile = fopen(Catenate(DB->path,".","qual",".anno"),"r");
+ fread(&tracklen,sizeof(int),1,afile);
+ fread(&size,sizeof(int),1,afile);
+ fseeko(afile,0,SEEK_END);
+ extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int));
+ fseeko(afile,-extra,SEEK_END);
+ if (extra == 2*sizeof(int))
+ { fread(&cover,sizeof(int),1,afile);
+ fread(&hgap_min,sizeof(int),1,afile);
+ }
+ else if (extra == sizeof(int))
+ { fread(&cover,sizeof(int),1,afile);
+ hgap_min = 0;
+ }
+ else
+ { cover = -1;
+ hgap_min = 0;
+ }
+ fclose(afile);
+ }
+ else
+ { fprintf(stderr,"%s: Must have a 'qual' track, run DASqv\n",Prog_Name);
+ exit (1);
+ }
+
+ root = Root(argv[c],".db");
+ nreads = DB->nreads;
+ totlen = DB->totlen;
+
+ for (i = 0; i <= MAXQV; i++)
+ qgram[i] = 0;
+
+ for (i = 0; i < QV_IDX[nreads]; i++)
+ qgram[QV[i]] += 1;
+
+ printf("\nDASqv");
+ if (cover >= 0)
+ printf(" -c%d",cover);
+ else
+ printf(" -c??");
+ if (hgap_min > 0)
+ printf(" -H%d",hgap_min);
+ printf(" %s\n\n",root);
+
+ if (hgap_min > 0)
+ { for (i = 0; i < DB->nreads; i++)
+ if (DB->reads[i].rlen < hgap_min)
+ { nreads -= 1;
+ totlen -= DB->reads[i].rlen;
+ }
+ }
+ printf(" Input: ");
+ Print_Number(nreads,7,stdout);
+ printf("reads, ");
+ Print_Number(totlen,12,stdout);
+ printf(" bases");
+ if (hgap_min > 0)
+ { printf(" (another ");
+ Print_Number(DB->nreads-nreads,0,stdout);
+ printf(" were < H-length)");
+ }
+ printf("\n");
+
+ if (cover >= 0)
+ { int qv_deep;
+
+ if (cover >= 40)
+ qv_deep = cover/8;
+ else if (cover >= 20)
+ qv_deep = 5;
+ else
+ qv_deep = cover/4;
+
+ printf("\n Histogram of q-values (average %d best)\n",2*qv_deep);
+ }
+ else
+ printf("\n Histogram of q-values\n");
+
+ qtotal = 0;
+ for (i = 0; i <= MAXQV; i++)
+ qtotal += qgram[i];
+
+ qsum = qgram[MAXQV];
+ printf("\n %2d: %9lld %5.1f%%\n\n",MAXQV,qgram[MAXQV],(100.*qsum)/qtotal);
+
+ bval = gval = -1;
+ qtotal -= qsum;
+ qsum = 0;
+ for (i = MAXQV-1; i >= 0; i--)
+ if (qgram[i] > 0)
+ { qsum += qgram[i];
+ printf(" %2d: %9lld %5.1f%%\n",i,qgram[i],(100.*qsum)/qtotal);
+ if ((100.*qsum)/qtotal > 7. && bval < 0)
+ bval = i;
+ if ((100.*qsum)/qtotal > 20. && gval < 0)
+ gval = i;
+ }
+
+ printf("\n Recommend \'DAStrim -g%d -b%d'\n\n",gval,bval);
+
+ free(root);
+ Close_DB(DB);
+ }
+
+ free(Prog_Name);
+ exit (0);
+}
diff --git a/REPtrim.c b/REPtrim.c
new file mode 100644
index 0000000..5e74674
--- /dev/null
+++ b/REPtrim.c
@@ -0,0 +1,303 @@
+/*******************************************************************************************
+ *
+ * Read the .trim track of each db or db block on the command line and output
+ * a summary of the scrubbing that took place on that db or block.
+ *
+ * Author: Gene Myers
+ * Date : August 2017
+ *
+ *******************************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+
+#include "DB.h"
+#include "align.h"
+
+// Command format and global parameter variables
+
+static char *Usage = "<source:db> ...";
+
+// Gap states
+
+#define LOWQ 0 // Gap is spanned by many LAs and patchable
+#define SPAN 1 // Gap has many paired LAs and patchable
+#define SPLIT 2 // Gap is a chimer or an unpatchable gap
+#define ADPRE 3 // Gap is due to adaptemer, trim prefix interval to left
+#define ADSUF 4 // Gap is due to adaptemer, trim suffix interval to right
+
+
+
+// Global Variables (must exist across the processing of each pile)
+
+static HITS_DB _DB, *DB = &_DB; // Data base
+
+static int64 *TRIM_IDX; // trim track index
+static int *TRIM; // trim track values
+
+int main(int argc, char *argv[])
+{ int c;
+
+ // Process arguments
+
+ Prog_Name = Strdup("REPtrim","");
+
+ if (argc < 2)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+
+ // Open trimmed DB and .qual and .trim tracks
+
+ for (c = 1; c < argc; c++)
+ { int status;
+ char *root;
+ int i, a, tb, te;
+ int alen;
+ HITS_TRACK *track;
+ int64 nreads, totlen;
+ int64 nelim, nelimbp;
+ int64 n5trm, n5trmbp;
+ int64 n3trm, n3trmbp;
+ int64 natrm, natrmbp;
+ int64 ngaps, ngapsbp;
+ int64 nlowq, nlowqbp;
+ int64 nspan, nspanbp;
+ int64 nchim, nchimbp;
+ int BAD_QV, GOOD_QV, COVERAGE, HGAP_MIN;
+
+ status = Open_DB(argv[c],DB);
+ if (status < 0)
+ exit (1);
+ if (status == 1)
+ { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ Trim_DB(DB);
+
+ track = Load_Track(DB,"trim");
+ if (track != NULL)
+ { FILE *afile;
+ int size, tracklen, extra;
+
+ TRIM_IDX = (int64 *) track->anno;
+ TRIM = (int *) track->data;
+ for (i = 0; i <= DB->nreads; i++)
+ TRIM_IDX[i] /= sizeof(int);
+
+ if (DB->part)
+ { afile = fopen(Catenate(DB->path,
+ Numbered_Suffix(".",DB->part,"."),"trim",".anno"),"r");
+ if (afile == NULL)
+ afile = fopen(Catenate(DB->path,".","trim",".anno"),"r");
+ }
+ else
+ afile = fopen(Catenate(DB->path,".","trim",".anno"),"r");
+ fread(&tracklen,sizeof(int),1,afile);
+ fread(&size,sizeof(int),1,afile);
+ fseeko(afile,0,SEEK_END);
+ extra = ftell(afile) - (size*(tracklen+1) + 2*sizeof(int));
+ fseeko(afile,-extra,SEEK_END);
+ if (extra == 4*sizeof(int))
+ { fread(&GOOD_QV,sizeof(int),1,afile);
+ fread(&BAD_QV,sizeof(int),1,afile);
+ fread(&COVERAGE,sizeof(int),1,afile);
+ fread(&HGAP_MIN,sizeof(int),1,afile);
+ }
+ else if (extra == 3*sizeof(int))
+ { fread(&GOOD_QV,sizeof(int),1,afile);
+ fread(&BAD_QV,sizeof(int),1,afile);
+ fread(&COVERAGE,sizeof(int),1,afile);
+ HGAP_MIN = 0;
+ }
+ else if (extra == 2*sizeof(int))
+ { fread(&GOOD_QV,sizeof(int),1,afile);
+ fread(&BAD_QV,sizeof(int),1,afile);
+ COVERAGE = -1;
+ HGAP_MIN = 0;
+ }
+ else
+ { GOOD_QV = -1;
+ BAD_QV = -1;
+ COVERAGE = -1;
+ HGAP_MIN = 0;
+ }
+ fclose(afile);
+ }
+ else
+ { fprintf(stderr,"%s: Must have a 'trim' track, run DAStrim\n",Prog_Name);
+ exit (1);
+ }
+
+ root = Root(argv[c],".db");
+ nreads = DB->nreads;
+ totlen = DB->totlen;
+
+ nelim = 0;
+ n5trm = 0;
+ n3trm = 0;
+ natrm = 0;
+ nelimbp = 0;
+ n5trmbp = 0;
+ n3trmbp = 0;
+ natrmbp = 0;
+
+ ngaps = 0;
+ nlowq = 0;
+ nspan = 0;
+ nchim = 0;
+ ngapsbp = 0;
+ nlowqbp = 0;
+ nspanbp = 0;
+ nchimbp = 0;
+
+ for (a = 0; a < DB->nreads; a++)
+ { tb = TRIM_IDX[a];
+ te = TRIM_IDX[a+1];
+ alen = DB->reads[a].rlen;
+ if (alen < HGAP_MIN)
+ { nreads -= 1;
+ totlen -= alen;
+ }
+ else if (tb >= te)
+ { nelim += 1;
+ nelimbp += alen;
+ }
+ else
+ { if (TRIM[tb] > 0)
+ { n5trm += 1;
+ n5trmbp += TRIM[tb];
+ }
+ if (TRIM[te-1] < alen)
+ { n3trm += 1;
+ n3trmbp += alen - TRIM[te-1];
+ }
+ while (tb + 3 < te)
+ { ngaps += 1;
+ ngapsbp += TRIM[tb+3] - TRIM[tb+1];
+ if (TRIM[tb+2] == LOWQ)
+ { nlowq += 1;
+ nlowqbp += TRIM[tb+3] - TRIM[tb+1];
+ }
+ else if (TRIM[tb+2] == SPAN)
+ { nspan += 1;
+ nspanbp += TRIM[tb+3] - TRIM[tb+1];
+ }
+ else if (TRIM[tb+2] == ADPRE)
+ { natrm += 1;
+ natrmbp += TRIM[tb+3] - TRIM[tb];
+ }
+ else if (TRIM[tb+2] == ADSUF)
+ { natrm += 1;
+ natrmbp += TRIM[tb+4] - TRIM[tb+1];
+ }
+ else
+ { nchim += 1;
+ nchimbp += TRIM[tb+3] - TRIM[tb+1];
+ }
+ tb += 3;
+ }
+ }
+ }
+
+ printf("\nStatistics for DAStrim");
+ if (GOOD_QV >= 0)
+ printf(" -g%d",GOOD_QV);
+ else
+ printf(" -g??");
+ if (BAD_QV >= 0)
+ printf(" -b%d",BAD_QV);
+ else
+ printf(" -b??");
+ if (HGAP_MIN > 0)
+ printf(" [-H%d]",HGAP_MIN);
+ printf(" %s\n\n",root);
+
+ printf(" Input: ");
+ Print_Number((int64) nreads,7,stdout);
+ printf(" (100.0%%) reads ");
+ Print_Number(totlen,12,stdout);
+ printf(" (100.0%%) bases");
+ if (HGAP_MIN > 0)
+ { printf(" (another ");
+ Print_Number((int64) (DB->nreads-nreads),0,stdout);
+ printf(" were < H-length)");
+ }
+ printf("\n");
+
+ printf(" Trimmed: ");
+ Print_Number(nelim,7,stdout);
+ printf(" (%5.1f%%) reads ",(100.*nelim)/nreads);
+ Print_Number(nelimbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*nelimbp)/totlen);
+
+ printf(" 5' trim: ");
+ Print_Number(n5trm,7,stdout);
+ printf(" (%5.1f%%) reads ",(100.*n5trm)/nreads);
+ Print_Number(n5trmbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*n5trmbp)/totlen);
+
+ printf(" 3' trim: ");
+ Print_Number(n3trm,7,stdout);
+ printf(" (%5.1f%%) reads ",(100.*n3trm)/nreads);
+ Print_Number(n3trmbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*n3trmbp)/totlen);
+
+ if (natrm > 0)
+ { printf(" Adapter: ");
+ Print_Number(natrm,7,stdout);
+ printf(" (%5.1f%%) reads ",(100.*natrm)/nreads);
+ Print_Number(natrmbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*natrmbp)/totlen);
+ }
+
+ printf("\n");
+
+ printf(" Gaps: ");
+ Print_Number(ngaps,7,stdout);
+ printf(" (%5.1f%%) gaps ",(100.*(ngaps))/nreads);
+ Print_Number(ngapsbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*(ngapsbp))/totlen);
+
+ printf(" Low QV: ");
+ Print_Number(nlowq,7,stdout);
+ printf(" (%5.1f%%) gaps ",(100.*(nlowq))/nreads);
+ Print_Number(nlowqbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*(nlowqbp))/totlen);
+
+ printf(" Span'd: ");
+ Print_Number(nspan,7,stdout);
+ printf(" (%5.1f%%) gaps ",(100.*(nspan))/nreads);
+ Print_Number(nspanbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*(nspanbp))/totlen);
+
+ printf(" Break: ");
+ Print_Number(nchim,7,stdout);
+ printf(" (%5.1f%%) gaps ",(100.*(nchim))/nreads);
+ Print_Number(nchimbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*(nchimbp))/totlen);
+
+ printf("\n");
+
+ printf(" Clipped: ");
+ Print_Number(n5trm+n3trm+nelim+nchim,7,stdout);
+ printf(" clips ");
+ Print_Number(n5trmbp+n3trmbp+nelimbp+nchimbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*(n5trmbp+n3trmbp+nelimbp+nchimbp))/totlen);
+
+ printf(" Patched: ");
+ Print_Number(nlowq+nspan,7,stdout);
+ printf(" patches ");
+ Print_Number(nlowqbp+nspanbp,12,stdout);
+ printf(" (%5.1f%%) bases\n",(100.*(nlowqbp+nspanbp))/totlen);
+
+ free(root);
+ Close_DB(DB);
+ }
+
+ free(Prog_Name);
+ exit (0);
+}
diff --git a/align.c b/align.c
index 82de72a..d4fdfe8 100644
--- a/align.c
+++ b/align.c
@@ -322,7 +322,7 @@ typedef struct
static int VectorEl = 6*sizeof(int) + sizeof(BVEC);
static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath,
- int *mind, int maxd, int mida, int minp, int maxp)
+ int *mind, int maxd, int mida, int minp, int maxp, int aoff, int boff)
{ char *aseq = align->aseq;
char *bseq = align->bseq;
Path *apath = align->path;
@@ -339,7 +339,7 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
int *NA, *NB;
int *_NA, *_NB;
Pebble *cells;
- int avail, cmax, boff;
+ int avail, cmax;
int TRACE_SPACE = spec->trace_space;
int PATH_AVE = spec->ave_path;
@@ -385,11 +385,6 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
cells = (Pebble *) (work->cells);
cmax = work->celmax;
avail = 0;
-
- if (COMP(align->flags))
- boff = align->blen % TRACE_SPACE;
- else
- boff = 0;
}
/* Compute 0-wave starting from mid-line */
@@ -426,7 +421,7 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
work->cells = (void *) cells;
}
- na = ((y+k)/TRACE_SPACE)*TRACE_SPACE;
+ na = (((y+k)+(TRACE_SPACE-aoff))/TRACE_SPACE-1)*TRACE_SPACE+aoff;
#ifdef SHOW_TPS
printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout);
#endif
@@ -978,15 +973,6 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
apath->bepos = trimy;
apath->diffs = trimd;
apath->tlen = atlen;
- if (COMP(align->flags))
- { bpath->abpos = align->blen - apath->bepos;
- bpath->bbpos = align->alen - apath->aepos;
- }
- else
- { bpath->aepos = apath->bepos;
- bpath->bepos = apath->aepos;
- }
- bpath->diffs = trimd;
bpath->tlen = btlen;
}
@@ -997,7 +983,7 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
/*** Reverse Wave ***/
static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath,
- int mind, int maxd, int mida, int minp, int maxp)
+ int mind, int maxd, int mida, int minp, int maxp, int aoff, int boff)
{ char *aseq = align->aseq - 1;
char *bseq = align->bseq - 1;
Path *apath = align->path;
@@ -1014,7 +1000,7 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
int *NA, *NB;
int *_NA, *_NB;
Pebble *cells;
- int avail, cmax, boff;
+ int avail, cmax;
int TRACE_SPACE = spec->trace_space;
int PATH_AVE = spec->ave_path;
@@ -1060,11 +1046,6 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
cells = (Pebble *) (work->cells);
cmax = work->celmax;
avail = 0;
-
- if (COMP(align->flags))
- boff = align->blen % TRACE_SPACE;
- else
- boff = 0;
}
more = 1;
@@ -1099,7 +1080,7 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
work->cells = (void *) cells;
}
- na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE;
+ na = (((y+k)+(TRACE_SPACE-aoff)-1)/TRACE_SPACE-1)*TRACE_SPACE+aoff;
#ifdef SHOW_TPS
printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout);
#endif
@@ -1572,7 +1553,7 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
#ifdef SHOW_TRAIL
printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout);
#endif
- if ((b+k)%TRACE_SPACE != 0)
+ if ((b+k)%TRACE_SPACE != aoff)
{ h = cells[h].ptr;
if (h < 0)
{ a = trimy;
@@ -1700,15 +1681,6 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
apath->diffs = apath->diffs + trimd;
apath->tlen = apath->tlen - atlen;
apath->trace = atrace + atlen;
- if (COMP(align->flags))
- { bpath->aepos = align->blen - apath->bbpos;
- bpath->bepos = align->alen - apath->abpos;
- }
- else
- { bpath->abpos = apath->bbpos;
- bpath->bbpos = apath->abpos;
- }
- bpath->diffs = bpath->diffs + trimd;
bpath->tlen = bpath->tlen - btlen;
bpath->trace = btrace + btlen;
}
@@ -1727,6 +1699,7 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
_Align_Spec *spec = (_Align_Spec *) espec;
Path *apath, *bpath;
+ int aoff, boff;
int minp, maxp;
int selfie;
@@ -1783,7 +1756,20 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
else
maxp = hgh+hbord;
- if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp))
+ if (ACOMP(align->flags))
+ { aoff = align->alen % spec->trace_space;
+ boff = 0;
+ }
+ else if (COMP(align->flags))
+ { aoff = 0;
+ boff = align->blen % spec->trace_space;
+ }
+ else
+ { aoff = 0;
+ boff = 0;
+ }
+
+ if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp,aoff,boff))
EXIT(NULL);
#ifdef DEBUG_PASSES
@@ -1792,7 +1778,7 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
apath->aepos,apath->bepos,apath->diffs);
#endif
- if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp))
+ if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp,aoff,boff))
EXIT(NULL);
#ifdef DEBUG_PASSES
@@ -1800,11 +1786,43 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
(anti+low)/2,(anti-low)/2,apath->abpos,apath->bbpos,apath->diffs);
#endif
- if (COMP(align->flags))
+ bpath->diffs = apath->diffs;
+ if (ACOMP(align->flags))
+ { uint16 *trace = (uint16 *) apath->trace;
+ uint16 p;
+ int i, j;
+
+ bpath->aepos = apath->bepos;
+ bpath->bepos = apath->aepos;
+ bpath->abpos = apath->bbpos;
+ bpath->bbpos = apath->abpos;
+
+ apath->abpos = align->alen - bpath->bepos;
+ apath->bbpos = align->blen - bpath->aepos;
+ apath->aepos = align->alen - bpath->bbpos;
+ apath->bepos = align->blen - bpath->abpos;
+ i = apath->tlen-2;
+ j = 0;
+ while (j < i)
+ { p = trace[i];
+ trace[i] = trace[j];
+ trace[j] = p;
+ p = trace[i+1];
+ trace[i+1] = trace[j+1];
+ trace[j+1] = p;
+ i -= 2;
+ j += 2;
+ }
+ }
+ else if (COMP(align->flags))
{ uint16 *trace = (uint16 *) bpath->trace;
uint16 p;
int i, j;
+ bpath->abpos = align->blen - apath->bepos;
+ bpath->bbpos = align->alen - apath->aepos;
+ bpath->aepos = align->blen - apath->bbpos;
+ bpath->bepos = align->alen - apath->abpos;
i = bpath->tlen-2;
j = 0;
while (j < i)
@@ -1818,13 +1836,19 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
j += 2;
}
}
+ else
+ { bpath->aepos = apath->bepos;
+ bpath->bepos = apath->aepos;
+ bpath->abpos = apath->bbpos;
+ bpath->bbpos = apath->abpos;
+ }
#ifdef DEBUG_POINTS
{ uint16 *trace = (uint16 *) apath->trace;
int a, h;
printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos);
- printf(" %c\n",(COMP(align->flags) ? 'c' : 'n'));
+ printf(" %c\n",((COMP(align->flags) || ACOMP(align->flags)) ? 'c' : 'n'));
a = apath->bbpos;
for (h = 1; h < apath->tlen; h += 2)
{ int dif = trace[h-1];
@@ -1838,7 +1862,8 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
int a, h;
printf("\nB-path (%d,%d)->(%d,%d)",bpath->abpos,bpath->bbpos,bpath->aepos,bpath->bepos);
- printf(" %c [%d,%d]\n",(COMP(align->flags) ? 'c' : 'n'),align->blen,align->alen);
+ printf(" %c [%d,%d]\n",((COMP(align->flags) || ACOMP(align->flags)) ? 'c' : 'n'),
+ align->blen,align->alen);
a = bpath->bbpos;
for (h = 1; h < bpath->tlen; h += 2)
{ int dif = trace[h-1];
@@ -2958,8 +2983,8 @@ int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec,
if (prefix)
{ if (reverse_extend(work,spec,align,diag,anti,minp,maxp))
EXIT(1);
- apath->aepos = (anti-diag)/2;
- apath->bepos = (anti+diag)/2;
+ apath->aepos = (anti+diag)/2;
+ apath->bepos = (anti-diag)/2;
#ifdef DEBUG_PASSES
printf("E1 (%d,%d) => (%d,%d) %d\n",
(anti+diag)/2,(anti-diag)/2,apath->abpos,apath->bbpos,apath->diffs);
@@ -2968,8 +2993,8 @@ int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec,
else
{ if (forward_extend(work,spec,align,diag,anti,minp,maxp))
EXIT(1);
- apath->abpos = (anti-diag)/2;
- apath->bbpos = (anti+diag)/2;
+ apath->abpos = (anti+diag)/2;
+ apath->bbpos = (anti-diag)/2;
#ifdef DEBUG_PASSES
printf("F1 (%d,%d) => (%d,%d) %d\n",
(anti+diag)/2,(anti-diag)/2,apath->aepos,apath->bepos,apath->diffs);
@@ -3019,10 +3044,13 @@ int Read_Trace(FILE *input, Overlap *ovl, int tbytes)
return (0);
}
-void Write_Overlap(FILE *output, Overlap *ovl, int tbytes)
-{ fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output);
+int Write_Overlap(FILE *output, Overlap *ovl, int tbytes)
+{ if (fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output) != 1)
+ return (1);
if (ovl->path.trace != NULL)
- fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output);
+ if (fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output) != (size_t) ovl->path.tlen)
+ return (1);
+ return (0);
}
void Compress_TraceTo8(Overlap *ovl)
@@ -3085,29 +3113,46 @@ void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent)
}
int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname)
-{ int i, p;
-
- if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2)
- { if (verbose)
- EPRINTF(EPLACE," %s: Wrong number of trace points\n",fname);
- return (1);
- }
- p = ovl->path.bbpos;
- if (tspace <= TRACE_XOVR)
- { uint8 *trace8 = (uint8 *) ovl->path.trace;
- for (i = 1; i < ovl->path.tlen; i += 2)
- p += trace8[i];
+{ int i, p, q;
+
+ if (tspace != 0)
+ { if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2)
+ { if (verbose)
+ EPRINTF(EPLACE," %s: Wrong number of trace points\n",fname);
+ return (1);
+ }
+ p = ovl->path.bbpos;
+ if (tspace <= TRACE_XOVR)
+ { uint8 *trace8 = (uint8 *) ovl->path.trace;
+ for (i = 1; i < ovl->path.tlen; i += 2)
+ p += trace8[i];
+ }
+ else
+ { uint16 *trace16 = (uint16 *) ovl->path.trace;
+ for (i = 1; i < ovl->path.tlen; i += 2)
+ p += trace16[i];
+ }
+ if (p != ovl->path.bepos)
+ { if (verbose)
+ EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname);
+ return (1);
+ }
}
- else
+ else
{ uint16 *trace16 = (uint16 *) ovl->path.trace;
+
+ p = ovl->path.bbpos;
+ q = ovl->path.abpos;
for (i = 1; i < ovl->path.tlen; i += 2)
- p += trace16[i];
+ { p += trace16[i];
+ q += trace16[i-1];
+ }
+ if (p != ovl->path.bepos || q != ovl->path.aepos)
+ { if (verbose)
+ EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname);
+ return (1);
+ }
}
- if (p != ovl->path.bepos)
- { if (verbose)
- EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname);
- return (1);
- }
return (0);
}
diff --git a/align.h b/align.h
index e937b68..daa9151 100644
--- a/align.h
+++ b/align.h
@@ -113,11 +113,30 @@ typedef struct
the sequence before calling Compute_Trace or Print_Alignment. Complement_Seq complements
the sequence a of length n. The operation does the complementation/reversal in place.
Calling it a second time on a given fragment restores it to its original state.
+
+ With the introduction of the DAMAPPER, we need to code chains of alignments between a
+ pair of sequences. The alignments of a chain are expected to be found in order either on
+ a file or in memory, where the START_FLAG marks the first alignment and the NEXT_FLAG all
+ subsequent alignmenst in a chain. A chain of a single LA is marked with the START_FLAG.
+ The BEST_FLAG marks one of the best chains for a pair of sequences. The convention is
+ that either every record has either a START- or NEXT-flag, or none of them do (e.g. as
+ produced by daligner), so one can always check the flags of the first alignment to see
+ whether or not the chain concept applies to a given collection or not.
***/
-#define COMP(x) ((x) & 0x1)
+#define COMP_FLAG 0x1
+#define ACOMP_FLAG 0x2 // A-sequence is complemented, not B ! Only Local_Alignment notices
+
+#define COMP(x) ((x) & COMP_FLAG)
+#define ACOMP(x) ((x) & ACOMP_FLAG)
+
+#define START_FLAG 0x4 // LA is the first of a chain of 1 or more la's
+#define NEXT_FLAG 0x8 // LA is the next segment of a chain.
+#define BEST_FLAG 0x10 // This is the start of the best chain
-#define COMP_FLAG 0x1
+#define CHAIN_START(x) ((x) & START_FLAG)
+#define CHAIN_NEXT(x) ((x) & NEXT_FLAG)
+#define BEST_CHAIN(x) ((x) & BEST_FLAG)
typedef struct
{ Path *path;
@@ -306,7 +325,7 @@ typedef struct {
accommodate the trace where each value take 'tbytes' bytes (1 if uint8 or 2 if uint16).
Write_Overlap write 'ovl' to stream 'output' followed by its trace vector (if any) that
- occupies 'tbytes' bytes per value.
+ occupies 'tbytes' bytes per value. It returns non-zero if there was an error writing.
Print_Overlap prints an ASCII version of the contents of 'ovl' to stream 'output'
where the trace occupes 'tbytes' per value and the print out is indented from the left
@@ -324,7 +343,7 @@ typedef struct {
int Read_Overlap(FILE *input, Overlap *ovl);
int Read_Trace(FILE *innput, Overlap *ovl, int tbytes);
- void Write_Overlap(FILE *output, Overlap *ovl, int tbytes);
+ int Write_Overlap(FILE *output, Overlap *ovl, int tbytes);
void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent);
void Compress_TraceTo8(Overlap *ovl);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/dascrubber.git
More information about the debian-med-commit
mailing list