[med-svn] [daligner] 01/02: Imported Upstream version 1.0+20160927
Afif Elghraoui
afif at moszumanska.debian.org
Mon Oct 10 15:48:53 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository daligner.
commit 3e9ce5578958b23ee143ce3ca4a9b3695fc44a8f
Author: Afif Elghraoui <afif at debian.org>
Date: Mon Oct 10 06:49:46 2016 -0700
Imported Upstream version 1.0+20160927
---
DB.c | 86 ++-
DB.h | 28 +-
HPC.daligner.c | 1399 +++++++++++++++++++++++++++++++++++++++++++++++
HPCdaligner.c | 543 ------------------
HPCmapper.c | 591 --------------------
LAcat.c | 53 +-
LAcheck.c | 95 +++-
LAdump.c | 13 +-
LAmerge.c | 58 +-
LAshow.c | 78 ++-
LAsort.c | 83 ++-
LAsplit.c | 51 +-
LAupgrade.Dec.31.2014.c | 153 ------
Makefile | 15 +-
QV.c | 56 +-
QV.h | 18 +-
README | 395 -------------
README.md | 506 +++++++++++++++++
align.c | 165 ++++--
align.h | 23 +-
daligner.c | 102 +++-
filter.c | 348 +++++++-----
filter.h | 5 +-
23 files changed, 2801 insertions(+), 2063 deletions(-)
diff --git a/DB.c b/DB.c
index c3f8129..46046b7 100644
--- a/DB.c
+++ b/DB.c
@@ -54,7 +54,9 @@ void *Malloc(int64 size, char *mesg)
}
void *Realloc(void *p, int64 size, char *mesg)
-{ if ((p = realloc(p,size)) == NULL)
+{ if (size <= 0)
+ size = 1;
+ if ((p = realloc(p,size)) == NULL)
{ if (mesg == NULL)
EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name);
else
@@ -776,6 +778,43 @@ void Close_DB(HITS_DB *db)
}
+// Return the size in bytes of the memory occupied by a given DB
+
+int64 sizeof_DB(HITS_DB *db)
+{ int64 s;
+ HITS_TRACK *t;
+
+ s = sizeof(HITS_DB)
+ + sizeof(HITS_READ)*(db->nreads+2)
+ + strlen(db->path)+1
+ + (db->totlen+db->nreads+4);
+
+ t = db->tracks;
+ if (t != NULL && strcmp(t->name,". at qvs") == 0)
+ { HITS_QV *q = (HITS_QV *) t;
+ s += sizeof(HITS_QV)
+ + sizeof(uint16) * db->nreads
+ + q->ncodes * sizeof(QVcoding)
+ + 6;
+ t = t->next;
+ }
+
+ for (; t != NULL; t = t->next)
+ { s += sizeof(HITS_TRACK)
+ + strlen(t->name)+1
+ + t->size * (db->nreads+1);
+ if (t->data != NULL)
+ { if (t->size == 8)
+ s += sizeof(int)*((int64 *) t->anno)[db->nreads];
+ else // t->size == 4
+ s += sizeof(int)*((int *) t->anno)[db->nreads];
+ }
+ }
+
+ return (s);
+}
+
+
/*******************************************************************************************
*
* QV LOAD & CLOSE ROUTINES
@@ -791,7 +830,7 @@ int Load_QVs(HITS_DB *db)
uint16 *table;
HITS_QV *qvtrk;
QVcoding *coding, *nx;
- int ncodes;
+ int ncodes = 0;
if (db->tracks != NULL && strcmp(db->tracks->name,". at qvs") == 0)
return (0);
@@ -802,8 +841,14 @@ int Load_QVs(HITS_DB *db)
}
if (db->reads[db->nreads-1].coff < 0)
- { EPRINTF(EPLACE,"%s: The requested QVs have not been added to the DB!\n",Prog_Name);
- EXIT(1);
+ { if (db->part > 0)
+ { EPRINTF(EPLACE,"%s: All QVs for this block have not been added to the DB!\n",Prog_Name);
+ EXIT(1);
+ }
+ else
+ { EPRINTF(EPLACE,"%s: All QVs for this DB have not been added!\n",Prog_Name);
+ EXIT(1);
+ }
}
// Open .qvs, .idx, and .db files
@@ -818,8 +863,14 @@ int Load_QVs(HITS_DB *db)
coding = NULL;
qvtrk = NULL;
- root = rindex(db->path,'/') + 2;
- istub = Fopen(Catenate(db->path,"/",root,".db"),"r");
+ root = rindex(db->path,'/');
+ if (root[1] == '.')
+ { *root = '\0';
+ istub = Fopen(Catenate(db->path,"/",root+2,".db"),"r");
+ *root = '/';
+ }
+ else
+ istub = Fopen(Catenate(db->path,"","",".db"),"r");
if (istub == NULL)
goto error;
@@ -877,16 +928,16 @@ int Load_QVs(HITS_DB *db)
// assign the tables # for each read in the block in "tables".
rewind(istub);
- fscanf(istub,DB_NFILE,&nfiles);
+ (void) fscanf(istub,DB_NFILE,&nfiles);
first = 0;
for (n = 0; n < fbeg; n++)
- { fscanf(istub,DB_FDATA,&last,fname,prolog);
+ { (void) fscanf(istub,DB_FDATA,&last,fname,prolog);
first = last;
}
for (n = fbeg; n < fend; n++)
- { fscanf(istub,DB_FDATA,&last,fname,prolog);
+ { (void) fscanf(istub,DB_FDATA,&last,fname,prolog);
i = n-fbeg;
if (first < pfirst)
@@ -1059,16 +1110,22 @@ int Check_Track(HITS_DB *db, char *track, int *kind)
return (-2);
if (fread(&tracklen,sizeof(int),1,afile) != 1)
- return (-1);
+ { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track);
+ exit (1);
+ }
if (fread(&size,sizeof(int),1,afile) != 1)
- return (-1);
+ { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track);
+ exit (1);
+ }
if (size == 0)
*kind = MASK_TRACK;
else if (size > 0)
*kind = CUSTOM_TRACK;
else
- return (-1);
+ { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track);
+ exit (1);
+ }
fclose(afile);
@@ -1188,7 +1245,10 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track)
if ( ! ispart && db->part > 0)
fseeko(afile,size*db->ufirst,SEEK_CUR);
}
- nreads = db->nreads;
+ if (tracklen == treads)
+ nreads = ((int *) (db->reads))[-2];
+ else
+ nreads = ((int *) (db->reads))[-1];
anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector");
if (anno == NULL)
diff --git a/DB.h b/DB.h
index 48b319e..983bee7 100644
--- a/DB.h
+++ b/DB.h
@@ -34,8 +34,6 @@
// value. For such routines that were previously void, they are now int, and
// return 1 if an error occured, 0 otherwise.
-#undef INTERACTIVE
-
#ifdef INTERACTIVE
#define EPRINTF sprintf
@@ -98,7 +96,8 @@ extern char Ebuffer[];
#define ARG_POSITIVE(var,name) \
var = strtol(argv[i]+2,&eptr,10); \
if (*eptr != '\0' || argv[i][2] == '\0') \
- { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \
+ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \
+ Prog_Name,argv[i][1],argv[i]+2); \
exit (1); \
} \
if (var <= 0) \
@@ -109,7 +108,8 @@ extern char Ebuffer[];
#define ARG_NON_NEGATIVE(var,name) \
var = strtol(argv[i]+2,&eptr,10); \
if (*eptr != '\0' || argv[i][2] == '\0') \
- { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \
+ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \
+ Prog_Name,argv[i][1],argv[i]+2); \
exit (1); \
} \
if (var < 0) \
@@ -120,7 +120,8 @@ extern char Ebuffer[];
#define ARG_REAL(var) \
var = strtod(argv[i]+2,&eptr); \
if (*eptr != '\0' || argv[i][2] == '\0') \
- { fprintf(stderr,"%s: -%c argument is not a real number\n",Prog_Name,argv[i][1]); \
+ { fprintf(stderr,"%s: -%c '%s' argument is not a real number\n", \
+ Prog_Name,argv[i][1],argv[i]+2); \
exit (1); \
}
@@ -174,14 +175,17 @@ void Number_Read(char *s); // Convert read from letters to numbers
#define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert
#define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1)
+// Fields have different interpretations if a .db versus a .dam
+
typedef struct
- { int origin; // Well #
+ { int origin; // Well # (DB), Contig # (DAM)
int rlen; // Length of the sequence (Last pulse = fpulse + rlen)
- int fpulse; // First pulse
+ int fpulse; // First pulse (DB), left index of contig in scaffold (DAM)
int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of
// uncompressed bases in memory block
- int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file
- int flags; // QV of read + flags above
+ int64 coff; // Offset (in bytes) of compressed quiva streams in '.qvs' file (DB),
+ // Offset (in bytes) of scaffold header string in '.hdr' file (DAM)
+ int flags; // QV of read + flags above (DB only)
} HITS_READ;
// A track can be of 3 types:
@@ -262,7 +266,7 @@ typedef struct
#define DB_NFILE "files = %9d\n" // number of files
#define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name
#define DB_NBLOCK "blocks = %9d\n" // number of blocks
-#define DB_PARAMS "size = %9lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well
+#define DB_PARAMS "size = %10lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well
#define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed)
@@ -305,6 +309,10 @@ void Trim_DB(HITS_DB *db);
void Close_DB(HITS_DB *db);
+ // Return the size in bytes of the given DB
+
+int64 sizeof_DB(HITS_DB *db);
+
// If QV pseudo track is not already in db's track list, then load it and set it up.
// The database must not have been trimmed yet. -1 is returned if a .qvs file is not
// present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE
diff --git a/HPC.daligner.c b/HPC.daligner.c
new file mode 100644
index 0000000..649a629
--- /dev/null
+++ b/HPC.daligner.c
@@ -0,0 +1,1399 @@
+/*********************************************************************************************\
+ *
+ * Produce a script to compute overlaps for all block pairs of a DB, and then sort and merge
+ * them into as many .las files as their are blocks.
+ *
+ * Author: Gene Myers
+ * Date : June 1, 2014
+ *
+ *********************************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "DB.h"
+#include "filter.h"
+
+#undef LSF // define if want a directly executable LSF script
+
+static char *Usage[] =
+ { "[-vbad] [-t<int>] [-w<int(6)>] [-l<int(1000)>] [-s<int(100)]",
+ " [-M<int>] [-B<int(4)>] [-D<int( 250)>] [-T<int(4)>] [-f<name>]",
+ " ( [-k<int(14)>] [-h<int(35)>] [-e<double(.70)>] [-H<int>]",
+ " [-k<int(20)>] [-h<int(50)>] [-e<double(.85)>] <ref:db|dam> )",
+ " [-m<track>]+ <reads:db|dam> [<first:int>[-<last:int>]]"
+ };
+
+ // Command Options
+
+static int DUNIT, BUNIT;
+static int VON, BON, CON, DON;
+static int WINT, TINT, HGAP, HINT, KINT, SINT, LINT, MINT;
+static int NTHREADS;
+static double EREL;
+static int MMAX, MTOP;
+static char **MASK;
+static char *ONAME;
+
+#define LSF_ALIGN "bsub -q medium -n 4 -o DALIGNER.out -e DALIGNER.err -R span[hosts=1] -J align#%d"
+#define LSF_MERGE \
+ "bsub -q short -n 12 -o MERGE%d.DAL.out -e MERGE%d.DAL.err -R span[hosts=1] -J merge#%d"
+#define LSF_CHECK \
+ "bsub -q short -n 12 -o CHECK%d.DAL.out -e CHECK%d.DAL.err -R span[hosts=1] -J check#%d"
+
+void daligner_script(int argc, char *argv[])
+{ int nblocks;
+ int usepath;
+ int useblock;
+ int fblock, lblock;
+#ifdef LSF
+ int jobid;
+#endif
+
+ FILE *out;
+ char name[100];
+ char *pwd, *root;
+
+ // Make sure DB exists and is partitioned, get number of blocks in partition
+
+ pwd = PathTo(argv[1]);
+ if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0)
+ root = Root(argv[1],".dam");
+ else
+ root = Root(argv[1],".db");
+
+ { int i, nfiles;
+ FILE *dbvis;
+
+ dbvis = fopen(Catenate(pwd,"/",root,".dam"),"r");
+ if (dbvis == NULL)
+ { dbvis = Fopen(Catenate(pwd,"/",root,".db"),"r");
+ if (dbvis == NULL)
+ exit (1);
+ }
+
+ if (fscanf(dbvis,"files = %d\n",&nfiles) != 1)
+ SYSTEM_ERROR
+ for (i = 0; i < nfiles; i++)
+ { char buffer[30001];
+
+ if (fgets(buffer,30000,dbvis) == NULL)
+ SYSTEM_ERROR
+ }
+
+ useblock = 1;
+ if (fscanf(dbvis,"blocks = %d\n",&nblocks) != 1 || nblocks == 1)
+ { useblock = 0;
+ nblocks = 1;
+ }
+
+ usepath = (strcmp(pwd,".") != 0);
+ }
+
+ // Set range fblock-lblock checking that DB.<fblock-1>.las exists & DB.<fblock>.las does not
+
+ { char *eptr, *fptr;
+ FILE *file;
+
+ if (argc == 3)
+ { fblock = strtol(argv[2],&eptr,10);
+ if (*eptr != '\0' && *eptr != '-')
+ { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n",
+ Prog_Name,argv[2]);
+ exit (1);
+ }
+ useblock = 1;
+ if (*eptr == '-')
+ { lblock = strtol(eptr+1,&fptr,10);
+ if (*fptr != '\0')
+ { fprintf(stderr,"%s: second part of range '%s' is not an integer\n",
+ Prog_Name,eptr+1);
+ exit (1);
+ }
+ }
+ else
+ lblock = fblock;
+ if (fblock < 1 || lblock > nblocks || fblock > lblock)
+ { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock);
+ exit (1);
+ }
+ }
+ else
+ { fblock = 1;
+ lblock = nblocks;
+ }
+
+ if (fblock > 1)
+ { file = fopen(Catenate(pwd,"/",root,Numbered_Suffix(".",fblock-1,".las")),"r");
+ if (file == NULL)
+ { if (usepath)
+ fprintf(stderr,"%s: File %s/%s.%d.las should already be present!\n",
+ Prog_Name,pwd,root,fblock-1);
+ else
+ fprintf(stderr,"%s: File %s.%d.las should already be present!\n",
+ Prog_Name,root,fblock-1);
+ exit (1);
+ }
+ else
+ fclose(file);
+ }
+ if (useblock)
+ file = fopen(Catenate(pwd,"/",root,Numbered_Suffix(".",fblock,".las")),"r");
+ else
+ file = fopen(Catenate(pwd,"/",root,".las"),"r");
+ if (file != NULL)
+ { if (usepath)
+ if (useblock)
+ fprintf(stderr,"%s: File %s/%s.%d.las should not yet exist!\n",
+ Prog_Name,pwd,root,fblock);
+ else
+ fprintf(stderr,"%s: File %s/%s.las should not yet exist!\n",Prog_Name,pwd,root);
+ else
+ if (useblock)
+ fprintf(stderr,"%s: File %s.%d.las should not yet exist!\n",Prog_Name,root,fblock);
+ else
+ fprintf(stderr,"%s: File %s.las should not yet exist!\n",Prog_Name,root);
+ exit (1);
+ }
+
+ DON = (DON && (lblock > 1));
+ out = stdout;
+ }
+
+ { int level, njobs;
+ int i, j, k;
+
+ // Create all work subdirectories if DON
+
+ if (DON)
+ { if (ONAME != NULL)
+ { sprintf(name,"%s.00.MKDIR",ONAME);
+ out = fopen(name,"w");
+ }
+
+ fprintf(out,"# Create work subdirectories\n");
+ for (i = fblock; i <= lblock; i++)
+ fprintf(out,"mkdir work%d\n",i);
+
+ if (ONAME != NULL)
+ fclose(out);
+ }
+
+ // Produce all necessary daligner jobs
+
+ if (ONAME != NULL)
+ { sprintf(name,"%s.01.OVL",ONAME);
+ out = fopen(name,"w");
+ }
+
+ njobs = 0;
+ for (i = fblock; i <= lblock; i++)
+ njobs += (i-1)/BUNIT+1;
+
+ fprintf(out,"# Daligner jobs (%d)\n",njobs);
+
+#ifdef LSF
+ jobid = 1;
+#endif
+ for (i = fblock; i <= lblock; i++)
+ { int bits;
+ int low, hgh;
+
+ bits = (i-1)/BUNIT+1;
+ low = 1;
+ for (j = 1; j <= bits; j++)
+ {
+#ifdef LSF
+ fprintf(out,LSF_ALIGN,jobid++);
+ fprintf(out," \"");
+#endif
+ fprintf(out,"daligner");
+ if (VON)
+ fprintf(out," -v");
+ if (BON)
+ fprintf(out," -b");
+ if (KINT != 14)
+ fprintf(out," -k%d",KINT);
+ if (WINT != 6)
+ fprintf(out," -w%d",WINT);
+ if (HINT != 35)
+ fprintf(out," -h%d",HINT);
+ if (TINT > 0)
+ fprintf(out," -t%d",TINT);
+ if (HGAP > 0)
+ fprintf(out," -H%d",HGAP);
+ if (EREL > 0.)
+ fprintf(out," -e%g",EREL);
+ if (LINT != 1000)
+ fprintf(out," -l%d",LINT);
+ if (SINT != 100)
+ fprintf(out," -s%d",SINT);
+ if (MINT >= 0)
+ fprintf(out," -M%d",MINT);
+ if (NTHREADS != 4)
+ fprintf(out," -T%d",NTHREADS);
+ for (k = 0; k < MTOP; k++)
+ fprintf(out," -m%s",MASK[k]);
+ if (useblock)
+ if (usepath)
+ fprintf(out," %s/%s.%d",pwd,root,i);
+ else
+ fprintf(out," %s.%d",root,i);
+ else
+ if (usepath)
+ fprintf(out," %s/%s",pwd,root);
+ else
+ fprintf(out," %s",root);
+ hgh = (i*j)/bits + 1;
+ for (k = low; k < hgh; k++)
+ if (useblock)
+ if (usepath)
+ fprintf(out," %s/%s.%d",pwd,root,k);
+ else
+ fprintf(out," %s.%d",root,k);
+ else
+ if (usepath)
+ fprintf(out," %s/%s",pwd,root);
+ else
+ fprintf(out," %s",root);
+
+ if (lblock == 1) // ==> i = 1, [low,hgh) = [1,2)
+ { fprintf(out," && mv");
+ if (useblock)
+ fprintf(out," %s.1.%s.1.las",root,root);
+ else
+ fprintf(out," %s.%s.las",root,root);
+ if (usepath)
+ fprintf(out," %s/",pwd);
+ else
+ fprintf(out," ");
+ if (useblock)
+ fprintf(out,"%s.1.las",root);
+ else
+ fprintf(out,"%s.las",root);
+ }
+ else if (DON)
+ { fprintf(out," && mv");
+ for (k = low; k < hgh; k++)
+ fprintf(out," %s.%d.%s.%d.las",root,i,root,k);
+ fprintf(out," work%d",i);
+ for (k = low; k < hgh; k++)
+ if (k != i)
+ fprintf(out," && mv %s.%d.%s.%d.las work%d",root,k,root,i,k);
+ }
+
+#ifdef LSF
+ fprintf(out,"\"");
+#endif
+ fprintf(out,"\n");
+ low = hgh;
+ }
+ }
+
+ // Check .las files (optional)
+
+ if (ONAME != NULL)
+ { fclose(out);
+ sprintf(name,"%s.02.CHECK.OPT",ONAME);
+ out = fopen(name,"w");
+ }
+
+ fprintf(out,"# Check initial .las files jobs (%d) (optional but recommended)\n",
+ (fblock-1) * ((lblock-fblock)/(BUNIT+1) + 1) +
+ (lblock-fblock+1) * ((lblock-1)/(BUNIT+1) + 1) );
+
+#ifdef LSF
+ jobid = 1;
+#endif
+ for (i = 1; i <= lblock; i++)
+ for (j = (i < fblock ? fblock : 1); j <= lblock; )
+ { k = j+BUNIT;
+ if (k > lblock)
+ k = lblock;
+#ifdef LSF
+ fprintf(out,LSF_CHECK,0,0,jobid++);
+ fprintf(out," \"");
+#endif
+ fprintf(out,"LAcheck -vS");
+ if (usepath)
+ fprintf(out," %s/%s",pwd,root);
+ else
+ fprintf(out," %s",root);
+ while (j <= k)
+ { if (lblock == 1)
+ { if (usepath)
+ if (useblock)
+ fprintf(out," %s/%s.1",pwd,root);
+ else
+ fprintf(out," %s/%s",pwd,root);
+ else
+ if (useblock)
+ fprintf(out," %s.1",root);
+ else
+ fprintf(out," %s",root);
+ }
+ else
+ { if (DON)
+ fprintf(out," work%d/%s.%d.%s.%d",i,root,i,root,j);
+ else
+ fprintf(out," %s.%d.%s.%d",root,i,root,j);
+ }
+ j += 1;
+ }
+#ifdef LSF
+ fprintf(out,"\"");
+#endif
+ fprintf(out,"\n");
+ }
+
+ if (ONAME != NULL)
+ fclose(out);
+
+ // Higher level merges (if lblock > 1)
+
+ if (lblock > 1)
+ { int pow, stage;
+
+ // Determine the number of merging levels
+
+ stage = 3;
+
+ pow = 1;
+ for (level = 0; pow < lblock; level++)
+ pow *= DUNIT;
+
+ // Issue the commands for each merge level
+
+ { int p, cnt, dnt;
+
+ cnt = lblock;
+ dnt = (lblock-fblock)+1;
+ for (i = 1; i <= level; i++)
+ { int bits, dits;
+ int low, hgh;
+
+ if (ONAME != NULL)
+ { sprintf(name,"%s.%02d.MERGE",ONAME,stage++);
+ out = fopen(name,"w");
+ }
+
+ bits = (cnt-1)/DUNIT+1;
+ dits = (dnt-1)/DUNIT+1;
+
+ // Incremental update merges
+
+#ifdef LSF
+ jobid = 1;
+#endif
+ if (dnt >= 1)
+ { int last;
+
+ last = (dnt == 1 || i == level);
+ fprintf(out,"# Level %d merge jobs (%d)\n",
+ i,bits*((lblock-fblock)+1) + dits*(fblock-1));
+ for (j = 1; j < fblock; j++)
+ {
+#ifdef LSF
+ fprintf(out,LSF_MERGE,i,i,jobid++);
+ fprintf(out," \"");
+#endif
+ if (last)
+ { if (DON)
+ { if (usepath)
+ fprintf(out,"mv %s/%s.%d.las work%d/L%d.%d.0.las && ",
+ pwd,root,j,j,i,j);
+ else
+ fprintf(out,"mv %s.%d.las work%d/L%d.%d.0.las && ",root,j,j,i,j);
+ }
+ else
+ { if (usepath)
+ fprintf(out,"mv %s/%s.%d.las L%d.%d.0.las && ",pwd,root,j,i,j);
+ else
+ fprintf(out,"mv %s.%d.las L%d.%d.0.las && ",root,j,i,j);
+ }
+ }
+ low = 1;
+ for (p = 1; p <= dits; p++)
+ { hgh = (dnt*p)/dits;
+#ifdef LSF
+ if (p > 1)
+ { fprintf(out,LSF_MERGE,i,i,jobid++);
+ fprintf(out," \"");
+ }
+#endif
+ fprintf(out,"LAmerge");
+ if (VON)
+ fprintf(out," -v");
+ if (CON)
+ fprintf(out," -a");
+ if (last)
+ if (DON)
+ if (usepath)
+ fprintf(out," %s/%s.%d work%d/L%d.%d.0",pwd,root,j,j,i,j);
+ else
+ fprintf(out," %s.%d work%d/L%d.%d.0",root,j,j,i,j);
+ else
+ if (usepath)
+ fprintf(out," %s/%s.%d L%d.%d.0",pwd,root,j,i,j);
+ else
+ fprintf(out," %s.%d L%d.%d.0",root,j,i,j);
+ else
+ if (DON)
+ fprintf(out," work%d/L%d.%d.%d",j,i+1,j,p);
+ else
+ fprintf(out," L%d.%d.%d",i+1,j,p);
+ for (k = low; k <= hgh; k++)
+ if (i == 1)
+ if (DON)
+ fprintf(out," work%d/%s.%d.%s.%d",j,root,j,root,k+(fblock-1));
+ else
+ fprintf(out," %s.%d.%s.%d",root,j,root,k+(fblock-1));
+ else
+ if (DON)
+ fprintf(out," work%d/L%d.%d.%d",j,i,j,k);
+ else
+ fprintf(out," L%d.%d.%d",i,j,k);
+#ifdef LSF
+ fprintf(out,"\"");
+#endif
+ fprintf(out,"\n");
+ low = hgh+1;
+ }
+ }
+ }
+ else
+ fprintf(out,"# Level %d merge jobs (%d)\n",i,bits*((lblock-fblock)+1));
+
+ // New block merges
+
+ for (j = fblock; j <= lblock; j++)
+ { low = 1;
+ for (p = 1; p <= bits; p++)
+ { hgh = (cnt*p)/bits;
+#ifdef LSF
+ fprintf(out,LSF_MERGE,i,i,jobid++);
+ fprintf(out," \"");
+#endif
+ fprintf(out,"LAmerge");
+ if (VON)
+ fprintf(out," -v");
+ if (CON)
+ fprintf(out," -a");
+ if (i == level)
+ if (usepath)
+ fprintf(out," %s/%s.%d",pwd,root,j);
+ else
+ fprintf(out," %s.%d",root,j);
+ else
+ if (DON)
+ fprintf(out," work%d/L%d.%d.%d",j,i+1,j,p);
+ else
+ fprintf(out," L%d.%d.%d",i+1,j,p);
+ for (k = low; k <= hgh; k++)
+ if (i == 1)
+ if (DON)
+ fprintf(out," work%d/%s.%d.%s.%d",j,root,j,root,k);
+ else
+ fprintf(out," %s.%d.%s.%d",root,j,root,k);
+ else
+ if (DON)
+ fprintf(out," work%d/L%d.%d.%d",j,i,j,k);
+ else
+ fprintf(out," L%d.%d.%d",i,j,k);
+#ifdef LSF
+ fprintf(out,"\"");
+#endif
+ fprintf(out,"\n");
+ low = hgh+1;
+ }
+ }
+
+ // Check new .las (optional)
+
+ if (ONAME != NULL)
+ { fclose(out);
+ sprintf(name,"%s.%02d.CHECK.OPT",ONAME,stage++);
+ out = fopen(name,"w");
+ }
+
+ fprintf(out,"# Check level %d .las files jobs (%d) (optional but recommended)\n",
+ i+1,(fblock-1)*((dits-1)/(BUNIT+1)+1) +
+ (lblock-fblock+1)*((bits-1)/(BUNIT+1)+1) );
+
+#ifdef LSF
+ jobid = 1;
+#endif
+ if (dnt >= 1)
+ { int last;
+
+ last = (dnt == 1 || i == level);
+ for (j = 1; j < fblock; j++)
+ for (p = 1; p <= dits;)
+ { k = p+BUNIT;
+ if (k > dits)
+ k = dits;
+#ifdef LSF
+ fprintf(out,LSF_CHECK,i,i,jobid++);
+ fprintf(out," \"");
+#endif
+ fprintf(out,"LAcheck -vS");
+ if (usepath)
+ fprintf(out," %s/%s",pwd,root);
+ else
+ fprintf(out," %s",root);
+ while (p <= k)
+ { if (last)
+ if (usepath)
+ fprintf(out," %s/%s.%d",pwd,root,j);
+ else
+ fprintf(out," %s.%d",root,j);
+ else
+ if (DON)
+ fprintf(out," work%d/L%d.%d.%d",j,i+1,j,p);
+ else
+ fprintf(out," L%d.%d.%d",i+1,j,p);
+ p += 1;
+ }
+#ifdef LSF
+ fprintf(out,"\"");
+#endif
+ fprintf(out,"\n");
+ }
+ }
+
+ for (j = fblock; j <= lblock; j++)
+ for (p = 1; p <= bits;)
+ { k = p+BUNIT;
+ if (k > bits)
+ k = bits;
+#ifdef LSF
+ fprintf(out,LSF_CHECK,i,i,jobid++);
+ fprintf(out," \"");
+#endif
+ fprintf(out,"LAcheck -vS");
+ if (usepath)
+ fprintf(out," %s/%s",pwd,root);
+ else
+ fprintf(out," %s",root);
+ while (p <= k)
+ { if (i == level)
+ if (usepath)
+ fprintf(out," %s/%s.%d",pwd,root,j);
+ else
+ fprintf(out," %s.%d",root,j);
+ else
+ if (DON)
+ fprintf(out," work%d/L%d.%d.%d",j,i+1,j,p);
+ else
+ fprintf(out," L%d.%d.%d",i+1,j,p);
+ p += 1;
+ }
+#ifdef LSF
+ fprintf(out,"\"");
+#endif
+ fprintf(out,"\n");
+ }
+
+ // Cleanup (optional)
+
+ if (ONAME != NULL)
+ { fclose(out);
+ if (i == 1)
+ sprintf(name,"%s.%02d.RM.OPT",ONAME,stage++);
+ else
+ sprintf(name,"%s.%02d.RM",ONAME,stage++);
+ out = fopen(name,"w");
+ }
+ if (i == 1)
+ fprintf(out,"# Remove level %d .las files (optional)\n",i);
+ else
+ fprintf(out,"# Remove level %d .las files\n",i);
+
+ if (dnt >= 1)
+ { int last;
+
+ last = (dnt == 1 || i == level);
+ for (j = 1; j < fblock; j++)
+ { low = 1;
+ if (DON)
+ fprintf(out,"cd work%d\n",j);
+ for (p = 1; p <= dits; p++)
+ { hgh = (dnt*p)/dits;
+ fprintf(out,"rm");
+ if (last)
+ fprintf(out," L%d.%d.0.las",i,j);
+ for (k = low; k <= hgh; k++)
+ if (i == 1)
+ fprintf(out," %s.%d.%s.%d.las",root,j,root,k+(fblock-1));
+ else
+ fprintf(out," L%d.%d.%d.las",i,j,k);
+ fprintf(out,"\n");
+ low = hgh+1;
+ }
+ if (DON)
+ fprintf(out,"cd ..\n");
+ }
+ }
+
+ for (j = fblock; j <= lblock; j++)
+ { low = 1;
+ if (DON)
+ fprintf(out,"cd work%d\n",j);
+ for (p = 1; p <= bits; p++)
+ { hgh = (cnt*p)/bits;
+ fprintf(out,"rm");
+ for (k = low; k <= hgh; k++)
+ if (i == 1)
+ fprintf(out," %s.%d.%s.%d.las",root,j,root,k);
+ else
+ fprintf(out," L%d.%d.%d.las",i,j,k);
+ fprintf(out,"\n");
+ low = hgh+1;
+ }
+ if (DON)
+ fprintf(out,"cd ..\n");
+ }
+
+ if (ONAME != NULL)
+ fclose(out);
+
+ if (dnt >= 1)
+ { if (dnt > 1)
+ dnt = dits;
+ else
+ dnt = 0;
+ }
+ cnt = bits;
+ }
+ }
+ }
+ }
+
+ free(root);
+ free(pwd);
+}
+
+/*********************************************************************************************\
+ *
+ * Produce a script to compute overlaps for all block pairs between two DBs, and then sort
+ * and merge them into as many .las files as their are blocks of the 1st DB.
+ *
+ * Author: Gene Myers
+ * Date : December 31, 2014
+ *
+ *********************************************************************************************/
+
+#define LSF_MALIGN "bsub -q medium -n 4 -o MAPALL.out -e MAPALL.err -R span[hosts=1] -J align#%d"
+#define LSF_MSORT "bsub -q short -n 12 -o SORT.ALL.out -e SORT.ALL.err -R span[hosts=1] -J sort#%d"
+#define LSF_MMERGE \
+ "bsub -q short -n 12 -o MERGE%d.ALL.out -e MERGE%d.ALL.err -R span[hosts=1] -J merge#%d"
+
+void mapper_script(int argc, char *argv[])
+{ int nblocks1, nblocks2;
+ int useblock1, useblock2;
+ int usepath1, usepath2;
+ int fblock, lblock;
+#ifdef LSF
+ int jobid;
+#endif
+
+ FILE *out;
+ char name[100];
+ char *pwd1, *root1;
+ char *pwd2, *root2;
+
+ // Make sure DAM and DB exist and the DB is partitioned, get number of blocks in partition
+
+ pwd1 = PathTo(argv[1]);
+ if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0)
+ root1 = Root(argv[1],".dam");
+ else
+ root1 = Root(argv[1],".db");
+
+ { int i, nfiles;
+ FILE *dbvis;
+
+ dbvis = fopen(Catenate(pwd1,"/",root1,".dam"),"r");
+ if (dbvis == NULL)
+ { dbvis = Fopen(Catenate(pwd1,"/",root1,".db"),"r");
+ if (dbvis == NULL)
+ exit (1);
+ }
+
+ if (fscanf(dbvis,"files = %d\n",&nfiles) != 1)
+ SYSTEM_ERROR
+ for (i = 0; i < nfiles; i++)
+ { char buffer[30001];
+
+ if (fgets(buffer,30000,dbvis) == NULL)
+ SYSTEM_ERROR
+ }
+
+ useblock1 = 1;
+ if (fscanf(dbvis,"blocks = %d\n",&nblocks1) != 1 || nblocks1 == 1)
+ { useblock1 = 0;
+ nblocks1 = 1;
+ }
+
+ usepath1 = (strcmp(pwd1,".") != 0);
+
+ fclose(dbvis);
+ }
+
+ pwd2 = PathTo(argv[2]);
+ if (strcmp(argv[2]+(strlen(argv[2])-4),".dam") == 0)
+ root2 = Root(argv[2],".dam");
+ else
+ root2 = Root(argv[2],".db");
+
+ if (strcmp(root2,root1) == 0 && strcmp(pwd1,pwd2) == 0)
+ { fprintf(stderr,"%s: Comparing the same data base %s/%s against itself, use HPCdaligner\n",
+ Prog_Name,pwd1,root1);
+ exit (1);
+ }
+
+ { int i, nfiles;
+ FILE *dbvis;
+
+ dbvis = fopen(Catenate(pwd2,"/",root2,".dam"),"r");
+ if (dbvis == NULL)
+ { dbvis = Fopen(Catenate(pwd2,"/",root2,".db"),"r");
+ if (dbvis == NULL)
+ exit (1);
+ }
+
+ if (fscanf(dbvis,"files = %d\n",&nfiles) != 1)
+ SYSTEM_ERROR
+ for (i = 0; i < nfiles; i++)
+ { char buffer[30001];
+
+ if (fgets(buffer,30000,dbvis) == NULL)
+ SYSTEM_ERROR
+ }
+
+ useblock2 = 1;
+ if (fscanf(dbvis,"blocks = %d\n",&nblocks2) != 1 || nblocks2 == 1)
+ { useblock2 = 0;
+ nblocks2 = 1;
+ }
+
+ usepath2 = (strcmp(pwd2,".") != 0);
+
+ fclose(dbvis);
+ }
+
+ // Set range fblock-lblock checking that DB.<fblock-1>.las exists & DB.<fblock>.las does not
+
+ { char *eptr, *fptr, *src2;
+ FILE *file;
+
+ if (argc == 4)
+ { fblock = strtol(argv[3],&eptr,10);
+ if ((*eptr != '\0' && *eptr != '-') || eptr <= argv[3])
+ { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n",
+ Prog_Name,argv[3]);
+ exit (1);
+ }
+ useblock2 = 1;
+ if (*eptr == '-')
+ { lblock = strtol(eptr+1,&fptr,10);
+ if (*fptr != '\0' || fptr <= eptr+1)
+ { fprintf(stderr,"%s: second part of range '%s' is not an integer\n",
+ Prog_Name,eptr+1);
+ exit (1);
+ }
+ }
+ else
+ lblock = fblock;
+ if (fblock < 1 || lblock > nblocks2 || fblock > lblock)
+ { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock);
+ exit (1);
+ }
+ }
+ else
+ { fblock = 1;
+ lblock = nblocks2;
+ }
+
+ if (usepath2)
+ src2 = Strdup(Catenate(pwd2,"/",root2,""),"Allocating small string!");
+ else
+ src2 = Strdup(root2,"Allocating small string!");
+ if (src2 == NULL)
+ exit (1);
+
+ if (fblock > 1)
+ { file = fopen(Catenate(src2,".",root1,Numbered_Suffix(".",fblock-1,".las")),"r");
+ if (file == NULL)
+ { fprintf(stderr,"%s: File %s.%d.%s.las should already be present!\n",
+ Prog_Name,src2,fblock-1,root1);
+ exit (1);
+ }
+ else
+ fclose(file);
+ }
+ if (useblock2)
+ { file = fopen(Catenate(src2,".",root1,Numbered_Suffix(".",fblock,".las")),"r");
+ if (file != NULL)
+ { fprintf(stderr,"%s: File %s.%d.%s.las should not yet exist!\n",
+ Prog_Name,src2,fblock,root1);
+ exit (1);
+ }
+ }
+ else
+ { file = fopen(Catenate(src2,".",root1,".las"),"r");
+ if (file != NULL)
+ { fprintf(stderr,"%s: File %s.%s.las should not yet exist!\n",
+ Prog_Name,src2,root1);
+ exit (1);
+ }
+ }
+
+ free(src2);
+
+ DON = (DON && (nblocks1 > 1));
+ out = stdout;
+ }
+
+ { int level, njobs;
+ int i, j, k;
+
+ // Create all work subdirectories if DON
+
+ if (DON)
+ { if (ONAME != NULL)
+ { sprintf(name,"%s.00.MKDIR",ONAME);
+ out = fopen(name,"w");
+ }
+
+ fprintf(out,"# Create work subdirectories\n");
+ for (i = fblock; i <= lblock; i++)
+ fprintf(out,"mkdir work%d\n",i);
+
+ if (ONAME != NULL)
+ fclose(out);
+ }
+
+ // Produce all necessary daligner jobs ...
+
+ if (ONAME != NULL)
+ { sprintf(name,"%s.01.CMP",ONAME);
+ out = fopen(name,"w");
+ }
+
+ njobs = nblocks1 * ( (lblock-fblock)/BUNIT + 1);
+
+ fprintf(out,"# Daligner jobs (%d)\n",njobs);
+
+#ifdef LSF
+ jobid = 1;
+#endif
+ for (i = fblock; i <= lblock; i++)
+ { int bits;
+ int low, hgh;
+
+ bits = (nblocks1-1)/BUNIT+1;
+ low = 1;
+ for (j = 1; j <= bits; j++)
+ {
+#ifdef LSF
+ fprintf(out,LSF_MALIGN,jobid++);
+ fprintf(out," \"");
+#endif
+ fprintf(out,"daligner -A");
+ if (VON)
+ fprintf(out," -v");
+ if (BON)
+ fprintf(out," -b");
+ fprintf(out," -k%d",KINT);
+ if (WINT != 6)
+ fprintf(out," -w%d",WINT);
+ fprintf(out," -h%d",HINT);
+ if (TINT > 0)
+ fprintf(out," -t%d",TINT);
+ if (EREL > 0.)
+ fprintf(out," -e%g",EREL);
+ else
+ fprintf(out," -e.85");
+ if (LINT != 1000)
+ fprintf(out," -l%d",LINT);
+ if (SINT != 100)
+ fprintf(out," -s%d",SINT);
+ if (NTHREADS != 4)
+ fprintf(out," -T%d",NTHREADS);
+ if (MINT >= 0)
+ fprintf(out," -M%d",MINT);
+ for (k = 0; k < MTOP; k++)
+ fprintf(out," -m%s",MASK[k]);
+
+ fprintf(out," ");
+ if (usepath2)
+ fprintf(out,"%s/",pwd2);
+ fprintf(out,"%s",root2);
+ if (useblock2)
+ fprintf(out,".%d",i);
+
+ hgh = 1 + (nblocks1*j)/bits;
+ for (k = low; k < hgh; k++)
+ { fprintf(out," ");
+ if (usepath1)
+ fprintf(out,"%s/",pwd1);
+ fprintf(out,"%s",root1);
+ if (useblock1)
+ fprintf(out,".%d",k);
+ }
+
+ if (nblocks1 == 1)
+ { if (useblock1 || usepath2)
+ { fprintf(out," && mv %s",root2);
+ if (useblock2)
+ fprintf(out,".%d",i);
+ if (useblock1)
+ fprintf(out,".%s.1 ",root1);
+ else
+ fprintf(out,".%s ",root1);
+ if (useblock1)
+ { if (usepath2)
+ fprintf(out,"%s/",pwd2);
+ fprintf(out,"%s",root2);
+ if (useblock2)
+ fprintf(out,".%d",i);
+ fprintf(out,".%s",root1);
+ }
+ else
+ fprintf(out,"%s",pwd2);
+ }
+ }
+ else if (DON)
+ { fprintf(out," && mv");
+ for (k = low; k < hgh; k++)
+ { fprintf(out," %s",root2);
+ if (useblock2)
+ fprintf(out,".%d",i);
+ fprintf(out,".%s.%d.las",root1,k);
+ }
+ fprintf(out," work%d",i);
+ }
+#ifdef LSF
+ fprintf(out,"\"");
+#endif
+ fprintf(out,"\n");
+ low = hgh;
+ }
+ }
+
+ // Check .las files (optional)
+
+ if (ONAME != NULL)
+ { fclose(out);
+ sprintf(name,"%s.02.CHECK.OPT",ONAME);
+ out = fopen(name,"w");
+ }
+
+ fprintf(out,"# Check initial .las files jobs (%d) (optional but recommended)\n",
+ (lblock-fblock+1) * ((nblocks1-1)/(BUNIT+1) + 1) );
+
+#ifdef LSF
+ jobid = 1;
+#endif
+ for (j = fblock; j <= lblock; j++)
+ for (i = 1; i <= nblocks1; )
+ { k = i+BUNIT;
+ if (k > nblocks1)
+ k = nblocks1;
+#ifdef LSF
+ fprintf(out,LSF_CHECK,0,0,jobid++);
+ fprintf(out," \"");
+#endif
+ fprintf(out,"LAcheck -vS");
+ if (usepath2)
+ fprintf(out," %s/%s",pwd2,root2);
+ else
+ fprintf(out," %s",root2);
+ if (usepath1)
+ fprintf(out," %s/%s",pwd1,root1);
+ else
+ fprintf(out," %s",root1);
+ while (i <= k)
+ { fprintf(out," ");
+ if (nblocks1 == 1)
+ { if (usepath2)
+ fprintf(out,"%s/",pwd2);
+ fprintf(out,"%s",root2);
+ if (useblock2)
+ fprintf(out,".%d",j);
+ fprintf(out,".%s",root1);
+ }
+ else
+ { if (DON)
+ fprintf(out,"work%d/",j);
+ fprintf(out,"%s",root2);
+ if (useblock2)
+ fprintf(out,".%d",j);
+ fprintf(out,".%s.%d",root1,i);
+ }
+ i += 1;
+ }
+#ifdef LSF
+ fprintf(out,"\"");
+#endif
+ fprintf(out,"\n");
+ }
+
+ if (ONAME != NULL)
+ fclose(out);
+
+ // Higher level merges (if lblock > 1)
+
+ if (nblocks1 > 1)
+ { int pow, stage;
+
+ // Determine the number of merging levels
+
+ stage = 3;
+
+ pow = 1;
+ for (level = 0; pow < nblocks1; level++)
+ pow *= DUNIT;
+
+ // Issue the commands for each merge level
+
+ { int p, cnt;
+
+ cnt = nblocks1;
+ for (i = 1; i <= level; i++)
+ { int bits;
+ int low, hgh;
+
+ if (ONAME != NULL)
+ { sprintf(name,"%s.%02d.MERGE",ONAME,stage++);
+ out = fopen(name,"w");
+ }
+
+ bits = (cnt-1)/DUNIT+1;
+ fprintf(out,"# Level %d jobs (%d)\n",i,bits*((lblock-fblock)+1));
+
+ // Block merges
+
+#ifdef LSF
+ jobid = 1;
+#endif
+ for (j = fblock; j <= lblock; j++)
+ { low = 1;
+ for (p = 1; p <= bits; p++)
+ { hgh = (cnt*p)/bits;
+#ifdef LSF
+ fprintf(out,LSF_MMERGE,i,i,jobid++);
+ fprintf(out," \"");
+#endif
+ fprintf(out,"LAmerge ");
+ if (VON)
+ fprintf(out,"-v ");
+ if (CON)
+ fprintf(out,"-a ");
+ if (i == level)
+ { if (usepath2)
+ fprintf(out,"%s/",pwd2);
+ fprintf(out,"%s",root2);
+ if (useblock2)
+ fprintf(out,".%d",j);
+ fprintf(out,".%s",root1);
+ }
+ else
+ { if (DON)
+ fprintf(out,"work%d/",j);
+ fprintf(out,"L%d.%d.%d",i+1,j,p);
+ }
+ for (k = low; k <= hgh; k++)
+ if (i == 1)
+ { if (DON)
+ fprintf(out," work%d/",j);
+ else
+ fprintf(out," ");
+ fprintf(out,"%s",root2);
+ if (useblock2)
+ fprintf(out,".%d",j);
+ fprintf(out,".%s.%d",root1,k);
+ }
+ else
+ if (DON)
+ fprintf(out," work%d/L%d.%d.%d",j,i,j,k);
+ else
+ fprintf(out," L%d.%d.%d",i,j,k);
+
+#ifdef LSF
+ fprintf(out,"\"");
+#endif
+ fprintf(out,"\n");
+ low = hgh+1;
+ }
+ }
+
+ // Check new .las (optional)
+
+ if (ONAME != NULL)
+ { fclose(out);
+ sprintf(name,"%s.%02d.CHECK.OPT",ONAME,stage++);
+ out = fopen(name,"w");
+ }
+
+ fprintf(out,"# Check level %d .las files jobs (%d) (optional but recommended)\n",
+ i+1,(lblock-fblock+1)*((bits-1)/(BUNIT+1)+1) );
+
+#ifdef LSF
+ jobid = 1;
+#endif
+ for (j = fblock; j <= lblock; j++)
+ for (p = 1; p <= bits; )
+ { k = p+BUNIT;
+ if (k > bits)
+ k = bits;
+#ifdef LSF
+ fprintf(out,LSF_CHECK,0,0,jobid++);
+ fprintf(out," \"");
+#endif
+ fprintf(out,"LAcheck -vS");
+ if (usepath2)
+ fprintf(out," %s/%s",pwd2,root2);
+ else
+ fprintf(out," %s",root2);
+ if (usepath1)
+ fprintf(out," %s/%s",pwd1,root1);
+ else
+ fprintf(out," %s",root1);
+ while (p <= k)
+ { fprintf(out," ");
+ if (i == level)
+ { if (usepath2)
+ fprintf(out,"%s/",pwd2);
+ fprintf(out,"%s",root2);
+ if (useblock2)
+ fprintf(out,".%d",j);
+ fprintf(out,".%s",root1);
+ }
+ else
+ { if (DON)
+ fprintf(out,"work%d/",j);
+ fprintf(out,"L%d.%d.%d",i+1,j,p);
+ }
+ p += 1;
+ }
+#ifdef LSF
+ fprintf(out,"\"");
+#endif
+ fprintf(out,"\n");
+ }
+
+ // Cleanup (optional)
+
+ if (ONAME != NULL)
+ { fclose(out);
+ sprintf(name,"%s.%02d.RM",ONAME,stage++);
+ out = fopen(name,"w");
+ }
+
+ fprintf(out,"# Remove level %d .las files\n",i);
+
+ for (j = fblock; j <= lblock; j++)
+ { if (DON)
+ fprintf(out,"cd work%d\n",j);
+ low = 1;
+ for (p = 1; p <= bits; p++)
+ { hgh = (cnt*p)/bits;
+ fprintf(out,"rm");
+ for (k = low; k <= hgh; k++)
+ if (i == 1)
+ { fprintf(out," %s",root2);
+ if (useblock2)
+ fprintf(out,".%d",j);
+ fprintf(out,".%s.%d",root1,k);
+ }
+ else
+ fprintf(out," L%d.%d.%d.las",i,j,k);
+ fprintf(out,"\n");
+ low = hgh+1;
+ }
+ if (DON)
+ fprintf(out,"cd ..\n");
+ }
+
+ if (ONAME != NULL)
+ fclose(out);
+
+ cnt = bits;
+ }
+ }
+ }
+ }
+
+ free(root2);
+ free(pwd2);
+ free(root1);
+ free(pwd1);
+
+ exit (0);
+}
+
+int main(int argc, char *argv[])
+{ int i, j, k;
+ int flags[128];
+ char *eptr;
+ int mapper;
+
+ // Process options and decide if its a overlap or mapper script
+
+ ARG_INIT("HPC.daligner")
+
+ KINT = 0;
+ HINT = 0;
+ HGAP = 0;
+ EREL = 0.;
+
+ BUNIT = 4;
+ DUNIT = 250;
+ TINT = 0;
+ WINT = 6;
+ LINT = 1000;
+ SINT = 100;
+ MINT = -1;
+
+ MTOP = 0;
+ MMAX = 10;
+ MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array");
+ if (MASK == NULL)
+ exit (1);
+ ONAME = NULL;
+
+ NTHREADS = 4;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("vbadAI");
+ break;
+ case 'e':
+ ARG_REAL(EREL)
+ if (EREL < .7 || EREL >= 1.)
+ { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n",Prog_Name,EREL);
+ exit (1);
+ }
+ break;
+ case 'f':
+ ONAME = argv[i]+2;
+ break;
+ case 'h':
+ ARG_POSITIVE(HINT,"Hit threshold (in bp.s)")
+ break;
+ case 'k':
+ ARG_POSITIVE(KINT,"K-mer length")
+ break;
+ case 'l':
+ ARG_POSITIVE(LINT,"Minimum ovlerap length")
+ break;
+ case 'm':
+ if (MTOP >= MMAX)
+ { MMAX = 1.2*MTOP + 10;
+ MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array");
+ if (MASK == NULL)
+ exit (1);
+ }
+ MASK[MTOP++] = argv[i]+2;
+ break;
+ case 's':
+ ARG_POSITIVE(SINT,"Trace spacing")
+ break;
+ case 't':
+ ARG_POSITIVE(TINT,"Tuple suppression frequency")
+ break;
+ case 'w':
+ ARG_POSITIVE(WINT,"Log of bin width")
+ break;
+ case 'B':
+ ARG_NON_NEGATIVE(BUNIT,"Blocks per command")
+ break;
+ case 'D':
+ ARG_NON_NEGATIVE(DUNIT,"File per merge")
+ if (DUNIT < 3)
+ { fprintf(stderr,"%s: Files per merge must be at least 3 (%d)\n",
+ Prog_Name,DUNIT);
+ exit (1);
+ }
+ break;
+ case 'H':
+ ARG_POSITIVE(HGAP,"HGAP threshold (in bp.s)")
+ break;
+ case 'M':
+ ARG_NON_NEGATIVE(MINT,"Memory allocation (in Gb)")
+ break;
+ case 'T':
+ ARG_POSITIVE(NTHREADS,"Number of threads")
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ VON = flags['v'];
+ BON = flags['b'];
+ CON = flags['a'];
+ DON = flags['d'];
+
+ if (argc < 2 || argc > 4)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
+ fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
+ fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]);
+ fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[3]);
+ fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[4]);
+ exit (1);
+ }
+
+ if (argc == 2)
+ mapper = 0;
+ else if (argc == 4)
+ mapper = 1;
+ else
+ { (void) strtol(argv[2],&eptr,10);
+ if ((*eptr == '\0' || *eptr == '-') && eptr > argv[2])
+ mapper = 0;
+ else
+ mapper = 1;
+ }
+
+ if (mapper)
+ { if (HGAP > 0)
+ { fprintf(stderr,"%s: Cannot use -H option in a comparison script\n",Prog_Name);
+ exit (1);
+ }
+ if (KINT <= 0)
+ KINT = 20;
+ if (HINT <= 0)
+ HINT = 50;
+ if (EREL <= 0.)
+ EREL = .85;
+ }
+ else
+ { if (KINT <= 0)
+ KINT = 14;
+ if (HINT <= 0)
+ HINT = 35;
+ }
+
+ for (j = 1; 2*j <= NTHREADS; j *= 2)
+ ;
+ NTHREADS = j;
+
+ if (mapper)
+ mapper_script(argc,argv);
+ else
+ daligner_script(argc,argv);
+
+ exit (0);
+}
diff --git a/HPCdaligner.c b/HPCdaligner.c
deleted file mode 100644
index cae006d..0000000
--- a/HPCdaligner.c
+++ /dev/null
@@ -1,543 +0,0 @@
-/*********************************************************************************************\
- *
- * Produce a script to compute overlaps for all block pairs of a DB, and then sort and merge
- * them into as many .las files as their are blocks.
- *
- * Author: Gene Myers
- * Date : June 1, 2014
- *
- *********************************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <ctype.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "DB.h"
-#include "filter.h"
-
-#undef LSF // define if want a directly executable LSF script
-
-static char *Usage[] =
- { "[-vbAI] [-k<int(14)>] [-w<int(6)>] [-h<int(35)>] [-t<int>] [-M<int>]",
- " [-e<double(.70)] [-l<int(1000)>] [-s<int(100)] [-H<int>]",
- " [-m<track>]+ [-dal<int(4)>] [-deg<int(25)>]",
- " <path:db|dam> [<block:int>[-<range:int>]"
- };
-
-static int power(int base, int exp)
-{ int i, pow;
-
- pow = 1;
- for (i = 0; i < exp; i++)
- pow *= base;
- return (pow);
-}
-
-#define LSF_ALIGN "bsub -q medium -n 4 -o ALIGN.out -e ALIGN.err -R span[hosts=1] -J align#%d"
-#define LSF_MERGE "bsub -q short -n 12 -o MERGE.out -e MERGE.err -R span[hosts=1] -J merge#%d"
-
-int main(int argc, char *argv[])
-{ int nblocks;
- int useblock;
- int fblock, lblock;
-#ifdef LSF
- int jobid;
-#endif
-
- char *pwd, *root;
-
- int MUNIT, DUNIT;
- int VON, BON, AON, ION;
- int WINT, TINT, HGAP, HINT, KINT, SINT, LINT, MINT;
- double EREL;
- int MMAX, MTOP;
- char **MASK;
-
- { int i, j, k; // Process options
- int flags[128];
- char *eptr;
-
- ARG_INIT("HPCdaligner")
-
- DUNIT = 4;
- MUNIT = 25;
- KINT = 14;
- WINT = 6;
- HINT = 35;
- TINT = 0;
- HGAP = 0;
- EREL = 0.;
- LINT = 1000;
- SINT = 100;
- MINT = -1;
-
- MTOP = 0;
- MMAX = 10;
- MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array");
- if (MASK == NULL)
- exit (1);
-
- j = 1;
- for (i = 1; i < argc; i++)
- if (argv[i][0] == '-')
- switch (argv[i][1])
- { default:
- ARG_FLAGS("vbAI");
- break;
- case 'k':
- ARG_POSITIVE(KINT,"K-mer length")
- break;
- case 'w':
- ARG_POSITIVE(WINT,"Log of bin width")
- break;
- case 'h':
- ARG_POSITIVE(HINT,"Hit threshold (in bp.s)")
- break;
- case 't':
- ARG_POSITIVE(TINT,"Tuple suppression frequency")
- break;
- case 'H':
- ARG_POSITIVE(HGAP,"HGAP threshold (in bp.s)")
- break;
- case 'e':
- ARG_REAL(EREL)
- if (EREL < .7 || EREL >= 1.)
- { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n",Prog_Name,EREL);
- exit (1);
- }
- break;
- case 'l':
- ARG_POSITIVE(LINT,"Minimum ovlerap length")
- break;
- case 's':
- ARG_POSITIVE(SINT,"Trace spacing")
- break;
- case 'M':
- ARG_NON_NEGATIVE(MINT,"Memory allocation (in Gb)")
- break;
- case 'm':
- if (MTOP >= MMAX)
- { MMAX = 1.2*MTOP + 10;
- MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array");
- if (MASK == NULL)
- exit (1);
- }
- MASK[MTOP++] = argv[i]+2;
- break;
- case 'd':
- if (argv[i][2] == 'e' && argv[i][3] == 'g')
- { MUNIT = strtol(argv[i]+4,&eptr,10);
- if (*eptr != '\0' || argv[i][4] == '\0')
- { fprintf(stderr,"%s: -mrg argument is not an integer\n",Prog_Name);
- exit (1);
- }
- if (MUNIT <= 0)
- { fprintf(stderr,"%s: Files per merge must be positive (%d)\n",
- Prog_Name,MUNIT);
- exit (1);
- }
- if (MUNIT < 3)
- { fprintf(stderr,"%s: Files per merge must be at least 3 (%d)\n",
- Prog_Name,MUNIT);
- exit (1);
- }
- }
- else if (argv[i][2] == 'a' && argv[i][3] == 'l')
- { DUNIT = strtol(argv[i]+4,&eptr,10);
- if (*eptr != '\0' || argv[i][4] == '\0')
- { fprintf(stderr,"%s: -dal argument is not an integer\n",Prog_Name);
- exit (1);
- }
- if (DUNIT <= 0)
- { fprintf(stderr,"%s: Blocks per daligner call must be positive (%d)\n",
- Prog_Name,DUNIT);
- exit (1);
- }
- }
- else
- { fprintf(stderr,"%s: -%.3s is an illegal option\n",Prog_Name,argv[i]+1);
- exit (1);
- }
- break;
- }
- else
- argv[j++] = argv[i];
- argc = j;
-
- VON = flags['v'];
- BON = flags['b'];
- AON = flags['A'];
- ION = flags['I'];
-
- if (argc < 2 || argc > 3)
- { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
- fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
- fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]);
- fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[3]);
- exit (1);
- }
- }
-
- // Make sure DB exists and is partitioned, get number of blocks in partition
-
- pwd = PathTo(argv[1]);
- if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0)
- root = Root(argv[1],".dam");
- else
- root = Root(argv[1],".db");
-
- { int i, nfiles;
- FILE *dbvis;
-
- dbvis = fopen(Catenate(pwd,"/",root,".dam"),"r");
- if (dbvis == NULL)
- { dbvis = Fopen(Catenate(pwd,"/",root,".db"),"r");
- if (dbvis == NULL)
- exit (1);
- }
-
- if (fscanf(dbvis,"files = %d\n",&nfiles) != 1)
- SYSTEM_ERROR
- for (i = 0; i < nfiles; i++)
- { char buffer[30001];
-
- if (fgets(buffer,30000,dbvis) == NULL)
- SYSTEM_ERROR
- }
-
- useblock = 1;
- if (fscanf(dbvis,"blocks = %d\n",&nblocks) != 1)
- { useblock = 0;
- nblocks = 1;
- }
- }
-
- // Set range fblock-lblock checking that DB.<fblock-1>.las exists & DB.<fblock>.las does not
-
- { char *eptr, *fptr;
- FILE *file;
-
- if (argc == 3)
- { fblock = strtol(argv[2],&eptr,10);
- if (*eptr != '\0' && *eptr != '-')
- { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n",
- Prog_Name,argv[2]);
- exit (1);
- }
- if (*eptr == '-')
- { lblock = strtol(eptr+1,&fptr,10);
- if (*fptr != '\0')
- { fprintf(stderr,"%s: second part of range '%s' is not an integer\n",
- Prog_Name,eptr+1);
- exit (1);
- }
- }
- else
- lblock = fblock;
- if (fblock < 1 || lblock > nblocks || fblock > lblock)
- { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock);
- exit (1);
- }
- }
- else
- { fblock = 1;
- lblock = nblocks;
- }
-
- if (fblock > 1)
- { file = fopen(Catenate(root,Numbered_Suffix(".",fblock-1,".las"),"",""),"r");
- if (file == NULL)
- { fprintf(stderr,"%s: File %s.%d.las should already be present!\n",
- Prog_Name,root,fblock-1);
- exit (1);
- }
- else
- fclose(file);
- }
- file = fopen(Catenate(root,Numbered_Suffix(".",fblock,".las"),"",""),"r");
- if (file != NULL)
- { fprintf(stderr,"%s: File %s.%d.las should not yet exist!\n",
- Prog_Name,root,fblock);
- exit (1);
- }
- }
-
- { int level, njobs;
- int i, j, k;
- int usepath;
-
- // Produce all necessary daligner jobs ...
-
- usepath = (strcmp(pwd,".") != 0);
-
- njobs = 0;
- for (i = fblock; i <= lblock; i++)
- njobs += (i-1)/DUNIT+1;
-
- printf("# Daligner jobs (%d)\n",njobs);
-
-#ifdef LSF
- jobid = 1;
-#endif
- for (i = fblock; i <= lblock; i++)
- { int bits;
- int low, hgh;
-
- bits = (i-1)/DUNIT+1;
- low = 1;
- for (j = 1; j <= bits; j++)
- {
-#ifdef LSF
- printf(LSF_ALIGN,jobid++);
- printf(" \"");
-#endif
- printf("daligner");
- if (VON)
- printf(" -v");
- if (BON)
- printf(" -b");
- if (AON)
- printf(" -A");
- if (ION)
- printf(" -I");
- if (KINT != 14)
- printf(" -k%d",KINT);
- if (WINT != 6)
- printf(" -w%d",WINT);
- if (HINT != 35)
- printf(" -h%d",HINT);
- if (TINT > 0)
- printf(" -t%d",TINT);
- if (HGAP > 0)
- printf(" -H%d",HGAP);
- if (EREL > .1)
- printf(" -e%g",EREL);
- if (LINT != 1000)
- printf(" -l%d",LINT);
- if (SINT != 100)
- printf(" -s%d",SINT);
- if (MINT >= 0)
- printf(" -M%d",MINT);
- for (k = 0; k < MTOP; k++)
- printf(" -m%s",MASK[k]);
- if (useblock)
- if (usepath)
- printf(" %s/%s.%d",pwd,root,i);
- else
- printf(" %s.%d",root,i);
- else
- if (usepath)
- printf(" %s/%s",pwd,root);
- else
- printf(" %s",root);
- hgh = (i*j)/bits + 1;
- for (k = low; k < hgh; k++)
- if (useblock)
- if (usepath)
- printf(" %s/%s.%d",pwd,root,k);
- else
- printf(" %s.%d",root,k);
- else
- if (usepath)
- printf(" %s/%s",pwd,root);
- else
- printf(" %s",root);
-#ifdef LSF
- printf("\"");
-#endif
- printf("\n");
- low = hgh;
- }
- }
-
- // ... and then all the initial sort & merge jobs for each block pair
-
- printf("# Initial sort jobs (%d)\n", lblock*lblock - (fblock-1)*(fblock-1) );
-
-#ifdef LSF
- jobid = 1;
-#endif
- for (i = 1; i <= lblock; i++)
- for (j = (i < fblock ? fblock : 1); j <= lblock; j++)
- {
-#ifdef LSF
- printf(LSF_MERGE,jobid++);
- printf(" \"");
-#endif
- printf("LAsort");
- if (VON)
- printf(" -v");
- for (k = 0; k < NTHREADS; k++)
- if (useblock)
- { printf(" %s.%d.%s.%d.C%d",root,i,root,j,k);
- printf(" %s.%d.%s.%d.N%d",root,i,root,j,k);
- }
- else
- { printf(" %s.%s.C%d",root,root,k);
- printf(" %s.%s.N%d",root,root,k);
- }
- printf(" && LAmerge");
- if (VON)
- printf(" -v");
- if (lblock == 1)
- printf(" %s.%d",root,i);
- else if (i < fblock)
- printf(" L1.%d.%d",i,(j-fblock)+1);
- else
- printf(" L1.%d.%d",i,j);
- for (k = 0; k < NTHREADS; k++)
- if (useblock)
- { printf(" %s.%d.%s.%d.C%d.S",root,i,root,j,k);
- printf(" %s.%d.%s.%d.N%d.S",root,i,root,j,k);
- }
- else
- { printf(" %s.%s.C%d.S",root,root,k);
- printf(" %s.%s.N%d.S",root,root,k);
- }
- printf(" && rm");
- for (k = 0; k < NTHREADS; k++)
- if (useblock)
- { printf(" %s.%d.%s.%d.C%d.S.las",root,i,root,j,k);
- printf(" %s.%d.%s.%d.N%d.S.las",root,i,root,j,k);
- }
- else
- { printf(" %s.%s.C%d.S.las",root,root,k);
- printf(" %s.%s.N%d.S.las",root,root,k);
- }
-#ifdef LSF
- printf("\"");
-#endif
- printf("\n");
- }
-
- // Higher level merges (if lblock > 1)
-
- if (lblock > 1)
- { int pow, mway;
-
- // Determine most balance mway for merging in ceil(log_mrg lblock) levels
-
- pow = 1;
- for (level = 0; pow < lblock; level++)
- pow *= MUNIT;
-
- for (mway = MUNIT; mway >= 3; mway--)
- if (power(mway,level) < lblock)
- break;
- mway += 1;
-
- // Issue the commands for each merge level
-
- { int p, cnt, dnt;
-
- cnt = lblock;
- dnt = (lblock-fblock)+1;
- for (i = 1; i <= level; i++)
- { int bits, dits;
- int low, hgh;
-
- bits = (cnt-1)/mway+1;
- dits = (dnt-1)/mway+1;
-
- // Incremental update merges
-
-#ifdef LSF
- jobid = 1;
-#endif
- if (dnt >= 1)
- { int last;
-
- last = (dnt == 1 || i == level);
- printf("# Level %d jobs (%d)\n",i,bits*((lblock-fblock)+1) + dits*(fblock-1));
- for (j = 1; j < fblock; j++)
- {
-#ifdef LSF
- printf(LSF_MERGE,jobid++);
- printf(" \"");
-#endif
- if (last)
- printf("mv %s.%d.las L%d.%d.0.las && ",root,j,i,j);
- low = 1;
- for (p = 1; p <= dits; p++)
- { hgh = (dnt*p)/dits;
-#ifdef LSF
- if (p > 1)
- { printf(LSF_MERGE,jobid++);
- printf(" \"");
- }
-#endif
- printf("LAmerge");
- if (VON)
- printf(" -v");
- if (last)
- printf(" %s.%d L%d.%d.0",root,j,i,j);
- else
- printf(" L%d.%d.%d",i+1,j,p);
- for (k = low; k <= hgh; k++)
- printf(" L%d.%d.%d",i,j,k);
- printf(" && rm");
- if (last)
- printf(" L%d.%d.0.las",i,j);
- for (k = low; k <= hgh; k++)
- printf(" L%d.%d.%d.las",i,j,k);
-#ifdef LSF
- printf("\"");
-#endif
- printf("\n");
- low = hgh+1;
- }
- }
- if (dnt > 1)
- dnt = dits;
- else
- dnt = 0;
- }
- else
- printf("# Level %d jobs (%d)\n",i,bits*((lblock-fblock)+1));
-
- // New block merges
-
- for (j = fblock; j <= lblock; j++)
- { low = 1;
- for (p = 1; p <= bits; p++)
- { hgh = (cnt*p)/bits;
-#ifdef LSF
- printf(LSF_MERGE,jobid++);
- printf(" \"");
-#endif
- printf("LAmerge");
- if (VON)
- printf(" -v");
- if (i == level)
- printf(" %s.%d",root,j);
- else
- printf(" L%d.%d.%d",i+1,j,p);
- for (k = low; k <= hgh; k++)
- printf(" L%d.%d.%d",i,j,k);
- printf(" && rm");
- for (k = low; k <= hgh; k++)
- printf(" L%d.%d.%d.las",i,j,k);
-#ifdef LSF
- printf("\"");
-#endif
- printf("\n");
- low = hgh+1;
- }
- }
-
- cnt = bits;
- }
- }
- }
- }
-
- free(root);
- free(pwd);
-
- exit (0);
-}
diff --git a/HPCmapper.c b/HPCmapper.c
deleted file mode 100644
index f46c649..0000000
--- a/HPCmapper.c
+++ /dev/null
@@ -1,591 +0,0 @@
-/*********************************************************************************************\
- *
- * Produce a script to compute overlaps for all block pairs between two DBs, and then sort
- * and merge * them into as many .las files as their are blocks of the 1st DB.
- *
- * Author: Gene Myers
- * Date : December 31, 2014
- *
- *********************************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <ctype.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "DB.h"
-#include "filter.h"
-
-#undef LSF // define if want a directly executable LSF script
-
-static char *Usage[] =
- { "[-vb] [-k<int(20)>] [-w<int(6)>] [-h<int(50)>] [-t<int>] [-M<int>]",
- " [-e<double(.85)] [-l<int(1000)>] [-s<int(100)] [-H<int>]",
- " [-m<track>]+ [-dal<int(4)>] [-deg<int(25)>]",
- " <ref:db|dam> <reads:db|dam> [<first:int>[-<last:int>]]"
- };
-
-static int power(int base, int exp)
-{ int i, pow;
-
- pow = 1;
- for (i = 0; i < exp; i++)
- pow *= base;
- return (pow);
-}
-
-#define LSF_ALIGN "bsub -q medium -n 4 -o ALIGN.out -e ALIGN.err -R span[hosts=1] -J align#%d"
-#define LSF_MERGE "bsub -q short -n 12 -o MERGE.out -e MERGE.err -R span[hosts=1] -J merge#%d"
-
-int main(int argc, char *argv[])
-{ int nblocks1, nblocks2;
- int useblock1, useblock2;
- int fblock, lblock;
-#ifdef LSF
- int jobid;
-#endif
-
- char *pwd1, *root1;
- char *pwd2, *root2;
-
- int MUNIT, DUNIT;
- int VON, BON, CON;
- int WINT, TINT, HGAP, HINT, KINT, SINT, LINT, MINT;
- double EREL;
- int MMAX, MTOP;
- char **MASK;
-
- { int i, j, k; // Process options
- int flags[128];
- char *eptr;
-
- ARG_INIT("HPCmapper")
-
- DUNIT = 4;
- MUNIT = 25;
- KINT = 20;
- WINT = 6;
- HINT = 50;
- TINT = 0;
- HGAP = 0;
- EREL = 0.;
- LINT = 1000;
- SINT = 100;
- MINT = -1;
-
- MTOP = 0;
- MMAX = 10;
- MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array");
- if (MASK == NULL)
- exit (1);
-
- j = 1;
- for (i = 1; i < argc; i++)
- if (argv[i][0] == '-')
- switch (argv[i][1])
- { default:
- ARG_FLAGS("vbc");
- break;
- case 'k':
- ARG_POSITIVE(KINT,"K-mer length")
- break;
- case 'w':
- ARG_POSITIVE(WINT,"Log of bin width")
- break;
- case 'h':
- ARG_POSITIVE(HINT,"Hit threshold (in bp.s)")
- break;
- case 't':
- ARG_POSITIVE(TINT,"Tuple suppression frequency")
- break;
- case 'H':
- ARG_POSITIVE(HGAP,"HGAP threshold (in bp.s)")
- break;
- case 'e':
- ARG_REAL(EREL)
- if (EREL < .7 || EREL >= 1.)
- { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n",Prog_Name,EREL);
- exit (1);
- }
- break;
- case 'l':
- ARG_POSITIVE(LINT,"Minimum ovlerap length")
- break;
- case 's':
- ARG_POSITIVE(SINT,"Trace spacing")
- break;
- case 'M':
- ARG_NON_NEGATIVE(MINT,"Memory allocation (in Gb)")
- break;
- case 'm':
- if (MTOP >= MMAX)
- { MMAX = 1.2*MTOP + 10;
- MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array");
- if (MASK == NULL)
- exit (1);
- }
- MASK[MTOP++] = argv[i]+2;
- break;
- case 'd':
- if (argv[i][2] == 'e' && argv[i][3] == 'g')
- { MUNIT = strtol(argv[i]+4,&eptr,10);
- if (*eptr != '\0' || argv[i][4] == '\0')
- { fprintf(stderr,"%s: -mrg argument is not an integer\n",Prog_Name);
- exit (1);
- }
- if (MUNIT <= 0)
- { fprintf(stderr,"%s: Files per merge must be positive (%d)\n",
- Prog_Name,MUNIT);
- exit (1);
- }
- if (MUNIT < 3)
- { fprintf(stderr,"%s: Files per merge must be at least 3 (%d)\n",
- Prog_Name,MUNIT);
- exit (1);
- }
- }
- else if (argv[i][2] == 'a' && argv[i][3] == 'l')
- { DUNIT = strtol(argv[i]+4,&eptr,10);
- if (*eptr != '\0' || argv[i][4] == '\0')
- { fprintf(stderr,"%s: -dal argument is not an integer\n",Prog_Name);
- exit (1);
- }
- if (DUNIT <= 0)
- { fprintf(stderr,"%s: Blocks per daligner call must be positive (%d)\n",
- Prog_Name,DUNIT);
- exit (1);
- }
- }
- else
- { fprintf(stderr,"%s: -%.3s is an illegal option\n",Prog_Name,argv[i]+1);
- exit (1);
- }
- break;
- }
- else
- argv[j++] = argv[i];
- argc = j;
-
- VON = flags['v'];
- BON = flags['b'];
- CON = flags['c'];
-
- if (argc < 3 || argc > 4)
- { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
- fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
- fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]);
- fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[3]);
- exit (1);
- }
- }
-
- // Make sure DAM and DB exist and the DB is partitioned, get number of blocks in partition
-
- pwd1 = PathTo(argv[1]);
- if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0)
- root1 = Root(argv[1],".dam");
- else
- root1 = Root(argv[1],".db");
-
- { int i, nfiles;
- FILE *dbvis;
-
- dbvis = fopen(Catenate(pwd1,"/",root1,".dam"),"r");
- if (dbvis == NULL)
- { dbvis = Fopen(Catenate(pwd1,"/",root1,".db"),"r");
- if (dbvis == NULL)
- exit (1);
- }
-
- if (fscanf(dbvis,"files = %d\n",&nfiles) != 1)
- SYSTEM_ERROR
- for (i = 0; i < nfiles; i++)
- { char buffer[30001];
-
- if (fgets(buffer,30000,dbvis) == NULL)
- SYSTEM_ERROR
- }
-
- useblock1 = 1;
- if (fscanf(dbvis,"blocks = %d\n",&nblocks1) != 1)
- { useblock1 = 0;
- nblocks1 = 1;
- }
-
- fclose(dbvis);
- }
-
- pwd2 = PathTo(argv[2]);
- if (strcmp(argv[2]+(strlen(argv[2])-4),".dam") == 0)
- root2 = Root(argv[2],".dam");
- else
- root2 = Root(argv[2],".db");
-
- if (strcmp(root2,root1) == 0 && strcmp(pwd1,pwd2) == 0)
- { fprintf(stderr,"%s: Comparing the same data base %s/%s against itself, use HPCdaligner\n",
- Prog_Name,pwd1,root1);
- exit (1);
- }
-
- { int i, nfiles;
- FILE *dbvis;
-
- dbvis = fopen(Catenate(pwd2,"/",root2,".dam"),"r");
- if (dbvis == NULL)
- { dbvis = Fopen(Catenate(pwd2,"/",root2,".db"),"r");
- if (dbvis == NULL)
- exit (1);
- }
-
- if (fscanf(dbvis,"files = %d\n",&nfiles) != 1)
- SYSTEM_ERROR
- for (i = 0; i < nfiles; i++)
- { char buffer[30001];
-
- if (fgets(buffer,30000,dbvis) == NULL)
- SYSTEM_ERROR
- }
-
- useblock2 = 1;
- if (fscanf(dbvis,"blocks = %d\n",&nblocks2) != 1)
- { useblock2 = 0;
- nblocks2 = 1;
- }
-
- fclose(dbvis);
- }
-
- // Set range fblock-lblock checking that DB.<fblock-1>.las exists & DB.<fblock>.las does not
-
- { char *eptr, *fptr;
- FILE *file;
-
- if (argc == 4)
- { fblock = strtol(argv[3],&eptr,10);
- if (*eptr != '\0' && *eptr != '-')
- { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n",
- Prog_Name,argv[3]);
- exit (1);
- }
- if (*eptr == '-')
- { lblock = strtol(eptr+1,&fptr,10);
- if (*fptr != '\0')
- { fprintf(stderr,"%s: second part of range '%s' is not an integer\n",
- Prog_Name,eptr+1);
- exit (1);
- }
- }
- else
- lblock = fblock;
- if (fblock < 1 || lblock > nblocks2 || fblock > lblock)
- { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock);
- exit (1);
- }
- }
- else
- { fblock = 1;
- lblock = nblocks2;
- }
-
- if (fblock > 1)
- { file = fopen(Catenate(root1,".",root2,Numbered_Suffix(".",fblock-1,".las")),"r");
- if (file == NULL)
- { fprintf(stderr,"%s: File %s.%s.%d.las should already be present!\n",
- Prog_Name,root1,root2,fblock-1);
- exit (1);
- }
- else
- fclose(file);
- }
- if (useblock2)
- { file = fopen(Catenate(root1,".",root2,Numbered_Suffix(".",fblock,".las")),"r");
- if (file != NULL)
- { fprintf(stderr,"%s: File %s.%s.%d.las should not yet exist!\n",
- Prog_Name,root1,root2,fblock);
- exit (1);
- }
- }
- else
- { file = fopen(Catenate(root1,".",root2,".las"),"r");
- if (file != NULL)
- { fprintf(stderr,"%s: File %s.%s.las should not yet exist!\n",
- Prog_Name,root1,root2);
- exit (1);
- }
- }
- }
-
- { int level, njobs;
- int i, j, k;
- int usepath1, usepath2;
-
- // Produce all necessary daligner jobs ...
-
- usepath1 = (strcmp(pwd1,".") != 0);
- usepath2 = (strcmp(pwd2,".") != 0);
-
- njobs = nblocks1 * ( (lblock-fblock)/DUNIT + 1);
-
- printf("# Daligner jobs (%d)\n",njobs);
-
-#ifdef LSF
- jobid = 1;
-#endif
- for (i = 1; i <= nblocks1; i++)
- { int bits;
- int low, hgh;
-
- bits = (lblock-fblock)/DUNIT+1;
- low = fblock;
- for (j = 1; j <= bits; j++)
- {
-#ifdef LSF
- printf(LSF_ALIGN,jobid++);
- printf(" \"");
-#endif
- printf("daligner -A");
- if (VON)
- printf(" -v");
- if (BON)
- printf(" -b");
- printf(" -k%d",KINT);
- if (WINT != 6)
- printf(" -w%d",WINT);
- printf(" -h%d",HINT);
- if (TINT > 0)
- printf(" -t%d",TINT);
- if (HGAP > 0)
- printf(" -H%d",HGAP);
- if (EREL > .1)
- printf(" -e%g",EREL);
- else
- printf(" -e.85");
- if (LINT != 1000)
- printf(" -l%d",LINT);
- if (SINT != 100)
- printf(" -s%d",SINT);
- if (MINT >= 0)
- printf(" -M%d",MINT);
- for (k = 0; k < MTOP; k++)
- printf(" -m%s",MASK[k]);
- if (useblock1)
- if (usepath1)
- printf(" %s/%s.%d",pwd1,root1,i);
- else
- printf(" %s.%d",root1,i);
- else
- if (usepath1)
- printf(" %s/%s",pwd1,root1);
- else
- printf(" %s",root1);
- hgh = fblock + (((lblock-fblock)+1)*j)/bits;
- for (k = low; k < hgh; k++)
- if (useblock2)
- if (usepath2)
- printf(" %s/%s.%d",pwd2,root2,k);
- else
- printf(" %s.%d",root2,k);
- else
- if (usepath2)
- printf(" %s/%s",pwd2,root2);
- else
- printf(" %s",root2);
-#ifdef LSF
- printf("\"");
-#endif
- printf("\n");
- low = hgh;
- }
- }
-
- // ... and then all the initial sort & merge jobs for each block pair
-
- printf("# Initial sort jobs (%d)\n", nblocks1*((lblock-fblock)+1));
-
-#ifdef LSF
- jobid = 1;
-#endif
- for (i = 1; i <= nblocks1; i++)
- for (j = fblock; j <= lblock; j++)
- {
-#ifdef LSF
- printf(LSF_MERGE,jobid++);
- printf(" \"");
-#endif
- printf("LAsort");
- if (VON)
- printf(" -v");
- if (CON)
- printf(" -c");
- for (k = 0; k < NTHREADS; k++)
- { if (useblock1)
- printf(" %s.%d",root1,i);
- else
- printf(" %s",root1);
- if (useblock2)
- printf(".%s.%d.C%d",root2,j,k);
- else
- printf(".%s.C%d",root2,k);
- if (useblock1)
- printf(" %s.%d",root1,i);
- else
- printf(" %s",root1);
- if (useblock2)
- printf(".%s.%d.N%d",root2,j,k);
- else
- printf(".%s.N%d",root2,k);
- }
- printf(" && LAmerge");
- if (VON)
- printf(" -v");
- if (CON)
- printf(" -c");
- if (nblocks1 == 1)
- if (useblock2)
- printf(" %s.%s.%d",root1,root2,j);
- else
- printf(" %s.%s",root1,root2);
- else
- printf(" L1.%d.%d",i,j);
- for (k = 0; k < NTHREADS; k++)
- { if (useblock1)
- printf(" %s.%d",root1,i);
- else
- printf(" %s",root1);
- if (useblock2)
- printf(".%s.%d.C%d.S",root2,j,k);
- else
- printf(".%s.C%d.S",root2,k);
- if (useblock1)
- printf(" %s.%d",root1,i);
- else
- printf(" %s",root1);
- if (useblock2)
- printf(".%s.%d.N%d.S",root2,j,k);
- else
- printf(".%s.N%d.S",root2,k);
- }
- printf(" && rm");
- for (k = 0; k < NTHREADS; k++)
- { if (useblock1)
- printf(" %s.%d",root1,i);
- else
- printf(" %s",root1);
- if (useblock2)
- printf(".%s.%d.C%d.S.las",root2,j,k);
- else
- printf(".%s.C%d.S.las",root2,k);
- if (useblock1)
- printf(" %s.%d",root1,i);
- else
- printf(" %s",root1);
- if (useblock2)
- printf(".%s.%d.N%d.S.las",root2,j,k);
- else
- printf(".%s.N%d.S.las",root2,k);
- if (useblock1)
- printf(" %s.%d",root1,i);
- else
- printf(" %s",root1);
- if (useblock2)
- printf(".%s.%d.C%d.las",root2,j,k);
- else
- printf(".%s.C%d.las",root2,k);
- if (useblock1)
- printf(" %s.%d",root1,i);
- else
- printf(" %s",root1);
- if (useblock2)
- printf(".%s.%d.N%d.las",root2,j,k);
- else
- printf(".%s.N%d.las",root2,k);
- }
-#ifdef LSF
- printf("\"");
-#endif
- printf("\n");
- }
-
- // Higher level merges (if lblock > 1)
-
- if (nblocks1 > 1)
- { int pow, mway;
-
- // Determine most balance mway for merging in ceil(log_mrg nblock1) levels
-
- pow = 1;
- for (level = 0; pow < nblocks1; level++)
- pow *= MUNIT;
-
- for (mway = MUNIT; mway >= 3; mway--)
- if (power(mway,level) < nblocks1)
- break;
- mway += 1;
-
- // Issue the commands for each merge level
-
- { int p, cnt;
-
- cnt = nblocks1;
- for (i = 1; i <= level; i++)
- { int bits;
- int low, hgh;
-
- bits = (cnt-1)/mway+1;
- printf("# Level %d jobs (%d)\n",i,bits*((lblock-fblock)+1));
-
- // Block merges
-
-#ifdef LSF
- jobid = 1;
-#endif
- for (j = fblock; j <= lblock; j++)
- { low = 1;
- for (p = 1; p <= bits; p++)
- { hgh = (cnt*p)/bits;
-#ifdef LSF
- printf(LSF_MERGE,jobid++);
- printf(" \"");
-#endif
- printf("LAmerge");
- if (VON)
- printf(" -v");
- if (CON)
- printf(" -c");
- if (i == level)
- if (useblock2)
- printf(" %s.%s.%d",root1,root2,j);
- else
- printf(" %s.%s",root1,root2);
- else
- printf(" L%d.%d.%d",i+1,j,p);
- for (k = low; k <= hgh; k++)
- printf(" L%d.%d.%d",i,k,j);
- printf(" && rm");
- for (k = low; k <= hgh; k++)
- printf(" L%d.%d.%d.las",i,k,j);
-#ifdef LSF
- printf("\"");
-#endif
- printf("\n");
- low = hgh+1;
- }
- }
-
- cnt = bits;
- }
- }
- }
- }
-
- free(root2);
- free(pwd2);
- free(root1);
- free(pwd1);
-
- exit (0);
-}
diff --git a/LAcat.c b/LAcat.c
index b8b8b0e..5b39340 100644
--- a/LAcat.c
+++ b/LAcat.c
@@ -19,7 +19,7 @@
#include "DB.h"
#include "align.h"
-static char *Usage = "<source:las> > <target>.las";
+static char *Usage = "[-v] <source:las> > <target>.las";
#define MEMORY 1000 // How many megabytes for output buffer
@@ -28,14 +28,32 @@ int main(int argc, char *argv[])
FILE *input;
int64 novl, bsize, ovlsize, ptrsize;
int tspace, tbytes;
- char *pwd, *root;
+ char *pwd, *root, *root2;
- Prog_Name = Strdup("LAcat","");
+ int VERBOSE;
- if (argc <= 1)
- { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
- exit (1);
- }
+ // Process options
+
+ { int i, j, k;
+ int flags[128];
+
+ ARG_INIT("LAcat")
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ { ARG_FLAGS("v") }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ VERBOSE = flags['v'];
+
+ if (argc <= 1)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
ptrsize = sizeof(void *);
ovlsize = sizeof(Overlap) - ptrsize;
@@ -49,6 +67,17 @@ int main(int argc, char *argv[])
pwd = PathTo(argv[1]);
root = Root(argv[1],".las");
+ root2 = index(root,'#');
+ if (root2 == NULL)
+ { fprintf(stderr,"%s: No #-sign in source name '%s'\n",Prog_Name,root);
+ exit (1);
+ }
+ if (index(root2+1,'#') != NULL)
+ { fprintf(stderr,"%s: Two or more occurences of #-sign in source name '%s'\n",Prog_Name,root);
+ exit (1);
+ }
+ *root2++ = '\0';
+
{ int64 povl;
int i, mspace;
@@ -57,7 +86,7 @@ int main(int argc, char *argv[])
mspace = 0;
tbytes = 0;
for (i = 0; 1; i++)
- { char *name = Catenate(pwd,"/",root,Numbered_Suffix(".",i+1,".las"));
+ { char *name = Catenate(pwd,"/",Numbered_Suffix(root,i+1,root2),".las");
if ((input = fopen(name,"r")) == NULL) break;
if (fread(&povl,sizeof(int64),1,input) != 1)
@@ -94,7 +123,7 @@ int main(int argc, char *argv[])
otop = oblock + bsize;
for (i = 0; 1; i++)
- { char *name = Catenate(pwd,"/",root,Numbered_Suffix(".",i+1,".las"));
+ { char *name = Catenate(pwd,"/",Numbered_Suffix(root,i+1,root2),".las");
if ((input = fopen(name,"r")) == NULL) break;
if (fread(&povl,sizeof(int64),1,input) != 1)
@@ -102,6 +131,9 @@ int main(int argc, char *argv[])
if (fread(&mspace,sizeof(int),1,input) != 1)
SYSTEM_ERROR
+ if (VERBOSE)
+ fprintf(stderr," Concatenating %s: %lld la\'s\n",Numbered_Suffix(root,i+1,root2),povl);
+
iptr = iblock;
itop = iblock + fread(iblock,1,bsize,input);
@@ -148,6 +180,9 @@ int main(int argc, char *argv[])
fwrite(oblock,1,optr-oblock,stdout);
}
+ if (VERBOSE)
+ fprintf(stderr," Totalling %lld la\'s\n",novl);
+
free(pwd);
free(root);
free(oblock);
diff --git a/LAcheck.c b/LAcheck.c
index 326d140..5de3efe 100644
--- a/LAcheck.c
+++ b/LAcheck.c
@@ -28,6 +28,7 @@ int main(int argc, char *argv[])
int VERBOSE;
int SORTED;
int ISTWO;
+ int status;
// Process options
@@ -119,13 +120,15 @@ int main(int argc, char *argv[])
// For each file do
+ status = 0;
for (i = 2+ISTWO; i < argc; i++)
{ char *pwd, *root;
FILE *input;
char *iptr, *itop;
- Overlap last;
+ Overlap last, prev;
int64 novl;
int tspace, tbytes;
+ int has_chains;
// Establish IO and (novl,tspace) header
@@ -159,11 +162,13 @@ int main(int argc, char *argv[])
// For each record in file do
+ has_chains = 0;
last.aread = -1;
last.bread = -1;
last.flags = 0;
last.path.bbpos = last.path.abpos = 0;
last.path.bepos = last.path.aepos = 0;
+ prev = last;
for (j = 0; j < novl; j++)
{ Overlap ovl;
int tsize;
@@ -236,34 +241,74 @@ int main(int argc, char *argv[])
if (Check_Trace_Points(&ovl,tspace,VERBOSE,root))
goto error;
+ if (j == 0)
+ has_chains = ((ovl.flags & (START_FLAG | NEXT_FLAG | BEST_FLAG)) != 0);
+ if (has_chains)
+ { if ((ovl.flags & (START_FLAG | NEXT_FLAG)) == 0)
+ { if (VERBOSE)
+ fprintf(stderr," %s: LA has both start & next flag set\n",root);
+ goto error;
+ }
+ if (BEST_CHAIN(ovl.flags) && CHAIN_NEXT(ovl.flags))
+ { if (VERBOSE)
+ fprintf(stderr," %s: LA has both best & next flag set\n",root);
+ goto error;
+ }
+ }
+ else
+ { if ((ovl.flags & (START_FLAG | NEXT_FLAG | BEST_FLAG)) != 0)
+ { if (VERBOSE)
+ fprintf(stderr," %s: LAs should not have chain flags\n",root);
+ goto error;
+ }
+ }
+
// Duplicate check and sort check if -S set
equal = 0;
if (SORTED)
- { if (ovl.aread > last.aread) goto inorder;
- if (ovl.aread == last.aread)
- { if (ovl.bread > last.bread) goto inorder;
- if (ovl.bread == last.bread)
- { if (COMP(ovl.flags) > COMP(last.flags)) goto inorder;
- if (COMP(ovl.flags) == COMP(last.flags))
- { if (ovl.path.abpos > last.path.abpos) goto inorder;
- if (ovl.path.abpos == last.path.abpos)
- { equal = 1;
- goto inorder;
+ { if (CHAIN_NEXT(ovl.flags) || !has_chains)
+ { if (ovl.aread > last.aread) goto inorder;
+ if (ovl.aread == last.aread)
+ { if (ovl.bread > last.bread) goto inorder;
+ if (ovl.bread == last.bread)
+ { if (COMP(ovl.flags) > COMP(last.flags)) goto inorder;
+ if (COMP(ovl.flags) == COMP(last.flags))
+ { if (ovl.path.abpos > last.path.abpos) goto inorder;
+ if (ovl.path.abpos == last.path.abpos)
+ { equal = 1;
+ goto inorder;
+ }
}
}
}
+ if (VERBOSE)
+ { if (CHAIN_NEXT(ovl.flags))
+ fprintf(stderr," %s: Chain is not valid (%d vs %d)\n",
+ root,ovl.aread+1,ovl.bread+1);
+ else
+ fprintf(stderr," %s: Reads are not sorted (%d vs %d)\n",
+ root,ovl.aread+1,ovl.bread+1);
+ }
+ goto error;
+ }
+ else
+ { if (ovl.aread > prev.aread) goto inorder;
+ if (ovl.aread == prev.aread)
+ { if (ovl.path.abpos > prev.path.abpos) goto inorder;
+ if (ovl.path.abpos == prev.path.abpos)
+ goto dupcheck;
+ }
+ if (VERBOSE)
+ fprintf(stderr," %s: Chains are not sorted (%d vs %d)\n",
+ root,ovl.aread+1,ovl.bread+1);
+ goto error;
}
- if (VERBOSE)
- fprintf(stderr," %s: Reads are not sorted (%d vs %d)\n",
- root,ovl.aread+1,ovl.bread+1);
- goto error;
- }
- else
- { if (ovl.aread == last.aread && ovl.bread == last.bread &&
- COMP(ovl.flags) == COMP(last.flags) && ovl.path.abpos == last.path.abpos)
- equal = 1;
}
+ dupcheck:
+ if (ovl.aread == last.aread && ovl.bread == last.bread &&
+ COMP(ovl.flags) == COMP(last.flags) && ovl.path.abpos == last.path.abpos)
+ equal = 1;
inorder:
if (equal)
{ if (ovl.path.aepos == last.path.aepos &&
@@ -277,6 +322,8 @@ int main(int argc, char *argv[])
}
last = ovl;
+ if (CHAIN_START(ovl.flags))
+ prev = ovl;
}
// File processing epilog: Check all data read and print OK if -v
@@ -292,9 +339,13 @@ int main(int argc, char *argv[])
Print_Number(novl,0,stderr);
fprintf(stderr," all OK\n");
}
+ goto cleanup;
error:
- fclose(input);
+ status = 1;
+ cleanup:
+ if (input != NULL)
+ fclose(input);
free(pwd);
free(root);
}
@@ -306,5 +357,5 @@ int main(int argc, char *argv[])
if (ISTWO)
Close_DB(db2);
- exit (0);
+ exit (status);
}
diff --git a/LAdump.c b/LAdump.c
index 6d48694..070c3c2 100644
--- a/LAdump.c
+++ b/LAdump.c
@@ -445,9 +445,18 @@ int main(int argc, char *argv[])
printf("P %d %d",ovl->aread+1,ovl->bread+1);
if (COMP(ovl->flags))
- printf(" c\n");
+ printf(" c");
else
- printf(" n\n");
+ printf(" n");
+ if (CHAIN_NEXT(ovl->flags))
+ printf(" -");
+ else if (BEST_CHAIN(ovl->flags))
+ printf(" >");
+ else if (CHAIN_START(ovl->flags))
+ printf(" +");
+ else
+ printf(" .");
+ printf("\n");
if (DOCOORDS)
printf("C %d %d %d %d\n",ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos);
diff --git a/LAmerge.c b/LAmerge.c
index 00954e0..985a5f6 100644
--- a/LAmerge.c
+++ b/LAmerge.c
@@ -18,7 +18,7 @@
#include "DB.h"
#include "align.h"
-static char *Usage = "[-v] <merge:las> <parts:las> ...";
+static char *Usage = "[-va] <merge:las> <parts:las> ...";
#define MEMORY 4000 // in Mb
@@ -41,6 +41,10 @@ static char *Usage = "[-v] <merge:las> <parts:las> ...";
bigger = 0; \
else if (lp->path.abpos > rp->path.abpos) \
bigger = 1; \
+ else if (lp->path.abpos < rp->path.abpos) \
+ bigger = 0; \
+ else if (lp > rp) \
+ bigger = 1; \
else \
bigger = 0;
@@ -92,6 +96,10 @@ static void reheap(int s, Overlap **heap, int hsize)
bigger = 0; \
else if (lp->path.abpos > rp->path.abpos) \
bigger = 1; \
+ else if (lp->path.abpos < rp->path.abpos) \
+ bigger = 0; \
+ else if (lp > rp) \
+ bigger = 1; \
else \
bigger = 0;
@@ -194,13 +202,13 @@ int main(int argc, char *argv[])
j = 1;
for (i = 1; i < argc; i++)
if (argv[i][0] == '-')
- { ARG_FLAGS("vc") }
+ { ARG_FLAGS("va") }
else
argv[j++] = argv[i];
argc = j;
VERBOSE = flags['v'];
- MAP_SORT = flags['c'];
+ MAP_SORT = flags['a'];
if (argc < 3)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
@@ -334,31 +342,33 @@ int main(int argc, char *argv[])
ov = heap[1];
src = in + (ov - ovls);
- src->count += 1;
+ do
+ { src->count += 1;
- tsize = ov->path.tlen*tbytes;
- span = osize + tsize;
- if (src->ptr + span > src->top)
- ovl_reload(src,bsize);
- if (optr + span > otop)
- { fwrite(oblock,1,optr-oblock,output);
- optr = oblock;
- }
+ tsize = ov->path.tlen*tbytes;
+ span = osize + tsize;
+ if (src->ptr + span > src->top)
+ ovl_reload(src,bsize);
+ if (optr + span > otop)
+ { fwrite(oblock,1,optr-oblock,output);
+ optr = oblock;
+ }
- memcpy(optr,((char *) ov) + psize,osize);
- optr += osize;
- memcpy(optr,src->ptr,tsize);
- optr += tsize;
+ memcpy(optr,((char *) ov) + psize,osize);
+ optr += osize;
+ memcpy(optr,src->ptr,tsize);
+ optr += tsize;
- src->ptr += tsize;
- if (src->ptr < src->top)
- { *ov = *((Overlap *) (src->ptr - psize));
+ src->ptr += tsize;
+ if (src->ptr >= src->top)
+ { heap[1] = heap[hsize];
+ hsize -= 1;
+ break;
+ }
+ *ov = *((Overlap *) (src->ptr - psize));
src->ptr += osize;
}
- else
- { heap[1] = heap[hsize];
- hsize -= 1;
- }
+ while (CHAIN_NEXT(ov->flags));
}
// Flush output buffer and wind up
@@ -373,7 +383,7 @@ int main(int argc, char *argv[])
for (i = 0; i < fway; i++)
totl -= in[i].count;
if (totl != 0)
- { fprintf(stderr,"%s: Did not write all records (%lld)\n",argv[0],totl);
+ { fprintf(stderr,"%s: Did not write all records to %s (%lld)\n",argv[0],argv[1],totl);
exit (1);
}
diff --git a/LAshow.c b/LAshow.c
index 66926e9..4d6cf45 100644
--- a/LAshow.c
+++ b/LAshow.c
@@ -287,6 +287,10 @@ int main(int argc, char *argv[])
SYSTEM_ERROR
if (fread(&tspace,sizeof(int),1,input) != 1)
SYSTEM_ERROR
+ if (tspace <= 0)
+ { fprintf(stderr,"%s: Garbage .las file, trace spacing <= 0 !\n",Prog_Name);
+ exit (1);
+ }
if (tspace <= TRACE_XOVR)
{ small = 1;
@@ -457,6 +461,14 @@ int main(int argc, char *argv[])
if (ALIGN || CARTOON || REFERENCE)
printf("\n");
+
+ if (BEST_CHAIN(ovl->flags))
+ printf("> ");
+ else if (CHAIN_START(ovl->flags))
+ printf("+ ");
+ else if (CHAIN_NEXT(ovl->flags))
+ printf(" -");
+
if (FLIP)
{ Flip_Alignment(aln,0);
Print_Number((int64) ovl->bread+1,ar_wide+1,stdout);
@@ -472,15 +484,49 @@ int main(int argc, char *argv[])
printf(" c");
else
printf(" n");
- printf(" [");
+ if (ovl->path.abpos == 0)
+ printf(" <");
+ else
+ printf(" [");
Print_Number((int64) ovl->path.abpos,ai_wide,stdout);
printf("..");
Print_Number((int64) ovl->path.aepos,ai_wide,stdout);
- printf("] x [");
- Print_Number((int64) ovl->path.bbpos,bi_wide,stdout);
- printf("..");
- Print_Number((int64) ovl->path.bepos,bi_wide,stdout);
- printf("]");
+ if (ovl->path.aepos == aln->alen)
+ printf("> x ");
+ else
+ printf("] x ");
+ if (ovl->path.bbpos == 0)
+ printf("<");
+ else
+ printf("[");
+ if (COMP(ovl->flags))
+ { Print_Number((int64) (aln->blen - ovl->path.bbpos),bi_wide,stdout);
+ printf("..");
+ Print_Number((int64) (aln->blen - ovl->path.bepos),bi_wide,stdout);
+ }
+ else
+ { Print_Number((int64) ovl->path.bbpos,bi_wide,stdout);
+ printf("..");
+ Print_Number((int64) ovl->path.bepos,bi_wide,stdout);
+ }
+ if (ovl->path.bepos == aln->blen)
+ printf(">");
+ else
+ printf("]");
+
+ if (CARTOON)
+ { printf(" (");
+ Print_Number(tps,tp_wide,stdout);
+ printf(" trace pts)\n\n");
+ }
+ else
+ { printf(" ~ %4.1f%% (",(200.*ovl->path.diffs) /
+ ((ovl->path.aepos - ovl->path.abpos) + (ovl->path.bepos - ovl->path.bbpos)) );
+ Print_Number((int64) ovl->path.diffs,mn_wide,stdout);
+ printf(" diffs, ");
+ Print_Number(tps,tp_wide,stdout);
+ printf(" trace pts)\n");
+ }
if (ALIGN || CARTOON || REFERENCE)
{ if (ALIGN || REFERENCE)
@@ -548,30 +594,12 @@ int main(int argc, char *argv[])
}
}
if (CARTOON)
- { printf(" (");
- Print_Number(tps,tp_wide,stdout);
- printf(" trace pts)\n\n");
- Alignment_Cartoon(stdout,aln,INDENT,mx_wide);
- }
- else
- { printf(" : = ");
- Print_Number((int64) ovl->path.diffs,mn_wide,stdout);
- printf(" diffs (");
- Print_Number(tps,tp_wide,stdout);
- printf(" trace pts)\n");
- }
+ Alignment_Cartoon(stdout,aln,INDENT,mx_wide);
if (REFERENCE)
Print_Reference(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide);
if (ALIGN)
Print_Alignment(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide);
}
- else
- { printf(" : < ");
- Print_Number((int64) ovl->path.diffs,mn_wide,stdout);
- printf(" diffs (");
- Print_Number(tps,tp_wide,stdout);
- printf(" trace pts)\n");
- }
}
free(trace);
diff --git a/LAsort.c b/LAsort.c
index df54b0a..ce0d8d1 100644
--- a/LAsort.c
+++ b/LAsort.c
@@ -19,7 +19,7 @@
#include "DB.h"
#include "align.h"
-static char *Usage = "[-v] <align:las> ...";
+static char *Usage = "[-va] <align:las> ...";
#define MEMORY 1000 // How many megabytes for output buffer
@@ -55,7 +55,15 @@ static int SORT_OVL(const void *x, const void *y)
pl = ol->path.abpos;
pr = or->path.abpos;
- return (pl-pr);
+ if (pl != pr)
+ return (pl-pr);
+
+ if (ol < or)
+ return (-1);
+ else if (ol > or)
+ return (1);
+ else
+ return (0);
}
static int SORT_MAP(const void *x, const void *y)
@@ -76,11 +84,19 @@ static int SORT_MAP(const void *x, const void *y)
pl = ol->path.abpos;
pr = or->path.abpos;
- return (pl-pr);
+ if (pl != pr)
+ return (pl-pr);
+
+ if (ol < or)
+ return (-1);
+ else if (ol > or)
+ return (1);
+ else
+ return (0);
}
int main(int argc, char *argv[])
-{ char *iblock, *fblock;
+{ char *iblock, *fblock, *iend;
int64 isize, osize;
int64 ovlsize, ptrsize;
int tspace, tbytes;
@@ -99,13 +115,13 @@ int main(int argc, char *argv[])
j = 1;
for (i = 1; i < argc; i++)
if (argv[i][0] == '-')
- { ARG_FLAGS("vc") }
+ { ARG_FLAGS("va") }
else
argv[j++] = argv[i];
argc = j;
VERBOSE = flags['v'];
- MAP_ORDER = flags['c'];
+ MAP_ORDER = flags['a'];
if (argc <= 1)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
@@ -125,7 +141,7 @@ int main(int argc, char *argv[])
for (i = 1; i < argc; i++)
{ int64 *perm;
FILE *input, *foutput;
- int64 novl;
+ int64 novl, sov;
// Read in the entire file and output header
@@ -188,6 +204,7 @@ int main(int argc, char *argv[])
SYSTEM_ERROR
}
fclose(input);
+ iend = iblock + (size - ptrsize);
}
// Set up unsorted permutation array
@@ -199,10 +216,22 @@ int main(int argc, char *argv[])
{ int64 off;
int j;
- off = -ptrsize;
- for (j = 0; j < novl; j++)
- { perm[j] = off;
- off += ovlsize + ((Overlap *) (iblock+off))->path.tlen*tbytes;
+ if (CHAIN_START(((Overlap *) (iblock-ptrsize))->flags))
+ { sov = 0;
+ off = -ptrsize;
+ for (j = 0; j < novl; j++)
+ { if (CHAIN_START(((Overlap *) (iblock+off))->flags))
+ perm[sov++] = off;
+ off += ovlsize + ((Overlap *) (iblock+off))->path.tlen*tbytes;
+ }
+ }
+ else
+ { off = -ptrsize;
+ for (j = 0; j < novl; j++)
+ { perm[j] = off;
+ off += ovlsize + ((Overlap *) (iblock+off))->path.tlen*tbytes;
+ }
+ sov = novl;
}
}
@@ -210,31 +239,35 @@ int main(int argc, char *argv[])
IBLOCK = iblock;
if (MAP_ORDER)
- qsort(perm,novl,sizeof(int64),SORT_MAP);
+ qsort(perm,sov,sizeof(int64),SORT_MAP);
else
- qsort(perm,novl,sizeof(int64),SORT_OVL);
+ qsort(perm,sov,sizeof(int64),SORT_OVL);
// Output the records in sorted order
{ int j;
Overlap *w;
int64 tsize, span;
- char *fptr, *ftop;
+ char *fptr, *ftop, *wo;
fptr = fblock;
ftop = fblock + osize;
- for (j = 0; j < novl; j++)
- { w = (Overlap *) (iblock+perm[j]);
- tsize = w->path.tlen*tbytes;
- span = ovlsize + tsize;
- if (fptr + span > ftop)
- { fwrite(fblock,1,fptr-fblock,foutput);
- fptr = fblock;
+ for (j = 0; j < sov; j++)
+ { w = (Overlap *) (wo = iblock+perm[j]);
+ do
+ { tsize = w->path.tlen*tbytes;
+ span = ovlsize + tsize;
+ if (fptr + span > ftop)
+ { fwrite(fblock,1,fptr-fblock,foutput);
+ fptr = fblock;
+ }
+ memcpy(fptr,((char *) w)+ptrsize,ovlsize);
+ fptr += ovlsize;
+ memcpy(fptr,(char *) (w+1),tsize);
+ fptr += tsize;
+ w = (Overlap *) (wo += span);
}
- memcpy(fptr,((char *) w)+ptrsize,ovlsize);
- fptr += ovlsize;
- memcpy(fptr,(char *) (w+1),tsize);
- fptr += tsize;
+ while (wo < iend && CHAIN_NEXT(w->flags));
}
if (fptr > fblock)
fwrite(fblock,1,fptr-fblock,foutput);
diff --git a/LAsplit.c b/LAsplit.c
index 49d5ed0..adf6cf5 100644
--- a/LAsplit.c
+++ b/LAsplit.c
@@ -19,7 +19,7 @@
#include "DB.h"
#include "align.h"
-static char *Usage = "<align:las> (<parts:int> | <path:db|dam>) < <source>.las";
+static char *Usage = "-v <align:las> (<parts:int> | <path:db|dam>) < <source>.las";
#define MEMORY 1000 // How many megabytes for output buffer
@@ -29,14 +29,32 @@ int main(int argc, char *argv[])
int64 novl, bsize, ovlsize, ptrsize;
int parts, tspace, tbytes;
int olast, blast;
- char *root, *pwd;
+ char *pwd, *root, *root2;
- Prog_Name = Strdup("LAsplit","");
+ int VERBOSE;
- if (argc != 3)
- { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
- exit (1);
- }
+ // Process options
+
+ { int i, j, k;
+ int flags[128];
+
+ ARG_INIT("LAsplit")
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ { ARG_FLAGS("v") }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ VERBOSE = flags['v'];
+
+ if (argc != 3)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
{ char *eptr;
int nfiles, cutoff, all;
@@ -98,6 +116,17 @@ int main(int argc, char *argv[])
pwd = PathTo(argv[1]);
root = Root(argv[1],".las");
+ root2 = index(root,'#');
+ if (root2 == NULL)
+ { fprintf(stderr,"%s: No #-sign in source name '%s'\n",Prog_Name,root);
+ exit (1);
+ }
+ if (index(root2+1,'#') != NULL)
+ { fprintf(stderr,"%s: Two or more occurences of #-sign in source name '%s'\n",Prog_Name,root);
+ exit (1);
+ }
+ *root2++ = '\0';
+
if (fread(&novl,sizeof(int64),1,stdin) != 1)
SYSTEM_ERROR
if (fread(&tspace,sizeof(int),1,stdin) != 1)
@@ -107,6 +136,9 @@ int main(int argc, char *argv[])
else
tbytes = sizeof(uint16);
+ if (VERBOSE)
+ fprintf(stderr," Distributing %lld la\'s\n",novl);
+
{ int i, j;
Overlap *w;
int low, hgh, last;
@@ -119,7 +151,7 @@ int main(int argc, char *argv[])
hgh = 0;
for (i = 0; i < parts; i++)
- { output = Fopen(Catenate(pwd,"/",root,Numbered_Suffix(".",i+1,".las")),"w");
+ { output = Fopen(Catenate(pwd,"/",Numbered_Suffix(root,i+1,root2),".las"),"w");
if (output == NULL)
exit (1);
@@ -194,6 +226,9 @@ int main(int argc, char *argv[])
povl = hgh-low;
fwrite(&povl,sizeof(int64),1,output);
+ if (VERBOSE)
+ fprintf(stderr," Split off %s: %lld la\'s\n",Numbered_Suffix(root,i+1,root2),povl);
+
fclose(output);
}
}
diff --git a/LAupgrade.Dec.31.2014.c b/LAupgrade.Dec.31.2014.c
deleted file mode 100644
index f569caa..0000000
--- a/LAupgrade.Dec.31.2014.c
+++ /dev/null
@@ -1,153 +0,0 @@
-/*******************************************************************************************
- *
- * Convert older .las files so that the alen and blen fields are removed.
- *
- * Author: Gene Myers
- * Date : Dec 2014
- *
- *******************************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "DB.h"
-#include "align.h"
-
-typedef struct
- { void *trace;
- uint16 tlen;
- uint16 diffs;
- uint16 abpos, bbpos;
- uint16 aepos, bepos;
- } PathOld;
-
-typedef struct {
- PathOld path;
- int aread;
- int bread;
- uint16 alen;
- uint16 blen;
- int flags;
-} OverlapOld;
-
-static char *Usage = "<source:las> > <target>.las";
-
-#define MEMORY 1000 // How many megabytes for output buffer
-
-int main(int argc, char *argv[])
-{ char *iblock, *oblock;
- FILE *input;
- int64 novl, bsize, ovlsize, newsize, ptrsize;
- int tspace, tbytes;
- char *pwd, *root;
-
- Prog_Name = Strdup("Upgrade","");
-
- if (argc <= 1)
- { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
- exit (1);
- }
-
- ptrsize = sizeof(void *);
- ovlsize = sizeof(OverlapOld) - ptrsize;
- newsize = sizeof(Overlap ) - ptrsize;
- bsize = MEMORY * 1000000ll;
- oblock = (char *) Malloc(bsize,"Allocating output block");
- iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block");
- if (oblock == NULL || iblock == NULL)
- exit (1);
- iblock += ptrsize;
-
- pwd = PathTo(argv[1]);
- root = Root(argv[1],".las");
- input = Fopen(Catenate(pwd,"/",root,".las"),"r");
- if (input == NULL)
- exit (1);
- free(pwd);
- free(root);
-
- if (fread(&novl,sizeof(int64),1,input) != 1)
- SYSTEM_ERROR
- if (fread(&tspace,sizeof(int),1,input) != 1)
- SYSTEM_ERROR
- if (tspace <= TRACE_XOVR)
- tbytes = sizeof(uint8);
- else
- tbytes = sizeof(uint16);
-
- fwrite(&novl,sizeof(int64),1,stdout);
- fwrite(&tspace,sizeof(int),1,stdout);
-
- { int j;
- OverlapOld *w;
- Overlap *v;
- int64 tsize;
- char *iptr, *itop;
- char *optr, *otop;
-
- optr = oblock;
- otop = oblock + bsize;
-
- iptr = iblock;
- itop = iblock + fread(iblock,1,bsize,input);
-
- for (j = 0; j < novl; j++)
- { if (iptr + ovlsize > itop)
- { int64 remains = itop-iptr;
- if (remains > 0)
- memcpy(iblock,iptr,remains);
- iptr = iblock;
- itop = iblock + remains;
- itop += fread(itop,1,bsize-remains,input);
- }
-
- w = (OverlapOld *) (iptr - ptrsize);
- tsize = w->path.tlen*tbytes;
-
- if (optr + newsize + tsize > otop)
- { fwrite(oblock,1,optr-oblock,stdout);
- optr = oblock;
- }
-
- v = (Overlap *) (optr - ptrsize);
- v->path.abpos = w->path.abpos;
- v->path.bbpos = w->path.bbpos;
- v->path.aepos = w->path.aepos;
- v->path.bepos = w->path.bepos;
- v->path.diffs = w->path.diffs;
- v->path.tlen = w->path.tlen;
- v->aread = w->aread;
- v->bread = w->bread;
- v->flags = w->flags;
-
- optr += newsize;
- iptr += ovlsize;
-
- if (iptr + tsize > itop)
- { int64 remains = itop-iptr;
- if (remains > 0)
- memcpy(iblock,iptr,remains);
- iptr = iblock;
- itop = iblock + remains;
- itop += fread(itop,1,bsize-remains,input);
- }
-
- memcpy(optr,iptr,tsize);
- optr += tsize;
- iptr += tsize;
- }
- if (optr > oblock)
- fwrite(oblock,1,optr-oblock,stdout);
- }
-
- fclose(input);
- free(oblock);
- free(iblock-ptrsize);
-
- exit (0);
-}
diff --git a/Makefile b/Makefile
index c9b027a..5df518c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,17 +1,16 @@
+DEST_DIR = ~/bin
+
CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing
-ALL = daligner HPCdaligner HPCmapper LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck LAindex
+ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck LAindex
all: $(ALL)
daligner: daligner.c filter.c filter.h align.c align.h DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o daligner daligner.c filter.c align.c DB.c QV.c -lpthread -lm
-HPCdaligner: HPCdaligner.c DB.c DB.h QV.c QV.h
- gcc $(CFLAGS) -o HPCdaligner HPCdaligner.c DB.c QV.c -lm
-
-HPCmapper: HPCmapper.c DB.c DB.h QV.c QV.h
- gcc $(CFLAGS) -o HPCmapper HPCmapper.c DB.c QV.c -lm
+HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm
LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm
@@ -47,8 +46,8 @@ clean:
rm -f daligner.tar.gz
install:
- cp $(ALL) ~/bin
+ cp $(ALL) $(DEST_DIR)
package:
make clean
- tar -zcf daligner.tar.gz README *.h *.c Makefile
+ tar -zcf daligner.tar.gz README.md Makefile *.h *.c
diff --git a/QV.c b/QV.c
index 2c4f3bd..cdb6a63 100644
--- a/QV.c
+++ b/QV.c
@@ -738,6 +738,12 @@ static int Nline; // Referred by: QVcoding_Scan
char *QVentry()
{ return (Read); }
+void Set_QV_Line(int line)
+{ Nline = line; }
+
+int Get_QV_Line()
+{ return (Nline); }
+
// If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines
// for a sequence. Place line j at Read+j*Rmax and the length of every line is returned
// unless eof occurs in which case return -1. If any error occurs return -2.
@@ -847,8 +853,9 @@ static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar)
*
********************************************************************************************/
- // Read .quiva file from input, recording stats in the histograms. If zero is set then
- // start the stats anew with this file.
+ // Read up to the next num entries or until eof from the .quiva file on input and record
+ // frequency statistics. Copy these entries to the temporary file temp if != NULL.
+ // If there is an error then -1 is returned, otherwise the number of entries read.
static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256];
static uint64 totChar;
@@ -856,9 +863,10 @@ static int delChar, subChar;
// Referred by: QVcoding_Scan, Create_QVcoding
-int QVcoding_Scan(FILE *input)
+int QVcoding_Scan(FILE *input, int num, FILE *temp)
{ char *slash;
int rlen;
+ int i, r;
// Zero histograms
@@ -867,11 +875,8 @@ int QVcoding_Scan(FILE *input)
bzero(insHist,sizeof(uint64)*256);
bzero(subHist,sizeof(uint64)*256);
- { int i;
-
- for (i = 0; i < 256; i++)
- delRun[i] = subRun[i] = 1;
- }
+ for (i = 0; i < 256; i++)
+ delRun[i] = subRun[i] = 1;
totChar = 0;
delChar = -1;
@@ -880,37 +885,48 @@ int QVcoding_Scan(FILE *input)
// Make a sweep through the .quiva entries, histogramming the relevant things
// and figuring out the run chars for the deletion and substition streams
- Nline = 0;
- while (1)
+ r = 0;
+ for (i = 0; i < num; i++)
{ int well, beg, end, qv;
rlen = Read_Lines(input,1);
if (rlen == -2)
- EXIT(1);
+ EXIT(-1);
if (rlen < 0)
break;
if (rlen == 0 || Read[0] != '@')
- { EPRINTF(EPLACE,"Line %d: Header in quiv file is missing\n",Nline);
- EXIT(1);
+ { EPRINTF(EPLACE,"Line %d: Header in quiva file is missing\n",Nline);
+ EXIT(-1);
}
slash = index(Read+1,'/');
if (slash == NULL)
{ EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n",
Prog_Name,Nline);
- EXIT(1);
+ EXIT(-1);
}
if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4)
{ EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n",
Prog_Name,Nline);
- EXIT(1);
+ EXIT(-1);
}
+ if (temp != NULL)
+ fputs(Read,temp);
+
rlen = Read_Lines(input,5);
if (rlen < 0)
{ if (rlen == -1)
EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline);
- EXIT(1);
+ EXIT(-1);
+ }
+
+ if (temp != NULL)
+ { fputs(Read,temp);
+ fputs(Read+Rmax,temp);
+ fputs(Read+2*Rmax,temp);
+ fputs(Read+3*Rmax,temp);
+ fputs(Read+4*Rmax,temp);
}
Histogram_Seqs(delHist,(uint8 *) (Read),rlen);
@@ -943,9 +959,11 @@ int QVcoding_Scan(FILE *input)
}
if (subChar >= 0)
Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar);
+
+ r += 1;
}
- return (0);
+ return (r);
}
// Using the statistics in the global stat tables, create the Huffman schemes and write
@@ -1275,7 +1293,7 @@ int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy
if (rlen < 0)
{ if (rlen == -1)
EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline);
- EXIT (1);
+ EXIT (-1);
}
if (coding->delChar < 0)
@@ -1310,7 +1328,7 @@ int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy
Encode_Run(coding->subScheme, coding->sRunScheme, output,
(uint8 *) (Read+4*Rmax), rlen, coding->subChar);
- return (0);
+ return (rlen);
}
int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen)
diff --git a/QV.h b/QV.h
index 1ea7dc8..532b2f4 100644
--- a/QV.h
+++ b/QV.h
@@ -13,6 +13,8 @@
#ifndef _QV_COMPRESSOR
+#include <stdio.h>
+
#define _QV_COMPRESSOR
// The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or
@@ -46,10 +48,16 @@ typedef struct
int Read_Lines(FILE *input, int nlines);
char *QVentry();
- // Read the .quiva file on input and record frequency statistics. If there is an error
- // then 1 is returned, otherwise 0.
+ // Get and set the line counter for error reporting
+
+void Set_QV_Line(int line);
+int Get_QV_Line();
-int QVcoding_Scan(FILE *input);
+ // Read up to the next num entries or until eof from the .quiva file on input and record
+ // frequency statistics. Copy these entries to the temporary file temp if != NULL.
+ // If there is an error then -1 is returned, otherwise the number of entries read.
+
+int QVcoding_Scan(FILE *input, int num, FILE *temp);
// Given QVcoding_Scan has been called at least once, create an encoding scheme based on
// the accumulated statistics and return a pointer to it. The returned encoding object
@@ -71,8 +79,8 @@ void Free_QVcoding(QVcoding *coding);
// Assuming the file pointer is positioned just beyond an entry header line, read the
// next set of 5 QV lines, compress them according to 'coding', and output. If lossy
- // is set then the scheme is a lossy one. A non-zero value is return only if an
- // error occured.
+ // is set then the scheme is a lossy one. A negative value is returned if an error
+ // occurred, and the sequence length otherwise.
int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy);
diff --git a/README b/README
deleted file mode 100644
index 439af9e..0000000
--- a/README
+++ /dev/null
@@ -1,395 +0,0 @@
-
-
-
-*** PLEASE GO TO THE DAZZLER BLOG (https://dazzlerblog.wordpress.com) FOR TYPESET ***
- DOCUMENTATION, EXAMPLES OF USE, AND DESIGN PHILOSOPHY.
-
-
-/************************************************************************************\
-
-UPGRADE & DEVELOPER NOTES ! ! !
-
- If you have already performed a big comparison and don't want to recompute all your
-local alignments in .las files, but do want to use a more recent version of the
-software that entails a change to the data structures (currently the update on
-December 31, 2014), please note the routine LAupgrade.Dec.31.2014. This take a .las file,
-say X.las, as an argument, and writes to standard output the .las file in the new
-format.
-
- The program can be made with "make" but is not by default created when make is
-called without an argument.
-
- For those interested in the details, on December 30, the "alen" and "blen" fields
-were dropped to save space as they can always be gotten from the underlying DB.
-
-\************************************************************************************/
-
-
- The Daligner Overlap Library
-
- Author: Gene Myers
- First: July 17, 2013
- Current: December 31, 2014
-
- The commands below permit one to find all significant local alignments between reads
-encoded in Dazzler database. The assumption is that the reads are from a PACBIO RS II
-long read sequencer. That is the reads are long and noisy, up to 15% on average.
-
- Recall that a database has a current partition that divides it into blocks of a size
-that can conveniently be handled by calling the "dalign" overlapper on all the pairs of
-blocks producing a collection of .las local alignment files that can then be sorted and
-merged into an ordered sequence of sorted files containing all alignments between reads
-in the data set. The alignment records are parsimonious in that they do not record an
-alignment but simply a set of trace points, typically every 100bp or so, that allow the
-efficient reconstruction of alignments on demand.
-
-1. daligner [-vbAI] [-k<int(14)>] [-w<int(6)>] [-h<int(35)>] [-t<int>] [-M<int>]
- [-e<double(.70)] [-l<int(1000)] [-s<int(100)>] [-H<int>]
- [-m<track>]+ <subject:db|dam> <target:db|dam> ...
-
-Compare sequences in the trimmed <subject> block against those in the list of <target>
-blocks searching for local alignments involving at least -l base pairs (default 1000)
-or more, that have an average correlation rate of -e (default 70%). The local
-alignments found will be output in a sparse encoding where a trace point on the
-alignment is recorded every -s base pairs of the a-read (default 100bp). Reads are
-compared in both orientations and local alignments meeting the criteria are output to
-one of several created files described below. The -v option turns on a verbose
-reporting mode that gives statistics on each major step of the computation.
-
-The options -k, -h, and -w control the initial filtration search for possible matches
-between reads. Specifically, our search code looks for a pair of diagonal bands of
-width 2^w (default 2^6 = 64) that contain a collection of exact matching k-mers
-(default 14) between the two reads, such that the total number of bases covered by the
-k-mer hits is h (default 35). k cannot be larger than 32 in the current implementation.
-If the -b option is set, then the daligner assumes the data has a strong compositional
-bias (e.g. >65% AT rich), and at the cost of a bit more time, dynamically adjusts k-mer
-sizes depending on compositional bias, so that the mers used have an effective
-specificity of 4^k.
-
-If there are one or more interval tracks specified with the -m option, then the reads
-of the DB or DB's to which the mask applies are soft masked with the union of the
-intervals of all the interval tracks that apply, that is any k-mers that contain any
-bases in any of the masked intervals are ignored for the purposes of seeding a match.
-An interval track is a track, such as the "dust" track created by DBdust, that encodes
-a set of intervals over either the untrimmed or trimmed DB.
-
-Invariably, some k-mers are significantly over-represented (e.g. homopolymer runs).
-These k-mers create an excessive number of matching k-mer pairs and left unaddressed
-would cause daligner to overflow the available physical memory. One way to deal with
-this is to explicitly set the -t parameter which suppresses the use of any k-mer that
-occurs more than t times in either the subject or target block. However, a better way
-to handle the situation is to let the program automatically select a value of t that
-meets a given memory usage limit specified (in Gb) by the -M parameter. By default
-daligner will use the amount of physical memory as the choice for -M. If you want to
-use less, say only 8Gb on a 24Gb HPC cluster node because you want to run 3 daligner
-jobs on the node, then specify -M8. Specifying -M0 basically indicates that you do not
-want daligner to self adjust k-mer suppression to fit within a given amount of memory.
-
-For each subject, target pair of blocks, say X and Y, the program reports alignments
-where the a-read is in X and the b-read is in Y, and vice versa. However, if the -A
-option is set ("A" for "asymmetric") then just overlaps where the a-read is in X and
-the b-read is in Y are reported, and if X = Y, then it further reports only those
-overlaps where the a-read index is less than the b-read index. In either case, if the
--I option is set ("I" for "identity") then when X = Y, overlaps between different
-portions of the same read will also be found and reported.
-
-Each found alignment is recorded as -- a[ab,ae] x bo[bb,be] -- where a and b are the
-indices (in the trimmed DB) of the reads that overlap, o indicates whether the b-read
-is from the same or opposite strand, and [ab,ae] and [bb,be] are the intervals of a
-and bo, respectively, that align. The program places these alignment records in files
-whose name is of the form X.Y.[C|N]#.las where C indicates that the b-reads are
-complemented and N indicates they are not (both comparisons are performed) and # is
-the thread that detected and wrote out the collection of alignments contained in the
-file. That is the file X.Y.O#.las contains the alignments produced by thread # for
-which the a-read is from X and the b-read is from Y and in orientation O. The command
-"daligner -A X Y" produces 2*NTHREAD thread files X.Y.?.las and "daligner X Y"
-produces 4*NTHREAD files X.Y.?.las and Y.X.?.las (unless X=Y in which case only
-NTHREAD files, X.X.?.las, are produced).
-
-By default daligner compares all overlaps between reads in the database that are
-greater than the minimum cutoff set when the DB or DBs were split, typically 1 or
-2 Kbp. However, the HGAP assembly pipeline only wants to correct large reads, say
-8Kbp or over, and so needs only the overlaps where the a-read is one of the large
-reads. By setting the -H parameter to say N, one alters daligner so that it only
-reports overlaps where the a-read is over N base-pairs long.
-
-While the default parameter settings are good for raw Pacbio data, daligner can be used
-for efficiently finding alignments in corrected reads or other less noisy reads. For
-example, for mapping applications against .dams we run "daligner -k20 -h60 -e.85" and
-on corrected reads, we typically run "daligner -k25 -w5 -h60 -e.95 -s500" and at
-these settings it is very fast.
-
-
-2. LAsort [-v] <align:las> ...
-
-Sort each .las alignment file specified on the command line. For each file it reads in
-all the overlaps in the file and sorts them in lexicographical order of (a,b,o,ab)
-assuming each alignment is recorded as a[ab,ae] x b^o[bb,be]. It then writes them all
-to a file named <align>.S.las (assuming that the input file was <align>.las). With the
--v option set then the program reports the number of records read and written.
-
-
-3. LAmerge [-v] <merge:las> <parts:las> ...
-
-Merge the .las files <parts> into a singled sorted file <merge>, where it is assumed
-that the input <parts> files are sorted. Due to operating system limits, the number of
-<parts> files must be <= 252. With the -v option set the program reports the # of
-records read and written.
-
-Used correctly, LAmerge and LAsort together allow one to perform an "external" sort
-that produces a collection of sorted files containing in aggregate all the local
-alignments found by the daligner, such that their concatenation is sorted in order of
-(a,b,o,ab). In particular, this means that all the alignments for a given a-read will
-be found consecutively in one of the files. So computations that need to look at all
-the alignments for a given read can operate in simple sequential scans of these
-sorted files.
-
-
-4. LAshow [-caroUF] [-i<int(4)>] [-w<int(100)>] [-b<int(10)>]
- <src1:db|dam> [ <src2:db|dam> ] <align:las> [ <reads:FILE> | <reads:range> ... ]
-
-LAshow produces a printed listing of the local alignments contained in the specified
-.las file, where the a- and b-reads come from src1 or from src1 and scr2, respectively.
-If a file or list of read ranges is given then only the overlaps for which the a-read
-is in the set specified by the file or list are displayed. See DBshow for an explanation
-of how the file and list of read ranges are interpreted. If the -F option is set then
-the roles of the a- and b- reads are reversed in the display.
-
-If the -c option is given then a cartoon rendering is displayed, and if -a or -r option
-is set then an alignment of the local alignment is displayed. The -a option puts
-exactly -w columns per segment of the display, whereas the -r option puts exactly -w
-a-read symbols in each segment of the display. The -r display mode is useful when one
-wants to visually compare two alignments involving the same a-read. If a combination of
-the -c, -a, and -r flags is set, then the cartoon comes first, then the -a alignment,
-and lastly the -r alignment. The -i option sets the indent for the cartoon and/or
-alignment displays, if they are requested. The -b option sets the number of symbols on
-either side of the aligned segments in an alignment display, and -U specifies that
-uppercase should be used for DNA sequence instead of the default lowercase. If the
--o option is set then only alignments that are proper overlaps (a sequence end occurs
-at the each end of the alignment) are displayed.
-
-5. LAdump [-cdt] [-o] <src1:db|dam> [ <src2:db|dam> ]
- <align:las> [ <reads:FILE> | <reads:range> ... ]
-
-Like LAshow, LAdump allows one to display the local alignments (LAs) of a subset of the
-piles in an .las file and select which information to show about them. The difference
-is that the information is written in a very simple "1-code" ASCII format that makes it
-easy for one to read and parse the information for further use. For each LA the pair of
-reads is output on a line. -c requests that one further output the coordinates of the
-LA segments be output. The -d option requests that the number of difference in the LA
-be output, and -t requests that the tracepoint information be output. Finally, -o
-requests that only LAs that are proper overlaps be output.
-
-The format is very simple. Each requested piece of information occurs on a line. The
-first character of every line is a "1-code" character that tells you what information
-to expect on the line. The rest of the line contains information where each item is
-separated by a single blank space. The trace point line gives the number of trace
-point intervals in the LA and is immediately followed by that many lines containing
-a pair of integers giving the # of differences and b-displacement in each successive
-trace point interval.
-
- P #a #b - (#a,#b) have an LA between them
- C #ab #ae #bb #be - [#ab,#ae] aligns with [#bb,#be]
- D # - there are # differences in the LA
- T #n - there are #n trace point intervals for the LA
- (#d #y )^#n - there are #d difference aligning the #y bp's of B with the
- next fixed-size interval of A
- + X # - Total amount of X (X = P or T)
- % X # - Maximum amount of X in any pile (X = P or T)
- @ T # - Maximum number of trace points in any trace
-
-1-code lines that begin with +, %, or @ are always the first lines in the output.
-They give size information about what is contained in the output. Specifically,
-'+ X #' gives the total number of LAs (X=P), or the total number of trace point
-intervals (X=T) in the file . '% X #' gives the maximum number of LAs (X=P) or
-the maximum number of trace point intervals (X=T) in a given *pile* (collection of
-LAs all with the same a-read (applies only to sorted .las files). Finally @ T #
-gives the maximum # of trace point intervals in any trace within the file.
-
-6. LAindex -v <source:las> ...
-
-LAindex takes a series of one or more sorted .las files and produces a "pile
-index" for each one. If the input file has name "X.las", then the name of its
-index file is ".X.las.idx". For each A-read pile encoded in the .las file,
-the index contains the offset to the first local alignment with A in the file.
-The index starts with four 64-bit integers that encode the numbers % P, + T, % T,
-and @ T described for LAdump above, and then an offset for each pile beginning
-with the first A-read in the file (which may not be read 0). The index is meant
-to allow programs that process piles to more efficiently read just the piles
-they need at any momment int time, as opposed to having to sequentially scan
-through the .las file.
-
-7. LAcat <source:las> > <target>.las
-
-Given argument <source>, find all files <source>.1.las, <source>.2.las, ...
-<source>.n.<las> where <source>.i.las exists for every i in [1,n]. Then
-concatenate these files in order into a single .las file and pipe the result
-to the standard output.
-
-
-8. LAsplit <target:las> (<parts:int> | <path:db|dam>) < <source>.las
-
-If the second argument is an integer n, then divide the alignment file <source>, piped
-in through the standard input, as evenly as possible into n alignment files with the
-name <target>.i.las for i in [1,n], subject to the restriction that all alignment
-records for a given a-read are in the same file.
-
-If the second argument refers to a database <path>.db that has been partitioned, then
-divide the input alignment file into block .las files where all records whose a-read is
-in <path>.i.db are in <align>.i.las.
-
-
-9. LAcheck [-vS] <src1:db|dam> [ <src2:db|dam> ] <align:las> ...
-
-LAcheck checks each .las file for structural integrity, where the a- and b-sequences
-come from src1 or from src1 and scr2, respectively. That is, it makes sure each file
-makes sense as a plausible .las file, e.g. values are not out of bound, the number of
-records is correct, the number of trace points for a record is correct, and so on. If
-the -S option is set then it further checks that the alignments are in sorted order.
-If the -v option is set then a line is output for each .las file saying either the
-file is OK or reporting the first error. If the -v option is not set then the program
-runs silently. The exit status is 0 if every file is deemed good, and 1 if at least
-one of the files looks corrupted.
-
-
-10. HPCdaligner [-vbAI] [-k<int(14)>] [-w<int(6)>] [-h<int(35)>] [-t<int>] [-M<int>]
- [-e<double(.70)] [-l<int(1000)] [-s<int(100)>] [-H<int>]
- [-m<track>]+ [-dal<int(4)>] [-deg<int(25)>]
- <path:db|dam> [<first:int>[-<last:int>]]
-
-HPCdaligner writes a UNIX shell script to the standard output that consists of a
-sequence of commands that effectively run daligner on all pairs of blocks of a split
-database and then externally sorts and merges them using LAsort and LAmerge into a
-collection of alignment files with names <path>.#.las where # ranges from 1 to the
-number of blocks the data base is split into. These sorted files if concatenated by say
-LAcat would contain all the alignments in sorted order (of a-read, then b-read, ...).
-Moreover, all overlaps for a given a-read are guaranteed to not be split across files,
-so one can run artifact analyzers or error correction on each sorted file in parallel.
-
-The data base must have been previously split by DBsplit and all the parameters, except
--v, -dal, and -deg, are passed through to the calls to daligner. The defaults for these
-parameters are as for daligner. The -v flag, for verbose-mode, is also passed to all
-calls to LAsort and LAmerge. -dal and -deg options are described later. For a database
-divided into N sub-blocks, the calls to daligner will produce in total 2TN^2 .las files
-assuming daligner runs with T threads. These will then be sorted and merged into N^2
-sorted .las files, one for each block pair. These are then merged in ceil(log_deg N)
-phases where the number of files decreases geometrically in -deg until there is 1 file
-per row of the N x N block matrix. So at the end one has N sorted .las files that when
-concatenated would give a single large sorted overlap file.
-
-The -dal option (default 4) gives the desired number of block comparisons per call to
-daligner. Some must contain dal-1 comparisons, and the first dal-2 block comparisons
-even less, but the HPCdaligner "planner" does the best it can to give an average load
-of dal block comparisons per command. The -deg option (default 25) gives the maximum
-number of files that will be merged in a single LAmerge command. The planner makes the
-most even k-ary tree of merges, where the number of levels is ceil(log_deg N).
-
-If the integers <first> and <last> are missing then the script produced is for every
-block in the database. If <first> is present then HPCdaligner produces an incremental
-script that compares blocks <first> through <last> (<last> = <first> if not present)
-against each other and all previous blocks 1 through <first>-1, and then incrementally
-updates the .las files for blocks 1 through <first>-1, and creates the .las files for
-blocks <first> through <last>.
-
-Each UNIX command line output by the HPCdaligner can be a batch job (we use the &&
-operator to combine several commands into one line to make this so). Dependencies
-between jobs can be maintained simply by first running all the daligner jobs, then all
-the initial sort jobs, and then all the jobs in each phase of the external merge sort.
-Each of these phases is separated by an informative comment line for your scripting
-convenience.
-
-
-9. HPCmapper [-vb] [-k<int(20)>] [-w<int(6)>] [-h<int(50)>] [-t<int>] [-M<int>]
- [-e<double(.85)] [-l<int(1000)] [-s<int(100)>] [-H<int>]
- [-m<track>]+ [-dal<int(4)>] [-deg<int(25)>]
- <ref:db|dam> <reads:db|dam> [<first:int>[-<last:int>]]
-
-HPCmapper writes a UNIX shell script to the standard output that consists of a
-sequence of commands that effectively "maps" every read in the DB <reads> against a
-reference set of sequences in the DB <ref>, recording all the found local alignments
-in the sequence of files <ref>.<reads>.1.las, <ref>.<reads>.2.las, ... where
-<ref>.<reads>.k.las contains the alignments between all of <ref> and the k'th
-block of <reads>. The parameters are exactly the same as for HPCdaligner save that
-the -k, -h, and -e defaults are set appropriately for mapping, and the -A and -I
-options make no sense as <ref> and <reads> are expected to be distinct data sets.
-
-If the integers <first> and <last> are missing then the script produced is for every
-block in the database <reads>. If <first> is present then HPCmapper produces an
-script that compares blocks <first> through <last> (<last> = <first> if not present)
-against DAM <ref>.
-
-
-Example:
-
-// Recall G.db from the example in DAZZ_DB/README
-
-> cat G.db
-files = 1
- 1862 G Sim
-blocks = 2
-size = 11 cutoff = 0 all = 0
- 0 0
- 1024 1024
- 1862 1862
-> HPCdaligner -mdust -t5 G | csh -v // Run the HPCdaligner script
-
-# Dazzler jobs (2)
-dazzler -d -t5 -mdust G.1 G.1
-dazzler -d -t5 -mdust G.2 G.1 G.2
-# Initial sort jobs (4)
-LAsort G.1.G.1.*.las && LAmerge G.L1.1.1 G.1.G.1.*.S.las && rm G.1.G.1.*.S.las
-LAsort G.1.G.2.*.las && LAmerge G.L1.1.2 G.1.G.2.*.S.las && rm G.1.G.2.*.S.las
-LAsort G.2.G.1.*.las && LAmerge G.L1.2.1 G.2.G.1.*.S.las && rm G.2.G.1.*.S.las
-LAsort G.2.G.2.*.las && LAmerge G.L1.2.2 G.2.G.2.*.S.las && rm G.2.G.2.*.S.las
-# Level 1 jobs (2)
-LAmerge G.1 G.L1.1.1 G.L1.1.2 && rm G.L1.1.1.las G.L1.1.2.las
-LAmerge G.2 G.L1.2.1 G.L1.2.2 && rm G.L1.2.1.las G.L1.2.2.las
-
-> LAshow -c -a:G -w50 G.1 | more // Take a look at the result !
-
-G.1: 34,510 records
-
- 1 9 c [ 0.. 1,876] x [ 9,017..10,825] ( 18 trace pts)
-
- 12645
- A ---------+====> dif/(len1+len2) = 398/(1876+1808) = 21.61%
- B <====+---------
- 9017
-
- 1 ..........gtg-cggt--caggggtgcctgc-t-t-atcgcaatgtta
- |||*||||**||||||||*||||*|*|*||**|*|*||||
- 9008 gagaggccaagtggcggtggcaggggtg-ctgcgtcttatatccaggtta 27.5%
-
- 35 ta-ctgggtggttaaacttagccaggaaacctgttgaaataa-acggtgg
- ||*|||||||||||||*|**|*||*|*||||||*|**|||||*|*|||||
- 9057 tagctgggtggttaaa-tctg-ca-g-aacctg-t--aataacatggtgg 24.0%
-
- 83 -ctagtggcttgccgtttacccaacagaagcataatgaaa-tttgaaagt
- *||||||||*||||||||*||**||||*|||**|||||||*||||*||||
- 9100 gctagtggc-tgccgttt-ccgcacag-agc--aatgaaaatttg-aagt 20.0%
-
- 131 ggtaggttcctgctgtct-acatacagaacgacggagcgaaaaggtaccg
- ||*|||||||||||||*|*||||*|*|*||||||||||*||||||||||*
- 9144 gg-aggttcctgctgt-tcacat-c-ggacgacggagc-aaaaggtacc- 16.0%
-
-...
-
-> LAcat G >G.las // Combine G.1.las & G.2.las into a single .las file
-> LAshow G G | more // Take another look, now at G.las
-
-G: 62,654 records
- 1 9 c [ 0.. 1,876] x [ 9,017..10,825] : < 398 diffs ( 18 trace pts)
- 1 38 c [ 0.. 7,107] x [ 5,381..12,330] : < 1,614 diffs ( 71 trace pts)
- 1 49 n [ 5,493..14,521] x [ 0.. 9,065] : < 2,028 diffs ( 91 trace pts)
- 1 68 n [12,809..14,521] x [ 0.. 1,758] : < 373 diffs ( 17 trace pts)
- 1 147 c [ 0..13,352] x [ 854..14,069] : < 2,993 diffs (133 trace pts)
- 1 231 n [10,892..14,521] x [ 0.. 3,735] : < 816 diffs ( 37 trace pts)
- 1 292 c [ 3,835..14,521] x [ 0..10,702] : < 2,353 diffs (107 trace pts)
- 1 335 n [ 7,569..14,521] x [ 0.. 7,033] : < 1,544 diffs ( 70 trace pts)
- 1 377 c [ 9,602..14,521] x [ 0.. 5,009] : < 1,104 diffs ( 49 trace pts)
- 1 414 c [ 6,804..14,521] x [ 0.. 7,812] : < 1,745 diffs ( 77 trace pts)
- 1 415 c [ 0.. 3,613] x [ 7,685..11,224] : < 840 diffs ( 36 trace pts)
- 1 445 c [ 9,828..14,521] x [ 0.. 4,789] : < 1,036 diffs ( 47 trace pts)
- 1 464 n [ 0.. 1,942] x [12,416..14,281] : < 411 diffs ( 19 trace pts)
-
-...
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3af129a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,506 @@
+# Daligner: The Dazzler "Overlap" Module
+
+## _Author: Gene Myers_
+## _First: April 10, 2016_
+
+For typeset documentation, examples of use, and design philosophy please go to
+my [blog](https://dazzlerblog.wordpress.com/command-guides/damasker-commands).
+
+The commands below permit one to find all significant local alignments between reads
+encoded in Dazzler database. The assumption is that the reads are from a PACBIO RS II
+long read sequencer. That is the reads are long and noisy, up to 15% on average.
+
+Recall that a database has a current partition that divides it into blocks of a size
+that can conveniently be handled by calling the "dalign" overlapper on all the pairs of
+blocks producing a collection of .las local alignment files that can then be sorted and
+merged into an ordered sequence of sorted files containing all alignments between reads
+in the data set. The alignment records are parsimonious in that they do not record an
+alignment but simply a set of trace points, typically every 100bp or so, that allow the
+efficient reconstruction of alignments on demand.
+
+```
+1. daligner [-vbAI]
+ [-k<int(14)>] [-w<int(6)>] [-h<int(35)>] [-t<int>] [-M<int>]
+ [-e<double(.70)] [-l<int(1000)] [-s<int(100)>] [-H<int>] [-T<int(4)>]
+ [-m<track>]+ <subject:db|dam> <target:db|dam> ...
+```
+
+Compare sequences in the trimmed \<subject\> block against those in the list of \<target\>
+blocks searching for local alignments involving at least -l base pairs (default 1000)
+or more, that have an average correlation rate of -e (default 70%). The local
+alignments found will be output in a sparse encoding where a trace point on the
+alignment is recorded every -s base pairs of the a-read (default 100bp). Reads are
+compared in both orientations and local alignments meeting the criteria are output to
+one of several created files described below. The -v option turns on a verbose
+reporting mode that gives statistics on each major step of the computation. The
+program runs with 4 threads by default, but this may be set to any power of 2 with
+the -T option.
+
+The options -k, -h, and -w control the initial filtration search for possible matches
+between reads. Specifically, our search code looks for a pair of diagonal bands of
+width 2<sup>w</sup> (default 2<sup>6</sup> = 64) that contain a collection of exact matching k-mers
+(default 14) between the two reads, such that the total number of bases covered by the
+k-mer hits is h (default 35). k cannot be larger than 32 in the current implementation.
+If the -b option is set, then the daligner assumes the data has a strong compositional
+bias (e.g. >65% AT rich), and at the cost of a bit more time, dynamically adjusts k-mer
+sizes depending on compositional bias, so that the mers used have an effective
+specificity of 4<sup>k</sup>.
+
+If there are one or more interval tracks specified with the -m option, then the reads
+of the DB or DB's to which the mask applies are soft masked with the union of the
+intervals of all the interval tracks that apply, that is any k-mers that contain any
+bases in any of the masked intervals are ignored for the purposes of seeding a match.
+An interval track is a track, such as the "dust" track created by DBdust, that encodes
+a set of intervals over either the untrimmed or trimmed DB.
+
+Invariably, some k-mers are significantly over-represented (e.g. homopolymer runs).
+These k-mers create an excessive number of matching k-mer pairs and left unaddressed
+would cause daligner to overflow the available physical memory. One way to deal with
+this is to explicitly set the -t parameter which suppresses the use of any k-mer that
+occurs more than t times in either the subject or target block. However, a better way
+to handle the situation is to let the program automatically select a value of t that
+meets a given memory usage limit specified (in Gb) by the -M parameter. By default
+daligner will use the amount of physical memory as the choice for -M. If you want to
+use less, say only 8Gb on a 24Gb HPC cluster node because you want to run 3 daligner
+jobs on the node, then specify -M8. Specifying -M0 basically indicates that you do not
+want daligner to self adjust k-mer suppression to fit within a given amount of memory.
+
+Each found alignment is recorded as -- a[ab,ae] x b<sup>o</sup>[bb,be] -- where a and b are the
+indices (in the trimmed DB) of the reads that overlap, o indicates whether the b-read
+is from the same or opposite strand, and [ab,ae] and [bb,be] are the intervals of a
+and bo, respectively, that align. For each subject, target pair of blocks, say X and Y,
+the program reports alignments where the a-read is in X and the b-read is in Y, or
+vice versa. However, if the -A option is set ("A" for "asymmetric") then just overlaps
+where the a-read is in X and the b-read is in Y are reported, and if X = Y then it
+further reports only those overlaps where the a-read index is less than the b-read index.
+In either case, if the -I option is set ("I" for "identity") then when X = Y, overlaps
+between different portions of the same read will also be found and reported. In summary,
+the command "daligner -A X Y" produces a single file X.Y..las and "daligner X Y" produces
+2 files X.Y..las and Y.X.las (unless X=Y in which case only a single file, X.X.las, is
+produced). The overlap records in one of these files are sorted as described for LAsort.
+
+By default daligner compares all overlaps between reads in the database that are
+greater than the minimum cutoff set when the DB or DBs were split, typically 1 or
+2 Kbp. However, the HGAP assembly pipeline only wants to correct large reads, say
+8Kbp or over, and so needs only the overlaps where the a-read is one of the large
+reads. By setting the -H parameter to say N, one alters daligner so that it only
+reports overlaps where the a-read is over N base-pairs long.
+
+While the default parameter settings are good for raw Pacbio data, daligner can be used
+for efficiently finding alignments in corrected reads or other less noisy reads. For
+example, for mapping applications against .dams we run "daligner -k20 -h60 -e.85" and
+on corrected reads, we typically run "daligner -k25 -w5 -h60 -e.95 -s500" and at
+these settings it is very fast.
+
+```
+2. LAsort [-va] <align:las> ...
+```
+
+Sort each .las alignment file specified on the command line. For each file it reads in
+all the overlaps in the file and sorts them in lexicographical order of (a,b,o,ab)
+assuming each alignment is recorded as a[ab,ae] x b<sup>o</sup>[bb,be]. It then writes them all
+to a file named \<align\>.S.las (assuming that the input file was \<align\>.las). With the
+-v option set then the program reports the number of records read and written. If the
+-a option is set then it sorts LAs in lexicographical order of (a,ab) alone, which is
+desired when sorting a mapping of reads to a reference.
+
+If the .las file was produced by damapper the local alignments are organized into
+chains where the LA segments of a chain are consecutive and ordered in the file.
+LAsort can detects that it has been passed such a file and if so treats the chains as
+a unit and sorts them on the basis of the first LA in the chain.
+
+```
+3. LAmerge [-va] <merge:las> <parts:las> ...
+```
+
+Merge the .las files \<parts\> into a singled sorted file \<merge\>, where it is assumed
+that the input \<parts\> files are sorted. Due to operating system limits, the number of
+\<parts\> files must be ≤ 252. With the -v option set the program reports the # of
+records read and written. The -a option indicates the sort is as describe for LAsort
+above.
+
+If the .las file was produced by damapper the local alignments are organized into
+chains where the LA segments of a chain are consecutive and ordered in the file. When
+merging such files, LAmerge treats the chains as a unit and orders them on the basis
+of the first LA in the chain.
+
+Used correctly, LAmerge and LAsort together allow one to perform an "external" sort
+that produces a collection of sorted files containing in aggregate all the local
+alignments found by the daligner, such that their concatenation is sorted in order of
+(a,b,o,ab) (or (a,ab) if the -a option is set). In particular, this means that all the
+alignments for a given a-read will be found consecutively in one of the files. So
+computations that need to look at all the alignments for a given read can operate in
+simple sequential scans of these sorted files.
+
+```
+4. LAshow [-caroUF] [-i<int(4)>] [-w<int(100)>] [-b<int(10)>]
+ <src1:db|dam> [ <src2:db|dam> ]
+ <align:las> [ <reads:FILE> | <reads:range> ... ]
+```
+
+LAshow produces a printed listing of the local alignments contained in the specified
+.las file, where the a- and b-reads come from src1 or from src1 and scr2, respectively.
+If a file or list of read ranges is given then only the overlaps for which the a-read
+is in the set specified by the file or list are displayed. See DBshow for an explanation
+of how the file and list of read ranges are interpreted. If the -F option is set then
+the roles of the a- and b- reads are reversed in the display.
+
+If the -c option is given then a cartoon rendering is displayed, and if -a or -r option
+is set then an alignment of the local alignment is displayed. The -a option puts
+exactly -w columns per segment of the display, whereas the -r option puts exactly -w
+a-read symbols in each segment of the display. The -r display mode is useful when one
+wants to visually compare two alignments involving the same a-read. If a combination of
+the -c, -a, and -r flags is set, then the cartoon comes first, then the -a alignment,
+and lastly the -r alignment. The -i option sets the indent for the cartoon and/or
+alignment displays, if they are requested. The -b option sets the number of symbols on
+either side of the aligned segments in an alignment display, and -U specifies that
+uppercase should be used for DNA sequence instead of the default lowercase. If the
+-o option is set then only alignments that are proper overlaps (a sequence end occurs
+at the each end of the alignment) are displayed. If the -F option is given then the
+roles of the A- and B-reads are flipped.
+
+When examining LAshow output it is important to keep in mind that the coordinates
+describing an interval of a read are referring conceptually to positions between bases
+starting at 0 for the position to the left of the first base. That is, a coordinate c
+refers to the position between the c-1'st and c'th base, and the interval [b,e] captures
+the e-b bases from the b'th to the e-1'st, inclusive. We give an example with a cartoon
+and (part of an) alignment for which we will explain several additional
+important points:
+
+```
+ 1 1,865 c [18,479..20,216] x [ 1,707.. 0> ( 19 trace pts)
+
+ 18479 4235
+ A ========+----------+======> dif/(len1+len2) = 478/(1737+1707) = 27.76%
+ B <======+-----------
+ 5576
+
+ 18469 agccgcctag[tgcctcgcaaacgc-t-cggggcggcgt-gaaagcgg--
+ ::::::::::[||||||||||||||*|*|||*|||*|||*||||||||**
+ 1717 ctcttcttta[tgcctcgcaaacgccttcggcgcg-cgttgaaagcggtt 17.9%
+
+ 18513 -ccggtgggtc--agtggcgagttctggcagtgcgctggg-ctgcgaaat
+ *||||||*|||**|||||*||||*|*|*|||**|||||||*||*||||||
+ 1669 gccggtgcgtcgcagtgg-gagt-c-gtcag--cgctggggcttcgaaat 24.0%
+
+ . . .
+```
+
+The display of an LA always begins with a line giving the A-read, then the B-read, then
+an indication of orientation (i.e. are A and B from the same strand (n) or the opposite
+strand (c)?) followed by the A-interval and B-interval that are aligned. In particular,
+note carefully that when the B-read is in the complement orientation (c), then the
+B-interval gives the higher coordinate first, the idea being that one will align from
+the highest base down to the lowest base in the descending direction on B, complement
+the characters as you go. Further note that in the alignment display the coordinates at
+the start of each line follow this orientation convention and give the coordinate of the
+"tick mark" just left of the first character in each line. It is useful to know if an
+interval reaches the end of read, and to signal this we use an angle-bracket \<\> instead
+of a square bracket [], e.g. in the example the B-segment starts at the beginning of the
+read. Finally, observe that in the cartoon the numbers are not coordinates but rather
+indicate the lengths of the unaligned bits left and right of the two aligned intervals.
+Finally, observe that in the cartoon the numbers are not coordinates but rather indicate
+the lengths of the unaligned bits left and right of the two aligned intervals.
+
+With the introduction of damapper, .las files can now contain chains. If LAshow detects
+that it has been passed a file with chain information then it displays marks at the left
+that reveal the chain structure, e.g.:
+
+```
+ > 117 37,630 c [ 253.. 7,980] x [ 331,430.. 324,027] ~ 10.5%
+ + 117 37,628 n [ 253.. 7,983] x [21,493,673..21,501,079] ~ 10.6%
+ + 117 57 c [ 253.. 1,086] x [ 2,008,164.. 2,007,369] ~ 9.8%
+ - 117 57 c [ 1,300.. 7,982] x [ 2,007,351.. 2,000,945] ~ 10.7%
+ > 117 15 c [ 7,992.. 8,716] x [ 242,529.. 241,822] ~ 7.8%
+ - 117 15 c [ 8,752..14,299] x [ 241,824.. 236,425] ~ 10.7%
+ - 117 15 c [14,133..14,832] x [ 236,630.. 235,953] ~ 12.1%
+ + 117 37,628 n [ 7,992.. 8,716] x [19,202,357..19,203,064] ~ 7.7%
+ - 117 37,628 n [ 8,752..14,832] x [19,203,062..19,208,974] ~ 10.9%
+```
+
+A chain begins with either a > or + character, where > indicates this is the highest
+scoring chain and + indicates an alternate near optimal chain (controlled by the
+-n parameter to damapper). Each additional LA of a chain is marked with a - character.
+
+```
+5. LAdump [-cdt] [-o] <src1:db|dam> [ <src2:db|dam> ]
+ <align:las> [ <reads:FILE> | <reads:range> ... ]
+```
+
+Like LAshow, LAdump allows one to display the local alignments (LAs) of a subset of the
+piles in an .las file and select which information to show about them. The difference
+is that the information is written in a very simple "1-code" ASCII format that makes it
+easy for one to read and parse the information for further use. For each LA the pair of
+reads is output on a line. -c requests that one further output the coordinates of the
+LA segments be output. The -d option requests that the number of difference in the LA
+be output, and -t requests that the tracepoint information be output. Finally, -o
+requests that only LAs that are proper overlaps be output.
+
+The format is very simple. Each requested piece of information occurs on a line. The
+first character of every line is a "1-code" character that tells you what information
+to expect on the line. The rest of the line contains information where each item is
+separated by a single blank space. The trace point line gives the number of trace
+point intervals in the LA and is immediately followed by that many lines containing
+a pair of integers giving the # of differences and b-displacement in each successive
+trace point interval.
+
+```
+ P #a #b #o #c - (#a,#b^#o) have an LA between them where #o is 'n' or 'c' and
+ #c is '>' (start of best chain), '+' (start of alternate chain),
+ '-' (continuation of chain), or '.' (no chains in file)
+ C #ab #ae #bb #be - #a[#ab,#ae] aligns with #b^#o[#bb,#be]
+ D # - there are # differences in the LA
+ T #n - there are #n trace point intervals for the LA
+ (#d #y )^#n - there are #d difference aligning the #y bp's of B with the
+ next fixed-size interval of A
+ + X # - Total amount of X (X = P or T)
+ % X # - Maximum amount of X in any pile (X = P or T)
+ @ T # - Maximum number of trace points in any trace
+```
+
+1-code lines that begin with +, %, or @ are always the first lines in the output.
+They give size information about what is contained in the output. Specifically,
+'+ X #' gives the total number of LAs (X=P), or the total number of trace point
+intervals (X=T) in the file . '% X #' gives the maximum number of LAs (X=P) or
+the maximum number of trace point intervals (X=T) in a given *pile* (collection of
+LAs all with the same a-read (applies only to sorted .las files). Finally @ T #
+gives the maximum # of trace point intervals in any trace within the file.
+
+```
+6. LAindex -v <source:las> ...
+```
+
+LAindex takes a series of one or more sorted .las files and produces a "pile
+index" for each one. If the input file has name "X.las", then the name of its
+index file is ".X.las.idx". For each A-read pile encoded in the .las file,
+the index contains the offset to the first local alignment with A in the file.
+The index starts with four 64-bit integers that encode the numbers % P, + T, % T,
+and @ T described for LAdump above, and then an offset for each pile beginning
+with the first A-read in the file (which may not be read 0). The index is meant
+to allow programs that process piles to more efficiently read just the piles
+they need at any momment int time, as opposed to having to sequentially scan
+through the .las file.
+
+```
+7. LAcat [-v] <source:las> > <target>.las
+```
+
+Given template name \<source\> that contains a single #-sign somewhere within it,
+find all files that match it when the # is replace by i for i in 1,2,3,... and
+a .las extension is added if not present. Then concatenate these files in order
+into a single .las file and pipe the result to the standard output. The -v
+option reports the files concatenated and the number of la's within them to
+standard error (as the standard output receives the concatenated file).
+
+```
+8. LAsplit [-v] <target:las> (<parts:int> | <path:db|dam>) < <source>.las
+```
+
+If the second argument is an integer n, then divide the alignment file \<source\>, piped
+in through the standard input, as evenly as possible into n alignment files with the
+names specified by template \<target\>, subject to the restriction that all alignment
+records for a given a-read are in the same file. The name of the n files is the
+string \<target\> where the single #-sign that occurs somewhere in it is replaced
+by i for i in [1,n] and a .las extension is added if necessary.
+
+If the second argument refers to a database \<path\>.db that has been partitioned, then
+divide the input alignment file into block .las files where all records whose a-read is
+in \<path\>.i.db are in the i'th file generated from the template \<target\>. The -v
+option reports the files produced and the number of la's within them to standard error.
+
+```
+9. LAcheck [-vS] <src1:db|dam> [ <src2:db|dam> ] <align:las> ...
+```
+
+LAcheck checks each .las file for structural integrity, where the a- and b-sequences
+come from src1 or from src1 and scr2, respectively. That is, it makes sure each file
+makes sense as a plausible .las file, e.g. values are not out of bound, the number of
+records is correct, the number of trace points for a record is correct, and so on. If
+the -S option is set then it further checks that the alignments are in sorted order.
+If the -v option is set then a line is output for each .las file saying either the
+file is OK or reporting the first error. If the -v option is not set then the program
+runs silently. The exit status is 0 if every file is deemed good, and 1 if at least
+one of the files looks corrupted.
+
+With the introduction of damapper, LAcheck checks to see if a file has chain
+information, and if it does, then it checks the validity of chains and assumes that
+the chains were sorted with the -a option to LAsort and LAmerge.
+
+```
+10. HPC.daligner [-vbad] [-t<int>] [-w<int(6)>] [-l<int(1000)] [-s<int(100)]
+ [-M<int>] [-B<int(4)>] [-D<int( 250)>] [-T<int(4)>] [-f<name>]
+ ( [-k<int(14)>] [-h<int(35)>] [-e<double(.70)] [-H<int>]
+ [-k<int(20)>] [-h<int(50)>] [-e<double(.85)] <ref:db|dam> )
+ [-m<track>]+ <reads:db|dam> [<first:int>[-<last:int>]]
+```
+
+HPC.daligner writes a UNIX shell script to the standard output or to a series of files
+beginning with the prefix \<name\> if the -f option is set, that either performs an
+"overlap" computation on all the blocks in a single database, or a "comparison"
+computation on all pairs of blocks between two databases, depending on whether it is
+given one or two DB's as arguments (\<ref\> and \<reads\>). We describe the overlap
+script first and its effect first and then later the comparison script.
+
+An Overlap Script: consists of a sequence of commands that effectively run daligner on
+all pairs of blocks of a split database and then externally sorts and merges them using
+LAsort and LAmerge into a collection of alignment files with names \<path\>.#.las where #
+ranges from 1 to the number of blocks the data base is split into. These sorted files
+if concatenated by say LAcat would contain all the alignments in sorted order (of
+a-read, then b-read, ...). Moreover, all overlaps for a given a-read are guaranteed
+to not be split across files, so one can run artifact analyzers or error correction on
+each sorted file in parallel.
+
+The data base must have been previously split by DBsplit and all the parameters, except
+-a, -d, -f, -B, and -D, are passed through to the calls to daligner. The defaults for
+these parameters are as for daligner. The -v and -a flags are passed to all calls to
+LAsort and LAmerge. All other options are described later. For a database divided into
+N sub-blocks, the calls to daligner will produce in total N<sup>2</sup> .las files,
+on per block pair.
+These are then merged in ceil(log<sub>D</sub> N) phases where
+the number of files decreases geometrically in -D until there is 1 file per row of
+the N x N block matrix. So at the end one has N sorted .las files that when
+concatenated would give a single large sorted overlap file.
+
+The -B option (default 4) gives the desired number of block comparisons per call to
+daligner. Some must contain B-1 comparisons, and the first B-2 block comparisons
+even less, but the HPCdaligner "planner" does the best it can to give an average load
+of dal block comparisons per command. The -D option (default 250) gives the maximum
+number of files that will be merged in a single LAmerge command. The planner performs
+D-way merges at all of the ceil(log<sub>D</sub> N) levels save the last, so as to minimize the
+number of intermediate files.
+
+If the integers \<first\> and \<last\> are missing then the script produced is for every
+block in the database. If \<first\> is present then HPCdaligner produces an incremental
+script that compares blocks \<first\> through \<last\> (\<last\> = \<first\> if not present)
+against each other and all previous blocks 1 through \<first\>-1, and then incrementally
+updates the .las files for blocks 1 through \<first\>-1, and creates the .las files for
+blocks \<first\> through \<last\>.
+
+A Comparison Script: consists of a sequence of commands that effectively maps every
+read in the DB \<reads\> against a reference set of sequences in the DB \<ref\>, recording
+all the found local alignments in the sequence of files \<reads\>.1.\<ref\>.las,
+\<reads\>.2.\<ref\>.las, ... where \<reads\>.\<ref\>.k.las contains the alignments between all
+of \<ref\> and the k'th block of \<reads\>. The parameters are exactly the same as for the
+overlap script save that the -k, -h, and -e defaults are set more stringently for
+mapping, and the -A, -I , and -H options make no sense as \<ref\> and \<reads\> are
+expected to be distinct data sets. If the integers \<first\> and \<last\> are missing then
+the script produced is for every block in the database \<reads\>. If \<first\> is present
+then HPC.daligner produces a script that compares blocks \<first\> through \<last\> (\<last\>
+= \<first\> if not present) of \<reads\> against DAM \<ref\>.
+
+The command scripts output by HPC.daligner and other HPC.\<x\> programs consists of
+command blocks each of which begins with a comment line (begins with #) followed by a
+potentially long list of lines each containing a shell command. Command blocks whose
+comment mentions "jobs" and gives the number of said in parenthesis, we call parallel
+blocks because each command line in the block can be sent to a node in a cluster for
+independent execution, i.e. none of the commands in a block depend on another in the
+block. The remaining command blocks we call house-keeping blocks because they can be
+executed by the shell on the launch/server node and the commands are either checking
+the integrity of .las files with LAcheck, or removing intermediate files with rm. Each
+block should be performed in the order given and should complete before the next block
+is performed.
+
+If the -f option is set, then each command block is written to a file with a name of
+the form \<name\>.#.\<description\> where \<name\> is specified by the user in the -f option
+argument, # gives the order in which the command block in the given file is to be
+performed in relation to other command block files, and \<description\> is a (very)
+short symbolic reminder of what the block is doing. For example, "HPC.daligner -fJOBS
+DB" would produce the files:
+
+```
+ JOBS.01.OVL
+ JOBS.02.CHECK.OPT
+ JOBS.03.MERGE
+ JOBS.04.CHECK.OPT
+ JOBS.05.RM.OPT
+```
+
+The number of command blocks varies as it depends on the number of merging rounds
+required in the external sort of the .las files. The files with the suffix .OPT are
+optional and need not be executed albeit we highly recommend that one run all the
+CHECK blocks.
+
+A new -d option requests scripts that organize files into a collection of
+sub-directories so as not to overwhelm the underlying OS for large genomes. Recall
+that for a DB divided into N blocks, the daligner will produce N<sup>2</sup> .las-files.
+With the -d option set, N sub-directories (with respect to the directory HPC.daligner is
+called in) of the form "work\<i\>" for i from 1 to N are created in an initial command
+block, and then all work files are placed in those sub-directories, with a maximum
+of 2N files appearing in any sub-directory at any given point in the process.
+
+Example:
+
+```
+// Recall G.db from the example in DAZZ_DB/README
+
+> cat G.db
+files = 1
+ 1862 G Sim
+blocks = 2
+size = 11 cutoff = 0 all = 0
+ 0 0
+ 1024 1024
+ 1862 1862
+> HPCdaligner -mdust -t5 G | csh -v // Run the HPCdaligner script
+
+# Dazzler jobs (2)
+dazzler -d -t5 -mdust G.1 G.1
+dazzler -d -t5 -mdust G.2 G.1 G.2
+# Initial sort jobs (4)
+LAsort G.1.G.1.*.las && LAmerge G.L1.1.1 G.1.G.1.*.S.las && rm G.1.G.1.*.S.las
+LAsort G.1.G.2.*.las && LAmerge G.L1.1.2 G.1.G.2.*.S.las && rm G.1.G.2.*.S.las
+LAsort G.2.G.1.*.las && LAmerge G.L1.2.1 G.2.G.1.*.S.las && rm G.2.G.1.*.S.las
+LAsort G.2.G.2.*.las && LAmerge G.L1.2.2 G.2.G.2.*.S.las && rm G.2.G.2.*.S.las
+# Level 1 jobs (2)
+LAmerge G.1 G.L1.1.1 G.L1.1.2 && rm G.L1.1.1.las G.L1.1.2.las
+LAmerge G.2 G.L1.2.1 G.L1.2.2 && rm G.L1.2.1.las G.L1.2.2.las
+
+> LAshow -c -a:G -w50 G.1 | more // Take a look at the result !
+
+G.1: 34,510 records
+
+ 1 9 c [ 0.. 1,876] x [ 9,017..10,825] ( 18 trace pts)
+
+ 12645
+ A ---------+====> dif/(len1+len2) = 398/(1876+1808) = 21.61%
+ B <====+---------
+ 9017
+
+ 1 ..........gtg-cggt--caggggtgcctgc-t-t-atcgcaatgtta
+ |||*||||**||||||||*||||*|*|*||**|*|*||||
+ 9008 gagaggccaagtggcggtggcaggggtg-ctgcgtcttatatccaggtta 27.5%
+
+ 35 ta-ctgggtggttaaacttagccaggaaacctgttgaaataa-acggtgg
+ ||*|||||||||||||*|**|*||*|*||||||*|**|||||*|*|||||
+ 9057 tagctgggtggttaaa-tctg-ca-g-aacctg-t--aataacatggtgg 24.0%
+
+ 83 -ctagtggcttgccgtttacccaacagaagcataatgaaa-tttgaaagt
+ *||||||||*||||||||*||**||||*|||**|||||||*||||*||||
+ 9100 gctagtggc-tgccgttt-ccgcacag-agc--aatgaaaatttg-aagt 20.0%
+
+ 131 ggtaggttcctgctgtct-acatacagaacgacggagcgaaaaggtaccg
+ ||*|||||||||||||*|*||||*|*|*||||||||||*||||||||||*
+ 9144 gg-aggttcctgctgt-tcacat-c-ggacgacggagc-aaaaggtacc- 16.0%
+
+...
+
+> LAcat G >G.las // Combine G.1.las & G.2.las into a single .las file
+> LAshow G G | more // Take another look, now at G.las
+
+G: 62,654 records
+ 1 9 c [ 0.. 1,876] x [ 9,017..10,825] : < 398 diffs ( 18 trace pts)
+ 1 38 c [ 0.. 7,107] x [ 5,381..12,330] : < 1,614 diffs ( 71 trace pts)
+ 1 49 n [ 5,493..14,521] x [ 0.. 9,065] : < 2,028 diffs ( 91 trace pts)
+ 1 68 n [12,809..14,521] x [ 0.. 1,758] : < 373 diffs ( 17 trace pts)
+ 1 147 c [ 0..13,352] x [ 854..14,069] : < 2,993 diffs (133 trace pts)
+ 1 231 n [10,892..14,521] x [ 0.. 3,735] : < 816 diffs ( 37 trace pts)
+ 1 292 c [ 3,835..14,521] x [ 0..10,702] : < 2,353 diffs (107 trace pts)
+ 1 335 n [ 7,569..14,521] x [ 0.. 7,033] : < 1,544 diffs ( 70 trace pts)
+ 1 377 c [ 9,602..14,521] x [ 0.. 5,009] : < 1,104 diffs ( 49 trace pts)
+ 1 414 c [ 6,804..14,521] x [ 0.. 7,812] : < 1,745 diffs ( 77 trace pts)
+ 1 415 c [ 0.. 3,613] x [ 7,685..11,224] : < 840 diffs ( 36 trace pts)
+ 1 445 c [ 9,828..14,521] x [ 0.. 4,789] : < 1,036 diffs ( 47 trace pts)
+ 1 464 n [ 0.. 1,942] x [12,416..14,281] : < 411 diffs ( 19 trace pts)
+
+...
+```
diff --git a/align.c b/align.c
index de55eed..eb877d4 100644
--- a/align.c
+++ b/align.c
@@ -322,7 +322,7 @@ typedef struct
static int VectorEl = 6*sizeof(int) + sizeof(BVEC);
static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath,
- int *mind, int maxd, int mida, int minp, int maxp)
+ int *mind, int maxd, int mida, int minp, int maxp, int aoff, int boff)
{ char *aseq = align->aseq;
char *bseq = align->bseq;
Path *apath = align->path;
@@ -339,7 +339,7 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
int *NA, *NB;
int *_NA, *_NB;
Pebble *cells;
- int avail, cmax, boff;
+ int avail, cmax;
int TRACE_SPACE = spec->trace_space;
int PATH_AVE = spec->ave_path;
@@ -385,11 +385,6 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
cells = (Pebble *) (work->cells);
cmax = work->celmax;
avail = 0;
-
- if (COMP(align->flags))
- boff = align->blen % TRACE_SPACE;
- else
- boff = 0;
}
/* Compute 0-wave starting from mid-line */
@@ -426,7 +421,7 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
work->cells = (void *) cells;
}
- na = ((y+k)/TRACE_SPACE)*TRACE_SPACE;
+ na = (((y+k)+(TRACE_SPACE-aoff))/TRACE_SPACE-1)*TRACE_SPACE+aoff;
#ifdef SHOW_TPS
printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout);
#endif
@@ -978,15 +973,6 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
apath->bepos = trimy;
apath->diffs = trimd;
apath->tlen = atlen;
- if (COMP(align->flags))
- { bpath->abpos = align->blen - apath->bepos;
- bpath->bbpos = align->alen - apath->aepos;
- }
- else
- { bpath->aepos = apath->bepos;
- bpath->bepos = apath->aepos;
- }
- bpath->diffs = trimd;
bpath->tlen = btlen;
}
@@ -997,7 +983,7 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
/*** Reverse Wave ***/
static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath,
- int mind, int maxd, int mida, int minp, int maxp)
+ int mind, int maxd, int mida, int minp, int maxp, int aoff, int boff)
{ char *aseq = align->aseq - 1;
char *bseq = align->bseq - 1;
Path *apath = align->path;
@@ -1014,7 +1000,7 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
int *NA, *NB;
int *_NA, *_NB;
Pebble *cells;
- int avail, cmax, boff;
+ int avail, cmax;
int TRACE_SPACE = spec->trace_space;
int PATH_AVE = spec->ave_path;
@@ -1060,11 +1046,6 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
cells = (Pebble *) (work->cells);
cmax = work->celmax;
avail = 0;
-
- if (COMP(align->flags))
- boff = align->blen % TRACE_SPACE;
- else
- boff = 0;
}
more = 1;
@@ -1099,7 +1080,7 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
work->cells = (void *) cells;
}
- na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE;
+ na = (((y+k)+(TRACE_SPACE-aoff)-1)/TRACE_SPACE-1)*TRACE_SPACE+aoff;
#ifdef SHOW_TPS
printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout);
#endif
@@ -1572,7 +1553,7 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
#ifdef SHOW_TRAIL
printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout);
#endif
- if ((b+k)%TRACE_SPACE != 0)
+ if ((b+k)%TRACE_SPACE != aoff)
{ h = cells[h].ptr;
if (h < 0)
{ a = trimy;
@@ -1700,15 +1681,6 @@ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, P
apath->diffs = apath->diffs + trimd;
apath->tlen = apath->tlen - atlen;
apath->trace = atrace + atlen;
- if (COMP(align->flags))
- { bpath->aepos = align->blen - apath->bbpos;
- bpath->bepos = align->alen - apath->abpos;
- }
- else
- { bpath->abpos = apath->bbpos;
- bpath->bbpos = apath->abpos;
- }
- bpath->diffs = bpath->diffs + trimd;
bpath->tlen = bpath->tlen - btlen;
bpath->trace = btrace + btlen;
}
@@ -1727,6 +1699,7 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
_Align_Spec *spec = (_Align_Spec *) espec;
Path *apath, *bpath;
+ int aoff, boff;
int minp, maxp;
int selfie;
@@ -1783,7 +1756,20 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
else
maxp = hgh+hbord;
- if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp))
+ if (ACOMP(align->flags))
+ { aoff = align->alen % spec->trace_space;
+ boff = 0;
+ }
+ else if (COMP(align->flags))
+ { aoff = 0;
+ boff = align->blen % spec->trace_space;
+ }
+ else
+ { aoff = 0;
+ boff = 0;
+ }
+
+ if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp,aoff,boff))
EXIT(NULL);
#ifdef DEBUG_PASSES
@@ -1792,7 +1778,7 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
apath->aepos,apath->bepos,apath->diffs);
#endif
- if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp))
+ if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp,aoff,boff))
EXIT(NULL);
#ifdef DEBUG_PASSES
@@ -1800,11 +1786,43 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
(anti+low)/2,(anti-low)/2,apath->abpos,apath->bbpos,apath->diffs);
#endif
- if (COMP(align->flags))
+ bpath->diffs = apath->diffs;
+ if (ACOMP(align->flags))
+ { uint16 *trace = (uint16 *) apath->trace;
+ uint16 p;
+ int i, j;
+
+ bpath->aepos = apath->bepos;
+ bpath->bepos = apath->aepos;
+ bpath->abpos = apath->bbpos;
+ bpath->bbpos = apath->abpos;
+
+ apath->abpos = align->alen - bpath->bepos;
+ apath->bbpos = align->blen - bpath->aepos;
+ apath->aepos = align->alen - bpath->bbpos;
+ apath->bepos = align->blen - bpath->abpos;
+ i = apath->tlen-2;
+ j = 0;
+ while (j < i)
+ { p = trace[i];
+ trace[i] = trace[j];
+ trace[j] = p;
+ p = trace[i+1];
+ trace[i+1] = trace[j+1];
+ trace[j+1] = p;
+ i -= 2;
+ j += 2;
+ }
+ }
+ else if (COMP(align->flags))
{ uint16 *trace = (uint16 *) bpath->trace;
uint16 p;
int i, j;
+ bpath->abpos = align->blen - apath->bepos;
+ bpath->bbpos = align->alen - apath->aepos;
+ bpath->aepos = align->blen - apath->bbpos;
+ bpath->bepos = align->alen - apath->abpos;
i = bpath->tlen-2;
j = 0;
while (j < i)
@@ -1818,13 +1836,19 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
j += 2;
}
}
+ else
+ { bpath->aepos = apath->bepos;
+ bpath->bepos = apath->aepos;
+ bpath->abpos = apath->bbpos;
+ bpath->bbpos = apath->abpos;
+ }
#ifdef DEBUG_POINTS
{ uint16 *trace = (uint16 *) apath->trace;
int a, h;
printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos);
- printf(" %c\n",(COMP(align->flags) ? 'c' : 'n'));
+ printf(" %c\n",((COMP(align->flags) || ACOMP(align->flags)) ? 'c' : 'n'));
a = apath->bbpos;
for (h = 1; h < apath->tlen; h += 2)
{ int dif = trace[h-1];
@@ -1838,7 +1862,8 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec,
int a, h;
printf("\nB-path (%d,%d)->(%d,%d)",bpath->abpos,bpath->bbpos,bpath->aepos,bpath->bepos);
- printf(" %c [%d,%d]\n",(COMP(align->flags) ? 'c' : 'n'),align->blen,align->alen);
+ printf(" %c [%d,%d]\n",((COMP(align->flags) || ACOMP(align->flags)) ? 'c' : 'n'),
+ align->blen,align->alen);
a = bpath->bbpos;
for (h = 1; h < bpath->tlen; h += 2)
{ int dif = trace[h-1];
@@ -3221,6 +3246,7 @@ int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework,
char mtag, dtag;
int prefa, prefb;
int aend, bend;
+ int comp, blen;
int sa, sb;
int match, diff;
char *N2A;
@@ -3250,6 +3276,9 @@ int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework,
aend = align->path->aepos;
bend = align->path->bepos;
+ comp = COMP(align->flags);
+ blen = align->blen;
+
Abuf[width] = Bbuf[width] = Dbuf[width] = '\0';
/* buffer/output next column */
#define COLUMN(x,y) \
@@ -3258,15 +3287,18 @@ int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework,
{ fprintf(file,"\n"); \
fprintf(file,"%*s",indent,""); \
if (coord > 0) \
- { if (sa <= aend) \
+ { if (sa < aend) \
fprintf(file," %*d",coord,sa); \
else \
fprintf(file," %*s",coord,""); \
fprintf(file," %s\n",Abuf); \
fprintf(file,"%*s %*s %s\n",indent,"",coord,"",Dbuf); \
fprintf(file,"%*s",indent,""); \
- if (sb <= bend) \
- fprintf(file," %*d",coord,sb); \
+ if (sb < bend) \
+ if (comp) \
+ fprintf(file," %*d",coord,blen-sb); \
+ else \
+ fprintf(file," %*d",coord,sb); \
else \
fprintf(file," %*s",coord,""); \
fprintf(file," %s",Bbuf); \
@@ -3278,8 +3310,8 @@ int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework,
} \
fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \
o = 0; \
- sa = i; \
- sb = j; \
+ sa = i-1; \
+ sb = j-1; \
match = diff = 0; \
} \
u = (x); \
@@ -3313,8 +3345,8 @@ int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework,
prefb = border;
}
- sa = i;
- sb = j;
+ sa = i-1;
+ sb = j-1;
mtag = ':';
dtag = ':';
@@ -3422,15 +3454,18 @@ int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework,
fprintf(file,"\n");
fprintf(file,"%*s",indent,"");
if (coord > 0)
- { if (sa <= aend)
+ { if (sa < aend)
fprintf(file," %*d",coord,sa);
else
fprintf(file," %*s",coord,"");
fprintf(file," %.*s\n",o,Abuf);
fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf);
fprintf(file,"%*s",indent,"");
- if (sb <= bend)
- fprintf(file," %*d",coord,sb);
+ if (sb < bend)
+ if (comp)
+ fprintf(file," %*d",coord,blen-sb);
+ else
+ fprintf(file," %*d",coord,sb);
else
fprintf(file," %*s",coord,"");
fprintf(file," %.*s",o,Bbuf);
@@ -3461,6 +3496,7 @@ int Print_Reference(FILE *file, Alignment *align, Work_Data *ework,
char mtag, dtag;
int prefa, prefb;
int aend, bend;
+ int comp, blen;
int sa, sb, s0;
int match, diff;
char *N2A;
@@ -3494,21 +3530,27 @@ int Print_Reference(FILE *file, Alignment *align, Work_Data *ework,
aend = align->path->aepos;
bend = align->path->bepos;
+ comp = COMP(align->flags);
+ blen = align->blen;
+
#define BLOCK(x,y) \
{ int u, v; \
if (i%block == 1 && i != s0 && x < 4 && o > 0) \
{ fprintf(file,"\n"); \
fprintf(file,"%*s",indent,""); \
if (coord > 0) \
- { if (sa <= aend) \
+ { if (sa < aend) \
fprintf(file," %*d",coord,sa); \
else \
fprintf(file," %*s",coord,""); \
fprintf(file," %.*s\n",o,Abuf); \
fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); \
fprintf(file,"%*s",indent,""); \
- if (sb <= bend) \
- fprintf(file," %*d",coord,sb); \
+ if (sb < bend) \
+ if (comp) \
+ fprintf(file," %*d",coord,blen-sb); \
+ else \
+ fprintf(file," %*d",coord,sb); \
else \
fprintf(file," %*s",coord,""); \
fprintf(file," %.*s",o,Bbuf); \
@@ -3520,8 +3562,8 @@ int Print_Reference(FILE *file, Alignment *align, Work_Data *ework,
} \
fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \
o = 0; \
- sa = i; \
- sb = j; \
+ sa = i-1; \
+ sb = j-1; \
match = diff = 0; \
} \
u = (x); \
@@ -3567,8 +3609,8 @@ int Print_Reference(FILE *file, Alignment *align, Work_Data *ework,
}
s0 = i;
- sa = i;
- sb = j;
+ sa = i-1;
+ sb = j-1;
mtag = ':';
dtag = ':';
@@ -3676,15 +3718,18 @@ int Print_Reference(FILE *file, Alignment *align, Work_Data *ework,
fprintf(file,"\n");
fprintf(file,"%*s",indent,"");
if (coord > 0)
- { if (sa <= aend)
+ { if (sa < aend)
fprintf(file," %*d",coord,sa);
else
fprintf(file," %*s",coord,"");
fprintf(file," %.*s\n",o,Abuf);
fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf);
fprintf(file,"%*s",indent,"");
- if (sb <= bend)
- fprintf(file," %*d",coord,sb);
+ if (sb < bend)
+ if (comp)
+ fprintf(file," %*d",coord,blen-sb);
+ else
+ fprintf(file," %*d",coord,sb);
else
fprintf(file," %*s",coord,"");
fprintf(file," %.*s",o,Bbuf);
diff --git a/align.h b/align.h
index e937b68..c3b31ae 100644
--- a/align.h
+++ b/align.h
@@ -113,11 +113,30 @@ typedef struct
the sequence before calling Compute_Trace or Print_Alignment. Complement_Seq complements
the sequence a of length n. The operation does the complementation/reversal in place.
Calling it a second time on a given fragment restores it to its original state.
+
+ With the introduction of the DAMAPPER, we need to code chains of alignments between a
+ pair of sequences. The alignments of a chain are expected to be found in order either on
+ a file or in memory, where the START_FLAG marks the first alignment and the NEXT_FLAG all
+ subsequent alignmenst in a chain. A chain of a single LA is marked with the START_FLAG.
+ The BEST_FLAG marks one of the best chains for a pair of sequences. The convention is
+ that either every record has either a START- or NEXT-flag, or none of them do (e.g. as
+ produced by daligner), so one can always check the flags of the first alignment to see
+ whether or not the chain concept applies to a given collection or not.
***/
-#define COMP(x) ((x) & 0x1)
+#define COMP_FLAG 0x1
+#define ACOMP_FLAG 0x2 // A-sequence is complemented, not B ! Only Local_Alignment notices
+
+#define COMP(x) ((x) & COMP_FLAG)
+#define ACOMP(x) ((x) & ACOMP_FLAG)
+
+#define START_FLAG 0x4 // LA is the first of a chain of 1 or more la's
+#define NEXT_FLAG 0x8 // LA is the next segment of a chain.
+#define BEST_FLAG 0x10 // This is the start of the best chain
-#define COMP_FLAG 0x1
+#define CHAIN_START(x) ((x) & START_FLAG)
+#define CHAIN_NEXT(x) ((x) & NEXT_FLAG)
+#define BEST_CHAIN(x) ((x) & BEST_FLAG)
typedef struct
{ Path *path;
diff --git a/daligner.c b/daligner.c
index 2914e05..a25de9d 100644
--- a/daligner.c
+++ b/daligner.c
@@ -8,18 +8,17 @@
* each encoding a given found local alignment between two of the sequences. The -v
* option turns on a verbose reporting mode that gives statistics on each major stage.
*
- * There cannot be more than 65,535 reads in a given db, and each read must be less than
- * 66,535 characters long.
- *
* The filter operates by looking for a pair of diagonal bands of width 2^'s' that contain
* a collection of exact matching 'k'-mers between the two sequences, such that the total
- * number of bases covered by 'k'-mer hits is 'h'. k cannot be larger than 15 in the
+ * number of bases covered by 'k'-mer hits is 'h'. k cannot be larger than 32 in the
* current implementation.
*
* Some k-mers are significantly over-represented (e.g. homopolymer runs). These are
- * suppressed as seed hits, with the parameter 'm' -- any k-mer that occurs more than
- * 'm' times in either the subject or target is not counted as a seed hit. If the -m
- * option is absent then no k-mer is suppressed.
+ * suppressed as seed hits, with the parameter 't' -- any k-mer that occurs more than
+ * 't' times in either the subject or target is not counted as a seed hit. If the -t
+ * option is absent then no k-mer is suppressed. Alternatively, the option -M specifies
+ * that 't' is dynamically set to the largest value such that less than -M memory is
+ * used.
*
* For each subject, target pair, say XXX and YYY, the program outputs a file containing
* overlaps of the form XXX.YYY.[C|N]#.las where C implies that the reads in XXX were
@@ -51,7 +50,7 @@
static char *Usage[] =
{ "[-vbAI] [-k<int(14)>] [-w<int(6)>] [-h<int(35)>] [-t<int>] [-M<int>]",
- " [-e<double(.70)] [-l<int(1000)>] [-s<int(100)>] [-H<int>]",
+ " [-e<double(.70)] [-l<int(1000)>] [-s<int(100)>] [-H<int>] [-T<int(4)>]",
" [-m<track>]+ <subject:db|dam> <target:db|dam> ...",
};
@@ -178,7 +177,7 @@ static void reheap(int s, Event **heap, int hsize)
heap[c] = hs;
}
-int64 Merge_Size(HITS_DB *block, int mtop)
+static int64 merge_size(HITS_DB *block, int mtop)
{ Event ev[mtop+1];
Event *heap[mtop+2];
int r, mhalf;
@@ -250,7 +249,7 @@ int64 Merge_Size(HITS_DB *block, int mtop)
return (nsize);
}
-HITS_TRACK *Merge_Tracks(HITS_DB *block, int mtop, int64 nsize)
+static HITS_TRACK *merge_tracks(HITS_DB *block, int mtop, int64 nsize)
{ HITS_TRACK *ntrack;
Event ev[mtop+1];
Event *heap[mtop+2];
@@ -351,9 +350,13 @@ static int read_DB(HITS_DB *block, char *name, char **mask, int *mstat, int mtop
if (kind == MASK_TRACK)
mstat[i] = 0;
else
- mstat[i] = -3;
+ { if (mstat[i] != 0)
+ mstat[i] = -3;
+ }
else
- mstat[i] = status;
+ { if (mstat[i] == -2)
+ mstat[i] = status;
+ }
if (status == 0 && kind == MASK_TRACK)
Load_Track(block,mask[i]);
}
@@ -381,8 +384,8 @@ static int read_DB(HITS_DB *block, char *name, char **mask, int *mstat, int mtop
{ int64 nsize;
HITS_TRACK *track;
- nsize = Merge_Size(block,stop);
- track = Merge_Tracks(block,stop,nsize);
+ nsize = merge_size(block,stop);
+ track = merge_tracks(block,stop,nsize);
while (block->tracks != NULL)
Close_Track(block,block->tracks->name);
@@ -514,6 +517,22 @@ static HITS_DB *complement_DB(HITS_DB *block, int inplace)
return (cblock);
}
+static char *CommandBuffer(char *aname, char *bname)
+{ static char *cat = NULL;
+ static int max = -1;
+ int len;
+
+ len = 2*(strlen(aname) + strlen(bname)) + 200;
+ if (len > max)
+ { max = ((int) (1.2*len)) + 100;
+ if ((cat = (char *) realloc(cat,max+1)) == NULL)
+ { fprintf(stderr,"%s: Out of memory (Making path name)\n",Prog_Name);
+ exit (1);
+ }
+ }
+ return (cat);
+}
+
int main(int argc, char *argv[])
{ HITS_DB _ablock, _bblock;
HITS_DB *ablock = &_ablock, *bblock = &_bblock;
@@ -532,6 +551,7 @@ int main(int argc, char *argv[])
int HIT_MIN;
double AVE_ERROR;
int SPACING;
+ int NTHREADS;
{ int i, j, k;
int flags[128];
@@ -547,6 +567,7 @@ int main(int argc, char *argv[])
AVE_ERROR = .70;
SPACING = 100;
MINOVER = 1000; // Globally visible to filter.c
+ NTHREADS = 4;
MEM_PHYSICAL = getMemorySize();
MEM_LIMIT = MEM_PHYSICAL;
@@ -571,6 +592,10 @@ int main(int argc, char *argv[])
break;
case 'k':
ARG_POSITIVE(KMER_LEN,"K-mer length")
+ if (KMER_LEN > 32)
+ { fprintf(stderr,"%s: K-mer length must be 32 or less\n",Prog_Name);
+ exit (1);
+ }
break;
case 'w':
ARG_POSITIVE(BIN_SHIFT,"Log of bin width")
@@ -615,6 +640,9 @@ int main(int argc, char *argv[])
}
MASK[MTOP++] = argv[i]+2;
break;
+ case 'T':
+ ARG_POSITIVE(NTHREADS,"Number of threads")
+ break;
}
else
argv[j++] = argv[i];
@@ -631,10 +659,13 @@ int main(int argc, char *argv[])
fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]);
exit (1);
}
+
+ for (j = 0; j < MTOP; j++)
+ MSTAT[j] = -2;
}
MINOVER *= 2;
- if (Set_Filter_Params(KMER_LEN,BIN_SHIFT,MAX_REPS,HIT_MIN))
+ if (Set_Filter_Params(KMER_LEN,BIN_SHIFT,MAX_REPS,HIT_MIN,NTHREADS))
{ fprintf(stderr,"Illegal combination of filter parameters\n");
exit (1);
}
@@ -652,7 +683,8 @@ int main(int argc, char *argv[])
/* Compare against reads in B in both orientations */
- { int i, j;
+ { int i, j;
+ char *command;
aindex = NULL;
broot = NULL;
@@ -665,6 +697,8 @@ int main(int argc, char *argv[])
else
broot = Root(bfile,".db");
}
+ else
+ broot = aroot;
if (i == 2)
{ for (j = 0; j < MTOP; j++)
@@ -681,7 +715,7 @@ int main(int argc, char *argv[])
aindex = Sort_Kmers(ablock,&alen);
}
- if (strcmp(afile,bfile) != 0)
+ if (aroot != broot)
{ if (VERBOSE)
printf("\nBuilding index for %s\n",broot);
bindex = Sort_Kmers(bblock,&blen);
@@ -692,8 +726,6 @@ int main(int argc, char *argv[])
printf("\nBuilding index for c(%s)\n",broot);
bindex = Sort_Kmers(bblock,&blen);
Match_Filter(aroot,ablock,broot,bblock,aindex,alen,bindex,blen,1,asettings);
-
- free(broot);
}
else
{ Match_Filter(aroot,ablock,aroot,ablock,aindex,alen,aindex,alen,0,asettings);
@@ -709,6 +741,38 @@ int main(int argc, char *argv[])
}
Close_DB(bblock);
+
+ command = CommandBuffer(aroot,broot);
+
+ sprintf(command,"LAsort /tmp/%s.%s.[CN]*.las",aroot,broot);
+ if (VERBOSE)
+ printf("\n%s\n",command);
+ system(command);
+ sprintf(command,"LAmerge %s.%s.las /tmp/%s.%s.[CN]*.S.las",aroot,broot,aroot,broot);
+ if (VERBOSE)
+ printf("%s\n",command);
+ system(command);
+ sprintf(command,"rm /tmp/%s.%s.[CN]*.las",aroot,broot);
+ if (VERBOSE)
+ printf("%s\n",command);
+ system(command);
+ if (aroot != broot && SYMMETRIC)
+ { sprintf(command,"LAsort /tmp/%s.%s.[CN]*.las",broot,aroot);
+ if (VERBOSE)
+ printf("%s\n",command);
+ system(command);
+ sprintf(command,"LAmerge %s.%s.las /tmp/%s.%s.[CN]*.S.las",broot,aroot,broot,aroot);
+ if (VERBOSE)
+ printf("%s\n",command);
+ system(command);
+ sprintf(command,"rm /tmp/%s.%s.[CN]*.las",broot,aroot);
+ if (VERBOSE)
+ printf("%s\n",command);
+ system(command);
+ }
+
+ if (aroot != broot)
+ free(broot);
}
}
diff --git a/filter.c b/filter.c
index f54780d..a9c5083 100644
--- a/filter.c
+++ b/filter.c
@@ -115,10 +115,12 @@ static int Suppress;
static int Kshift; // 2*Kmer
static uint64 Kmask; // 4^Kmer-1
-static uint64 Kpowr; // 4^Kmer
static int TooFrequent; // (Suppress != 0) ? Suppress : INT32_MAX
-int Set_Filter_Params(int kmer, int binshift, int suppress, int hitmin)
+static int NTHREADS; // Adjusted downward to nearest power of 2
+static int NSHIFT; // NTHREADS = 1 << NSHIFT
+
+int Set_Filter_Params(int kmer, int binshift, int suppress, int hitmin, int nthread)
{ if (kmer <= 1)
return (1);
@@ -128,14 +130,23 @@ int Set_Filter_Params(int kmer, int binshift, int suppress, int hitmin)
Hitmin = hitmin;
Kshift = 2*Kmer;
- Kpowr = (0x1llu << Kshift);
- Kmask = Kpowr-1;
+ if (Kmer == 32)
+ Kmask = 0xffffffffffffffffllu;
+ else
+ Kmask = (0x1llu << Kshift) - 1;
if (Suppress == 0)
TooFrequent = INT32_MAX;
else
TooFrequent = Suppress;
+ NTHREADS = 1;
+ NSHIFT = 0;
+ while (2*NTHREADS <= nthread)
+ { NTHREADS *= 2;
+ NSHIFT += 1;
+ }
+
return (0);
}
@@ -163,7 +174,7 @@ typedef struct
{ int64 beg;
int64 end;
int64 tptr[BPOWR];
- int64 sptr[NTHREADS*BPOWR];
+ int64 *sptr;
} Lex_Arg;
static void *lex_thread(void *arg)
@@ -400,13 +411,12 @@ static void *tuple_thread(void *arg)
int64 *anno1 = ((int64 *) (TA_track->anno)) + 1;
int *point = (int *) (TA_track->data);
int64 a, b, f;
- int q;
+ int q = 0;
f = anno1[i-1];
for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++)
{ b = f;
f = anno1[i];
- q = reads[i].rlen;
for (a = b; a <= f; a += 2)
{ if (a == b)
p = 0;
@@ -484,14 +494,13 @@ static void *biased_tuple_thread(void *arg)
int64 *anno1 = ((int64 *) (TA_track->anno)) + 1;
int *point = (int *) (TA_track->data);
int64 j, b, f;
- int q;
+ int q = 0;
f = anno1[i-1];
for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++)
{ b = f;
f = anno1[i];
t = s+1;
- q = reads[i].rlen;
for (j = b; j <= f; j += 2)
{ if (j == b)
p = 0;
@@ -658,6 +667,9 @@ void *Sort_Kmers(HITS_DB *block, int *len)
int i, j, x, z;
uint64 h;
+ for (i = 0; i < NTHREADS; i++)
+ parmx[i].sptr = (int64 *) alloca(NTHREADS*BPOWR*sizeof(int64));
+
for (i = 0; i < 16; i++)
mersort[i] = 0;
for (i = 0; i < Kshift; i += 8)
@@ -686,12 +698,12 @@ void *Sort_Kmers(HITS_DB *block, int *len)
goto no_mers;
if (( (Kshift-1)/BSHIFT + (TooFrequent < INT32_MAX) ) & 0x1)
- { trg = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+1),"Allocating Sort_Kmers vectors");
- src = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+1),"Allocating Sort_Kmers vectors");
+ { trg = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors");
+ src = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors");
}
else
- { src = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+1),"Allocating Sort_Kmers vectors");
- trg = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+1),"Allocating Sort_Kmers vectors");
+ { src = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors");
+ trg = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors");
}
if (src == NULL || trg == NULL)
exit (1);
@@ -735,7 +747,6 @@ void *Sort_Kmers(HITS_DB *block, int *len)
if (BIASED || TA_track != NULL)
for (i = 0; i < NTHREADS; i++)
kmers -= parmt[i].fill;
- rez[kmers].code = Kpowr;
if (TooFrequent < INT32_MAX && kmers > 0)
{ parmf[0].beg = 0;
@@ -748,6 +759,11 @@ void *Sort_Kmers(HITS_DB *block, int *len)
}
parmf[NTHREADS-1].end = kmers;
+ if (rez[kmers-1].code == 0xffffffffffffffffllu)
+ rez[kmers].code = 0;
+ else
+ rez[kmers].code = 0xffffffffffffffffllu;
+
if (src == rez)
{ FR_src = src;
FR_trg = rez = trg;
@@ -776,9 +792,10 @@ void *Sort_Kmers(HITS_DB *block, int *len)
for (i = 0; i < NTHREADS; i++)
pthread_join(threads[i],NULL);
-
- rez[kmers].code = Kpowr;
}
+
+ rez[kmers].code = 0xffffffffffffffffllu;
+ rez[kmers+1].code = 0;
if (src != rez)
free(src);
@@ -798,7 +815,7 @@ void *Sort_Kmers(HITS_DB *block, int *len)
#endif
if (VERBOSE)
- { if (TooFrequent < INT32_MAX)
+ { if (TooFrequent < INT32_MAX || BIASED || TA_track != NULL)
{ printf(" Revised kmer count = ");
Print_Number((int64) kmers,0,stdout);
printf("\n");
@@ -812,7 +829,7 @@ void *Sort_Kmers(HITS_DB *block, int *len)
goto no_mers;
}
- if (MEM_LIMIT > 0 && kmers > (int64) (MEM_LIMIT/(4*sizeof(KmerPos))))
+ if (kmers > (int64) (MEM_LIMIT/(4*sizeof(KmerPos))))
{ fprintf(stderr,"Warning: Block size too big, index occupies more than 1/4 of");
if (MEM_LIMIT == MEM_PHYSICAL)
fprintf(stderr," physical memory (%.1fGb)\n",(1.*MEM_LIMIT)/0x40000000ll);
@@ -883,7 +900,8 @@ static void *count_thread(void *arg)
int jb, ja;
uint64 ca, cb;
uint64 da, db;
- int ar;
+ int ar, ap;
+ int a, b;
ia = data->abeg;
ca = asort[ia].code;
@@ -896,36 +914,45 @@ static void *count_thread(void *arg)
while (cb > ca)
ca = asort[++ia].code;
if (cb == ca)
- { if (ia >= aend) break;
+ { ja = ia++;
+ while ((da = asort[ia].code) == ca)
+ ia += 1;
+ jb = ib++;
+ while ((db = bsort[ib].code) == cb)
+ ib += 1;
+
+ if (ia > aend)
+ { if (ja >= aend)
+ break;
+ da = asort[ia = aend].code;
+ db = bsort[ib = data->bend].code;
+ }
ct = 0;
- jb = ib;
- db = cb;
+ b = jb;
if (IDENTITY)
- do
- { ar = asort[ia].read;
+ for (a = ja; a < ia; a++)
+ { ar = asort[a].read;
if (MG_comp)
- while (db == cb && bsort[ib].read <= ar)
- db = bsort[++ib].code;
+ { while (b < ib && bsort[b].read <= ar)
+ b += 1;
+ }
else
- { while (db == cb && bsort[ib].read < ar)
- db = bsort[++ib].code;
- while (db == cb && bsort[ib].read == ar && bsort[ib].rpos < asort[ia].rpos)
- db = bsort[++ib].code;
+ { ap = asort[a].rpos;
+ while (b < ib && bsort[b].read < ar)
+ b += 1;
+ while (b < ib && bsort[b].read == ar && bsort[b].rpos < ap)
+ b += 1;
}
- ct += (ib-jb);
+ ct += (b-jb);
}
- while ((da = asort[++ia].code) == ca);
else
- do
- { ar = asort[ia].read;
- while (db == cb && bsort[ib].read < ar)
- db = bsort[++ib].code;
- ct += (ib-jb);
+ for (a = ja; a < ia; a++)
+ { ar = asort[a].read;
+ while (b < ib && bsort[b].read < ar)
+ b += 1;
+ ct += (b-jb);
}
- while ((da = asort[++ia].code) == ca);
- while (db == cb)
- db = bsort[++ib].code;
nhits += ct;
ca = da;
@@ -943,15 +970,20 @@ static void *count_thread(void *arg)
while (cb > ca)
ca = asort[++ia].code;
if (cb == ca)
- { if (ia >= aend) break;
-
- ja = ia++;
+ { ja = ia++;
while ((da = asort[ia].code) == ca)
ia += 1;
jb = ib++;
while ((db = bsort[ib].code) == cb)
ib += 1;
+ if (ia > aend)
+ { if (ja >= aend)
+ break;
+ da = asort[ia = aend].code;
+ db = bsort[ib = data->bend].code;
+ }
+
ct = (ia-ja);
ct *= (ib-jb);
@@ -989,7 +1021,7 @@ static void *merge_thread(void *arg)
uint64 ca, cb;
uint64 da, db;
int ar, ap;
- int a, b;
+ int a, b, c;
ia = data->abeg;
ca = asort[ia].code;
@@ -1002,62 +1034,69 @@ static void *merge_thread(void *arg)
while (cb > ca)
ca = asort[++ia].code;
if (cb == ca)
- { if (ia >= aend) break;
+ { ja = ia++;
+ while ((da = asort[ia].code) == ca)
+ ia += 1;
+ jb = ib++;
+ while ((db = bsort[ib].code) == cb)
+ ib += 1;
+
+ if (ia > aend)
+ { if (ja >= aend)
+ break;
+ da = asort[ia = aend].code;
+ db = bsort[ib = data->bend].code;
+ }
ct = 0;
- ja = ia;
- jb = ib;
- db = cb;
+ b = jb;
if (IDENTITY)
- do
- { ar = asort[ia].read;
- ap = asort[ia].rpos;
+ for (a = ja; a < ia; a++)
+ { ar = asort[a].read;
if (MG_comp)
- while (db == cb && bsort[ib].read <= ar)
- db = bsort[++ib].code;
+ { while (b < ib && bsort[b].read <= ar)
+ b += 1;
+ }
else
- { while (db == cb && bsort[ib].read < ar)
- db = bsort[++ib].code;
- while (db == cb && bsort[ib].read == ar && bsort[ib].rpos < ap)
- db = bsort[++ib].code;
+ { ap = asort[a].rpos;
+ while (b < ib && bsort[b].read < ar)
+ b += 1;
+ while (b < ib && bsort[b].read == ar && bsort[b].rpos < ap)
+ b += 1;
}
- ct += (ib-jb);
+ ct += (b-jb);
}
- while ((da = asort[++ia].code) == ca);
else
- do
- { ar = asort[ia].read;
- while (db == cb && bsort[ib].read < ar)
- db = bsort[++ib].code;
- ct += (ib-jb);
+ for (a = ja; a < ia; a++)
+ { ar = asort[a].read;
+ while (b < ib && bsort[b].read < ar)
+ b += 1;
+ ct += (b-jb);
}
- while ((da = asort[++ia].code) == ca);
- while (db == cb)
- db = bsort[++ib].code;
if (ct < limit)
- { ib = jb;
- db = cb;
+ { b = jb;
if (IDENTITY)
for (a = ja; a < ia; a++)
{ ap = asort[a].rpos;
ar = asort[a].read;
if (MG_comp)
- while (db == cb && bsort[ib].read <= ar)
- db = bsort[++ib].code;
+ { while (b < ib && bsort[b].read <= ar)
+ b += 1;
+ }
else
- { while (db == cb && bsort[ib].read < ar)
- db = bsort[++ib].code;
- while (db == cb && bsort[ib].read == ar && bsort[ib].rpos < ap)
- db = bsort[++ib].code;
+ { while (b < ib && bsort[b].read < ar)
+ b += 1;
+ while (b < ib && bsort[b].read == ar && bsort[b].rpos < ap)
+ b += 1;
}
- if ((ct = ib-jb) > 0)
+ if ((ct = b-jb) > 0)
{ kptr[ap & BMASK] += ct;
- for (b = jb; b < ib; b++)
- { hits[nhits].bread = bsort[b].read;
+ for (c = jb; c < b; c++)
+ { hits[nhits].bread = bsort[c].read;
hits[nhits].aread = ar;
hits[nhits].apos = ap;
- hits[nhits].diag = ap - bsort[b].rpos;
+ hits[nhits].diag = ap - bsort[c].rpos;
nhits += 1;
}
}
@@ -1066,21 +1105,19 @@ static void *merge_thread(void *arg)
for (a = ja; a < ia; a++)
{ ap = asort[a].rpos;
ar = asort[a].read;
- while (db == cb && bsort[ib].read < ar)
- db = bsort[++ib].code;
- if ((ct = ib-jb) > 0)
+ while (b < ib && bsort[b].read < ar)
+ b += 1;
+ if ((ct = b-jb) > 0)
{ kptr[ap & BMASK] += ct;
- for (b = jb; b < ib; b++)
- { hits[nhits].bread = bsort[b].read;
+ for (c = jb; c < b; c++)
+ { hits[nhits].bread = bsort[c].read;
hits[nhits].aread = ar;
hits[nhits].apos = ap;
- hits[nhits].diag = ap - bsort[b].rpos;
+ hits[nhits].diag = ap - bsort[c].rpos;
nhits += 1;
}
}
}
- while (db == cb)
- db = bsort[++ib].code;
}
ca = da;
cb = db;
@@ -1101,6 +1138,14 @@ static void *merge_thread(void *arg)
jb = ib++;
while ((db = bsort[ib].code) == cb)
ib += 1;
+
+ if (ia > aend)
+ { if (ja >= aend)
+ break;
+ da = asort[ia = aend].code;
+ db = bsort[ib = data->bend].code;
+ }
+
ct = ib-jb;
if ((ia-ja)*ct < limit)
{ for (a = ja; a < ia; a++)
@@ -1165,6 +1210,12 @@ static int Entwine(Path *jpath, Path *kpath, Trace_Buffer *tbuf, int *where)
b2 = kpath->bbpos;
k = kpath->abpos/MR_tspace;
+ if (jpath->abpos == kpath->abpos)
+ { min = abs(y2-b2);
+ if (min == 0)
+ *where = kpath->abpos;
+ }
+
if (j < k)
{ ac = k*MR_tspace;
@@ -1218,6 +1269,15 @@ static int Entwine(Path *jpath, Path *kpath, Trace_Buffer *tbuf, int *where)
#endif
}
+ if (jpath->aepos == kpath->aepos)
+ { i = abs(jpath->bepos-kpath->bepos);
+ if (i <= min)
+ { min = i;
+ if (i == 0)
+ *where = kpath->aepos;
+ }
+ }
+
#ifdef SEE_ENTWINE
if (den == 0)
printf("Nothing\n");
@@ -1286,7 +1346,8 @@ static void Fusion(Path *path1, int ap, Path *path2, Trace_Buffer *tbuf)
static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, Trace_Buffer *tbuf)
{ Path *jpath, *kpath;
int j, k, no;
- int dist, awhen, bwhen;
+ int dist;
+ int awhen = 0, bwhen = 0;
int hasB;
#ifdef TEST_CONTAIN
@@ -1297,12 +1358,14 @@ static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, Trace_Buff
hasB = (bmatch != NULL);
- no = 0;
for (j = 1; j < novls; j++)
{ jpath = amatch+j;
- for (k = no; k >= 0; k--)
+ for (k = j-1; k >= 0; k--)
{ kpath = amatch+k;
+ if (kpath->abpos < 0)
+ continue;
+
if (jpath->abpos < kpath->abpos)
{ if (kpath->abpos <= jpath->aepos && kpath->bbpos <= jpath->bepos)
@@ -1315,8 +1378,8 @@ static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, Trace_Buff
if (dist != 0)
continue;
Fusion(jpath,awhen,kpath,tbuf);
- amatch[k] = *jpath;
Fusion(bmatch+k,bwhen,bmatch+j,tbuf);
+ bmatch[j] = bmatch[k];
#ifdef TEST_CONTAIN
printf(" Really 1");
#endif
@@ -1326,9 +1389,7 @@ static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, Trace_Buff
if (dist != 0)
continue;
Fusion(jpath,awhen,kpath,tbuf);
- amatch[k] = *jpath;
Fusion(bmatch+j,bwhen,bmatch+k,tbuf);
- bmatch[k] = bmatch[j];
#ifdef TEST_CONTAIN
printf(" Really 2");
#endif
@@ -1336,21 +1397,16 @@ static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, Trace_Buff
}
else
{ Fusion(jpath,awhen,kpath,tbuf);
- amatch[k] = *jpath;
#ifdef TEST_CONTAIN
printf(" Really 3");
#endif
}
+ k = j;
}
- else
- { amatch[k] = *jpath;
- if (hasB)
- bmatch[k] = bmatch[j];
- }
+ kpath->abpos = -1;
#ifdef TEST_CONTAIN
printf(" Fuse! A %d %d\n",j,k);
#endif
- break;
}
}
}
@@ -1361,10 +1417,10 @@ static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, Trace_Buff
{ dist = Entwine(kpath,jpath,tbuf,&awhen);
if (dist == 0)
{ if (kpath->abpos == jpath->abpos)
- { if (kpath->aepos < jpath->aepos)
- { amatch[k] = *jpath;
+ { if (kpath->aepos > jpath->aepos)
+ { *jpath = *kpath;
if (hasB)
- bmatch[k] = bmatch[j];
+ bmatch[j] = bmatch[k];
}
}
else if (jpath->aepos > kpath->aepos)
@@ -1374,8 +1430,8 @@ static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, Trace_Buff
if (dist != 0)
continue;
Fusion(kpath,awhen,jpath,tbuf);
+ *jpath = *kpath;
Fusion(bmatch+j,bwhen,bmatch+k,tbuf);
- bmatch[k] = bmatch[j];
#ifdef TEST_CONTAIN
printf(" Really 4");
#endif
@@ -1385,7 +1441,9 @@ static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, Trace_Buff
if (dist != 0)
continue;
Fusion(kpath,awhen,jpath,tbuf);
+ *jpath = *kpath;
Fusion(bmatch+k,bwhen,bmatch+j,tbuf);
+ bmatch[j] = bmatch[k];
#ifdef TEST_CONTAIN
printf(" Really 5");
#endif
@@ -1393,28 +1451,36 @@ static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, Trace_Buff
}
else
{ Fusion(kpath,awhen,jpath,tbuf);
+ *jpath = *kpath;
#ifdef TEST_CONTAIN
printf(" Really 6");
#endif
}
+ k = j;
}
+ else
+ { *jpath = *kpath;
+ if (hasB)
+ bmatch[j] = bmatch[k];
+ }
+ kpath->abpos = -1;
#ifdef TEST_CONTAIN
printf(" Fuse! B %d %d\n",j,k);
#endif
- break;
}
}
}
}
- if (k < 0)
- { no += 1;
- amatch[no] = *jpath;
- if (hasB)
- bmatch[no] = bmatch[j];
- }
}
- novls = no+1;
+ no = 0;
+ for (j = 0; j < novls; j++)
+ if (amatch[j].abpos >= 0)
+ { if (hasB)
+ bmatch[no] = bmatch[j];
+ amatch[no++] = amatch[j];
+ }
+ novls = no;
#ifdef TEST_CONTAIN
for (j = 0; j < novls; j++)
@@ -1507,7 +1573,8 @@ static void *report_thread(void *arg)
Double *hitc;
int minhit;
- uint64 cpair, npair;
+ uint64 cpair;
+ uint64 npair = 0;
int64 nidx, eidx;
// In ovl and align roles of A and B are reversed, as the B sequence must be the
@@ -1818,6 +1885,22 @@ static void *report_thread(void *arg)
*
********************************************************************************************/
+static char *NameBuffer(char *aname, char *bname)
+{ static char *cat = NULL;
+ static int max = -1;
+ int len;
+
+ len = strlen(aname) + strlen(bname) + 100;
+ if (len > max)
+ { max = ((int) (1.2*len)) + 100;
+ if ((cat = (char *) realloc(cat,max+1)) == NULL)
+ { fprintf(stderr,"%s: Out of memory (Making path name)\n",Prog_Name);
+ exit (1);
+ }
+ }
+ return (cat);
+}
+
void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock,
void *vasort, int alen, void *vbsort, int blen,
int comp, Align_Spec *aspec)
@@ -1826,6 +1909,7 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock,
Lex_Arg parmx[NTHREADS];
Report_Arg parmr[NTHREADS];
int pairsort[16];
+ char *fname;
SeedPair *khit, *hhit;
SeedPair *work1, *work2;
@@ -1841,9 +1925,14 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock,
atot = ablock->totlen;
btot = bblock->totlen;
+ MR_tspace = Trace_Spacing(aspec);
+
{ int64 powr;
int i, nbyte;
+ for (i = 0; i < NTHREADS; i++)
+ parmx[i].sptr = (int64 *) alloca(NTHREADS*BPOWR*sizeof(int64));
+
for (i = 0; i < 16; i++)
pairsort[i] = 0;
@@ -1923,10 +2012,11 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock,
for (j = 0; j < MAXGRAM; j++)
histo[j] += parmm[i].hitgram[j];
- if (asort == bsort || (int64) (MEM_LIMIT/sizeof(Double)) > alen + 2*blen)
- avail = (MEM_LIMIT/sizeof(Double) - alen) / 2;
+ avail = (int64) (MEM_LIMIT - (sizeof_DB(ablock) + sizeof_DB(bblock))) / sizeof(Double);
+ if (asort == bsort || avail > alen + 2*blen)
+ avail = (avail - alen) / 2;
else
- avail = MEM_LIMIT/sizeof(Double) - (alen + blen);
+ avail = avail - (alen + blen);
avail *= .98;
tom = 0;
@@ -1998,14 +2088,15 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock,
if (asort == bsort)
hhit = work1 = (SeedPair *) Malloc(sizeof(SeedPair)*(nhits+1),
- "Allocating dazzler hit vectors");
+ "Allocating daligner hit vectors");
else
{ if (nhits >= blen)
bsort = (KmerPos *) Realloc(bsort,sizeof(SeedPair)*(nhits+1),
- "Reallocating dazzler sort vectors");
+ "Reallocating daligner sort vectors");
hhit = work1 = (SeedPair *) bsort;
}
- khit = work2 = (SeedPair *) Malloc(sizeof(SeedPair)*(nhits+1),"Allocating dazzler hit vectors");
+ khit = work2 = (SeedPair *) Malloc(sizeof(SeedPair)*(nhits+1),
+ "Allocating daligner hit vectors");
if (hhit == NULL || khit == NULL || bsort == NULL)
exit (1);
@@ -2074,7 +2165,6 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock,
MR_hits = khit;
MR_two = ! MG_self && SYMMETRIC;
MR_spec = aspec;
- MR_tspace = Trace_Spacing(aspec);
parmr[0].beg = 0;
for (i = 1; i < NTHREADS; i++)
@@ -2093,6 +2183,8 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock,
if (counters == NULL)
exit (1);
+ fname = NameBuffer(aname,bname);
+
for (i = 0; i < 3*w*NTHREADS; i++)
counters[i] = 0;
for (i = 0; i < NTHREADS; i++)
@@ -2104,15 +2196,15 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock,
parmr[i].lasta = parmr[i].lastp + w;
parmr[i].work = New_Work_Data();
- parmr[i].ofile1 =
- Fopen(Catenate(aname,".",bname,Numbered_Suffix((comp?".C":".N"),i,".las")),"w");
+ sprintf(fname,"/tmp/%s.%s.%c%d.las",aname,bname,(comp?'C':'N'),i+1);
+ parmr[i].ofile1 = Fopen(fname,"w");
if (parmr[i].ofile1 == NULL)
exit (1);
if (MG_self)
parmr[i].ofile2 = parmr[i].ofile1;
else if (SYMMETRIC)
- { parmr[i].ofile2 =
- Fopen(Catenate(bname,".",aname,Numbered_Suffix((comp?".C":".N"),i,".las")),"w");
+ { sprintf(fname,"/tmp/%s.%s.%c%d.las",bname,aname,(comp?'C':'N'),i+1);
+ parmr[i].ofile2 = Fopen(fname,"w");
if (parmr[i].ofile2 == NULL)
exit (1);
}
@@ -2152,14 +2244,18 @@ zerowork:
{ FILE *ofile;
int i;
+ fname = NameBuffer(aname,bname);
+
nhits = 0;
for (i = 0; i < NTHREADS; i++)
- { ofile = Fopen(Catenate(aname,".",bname,Numbered_Suffix((comp?".C":".N"),i,".las")),"w");
+ { sprintf(fname,"/tmp/%s.%s.%c%d.las",aname,bname,(comp?'C':'N'),i+1);
+ ofile = Fopen(fname,"w");
fwrite(&nhits,sizeof(int64),1,ofile);
fwrite(&MR_tspace,sizeof(int),1,ofile);
fclose(ofile);
if (! MG_self && SYMMETRIC)
- { ofile = Fopen(Catenate(bname,".",aname,Numbered_Suffix((comp?".C":".N"),i,".las")),"w");
+ { sprintf(fname,"/tmp/%s.%s.%c%d.las",bname,aname,(comp?'C':'N'),i+1);
+ ofile = Fopen(fname,"w");
fwrite(&nhits,sizeof(int64),1,ofile);
fwrite(&MR_tspace,sizeof(int),1,ofile);
fclose(ofile);
diff --git a/filter.h b/filter.h
index ba7a34d..db565f1 100644
--- a/filter.h
+++ b/filter.h
@@ -24,10 +24,7 @@ extern int IDENTITY;
extern uint64 MEM_LIMIT;
extern uint64 MEM_PHYSICAL;
-#define NTHREADS 4 // Must be a power of 2
-#define NSHIFT 2 // log_2 NTHREADS
-
-int Set_Filter_Params(int kmer, int binshift, int suppress, int hitmin);
+int Set_Filter_Params(int kmer, int binshift, int suppress, int hitmin, int nthreads);
void *Sort_Kmers(HITS_DB *block, int *len);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/daligner.git
More information about the debian-med-commit
mailing list