[med-svn] [dazzdb] 01/02: Imported Upstream version 1.0
Afif Elghraoui
afif-guest at moszumanska.debian.org
Sun Sep 13 23:46:52 UTC 2015
This is an automated email from the git hooks/post-receive script.
afif-guest pushed a commit to branch master
in repository dazzdb.
commit 64be7f20a6c96d9517e4e6d63985369a6f3512c0
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Sat Aug 15 21:29:25 2015 -0700
Imported Upstream version 1.0
---
Catrack.c | 296 +++++++++
DAM2fasta.c | 237 +++++++
DB.c | 1580 ++++++++++++++++++++++++++++++++++++++++++++++
DB.h | 442 +++++++++++++
DB2fasta.c | 188 ++++++
DB2quiva.c | 192 ++++++
DBdust.c | 508 +++++++++++++++
DBrm.c | 77 +++
DBshow.c | 612 ++++++++++++++++++
DBsplit.c | 246 ++++++++
DBstats.c | 358 +++++++++++
DBupgrade.Dec.31.2014.c | 115 ++++
DBupgrade.Sep.25.2014.c | 125 ++++
DUSTupgrade.Jan.1.2015.c | 117 ++++
Makefile | 67 ++
QV.c | 1406 +++++++++++++++++++++++++++++++++++++++++
QV.h | 125 ++++
README | 442 +++++++++++++
fasta2DAM.c | 450 +++++++++++++
fasta2DB.c | 668 ++++++++++++++++++++
quiva2DB.c | 384 +++++++++++
simulator.c | 458 ++++++++++++++
22 files changed, 9093 insertions(+)
diff --git a/Catrack.c b/Catrack.c
new file mode 100644
index 0000000..8df7e3e
--- /dev/null
+++ b/Catrack.c
@@ -0,0 +1,296 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/********************************************************************************************
+ *
+ * Concate in block order all "block tracks" <DB>.<track>.# into a single track
+ * <DB>.<track>
+ *
+ * Author: Gene Myers
+ * Date : June 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+static char *Usage = "[-v] <path:db|dam> <track:name>";
+
+int main(int argc, char *argv[])
+{ char *prefix;
+ FILE *aout, *dout;
+ int VERBOSE;
+
+ // Process arguments
+
+ { int i, j, k;
+ int flags[128];
+
+ ARG_INIT("Catrack")
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ { ARG_FLAGS("v") }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ VERBOSE = flags['v'];
+
+ if (argc != 3)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ { char *pwd, *root;
+ int plen;
+
+ plen = strlen(argv[1]);
+ if (strcmp(argv[1]+(plen-3),".dam") == 0)
+ root = Root(argv[1],".dam");
+ else
+ root = Root(argv[1],".db");
+ pwd = PathTo(argv[1]);
+ prefix = Strdup(Catenate(pwd,PATHSEP,root,"."),"Allocating track name");
+ free(pwd);
+ free(root);
+
+ aout = fopen(Catenate(prefix,argv[2],".","anno"),"r");
+ if (aout != NULL)
+ { fprintf(stderr,"%s: Track file %s%s.anno already exists!\n",Prog_Name,prefix,argv[2]);
+ fclose(aout);
+ exit (1);
+ }
+
+ dout = fopen(Catenate(prefix,argv[2],".","data"),"r");
+ if (dout != NULL)
+ { fprintf(stderr,"%s: Track file %s%s.data already exists!\n",Prog_Name,prefix,argv[2]);
+ fclose(dout);
+ exit (1);
+ }
+
+ aout = Fopen(Catenate(prefix,argv[2],".","anno"),"w");
+ if (aout == NULL)
+ exit (1);
+ dout = NULL;
+ }
+
+ { int tracktot, tracksiz;
+ int64 trackoff;
+ int nfiles;
+ char data[1024];
+ void *anno;
+
+ anno = NULL;
+ trackoff = 0;
+ tracktot = tracksiz = 0;
+ fwrite(&tracktot,sizeof(int),1,aout);
+ fwrite(&tracksiz,sizeof(int),1,aout);
+
+ nfiles = 0;
+ while (1)
+ { FILE *afile, *dfile;
+ int i, size, tracklen;
+
+ afile = fopen(Numbered_Suffix(prefix,nfiles+1,Catenate(".",argv[2],".","anno")),"r");
+ if (afile == NULL)
+ break;
+ dfile = fopen(Numbered_Suffix(prefix,nfiles+1,Catenate(".",argv[2],".","data")),"r");
+
+ if (VERBOSE)
+ { fprintf(stderr,"Concatenating %s%d.%s ...\n",prefix,nfiles+1,argv[2]);
+ fflush(stderr);
+ }
+
+ if (fread(&tracklen,sizeof(int),1,afile) != 1)
+ SYSTEM_ERROR
+ if (fread(&size,sizeof(int),1,afile) != 1)
+ SYSTEM_ERROR
+ if (nfiles == 0)
+ { tracksiz = size;
+ if (dfile != NULL)
+ { dout = Fopen(Catenate(prefix,argv[2],".","data"),"w");
+ if (dout == NULL)
+ { fclose(afile);
+ fclose(dfile);
+ goto error;
+ }
+ }
+ else
+ { anno = Malloc(size,"Allocating annotation record");
+ if (anno == NULL)
+ { fclose(afile);
+ goto error;
+ }
+ }
+ }
+ else
+ { int escape = 1;
+ if (tracksiz != size)
+ { fprintf(stderr,"%s: Track block %d does not have the same annotation size (%d)",
+ Prog_Name,nfiles+1,size);
+ fprintf(stderr," as previous blocks (%d)\n",tracksiz);
+ }
+ else if (dfile == NULL && dout != NULL)
+ fprintf(stderr,"%s: Track block %d does not have data but previous blocks do\n",
+ Prog_Name,nfiles+1);
+ else if (dfile != NULL && dout == NULL)
+ fprintf(stderr,"%s: Track block %d has data but previous blocks do not\n",
+ Prog_Name,nfiles+1);
+ else
+ escape = 0;
+ if (escape)
+ { fclose(afile);
+ if (dfile != NULL) fclose(dfile);
+ if (anno != NULL) free(anno);
+ goto error;
+ }
+ }
+
+ if (dfile != NULL)
+ { int64 dlen;
+
+ if (size == 4)
+ { int anno4;
+
+ for (i = 0; i < tracklen; i++)
+ { if (fread(&anno4,sizeof(int),1,afile) != 1)
+ SYSTEM_ERROR
+ anno4 += trackoff;
+ fwrite(&anno4,sizeof(int),1,aout);
+ }
+ if (fread(&anno4,sizeof(int),1,afile) != 1)
+ SYSTEM_ERROR
+ dlen = anno4;
+ }
+ else
+ { int64 anno8;
+
+ for (i = 0; i < tracklen; i++)
+ { if (fread(&anno8,sizeof(int64),1,afile) != 1)
+ SYSTEM_ERROR
+ anno8 += trackoff;
+ fwrite(&anno8,sizeof(int64),1,aout);
+ }
+ if (fread(&anno8,sizeof(int64),1,afile) != 1)
+ SYSTEM_ERROR
+ dlen = anno8;
+ }
+ trackoff += dlen;
+
+ for (i = 1024; i < dlen; i += 1024)
+ { if (fread(data,1024,1,dfile) != 1)
+ SYSTEM_ERROR
+ fwrite(data,1024,1,dout);
+ }
+ i -= 1024;
+ if (i < dlen)
+ { if (fread(data,dlen-i,1,dfile) != 1)
+ SYSTEM_ERROR
+ fwrite(data,dlen-i,1,dout);
+ }
+ }
+ else
+ { for (i = 0; i < tracklen; i++)
+ { if (fread(anno,size,1,afile) != 1)
+ SYSTEM_ERROR
+ fwrite(anno,size,1,aout);
+ }
+ }
+
+ tracktot += tracklen;
+ nfiles += 1;
+ if (dfile != NULL) fclose(dfile);
+ fclose(afile);
+ }
+
+ if (nfiles == 0)
+ { fprintf(stderr,"%s: Couldn't find first track block %s1.%s.anno\n",
+ Prog_Name,prefix,argv[2]);
+ goto error;
+ }
+ else
+ { if (dout != NULL)
+ { if (tracksiz == 4)
+ { int anno4 = trackoff;
+ fwrite(&anno4,sizeof(int),1,aout);
+ }
+ else
+ { int64 anno8 = trackoff;
+ fwrite(&anno8,sizeof(int64),1,aout);
+ }
+ }
+ else
+ { fwrite(anno,tracksiz,1,aout);
+ free(anno);
+ }
+ rewind(aout);
+ fwrite(&tracktot,sizeof(int),1,aout);
+ fwrite(&tracksiz,sizeof(int),1,aout);
+ }
+ }
+
+ fclose(aout);
+ if (dout != NULL)
+ fclose(dout);
+ free(prefix);
+
+ exit (0);
+
+error:
+ fclose(aout);
+ unlink(Catenate(prefix,argv[2],".","anno"));
+ if (dout != NULL)
+ { fclose(dout);
+ unlink(Catenate(prefix,argv[2],".","data"));
+ }
+ free(prefix);
+
+ exit (1);
+}
diff --git a/DAM2fasta.c b/DAM2fasta.c
new file mode 100644
index 0000000..57ca2f7
--- /dev/null
+++ b/DAM2fasta.c
@@ -0,0 +1,237 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/********************************************************************************************
+ *
+ * Recreate all the .fasta files that are in a specified DAM.
+ *
+ * Author: Gene Myers
+ * Date : May 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+static char *Usage = "[-vU] [-w<int(80)>] <path:dam>";
+
+int main(int argc, char *argv[])
+{ HITS_DB _db, *db = &_db;
+ FILE *dbfile, *hdrs;
+ int nfiles;
+ int VERBOSE, UPPER, WIDTH;
+
+ // Process arguments
+
+ { int i, j, k;
+ int flags[128];
+ char *eptr;
+
+ ARG_INIT("DAM2fasta")
+
+ WIDTH = 80;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("vU")
+ break;
+ case 'w':
+ ARG_NON_NEGATIVE(WIDTH,"Line width")
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ UPPER = 1 + flags['U'];
+ VERBOSE = flags['v'];
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ // Open db
+
+ { int status;
+
+ status = Open_DB(argv[1],db);
+ if (status < 0)
+ exit (1);
+ if (status == 0)
+ { fprintf(stderr,"%s: Cannot be called on a .db: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ if (db->part > 0)
+ { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ }
+
+ { char *pwd, *root;
+
+ pwd = PathTo(argv[1]);
+ root = Root(argv[1],".dam");
+ dbfile = Fopen(Catenate(pwd,"/",root,".dam"),"r");
+ hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"r");
+ free(pwd);
+ free(root);
+ if (dbfile == NULL || hdrs == NULL)
+ exit (1);
+ }
+
+ // nfiles = # of files in data base
+
+ if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
+ SYSTEM_ERROR
+
+ // For each file do:
+
+ { HITS_READ *reads;
+ char *read;
+ int f, first;
+ char nstring[WIDTH+1];
+
+ if (UPPER == 2)
+ for (f = 0; f < WIDTH; f++)
+ nstring[f] = 'N';
+ else
+ for (f = 0; f < WIDTH; f++)
+ nstring[f] = 'n';
+ nstring[WIDTH] = '\0';
+
+ reads = db->reads;
+ read = New_Read_Buffer(db);
+ first = 0;
+ for (f = 0; f < nfiles; f++)
+ { int i, last, wpos;
+ FILE *ofile;
+ char prolog[MAX_NAME], fname[MAX_NAME], header[MAX_NAME];
+
+ // Scan db image file line, create .fasta file for writing
+
+ if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3)
+ SYSTEM_ERROR
+
+ if ((ofile = Fopen(Catenate(".","/",fname,".fasta"),"w")) == NULL)
+ exit (1);
+
+ if (VERBOSE)
+ { fprintf(stderr,"Creating %s.fasta ...\n",fname);
+ fflush(stdout);
+ }
+
+ // For the relevant range of reads, write each to the file
+ // recreating the original headers with the index meta-data about each read
+
+ wpos = 0;
+ for (i = first; i < last; i++)
+ { int j, len, nlen, w;
+ HITS_READ *r;
+
+ r = reads + i;
+ len = r->rlen;
+
+ if (r->origin == 0)
+ { if (i != first && wpos != 0)
+ { fprintf(ofile,"\n");
+ wpos = 0;
+ }
+ fseeko(hdrs,r->coff,SEEK_SET);
+ fgets(header,MAX_NAME,hdrs);
+ fputs(header,ofile);
+ }
+
+ if (r->fpulse != 0)
+ { if (r->origin != 0)
+ nlen = r->fpulse - (reads[i-1].fpulse + reads[i-1].rlen);
+ else
+ nlen = r->fpulse;
+
+ for (j = 0; j+(w = WIDTH-wpos) <= nlen; j += w)
+ { fprintf(ofile,"%.*s\n",w,nstring);
+ wpos = 0;
+ }
+ if (j < nlen)
+ { fprintf(ofile,"%.*s",nlen-j,nstring);
+ if (j == 0)
+ wpos += nlen;
+ else
+ wpos = nlen-j;
+ }
+ }
+
+ Load_Read(db,i,read,UPPER);
+
+ for (j = 0; j+(w = WIDTH-wpos) <= len; j += w)
+ { fprintf(ofile,"%.*s\n",w,read+j);
+ wpos = 0;
+ }
+ if (j < len)
+ { fprintf(ofile,"%s",read+j);
+ if (j == 0)
+ wpos += len;
+ else
+ wpos = len-j;
+ }
+ }
+ if (wpos > 0)
+ fprintf(ofile,"\n");
+
+ first = last;
+ }
+ }
+
+ fclose(hdrs);
+ fclose(dbfile);
+ Close_DB(db);
+
+ exit (0);
+}
diff --git a/DB.c b/DB.c
new file mode 100644
index 0000000..27b202e
--- /dev/null
+++ b/DB.c
@@ -0,0 +1,1580 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Compressed data base module. Auxiliary routines to open and manipulate a data base for
+ * which the sequence and read information are separated into two separate files, and the
+ * sequence is compressed into 2-bits for each base. Support for tracks of additional
+ * information, and trimming according to the current partition. Eventually will also
+ * support compressed quality information.
+ *
+ * Author : Gene Myers
+ * Date : July 2013
+ * Revised: April 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <dirent.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+
+/*******************************************************************************************
+ *
+ * GENERAL UTILITIES
+ *
+ ********************************************************************************************/
+
+char *Prog_Name;
+
+#ifdef INTERACTIVE
+
+char Ebuffer[1000];
+
+#endif
+
+void *Malloc(int64 size, char *mesg)
+{ void *p;
+
+ if ((p = malloc(size)) == NULL)
+ { if (mesg == NULL)
+ EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name);
+ else
+ EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg);
+ }
+ return (p);
+}
+
+void *Realloc(void *p, int64 size, char *mesg)
+{ if ((p = realloc(p,size)) == NULL)
+ { if (mesg == NULL)
+ EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name);
+ else
+ EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg);
+ }
+ return (p);
+}
+
+char *Strdup(char *name, char *mesg)
+{ char *s;
+
+ if (name == NULL)
+ return (NULL);
+ if ((s = strdup(name)) == NULL)
+ { if (mesg == NULL)
+ EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name);
+ else
+ EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg);
+ }
+ return (s);
+}
+
+FILE *Fopen(char *name, char *mode)
+{ FILE *f;
+
+ if (name == NULL || mode == NULL)
+ return (NULL);
+ if ((f = fopen(name,mode)) == NULL)
+ EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode);
+ return (f);
+}
+
+char *PathTo(char *name)
+{ char *path, *find;
+
+ if (name == NULL)
+ return (NULL);
+ if ((find = rindex(name,'/')) != NULL)
+ { *find = '\0';
+ path = Strdup(name,"Extracting path from");
+ *find = '/';
+ }
+ else
+ path = Strdup(".","Allocating default path");
+ return (path);
+}
+
+char *Root(char *name, char *suffix)
+{ char *path, *find, *dot;
+ int epos;
+
+ if (name == NULL)
+ return (NULL);
+ find = rindex(name,'/');
+ if (find == NULL)
+ find = name;
+ else
+ find += 1;
+ if (suffix == NULL)
+ { dot = strchr(find,'.');
+ if (dot != NULL)
+ *dot = '\0';
+ path = Strdup(find,"Extracting root from");
+ if (dot != NULL)
+ *dot = '.';
+ }
+ else
+ { epos = strlen(find);
+ epos -= strlen(suffix);
+ if (epos > 0 && strcasecmp(find+epos,suffix) == 0)
+ { find[epos] = '\0';
+ path = Strdup(find,"Extracting root from");
+ find[epos] = suffix[0];
+ }
+ else
+ path = Strdup(find,"Allocating root");
+ }
+ return (path);
+}
+
+char *Catenate(char *path, char *sep, char *root, char *suffix)
+{ static char *cat = NULL;
+ static int max = -1;
+ int len;
+
+ if (path == NULL || root == NULL || sep == NULL || suffix == NULL)
+ return (NULL);
+ len = strlen(path);
+ len += strlen(sep);
+ len += strlen(root);
+ len += strlen(suffix);
+ if (len > max)
+ { max = ((int) (1.2*len)) + 100;
+ if ((cat = (char *) realloc(cat,max+1)) == NULL)
+ { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root);
+ return (NULL);
+ }
+ }
+ sprintf(cat,"%s%s%s%s",path,sep,root,suffix);
+ return (cat);
+}
+
+char *Numbered_Suffix(char *left, int num, char *right)
+{ static char *suffix = NULL;
+ static int max = -1;
+ int len;
+
+ if (left == NULL || right == NULL)
+ return (NULL);
+ len = strlen(left);
+ len += strlen(right) + 40;
+ if (len > max)
+ { max = ((int) (1.2*len)) + 100;
+ if ((suffix = (char *) realloc(suffix,max+1)) == NULL)
+ { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num);
+ return (NULL);
+ }
+ }
+ sprintf(suffix,"%s%d%s",left,num,right);
+ return (suffix);
+}
+
+
+#define COMMA ','
+
+// Print big integers with commas/periods for better readability
+
+void Print_Number(int64 num, int width, FILE *out)
+{ if (width == 0)
+ { if (num < 1000ll)
+ fprintf(out,"%lld",num);
+ else if (num < 1000000ll)
+ fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll);
+ else if (num < 1000000000ll)
+ fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,
+ COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll);
+ else
+ fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,
+ COMMA,(num%1000000000ll)/1000000ll,
+ COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll);
+ }
+ else
+ { if (num < 1000ll)
+ fprintf(out,"%*lld",width,num);
+ else if (num < 1000000ll)
+ { if (width <= 4)
+ fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll);
+ else
+ fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll);
+ }
+ else if (num < 1000000000ll)
+ { if (width <= 8)
+ fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll,
+ COMMA,num%1000ll);
+ else
+ fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll,
+ COMMA,num%1000ll);
+ }
+ else
+ { if (width <= 12)
+ fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA,
+ (num%1000000000ll)/1000000ll,COMMA,
+ (num%1000000ll)/1000ll,COMMA,num%1000ll);
+ else
+ fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA,
+ (num%1000000000ll)/1000000ll,COMMA,
+ (num%1000000ll)/1000ll,COMMA,num%1000ll);
+ }
+ }
+}
+
+// Return the number of digits, base 10, of num
+
+int Number_Digits(int64 num)
+{ int digit;
+
+ digit = 0;
+ while (num >= 1)
+ { num /= 10;
+ digit += 1;
+ }
+ return (digit);
+}
+
+
+/*******************************************************************************************
+ *
+ * READ COMPRESSION/DECOMPRESSION UTILITIES
+ *
+ ********************************************************************************************/
+
+// Compress read into 2-bits per base (from [0-3] per byte representation
+
+void Compress_Read(int len, char *s)
+{ int i;
+ char c, d;
+ char *s0, *s1, *s2, *s3;
+
+ s0 = s;
+ s1 = s0+1;
+ s2 = s1+1;
+ s3 = s2+1;
+
+ c = s1[len];
+ d = s2[len];
+ s0[len] = s1[len] = s2[len] = 0;
+
+ for (i = 0; i < len; i += 4)
+ *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]);
+
+ s1[len] = c;
+ s2[len] = d;
+}
+
+// Uncompress read form 2-bits per base into [0-3] per byte representation
+
+void Uncompress_Read(int len, char *s)
+{ int i, tlen, byte;
+ char *s0, *s1, *s2, *s3;
+ char *t;
+
+ s0 = s;
+ s1 = s0+1;
+ s2 = s1+1;
+ s3 = s2+1;
+
+ tlen = (len-1)/4;
+
+ t = s+tlen;
+ for (i = tlen*4; i >= 0; i -= 4)
+ { byte = *t--;
+ s0[i] = (char) ((byte >> 6) & 0x3);
+ s1[i] = (char) ((byte >> 4) & 0x3);
+ s2[i] = (char) ((byte >> 2) & 0x3);
+ s3[i] = (char) (byte & 0x3);
+ }
+ s[len] = 4;
+}
+
+// Convert read in [0-3] representation to ascii representation (end with '\n')
+
+void Lower_Read(char *s)
+{ static char letter[4] = { 'a', 'c', 'g', 't' };
+
+ for ( ; *s != 4; s++)
+ *s = letter[(int) *s];
+ *s = '\0';
+}
+
+void Upper_Read(char *s)
+{ static char letter[4] = { 'A', 'C', 'G', 'T' };
+
+ for ( ; *s != 4; s++)
+ *s = letter[(int) *s];
+ *s = '\0';
+}
+
+// Convert read in ascii representation to [0-3] representation (end with 4)
+
+void Number_Read(char *s)
+{ static char number[128] =
+ { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+
+ for ( ; *s != '\0'; s++)
+ *s = number[(int) *s];
+ *s = 4;
+}
+
+
+/*******************************************************************************************
+ *
+ * DB OPEN, TRIM & CLOSE ROUTINES
+ *
+ ********************************************************************************************/
+
+
+// Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has
+// a part # in it then just the part is opened. The index array is allocated (for all or
+// just the part) and read in.
+// Return status of routine:
+// -1: The DB could not be opened for a reason reported by the routine to EPLACE
+// 0: Open of DB proceeded without mishap
+// 1: Open of DAM proceeded without mishap
+
+int Open_DB(char* path, HITS_DB *db)
+{ HITS_DB dbcopy;
+ char *root, *pwd, *bptr, *fptr, *cat;
+ int nreads;
+ FILE *index, *dbvis;
+ int status, plen, isdam;
+ int part, cutoff, all;
+ int ufirst, tfirst, ulast, tlast;
+
+ status = -1;
+ dbcopy = *db;
+
+ plen = strlen(path);
+ if (strcmp(path+(plen-4),".dam") == 0)
+ root = Root(path,".dam");
+ else
+ root = Root(path,".db");
+ pwd = PathTo(path);
+
+ bptr = rindex(root,'.');
+ if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-')
+ { part = strtol(bptr+1,&fptr,10);
+ if (*fptr != '\0' || part == 0)
+ part = 0;
+ else
+ *bptr = '\0';
+ }
+ else
+ part = 0;
+
+ isdam = 0;
+ cat = Catenate(pwd,"/",root,".db");
+ if (cat == NULL)
+ return (-1);
+ if ((dbvis = fopen(cat,"r")) == NULL)
+ { cat = Catenate(pwd,"/",root,".dam");
+ if (cat == NULL)
+ return (-1);
+ if ((dbvis = fopen(cat,"r")) == NULL)
+ { EPRINTF(EPLACE,"%s: Could not open database %s\n",Prog_Name,path);
+ goto error;
+ }
+ isdam = 1;
+ }
+
+ if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL)
+ goto error1;
+ if (fread(db,sizeof(HITS_DB),1,index) != 1)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ goto error2;
+ }
+
+ { int p, nblocks, nfiles;
+ int64 size;
+ char fname[MAX_NAME], prolog[MAX_NAME];
+
+ nblocks = 0;
+ if (fscanf(dbvis,DB_NFILE,&nfiles) != 1)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error2;
+ }
+ for (p = 0; p < nfiles; p++)
+ if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error2;
+ }
+ if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1)
+ if (part == 0)
+ { cutoff = 0;
+ all = 1;
+ }
+ else
+ { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n",
+ Prog_Name,root);
+ goto error2;
+ }
+ else
+ { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error2;
+ }
+ if (part > nblocks)
+ { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks);
+ goto error2;
+ }
+ }
+
+ if (part > 0)
+ { for (p = 1; p <= part; p++)
+ if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error2;
+ }
+ if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error2;
+ }
+ }
+ else
+ { ufirst = tfirst = 0;
+ ulast = db->ureads;
+ tlast = db->treads;
+ }
+ }
+
+ db->trimmed = 0;
+ db->tracks = NULL;
+ db->part = part;
+ db->cutoff = cutoff;
+ db->all = all;
+ db->ufirst = ufirst;
+ db->tfirst = tfirst;
+
+ nreads = ulast-ufirst;
+ if (part <= 0)
+ { db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index");
+ db->reads += 1;
+ if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ free(db->reads);
+ goto error2;
+ }
+ }
+ else
+ { HITS_READ *reads;
+ int i, r, maxlen;
+ int64 totlen;
+
+ reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index");
+ reads += 1;
+
+ fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR);
+ if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ free(reads);
+ goto error2;
+ }
+
+ totlen = 0;
+ maxlen = 0;
+ for (i = 0; i < nreads; i++)
+ { r = reads[i].rlen;
+ totlen += r;
+ if (r > maxlen)
+ maxlen = r;
+ }
+
+ db->maxlen = maxlen;
+ db->totlen = totlen;
+ db->reads = reads;
+ }
+
+ ((int *) (db->reads))[-1] = ulast - ufirst; // Kludge, need these for DB part
+ ((int *) (db->reads))[-2] = tlast - tfirst;
+
+ db->nreads = nreads;
+ db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path");
+ if (db->path == NULL)
+ goto error2;
+ db->bases = NULL;
+ db->loaded = 0;
+
+ status = isdam;
+
+error2:
+ fclose(index);
+error1:
+ fclose(dbvis);
+error:
+ if (bptr != NULL)
+ *bptr = '.';
+
+ free(pwd);
+ free(root);
+
+ if (status < 0)
+ *db = dbcopy;
+
+ return (status);
+}
+
+
+// Trim the DB or part thereof and all loaded tracks according to the cuttof and all settings
+// of the current DB partition. Reallocate smaller memory blocks for the information kept
+// for the retained reads.
+
+void Trim_DB(HITS_DB *db)
+{ int i, j, r;
+ int allflag, cutoff;
+ int64 totlen;
+ int maxlen, nreads;
+ HITS_TRACK *record;
+ HITS_READ *reads;
+
+ if (db->trimmed) return;
+
+ if (db->cutoff <= 0 && db->all) return;
+
+ cutoff = db->cutoff;
+ if (db->all)
+ allflag = 0;
+ else
+ allflag = DB_BEST;
+
+ reads = db->reads;
+ nreads = db->nreads;
+
+ for (record = db->tracks; record != NULL; record = record->next)
+ if (strcmp(record->name,". at qvs") == 0)
+ { uint16 *table = ((HITS_QV *) record)->table;
+
+ j = 0;
+ for (i = 0; i < db->nreads; i++)
+ if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff)
+ table[j++] = table[i];
+ }
+ else
+ { int *anno4, size;
+ int64 *anno8;
+ char *anno, *data;
+
+ size = record->size;
+ data = (char *) record->data;
+ if (data == NULL)
+ { anno = (char *) record->anno;
+ j = 0;
+ for (i = r = 0; i < db->nreads; i++, r += size)
+ if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff)
+ { memmove(anno+j,anno+r,size);
+ j += size;
+ }
+ memmove(anno+j,anno+r,size);
+ }
+ else if (size == 4)
+ { int ai;
+
+ anno4 = (int *) (record->anno);
+ j = anno4[0] = 0;
+ for (i = 0; i < db->nreads; i++)
+ if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff)
+ { ai = anno4[i];
+ anno4[j+1] = anno4[j] + (anno4[i+1]-ai);
+ memmove(data+anno4[j],data+ai,anno4[i+1]-ai);
+ j += 1;
+ }
+ record->data = Realloc(record->data,anno4[j],NULL);
+ }
+ else // size == 8
+ { int64 ai;
+
+ anno8 = (int64 *) (record->anno);
+ j = anno8[0] = 0;
+ for (i = 0; i < db->nreads; i++)
+ if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff)
+ { ai = anno8[i];
+ anno8[j+1] = anno8[j] + (anno8[i+1]-ai);
+ memmove(data+anno8[j],data+ai,anno8[i+1]-ai);
+ j += 1;
+ }
+ record->data = Realloc(record->data,anno8[j],NULL);
+ }
+ record->anno = Realloc(record->anno,record->size*(j+1),NULL);
+ }
+
+ totlen = maxlen = 0;
+ for (j = i = 0; i < nreads; i++)
+ { r = reads[i].rlen;
+ if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff)
+ { totlen += r;
+ if (r > maxlen)
+ maxlen = r;
+ reads[j++] = reads[i];
+ }
+ }
+
+ db->totlen = totlen;
+ db->maxlen = maxlen;
+ db->nreads = j;
+ db->trimmed = 1;
+
+ if (j < nreads)
+ { db->reads = Realloc(reads-1,sizeof(HITS_READ)*(j+2),NULL);
+ db->reads += 1;
+ }
+}
+
+// Shut down an open 'db' by freeing all associated space, including tracks and QV structures,
+// and any open file pointers. The record pointed at by db however remains (the user
+// supplied it and so should free it).
+
+void Close_DB(HITS_DB *db)
+{ HITS_TRACK *t, *p;
+
+ if (db->loaded)
+ free(((char *) (db->bases)) - 1);
+ else if (db->bases != NULL)
+ fclose((FILE *) db->bases);
+ free(db->reads-1);
+ free(db->path);
+
+ Close_QVs(db);
+
+ for (t = db->tracks; t != NULL; t = p)
+ { p = t->next;
+ free(t->anno);
+ free(t->data);
+ free(t);
+ }
+}
+
+
+/*******************************************************************************************
+ *
+ * QV LOAD & CLOSE ROUTINES
+ *
+ ********************************************************************************************/
+
+HITS_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry"
+HITS_QV *Active_QV; // Becomes invalid after closing
+
+int Load_QVs(HITS_DB *db)
+{ FILE *quiva, *istub, *indx;
+ char *root;
+ uint16 *table;
+ HITS_QV *qvtrk;
+ QVcoding *coding, *nx;
+ int ncodes;
+
+ if (db->tracks != NULL && strcmp(db->tracks->name,". at qvs") == 0)
+ return (0);
+
+ if (db->trimmed)
+ { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name);
+ EXIT(1);
+ }
+
+ if (db->reads[db->nreads-1].coff < 0)
+ { EPRINTF(EPLACE,"%s: The requested QVs have not been added to the DB!\n",Prog_Name);
+ EXIT(1);
+ }
+
+ // Open .qvs, .idx, and .db files
+
+ quiva = Fopen(Catenate(db->path,"","",".qvs"),"r");
+ if (quiva == NULL)
+ return (-1);
+
+ istub = NULL;
+ indx = NULL;
+ table = NULL;
+ coding = NULL;
+ qvtrk = NULL;
+
+ root = rindex(db->path,'/') + 2;
+ istub = Fopen(Catenate(db->path,"/",root,".db"),"r");
+ if (istub == NULL)
+ goto error;
+
+ { int first, last, nfiles;
+ char prolog[MAX_NAME], fname[MAX_NAME];
+ int i, j;
+
+ if (fscanf(istub,DB_NFILE,&nfiles) != 1)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error;
+ }
+
+ if (db->part > 0)
+ { int pfirst, plast;
+ int fbeg, fend;
+ int n, k;
+ FILE *indx;
+
+ // Determine first how many and which files span the block (fbeg to fend)
+
+ pfirst = db->ufirst;
+ plast = pfirst + db->nreads;
+
+ first = 0;
+ for (fbeg = 0; fbeg < nfiles; fbeg++)
+ { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error;
+ }
+ if (last > pfirst)
+ break;
+ first = last;
+ }
+ for (fend = fbeg+1; fend <= nfiles; fend++)
+ { if (last >= plast)
+ break;
+ if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error;
+ }
+ first = last;
+ }
+
+ indx = Fopen(Catenate(db->path,"","",".idx"),"r");
+ ncodes = fend-fbeg;
+ coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes");
+ table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices");
+ if (indx == NULL || coding == NULL || table == NULL)
+ { ncodes = 0;
+ goto error;
+ }
+
+ // Carefully get the first coding scheme (its offset is most likely in a HITS_RECORD
+ // in .idx that is *not* in memory). Get all the other coding schemes normally and
+ // assign the tables # for each read in the block in "tables".
+
+ rewind(istub);
+ fscanf(istub,DB_NFILE,&nfiles);
+
+ first = 0;
+ for (n = 0; n < fbeg; n++)
+ { fscanf(istub,DB_FDATA,&last,fname,prolog);
+ first = last;
+ }
+
+ for (n = fbeg; n < fend; n++)
+ { fscanf(istub,DB_FDATA,&last,fname,prolog);
+
+ i = n-fbeg;
+ if (first < pfirst)
+ { HITS_READ read;
+
+ fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*first,SEEK_SET);
+ if (fread(&read,sizeof(HITS_READ),1,indx) != 1)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ ncodes = i;
+ goto error;
+ }
+ fseeko(quiva,read.coff,SEEK_SET);
+ nx = Read_QVcoding(quiva);
+ if (nx == NULL)
+ { ncodes = i;
+ goto error;
+ }
+ coding[i] = *nx;
+ }
+ else
+ { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET);
+ nx = Read_QVcoding(quiva);
+ if (nx == NULL)
+ { ncodes = i;
+ goto error;
+ }
+ coding[i] = *nx;
+ db->reads[first-pfirst].coff = ftello(quiva);
+ }
+
+ j = first-pfirst;
+ if (j < 0)
+ j = 0;
+ k = last-pfirst;
+ if (k > db->nreads)
+ k = db->nreads;
+ while (j < k)
+ table[j++] = (uint16) i;
+
+ first = last;
+ }
+
+ fclose(indx);
+ indx = NULL;
+ }
+
+ else
+ { // Load in coding scheme for each file, adjust .coff of first read in the file, and
+ // record which table each read uses
+
+ ncodes = nfiles;
+ coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes");
+ table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices");
+ if (coding == NULL || table == NULL)
+ goto error;
+
+ first = 0;
+ for (i = 0; i < nfiles; i++)
+ { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root);
+ goto error;
+ }
+
+ fseeko(quiva,db->reads[first].coff,SEEK_SET);
+ nx = Read_QVcoding(quiva);
+ if (nx == NULL)
+ { ncodes = i;
+ goto error;
+ }
+ coding[i] = *nx;
+ db->reads[first].coff = ftello(quiva);
+
+ for (j = first; j < last; j++)
+ table[j] = (uint16) i;
+
+ first = last;
+ }
+ }
+
+ // Allocate and fill in the HITS_QV record and add it to the front of the
+ // track list
+
+ qvtrk = (HITS_QV *) Malloc(sizeof(HITS_QV),"Allocating QV pseudo-track");
+ if (qvtrk == NULL)
+ goto error;
+ qvtrk->name = Strdup(". at qvs","Allocating QV pseudo-track name");
+ if (qvtrk->name == NULL)
+ goto error;
+ qvtrk->next = db->tracks;
+ db->tracks = (HITS_TRACK *) qvtrk;
+ qvtrk->ncodes = ncodes;
+ qvtrk->table = table;
+ qvtrk->coding = coding;
+ qvtrk->quiva = quiva;
+ }
+
+ fclose(istub);
+ return (0);
+
+error:
+ if (qvtrk != NULL)
+ free(qvtrk);
+ if (table != NULL)
+ free(table);
+ if (coding != NULL)
+ { int i;
+ for (i = 0; i < ncodes; i++)
+ Free_QVcoding(coding+i);
+ free(coding);
+ }
+ if (indx != NULL)
+ fclose(indx);
+ if (istub != NULL)
+ fclose(istub);
+ fclose(quiva);
+ EXIT(1);
+}
+
+// Close the QV stream, free the QV pseudo track and all associated memory
+
+void Close_QVs(HITS_DB *db)
+{ HITS_TRACK *track;
+ HITS_QV *qvtrk;
+ int i;
+
+ Active_DB = NULL;
+
+ track = db->tracks;
+ if (track != NULL && strcmp(track->name,". at qvs") == 0)
+ { qvtrk = (HITS_QV *) track;
+ for (i = 0; i < qvtrk->ncodes; i++)
+ Free_QVcoding(qvtrk->coding+i);
+ free(qvtrk->coding);
+ free(qvtrk->table);
+ fclose(qvtrk->quiva);
+ db->tracks = track->next;
+ free(track);
+ }
+ return;
+}
+
+
+/*******************************************************************************************
+ *
+ * TRACK LOAD & CLOSE ROUTINES
+ *
+ ********************************************************************************************/
+
+// Return status of track:
+// 1: Track is for trimmed DB
+// 0: Track is for untrimmed DB
+// -1: Track is not the right size of DB either trimmed or untrimmed
+// -2: Could not find the track
+
+int Check_Track(HITS_DB *db, char *track)
+{ FILE *afile;
+ int tracklen, ispart;
+ int ureads, treads;
+
+ afile = NULL;
+ if (db->part > 0)
+ { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r");
+ ispart = 1;
+ }
+ if (afile == NULL)
+ { afile = fopen(Catenate(db->path,".",track,".anno"),"r");
+ ispart = 0;
+ }
+ if (afile == NULL)
+ return (-2);
+
+ if (fread(&tracklen,sizeof(int),1,afile) != 1)
+ return (-1);
+
+ fclose(afile);
+
+ if (ispart)
+ { ureads = ((int *) (db->reads))[-1];
+ treads = ((int *) (db->reads))[-2];
+ }
+ else
+ { ureads = db->ureads;
+ treads = db->treads;
+ }
+
+ if (tracklen == treads)
+ return (1);
+ else if (tracklen == ureads)
+ return (0);
+ else
+ return (-1);
+}
+
+// If track is not already in the db's track list, then allocate all the storage for it,
+// read it in from the appropriate file, add it to the track list, and return a pointer
+// to the newly created HITS_TRACK record. If the track does not exist or cannot be
+// opened for some reason, then NULL is returned.
+
+HITS_TRACK *Load_Track(HITS_DB *db, char *track)
+{ FILE *afile, *dfile;
+ int tracklen, size;
+ int nreads, ispart;
+ int treads, ureads;
+ void *anno;
+ void *data;
+ char *name;
+ HITS_TRACK *record;
+
+ if (track[0] == '.')
+ { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track);
+ EXIT(NULL);
+ }
+
+ for (record = db->tracks; record != NULL; record = record->next)
+ if (strcmp(record->name,track) == 0)
+ return (record);
+
+ afile = NULL;
+ if (db->part)
+ { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r");
+ ispart = 1;
+ }
+ if (afile == NULL)
+ { afile = fopen(Catenate(db->path,".",track,".anno"),"r");
+ ispart = 0;
+ }
+ if (afile == NULL)
+ { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track);
+ return (NULL);
+ }
+
+ dfile = NULL;
+ anno = NULL;
+ data = NULL;
+ record = NULL;
+
+ if (ispart)
+ name = Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data");
+ else
+ name = Catenate(db->path,".",track,".data");
+ if (name == NULL)
+ goto error;
+ dfile = fopen(name,"r");
+
+ if (fread(&tracklen,sizeof(int),1,afile) != 1)
+ { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track);
+ goto error;
+ }
+ if (fread(&size,sizeof(int),1,afile) != 1)
+ { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track);
+ goto error;
+ }
+ if (size <= 0)
+ { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track);
+ goto error;
+ }
+
+ if (ispart)
+ { ureads = ((int *) (db->reads))[-1];
+ treads = ((int *) (db->reads))[-2];
+ }
+ else
+ { ureads = db->ureads;
+ treads = db->treads;
+ }
+
+ if (db->trimmed)
+ { if (tracklen != treads)
+ { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track);
+ goto error;
+ }
+ if ( ! ispart && db->part > 0)
+ fseeko(afile,size*db->tfirst,SEEK_CUR);
+ }
+ else
+ { if (tracklen != ureads)
+ { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track);
+ goto error;
+ }
+ if ( ! ispart && db->part > 0)
+ fseeko(afile,size*db->ufirst,SEEK_CUR);
+ }
+ nreads = db->nreads;
+
+ anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector");
+ if (anno == NULL)
+ goto error;
+
+ if (dfile != NULL)
+ { int64 *anno8, off8, dlen;
+ int *anno4, off4;
+ int i;
+
+ if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1))
+ { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track);
+ goto error;
+ }
+
+ if (size == 4)
+ { anno4 = (int *) anno;
+ off4 = anno4[0];
+ if (off4 != 0)
+ { for (i = 0; i <= nreads; i++)
+ anno4[i] -= off4;
+ fseeko(dfile,off4,SEEK_SET);
+ }
+ dlen = anno4[nreads];
+ data = (void *) Malloc(dlen,"Allocating Track Data Vector");
+ }
+ else
+ { anno8 = (int64 *) anno;
+ off8 = anno8[0];
+ if (off8 != 0)
+ { for (i = 0; i <= nreads; i++)
+ anno8[i] -= off8;
+ fseeko(dfile,off8,SEEK_SET);
+ }
+ dlen = anno8[nreads];
+ data = (void *) Malloc(dlen,"Allocating Track Data Vector");
+ }
+ if (data == NULL)
+ goto error;
+ if (dlen > 0)
+ { if (fread(data,dlen,1,dfile) != 1)
+ { EPRINTF(EPLACE,"%s: Track '%s' data file is junk\n",Prog_Name,track);
+ goto error;
+ }
+ }
+ fclose(dfile);
+ dfile = NULL;
+ }
+ else
+ { if (fread(anno,size,nreads,afile) != (size_t) nreads)
+ { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track);
+ goto error;
+ }
+ data = NULL;
+ }
+
+ fclose(afile);
+
+ record = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating Track Record");
+ if (record == NULL)
+ goto error;
+ record->name = Strdup(track,"Allocating Track Name");
+ if (record->name == NULL)
+ goto error;
+ record->data = data;
+ record->anno = anno;
+ record->size = size;
+
+ if (db->tracks != NULL && strcmp(db->tracks->name,". at qvs") == 0)
+ { record->next = db->tracks->next;
+ db->tracks->next = record;
+ }
+ else
+ { record->next = db->tracks;
+ db->tracks = record;
+ }
+
+ return (record);
+
+error:
+ if (record == NULL)
+ free(record);
+ if (data != NULL)
+ free(data);
+ if (anno != NULL)
+ free(anno);
+ if (dfile != NULL)
+ fclose(dfile);
+ fclose(afile);
+ EXIT (NULL);
+}
+
+void Close_Track(HITS_DB *db, char *track)
+{ HITS_TRACK *record, *prev;
+
+ prev = NULL;
+ for (record = db->tracks; record != NULL; record = record->next)
+ { if (strcmp(record->name,track) == 0)
+ { free(record->anno);
+ free(record->data);
+ free(record->name);
+ if (prev == NULL)
+ db->tracks = record->next;
+ else
+ prev->next = record->next;
+ free(record);
+ return;
+ }
+ prev = record;
+ }
+ return;
+}
+
+
+/*******************************************************************************************
+ *
+ * READ BUFFER ALLOCATION AND READ ACCESS
+ *
+ ********************************************************************************************/
+
+// Allocate and return a buffer big enough for the largest read in 'db', leaving room
+// for an initial delimiter character
+
+char *New_Read_Buffer(HITS_DB *db)
+{ char *read;
+
+ read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer");
+ if (read == NULL)
+ EXIT(NULL);
+ return (read+1);
+}
+
+// Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a
+// lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and
+// 3(T) otherwise.
+//
+// **NB**, the byte before read will be set to a delimiter character!
+
+int Load_Read(HITS_DB *db, int i, char *read, int ascii)
+{ FILE *bases = (FILE *) db->bases;
+ int64 off;
+ int len, clen;
+ HITS_READ *r = db->reads;
+
+ if (i >= db->nreads)
+ { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name);
+ EXIT(1);
+ }
+ if (bases == NULL)
+ { bases = Fopen(Catenate(db->path,"","",".bps"),"r");
+ if (bases == NULL)
+ EXIT(1);
+ db->bases = (void *) bases;
+ }
+
+ off = r[i].boff;
+ len = r[i].rlen;
+
+ if (ftello(bases) != off)
+ fseeko(bases,off,SEEK_SET);
+ clen = COMPRESSED_LEN(len);
+ if (clen > 0)
+ { if (fread(read,clen,1,bases) != 1)
+ { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name);
+ EXIT(1);
+ }
+ }
+ Uncompress_Read(len,read);
+ if (ascii == 1)
+ { Lower_Read(read);
+ read[-1] = '\0';
+ }
+ else if (ascii == 2)
+ { Upper_Read(read);
+ read[-1] = '\0';
+ }
+ else
+ read[-1] = 4;
+ return (0);
+}
+
+char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii)
+{ FILE *bases = (FILE *) db->bases;
+ int64 off;
+ int len, clen;
+ int bbeg, bend;
+ HITS_READ *r = db->reads;
+
+ if (i >= db->nreads)
+ { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name);
+ EXIT(NULL);
+ }
+ if (bases == NULL)
+ { bases = Fopen(Catenate(db->path,"","",".bps"),"r");
+ if (bases == NULL)
+ EXIT(NULL);
+ db->bases = (void *) bases;
+ }
+
+ bbeg = beg/4;
+ bend = (end-1)/4+1;
+
+ off = r[i].boff + bbeg;
+ len = end - beg;
+
+ if (ftello(bases) != off)
+ fseeko(bases,off,SEEK_SET);
+ clen = bend-bbeg;
+ if (clen > 0)
+ { if (fread(read,clen,1,bases) != 1)
+ { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name);
+ EXIT(NULL);
+ }
+ }
+ Uncompress_Read(4*clen,read);
+ read += beg%4;
+ read[len] = 4;
+ if (ascii == 1)
+ { Lower_Read(read);
+ read[-1] = '\0';
+ }
+ else if (ascii == 2)
+ { Upper_Read(read);
+ read[-1] = '\0';
+ }
+ else
+ read[-1] = 4;
+
+ return (read);
+}
+
+
+/*******************************************************************************************
+ *
+ * QV BUFFER ALLOCATION QV READ ACCESS
+ *
+ ********************************************************************************************/
+
+// Allocate and return a buffer of 5 vectors big enough for the largest read in 'db'
+
+char **New_QV_Buffer(HITS_DB *db)
+{ char **entry;
+ char *qvs;
+ int i;
+
+ qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer");
+ entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer");
+ if (qvs == NULL || entry == NULL)
+ EXIT(NULL);
+ for (i = 0; i < 5; i++)
+ entry[i] = qvs + i*db->maxlen;
+ return (entry);
+}
+
+// Load into entry the QV streams for the i'th read from db. The parameter ascii applies to
+// the DELTAG stream as described for Load_Read.
+
+int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii)
+{ HITS_READ *reads;
+ FILE *quiva;
+ int rlen;
+
+ if (db != Active_DB)
+ { if (db->tracks == NULL || strcmp(db->tracks->name,". at qvs") != 0)
+ { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name);
+ EXIT(1);
+ }
+ Active_QV = (HITS_QV *) db->tracks;
+ Active_DB = db;
+ }
+ if (i >= db->nreads)
+ { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name);
+ EXIT(1);
+ }
+
+ reads = db->reads;
+ quiva = Active_QV->quiva;
+ rlen = reads[i].rlen;
+
+ fseeko(quiva,reads[i].coff,SEEK_SET);
+ if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen))
+ EXIT(1);
+
+ if (ascii != 1)
+ { char *deltag = entry[1];
+
+ if (ascii != 2)
+ { char x = deltag[rlen];
+ deltag[rlen] = '\0';
+ Number_Read(deltag);
+ deltag[rlen] = x;
+ }
+ else
+ { int j;
+ int u = 'A'-'a';
+
+ for (j = 0; j < rlen; j++)
+ deltag[j] = (char) (deltag[j]+u);
+ }
+ }
+
+ return (0);
+}
+
+
+/*******************************************************************************************
+ *
+ * BLOCK LOAD OF ALL READS (PRIMARILY FOR DALIGNER)
+ *
+ ********************************************************************************************/
+
+// Allocate a block big enough for all the uncompressed sequences, read them into it,
+// reset the 'off' in each read record to be its in-memory offset, and set the
+// bases pointer to point at the block after closing the bases file. If ascii is
+// non-zero then the reads are converted to ACGT ascii, otherwise the reads are left
+// as numeric strings over 0(A), 1(C), 2(G), and 3(T).
+
+int Read_All_Sequences(HITS_DB *db, int ascii)
+{ FILE *bases;
+ int nreads = db->nreads;
+ HITS_READ *reads = db->reads;
+ void (*translate)(char *s);
+
+ char *seq;
+ int64 o, off;
+ int i, len, clen;
+
+ bases = Fopen(Catenate(db->path,"","",".bps"),"r");
+ if (bases == NULL)
+ EXIT(1);
+
+ seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads");
+ if (seq == NULL)
+ { fclose(bases);
+ EXIT(1);
+ }
+
+ *seq++ = 4;
+
+ if (ascii == 1)
+ translate = Lower_Read;
+ else
+ translate = Upper_Read;
+
+ o = 0;
+ for (i = 0; i < nreads; i++)
+ { len = reads[i].rlen;
+ off = reads[i].boff;
+ if (ftello(bases) != off)
+ fseeko(bases,off,SEEK_SET);
+ clen = COMPRESSED_LEN(len);
+ if (clen > 0)
+ { if (fread(seq+o,clen,1,bases) != 1)
+ { EPRINTF(EPLACE,"%s: Read of .bps file failed (Read_All_Sequences)\n",Prog_Name);
+ free(seq);
+ fclose(bases);
+ EXIT(1);
+ }
+ }
+ Uncompress_Read(len,seq+o);
+ if (ascii)
+ translate(seq+o);
+ reads[i].boff = o;
+ o += (len+1);
+ }
+ reads[nreads].boff = o;
+
+ fclose(bases);
+
+ db->bases = (void *) seq;
+ db->loaded = 1;
+
+ return (0);
+}
+
+int List_DB_Files(char *path, void actor(char *path, char *extension))
+{ int status, plen, rlen, dlen;
+ char *root, *pwd, *name;
+ int isdam;
+ DIR *dirp;
+ struct dirent *dp;
+
+ status = 0;
+ pwd = PathTo(path);
+ plen = strlen(path);
+ if (strcmp(path+(plen-4),".dam") == 0)
+ root = Root(path,".dam");
+ else
+ root = Root(path,".db");
+ rlen = strlen(root);
+
+ if (root == NULL || pwd == NULL)
+ { free(pwd);
+ free(root);
+ EXIT(1);
+ }
+
+ if ((dirp = opendir(pwd)) == NULL)
+ { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd);
+ status = -1;
+ goto error;
+ }
+
+ isdam = 0;
+ while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary)
+ { name = dp->d_name;
+ if (strcmp(name,Catenate("","",root,".db")) == 0)
+ break;
+ if (strcmp(name,Catenate("","",root,".dam")) == 0)
+ { isdam = 1;
+ break;
+ }
+ if (strcasecmp(name,Catenate("","",root,".db")) == 0)
+ { strncpy(root,name,rlen);
+ break;
+ }
+ if (strcasecmp(name,Catenate("","",root,".dam")) == 0)
+ { strncpy(root,name,rlen);
+ isdam = 1;
+ break;
+ }
+ }
+ if (dp == NULL)
+ { EPRINTF(EPLACE,"%s: Cannot find %s (List_DB_Files)\n",Prog_Name,pwd);
+ status = -1;
+ closedir(dirp);
+ goto error;
+ }
+
+ if (isdam)
+ actor(Catenate(pwd,"/",root,".dam"),"dam");
+ else
+ actor(Catenate(pwd,"/",root,".db"),"db");
+
+ rewinddir(dirp); // Report each auxiliary file
+ while ((dp = readdir(dirp)) != NULL)
+ { name = dp->d_name;
+ dlen = strlen(name);
+#ifdef HIDE_FILES
+ if (name[0] != '.')
+ continue;
+ dlen -= 1;
+ name += 1;
+#endif
+ if (dlen < rlen+1)
+ continue;
+ if (name[rlen] != '.')
+ continue;
+ if (strncmp(name,root,rlen) != 0)
+ continue;
+ actor(Catenate(pwd,PATHSEP,name,""),name+(rlen+1));
+ }
+ closedir(dirp);
+
+error:
+ free(pwd);
+ free(root);
+ return (status);
+}
+
+void Print_Read(char *s, int width)
+{ int i;
+
+ if (s[0] < 4)
+ { for (i = 0; s[i] != 4; i++)
+ { if (i%width == 0 && i != 0)
+ printf("\n");
+ printf("%d",s[i]);
+ }
+ printf("\n");
+ }
+ else
+ { for (i = 0; s[i] != '\0'; i++)
+ { if (i%width == 0 && i != 0)
+ printf("\n");
+ printf("%c",s[i]);
+ }
+ printf("\n");
+ }
+}
diff --git a/DB.h b/DB.h
new file mode 100644
index 0000000..567a91a
--- /dev/null
+++ b/DB.h
@@ -0,0 +1,442 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Compressed data base module. Auxiliary routines to open and manipulate a data base for
+ * which the sequence and read information are separated into two separate files, and the
+ * sequence is compressed into 2-bits for each base. Support for tracks of additional
+ * information, and trimming according to the current partition. Eventually will also
+ * support compressed quality information.
+ *
+ * Author : Gene Myers
+ * Date : July 2013
+ * Revised: April 2014
+ *
+ ********************************************************************************************/
+
+#ifndef _HITS_DB
+
+#define _HITS_DB
+
+#include <stdio.h>
+
+#include "QV.h"
+
+#define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden"
+ // Undefine if you don't want this
+
+// For interactive applications where it is inappropriate to simply exit with an error
+// message to standard error, define the constant INTERACTIVE. If set, then error
+// messages are put in the global variable Ebuffer and the caller of a DB routine
+// can decide how to deal with the error.
+//
+// DB, QV, or alignment routines that can encounter errors function as before in
+// non-INTERACTIVE mode by exiting after printing an error message to stderr. In
+// INTERACTIVE mode the routines place a message at EPLACE and return an error
+// value. For such routines that were previously void, they are now int, and
+// return 1 if an error occured, 0 otherwise.
+
+#undef INTERACTIVE
+
+#ifdef INTERACTIVE
+
+#define EPRINTF sprintf
+#define EPLACE Ebuffer
+#define EXIT(x) return (x)
+
+#else // BATCH
+
+#define EPRINTF fprintf
+#define EPLACE stderr
+#define EXIT(x) exit (1)
+
+#endif
+
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint32;
+typedef unsigned long long uint64;
+typedef signed char int8;
+typedef signed short int16;
+typedef signed int int32;
+typedef signed long long int64;
+typedef float float32;
+typedef double float64;
+
+
+/*******************************************************************************************
+ *
+ * COMMAND LINE INTERPRETATION MACROS
+ *
+ ********************************************************************************************/
+
+extern char *Prog_Name; // Name of program
+
+#ifdef INTERACTIVE
+
+extern char Ebuffer[];
+
+#endif
+
+#define SYSTEM_ERROR \
+ { EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \
+ exit (2); \
+ }
+
+#define ARG_INIT(name) \
+ Prog_Name = Strdup(name,""); \
+ for (i = 0; i < 128; i++) \
+ flags[i] = 0;
+
+#define ARG_FLAGS(set) \
+ for (k = 1; argv[i][k] != '\0'; k++) \
+ { if (index(set,argv[i][k]) == NULL) \
+ { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \
+ exit (1); \
+ } \
+ flags[(int) argv[i][k]] = 1; \
+ }
+
+#define ARG_POSITIVE(var,name) \
+ var = strtol(argv[i]+2,&eptr,10); \
+ if (*eptr != '\0' || argv[i][2] == '\0') \
+ { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \
+ exit (1); \
+ } \
+ if (var <= 0) \
+ { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \
+ exit (1); \
+ }
+
+#define ARG_NON_NEGATIVE(var,name) \
+ var = strtol(argv[i]+2,&eptr,10); \
+ if (*eptr != '\0' || argv[i][2] == '\0') \
+ { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \
+ exit (1); \
+ } \
+ if (var < 0) \
+ { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \
+ exit (1); \
+ }
+
+#define ARG_REAL(var) \
+ var = strtod(argv[i]+2,&eptr); \
+ if (*eptr != '\0' || argv[i][2] == '\0') \
+ { fprintf(stderr,"%s: -%c argument is not a real number\n",Prog_Name,argv[i][1]); \
+ exit (1); \
+ }
+
+/*******************************************************************************************
+ *
+ * UTILITIES
+ *
+ ********************************************************************************************/
+
+// The following general utilities return NULL if any of their input pointers are NULL, or if they
+// could not perform their function (in which case they also print an error to stderr).
+
+void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc
+void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to
+char *Strdup(char *string, char *mesg); // stderr if out of memory
+
+FILE *Fopen(char *path, char *mode); // Open file path for "mode"
+char *PathTo(char *path); // Return path portion of file name "path"
+char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path"
+
+// Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer
+// Numbered_Suffix returns concatenation of left.<num>.right in a *temporary* buffer
+
+char *Catenate(char *path, char *sep, char *root, char *suffix);
+char *Numbered_Suffix(char *left, int num, char *right);
+
+
+// DB-related utilities
+
+void Print_Number(int64 num, int width, FILE *out); // Print readable big integer
+int Number_Digits(int64 num); // Return # of digits in printed number
+
+#define COMPRESSED_LEN(len) (((len)+3) >> 2)
+
+void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form
+void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form
+void Print_Read(char *s, int width);
+
+void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt)
+void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT)
+void Number_Read(char *s); // Convert read from letters to numbers
+
+
+/*******************************************************************************************
+ *
+ * DB IN-CORE DATA STRUCTURES
+ *
+ ********************************************************************************************/
+
+#define DB_QV 0x03ff // Mask for 3-digit quality value
+#define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert
+#define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1)
+
+typedef struct
+ { int origin; // Well #
+ int rlen; // Length of the sequence (Last pulse = fpulse + rlen)
+ int fpulse; // First pulse
+ int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of
+ // uncompressed bases in memory block
+ int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file
+ int flags; // QV of read + flags above
+ } HITS_READ;
+
+// A track can be of 3 types:
+// data == NULL: there are nreads 'anno' records of size 'size'.
+// data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1])
+// contains the variable length data
+// data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1])
+// contains the variable length data
+
+typedef struct _track
+ { struct _track *next; // Link to next track
+ char *name; // Symbolic name of track
+ int size; // Size in bytes of anno records
+ void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records
+ void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL
+ } HITS_TRACK;
+
+// The information for accessing QV streams is in a HITS_QV record that is a "pseudo-track"
+// named ". at qvs" and is always the first track record in the list (if present). Since normal
+// track names cannot begin with a . (this is enforced), this pseudo-track is never confused
+// with a normal track.
+
+typedef struct
+ { struct _track *next;
+ char *name;
+ int ncodes; // # of coding tables
+ QVcoding *coding; // array [0..ncodes-1] of coding schemes (see QV.h)
+ uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with
+ // scheme coding[table[i]]
+ FILE *quiva; // the open file pointer to the .qvs file
+ } HITS_QV;
+
+// The DB record holds all information about the current state of an active DB including an
+// array of HITS_READS, one per read, and a linked list of HITS_TRACKs the first of which
+// is always a HITS_QV pseudo-track (if the QVs have been loaded).
+
+typedef struct
+ { int ureads; // Total number of reads in untrimmed DB
+ int treads; // Total number of reads in trimmed DB
+ int cutoff; // Minimum read length in block (-1 if not yet set)
+ int all; // Consider multiple reads from a given well
+ float freq[4]; // frequency of A, C, G, T, respectively
+
+ // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed)
+
+ int maxlen; // length of maximum read (initially over all DB)
+ int64 totlen; // total # of bases (initially over all DB)
+
+ int nreads; // # of reads in actively loaded portion of DB
+ int trimmed; // DB has been trimmed by cutoff/all
+ int part; // DB block (if > 0), total DB (if == 0)
+ int ufirst; // Index of first read in block (without trimming)
+ int tfirst; // Index of first read in block (with trimming)
+
+ // In order to avoid forcing users to have to rebuild all thier DBs to accommodate
+ // the addition of fields for the size of the actively loaded trimmed and untrimmed
+ // blocks, an additional read record is allocated in "reads" when a DB is loaded into
+ // memory (reads[-1]) and the two desired fields are crammed into the first two
+ // integer spaces of the record.
+
+ char *path; // Root name of DB for .bps, .qvs, and tracks
+ int loaded; // Are reads loaded in memory?
+ void *bases; // file pointer for bases file (to fetch reads from),
+ // or memory pointer to uncompressed block of all sequences.
+ HITS_READ *reads; // Array [-1..nreads] of HITS_READ
+ HITS_TRACK *tracks; // Linked list of loaded tracks
+ } HITS_DB;
+
+
+/*******************************************************************************************
+ *
+ * DB STUB FILE FORMAT = NFILE FDATA^nfile NBLOCK PARAMS BDATA^nblock
+ *
+ ********************************************************************************************/
+
+#define MAX_NAME 10000 // Longest file name or fasta header line
+
+#define DB_NFILE "files = %9d\n" // number of files
+#define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name
+#define DB_NBLOCK "blocks = %9d\n" // number of blocks
+#define DB_PARAMS "size = %9lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well
+#define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed)
+
+
+/*******************************************************************************************
+ *
+ * DB ROUTINES
+ *
+ ********************************************************************************************/
+
+ // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps,
+ // .DB.qvs, and files .DB.<track>.anno and DB.<track>.data where <track> is a track name
+ // (not containing a . !).
+
+ // A DAM is basically a DB except that:
+ // 1. there are no QV's, instead .coff points the '\0' terminated fasta header of the read
+ // in the file .<dam>.hdr file
+ // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences
+ // contain N-separated contigs), and .fpulse the first base of the contig in the
+ // fasta entry
+
+ // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has
+ // a part # in it then just the part is opened. The index array is allocated (for all or
+ // just the part) and read in.
+ // Return status of routine:
+ // -1: The DB could not be opened for a reason reported by the routine to EPLACE
+ // 0: Open of DB proceeded without mishap
+ // 1: Open of DAM proceeded without mishap
+
+int Open_DB(char *path, HITS_DB *db);
+
+ // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings
+ // of the current DB partition. Reallocate smaller memory blocks for the information kept
+ // for the retained reads.
+
+void Trim_DB(HITS_DB *db);
+
+ // Shut down an open 'db' by freeing all associated space, including tracks and QV structures,
+ // and any open file pointers. The record pointed at by db however remains (the user
+ // supplied it and so should free it).
+
+void Close_DB(HITS_DB *db);
+
+ // If QV pseudo track is not already in db's track list, then load it and set it up.
+ // The database must not have been trimmed yet. -1 is returned if a .qvs file is not
+ // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE
+ // is defined. Otherwise a 0 is returned.
+
+int Load_QVs(HITS_DB *db);
+
+ // Remove the QV pseudo track, all space associated with it, and close the .qvs file.
+
+void Close_QVs(HITS_DB *db);
+
+ // Look up the file and header in the file of the indicated track. Return:
+ // 1: Track is for trimmed DB
+ // 0: Track is for untrimmed DB
+ // -1: Track is not the right size of DB either trimmed or untrimmed
+ // -2: Could not find the track
+
+int Check_Track(HITS_DB *db, char *track);
+
+ // If track is not already in the db's track list, then allocate all the storage for it,
+ // read it in from the appropriate file, add it to the track list, and return a pointer
+ // to the newly created HITS_TRACK record. If the track does not exist or cannot be
+ // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise
+ // the routine prints an error message to stderr and exits if an error occurs, and returns
+ // with NULL only if the track does not exist.
+
+HITS_TRACK *Load_Track(HITS_DB *db, char *track);
+
+ // If track is on the db's track list, then it is removed and all storage associated with it
+ // is freed.
+
+void Close_Track(HITS_DB *db, char *track);
+
+ // Allocate and return a buffer big enough for the largest read in 'db'.
+ // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte
+ // are needed by the alignment algorithms. If cannot allocate memory then return NULL
+ // if INTERACTIVE is defined, or print error to stderr and exit otherwise.
+
+char *New_Read_Buffer(HITS_DB *db);
+
+ // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an
+ // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T)
+ // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter
+ // for traversals in either direction. A non-zero value is returned if an error occured
+ // and INTERACTIVE is defined.
+
+int Load_Read(HITS_DB *db, int i, char *read, int ascii);
+
+ // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the
+ // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii
+ // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string
+ // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to
+ // the string holding the substring so it has a delimeter for traversals in either direction.
+ // A NULL pointer is returned if an error occured and INTERACTIVE is defined.
+
+char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii);
+
+ // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur
+ // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined,
+ // or print error to stderr and exit otherwise.
+
+#define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer
+#define DEL_TAG 1 // The deleted characters
+#define INS_QV 2 // The insertion QVs
+#define SUB_QV 3 // The substitution QVs
+#define MRG_QV 4 // The merge QVs
+
+char **New_QV_Buffer(HITS_DB *db);
+
+ // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters
+ // are converted to a numeric or upper/lower case ascii string as per ascii. Return with
+ // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1.
+
+int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii);
+
+ // Allocate a block big enough for all the uncompressed sequences, read them into it,
+ // reset the 'off' in each read record to be its in-memory offset, and set the
+ // bases pointer to point at the block after closing the bases file. If ascii is
+ // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and
+ // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T).
+ // Return with a zero, except when an error occurs and INTERACTIVE is defined in which
+ // case return wtih 1.
+
+int Read_All_Sequences(HITS_DB *db, int ascii);
+
+ // For the DB or DAM "path" = "prefix/root[.db|.dam]", find all the files for that DB, i.e. all
+ // those of the form "prefix/[.]root.part" and call actor with the complete path to each file
+ // pointed at by path, and the suffix of the path by extension. The . proceeds the root
+ // name if the defined constant HIDE_FILES is set. Always the first call is with the
+ // path "prefix/root.db" and extension "db". There will always be calls for
+ // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and
+ // so this routine gives one a way to know all the tracks associated with a given DB.
+ // -1 is returned if the path could not be found, and 1 is returned if an error (reported
+ // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned.
+
+int List_DB_Files(char *path, void actor(char *path, char *extension));
+
+#endif // _HITS_DB
diff --git a/DB2fasta.c b/DB2fasta.c
new file mode 100644
index 0000000..5080f88
--- /dev/null
+++ b/DB2fasta.c
@@ -0,0 +1,188 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/********************************************************************************************
+ *
+ * Recreate all the .fasta files that have been loaded into a specified database.
+ *
+ * Author: Gene Myers
+ * Date : May 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "DB.h"
+
+static char *Usage = "[-vU] [-w<int(80)>] <path:db>";
+
+int main(int argc, char *argv[])
+{ HITS_DB _db, *db = &_db;
+ FILE *dbfile;
+ int nfiles;
+ int VERBOSE, UPPER, WIDTH;
+
+ // Process arguments
+
+ { int i, j, k;
+ int flags[128];
+ char *eptr;
+
+ ARG_INIT("DB2fasta")
+
+ WIDTH = 80;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("vU")
+ break;
+ case 'w':
+ ARG_NON_NEGATIVE(WIDTH,"Line width")
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ UPPER = 1 + flags['U'];
+ VERBOSE = flags['v'];
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ // Open db
+
+ { int status;
+
+ status = Open_DB(argv[1],db);
+ if (status < 0)
+ exit (1);
+ if (status == 1)
+ { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ if (db->part > 0)
+ { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ }
+
+ { char *pwd, *root;
+
+ pwd = PathTo(argv[1]);
+ root = Root(argv[1],".db");
+ dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r");
+ free(pwd);
+ free(root);
+ if (dbfile == NULL)
+ exit (1);
+ }
+
+ // nfiles = # of files in data base
+
+ if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
+ SYSTEM_ERROR
+
+ // For each file do:
+
+ { HITS_READ *reads;
+ char *read;
+ int f, first;
+
+ reads = db->reads;
+ read = New_Read_Buffer(db);
+ first = 0;
+ for (f = 0; f < nfiles; f++)
+ { int i, last;
+ FILE *ofile;
+ char prolog[MAX_NAME], fname[MAX_NAME];
+
+ // Scan db image file line, create .fasta file for writing
+
+ if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3)
+ SYSTEM_ERROR
+
+ if ((ofile = Fopen(Catenate(".","/",fname,".fasta"),"w")) == NULL)
+ exit (1);
+
+ if (VERBOSE)
+ { fprintf(stderr,"Creating %s.fasta ...\n",fname);
+ fflush(stdout);
+ }
+
+ // For the relevant range of reads, write each to the file
+ // recreating the original headers with the index meta-data about each read
+
+ for (i = first; i < last; i++)
+ { int j, len;
+ int flags, qv;
+ HITS_READ *r;
+
+ r = reads + i;
+ len = r->rlen;
+ flags = r->flags;
+ qv = (flags & DB_QV);
+ fprintf(ofile,">%s/%d/%d_%d",prolog,r->origin,r->fpulse,r->fpulse+len);
+ if (qv > 0)
+ fprintf(ofile," RQ=0.%3d",qv);
+ fprintf(ofile,"\n");
+
+ Load_Read(db,i,read,UPPER);
+
+ for (j = 0; j+WIDTH < len; j += WIDTH)
+ fprintf(ofile,"%.*s\n",WIDTH,read+j);
+ if (j < len)
+ fprintf(ofile,"%s\n",read+j);
+ }
+
+ first = last;
+ }
+ }
+
+ fclose(dbfile);
+ Close_DB(db);
+
+ exit (0);
+}
diff --git a/DB2quiva.c b/DB2quiva.c
new file mode 100644
index 0000000..63c0b91
--- /dev/null
+++ b/DB2quiva.c
@@ -0,0 +1,192 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/********************************************************************************************
+ *
+ * Recreate all the .quiva files that have been loaded into a specified database.
+ *
+ * Author: Gene Myers
+ * Date : May 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "DB.h"
+#include "QV.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+static char *Usage = "[-vU] <path:db>";
+
+int main(int argc, char *argv[])
+{ HITS_DB _db, *db = &_db;
+ FILE *dbfile, *quiva;
+ int VERBOSE, UPPER;
+
+ // Process arguments
+
+ { int i, j, k;
+ int flags[128];
+
+ ARG_INIT("DB2quiva")
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ { ARG_FLAGS("vU") }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ VERBOSE = flags['v'];
+ UPPER = flags['U'];
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ // Open db, db stub file, and .qvs file
+
+ { char *pwd, *root;
+ int status;
+
+ status = Open_DB(argv[1],db);
+ if (status < 0)
+ exit (1);
+ if (status == 1)
+ { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ if (db->part > 0)
+ { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+
+ pwd = PathTo(argv[1]);
+ root = Root(argv[1],".db");
+ dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r");
+ quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"r");
+ free(pwd);
+ free(root);
+ if (dbfile == NULL || quiva == NULL)
+ exit (1);
+ }
+
+ // For each file do:
+
+ { HITS_READ *reads;
+ int f, first, nfiles;
+ QVcoding *coding;
+ char **entry;
+
+ if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
+ SYSTEM_ERROR
+
+ entry = New_QV_Buffer(db);
+ reads = db->reads;
+ first = 0;
+ for (f = 0; f < nfiles; f++)
+ { int i, last;
+ FILE *ofile;
+ char prolog[MAX_NAME], fname[MAX_NAME];
+
+ // Scan db image file line, create .quiva file for writing
+
+ if (reads[first].coff < 0) break;
+
+ if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3)
+ SYSTEM_ERROR
+
+ if ((ofile = Fopen(Catenate(".","/",fname,".quiva"),"w")) == NULL)
+ exit (1);
+
+ if (VERBOSE)
+ { fprintf(stderr,"Creating %s.quiva ...\n",fname);
+ fflush(stderr);
+ }
+
+ coding = Read_QVcoding(quiva);
+
+ // For the relevant range of reads, write the header for each to the file
+ // and then uncompress and write the quiva entry for each
+
+ for (i = first; i < last; i++)
+ { int e, flags, qv, rlen;
+ HITS_READ *r;
+
+ r = reads + i;
+ flags = r->flags;
+ rlen = r->rlen;
+ qv = (flags & DB_QV);
+ fprintf(ofile,"@%s/%d/%d_%d",prolog,r->origin,r->fpulse,r->fpulse+rlen);
+ if (qv > 0)
+ fprintf(ofile," RQ=0.%3d",qv);
+ fprintf(ofile,"\n");
+
+ Uncompress_Next_QVentry(quiva,entry,coding,rlen);
+
+ if (UPPER)
+ { char *deltag = entry[1];
+ int j;
+
+ for (j = 0; j < rlen; j++)
+ deltag[j] -= 32;
+ }
+
+ for (e = 0; e < 5; e++)
+ fprintf(ofile,"%.*s\n",rlen,entry[e]);
+ }
+
+ first = last;
+ }
+ }
+
+ fclose(quiva);
+ fclose(dbfile);
+ Close_DB(db);
+
+ exit (0);
+}
diff --git a/DBdust.c b/DBdust.c
new file mode 100644
index 0000000..a63ddd4
--- /dev/null
+++ b/DBdust.c
@@ -0,0 +1,508 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * My implementation of the SDUST algorithm (Morgulis et al., JCB 13, 5 (2006), 1028-1040)
+ *
+ * Author: Gene Myers
+ * Date : September 2013
+ * Mod : Is now incremental
+ * Date : April 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <strings.h>
+#include <math.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+#undef DEBUG
+
+#ifdef DEBUG
+
+static int Caps[4] = { 'A', 'C', 'G', 'T' };
+static int Lowr[4] = { 'a', 'c', 'g', 't' };
+
+#endif
+
+static char *Usage = "[-b] [-w<int(64)>] [-t<double(2.)>] [-m<int(10)>] <path:db|dam>";
+
+typedef struct _cand
+ { struct _cand *next;
+ struct _cand *prev;
+ int beg;
+ int end;
+ double score;
+ } Candidate;
+
+int main(int argc, char *argv[])
+{ HITS_DB _db, *db = &_db;
+ FILE *afile, *dfile;
+ int64 indx;
+ int nreads;
+ int *mask;
+ Candidate *cptr;
+
+ int WINDOW;
+ double THRESH;
+ int MINLEN;
+ int BIASED;
+
+ { int i, j, k;
+ int flags[128];
+ char *eptr;
+
+ ARG_INIT("DBdust")
+
+ WINDOW = 64;
+ THRESH = 2.;
+ MINLEN = 9;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("b")
+ break;
+ case 'w':
+ ARG_POSITIVE(WINDOW,"Window size")
+ break;
+ case 't':
+ ARG_REAL(THRESH)
+ if (THRESH <= 0.)
+ { fprintf(stderr,"%s: Threshold must be positive (%g)\n",Prog_Name,THRESH);
+ exit (1);
+ }
+ break;
+ case 'm':
+ ARG_NON_NEGATIVE(MINLEN,"Minimum hit")
+ MINLEN -= 1;
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ BIASED = flags['b'];
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ // Open .db or .dam
+
+ { int status;
+
+ status = Open_DB(argv[1],db);
+ if (status < 0)
+ exit (1);
+ }
+
+ mask = (int *) Malloc((db->maxlen+1)*sizeof(int),"Allocating mask vector");
+ cptr = (Candidate *) Malloc((WINDOW+1)*sizeof(Candidate),"Allocating candidate vector");
+ if (mask == NULL || cptr == NULL)
+ exit (1);
+
+ { char *pwd, *root, *fname;
+ int size;
+
+ pwd = PathTo(argv[1]);
+ root = Root(argv[1],".db");
+ size = 8;
+
+ fname = Catenate(pwd,PATHSEP,root,".dust.anno");
+ if ((afile = fopen(fname,"r+")) == NULL || db->part > 0)
+ { if (afile != NULL)
+ fclose(afile);
+ afile = Fopen(fname,"w");
+ dfile = Fopen(Catenate(pwd,PATHSEP,root,".dust.data"),"w");
+ if (dfile == NULL || afile == NULL)
+ exit (1);
+ fwrite(&(db->nreads),sizeof(int),1,afile);
+ fwrite(&size,sizeof(int),1,afile);
+ nreads = 0;
+ indx = 0;
+ fwrite(&indx,sizeof(int64),1,afile);
+ }
+ else
+ { dfile = Fopen(Catenate(pwd,PATHSEP,root,".dust.data"),"r+");
+ if (dfile == NULL)
+ exit (1);
+ if (fread(&nreads,sizeof(int),1,afile) != 1)
+ SYSTEM_ERROR
+ if (nreads >= db->nreads)
+ { fclose(afile);
+ fclose(dfile);
+ exit(0);
+ }
+ fseeko(afile,0,SEEK_SET);
+ fwrite(&(db->nreads),sizeof(int),1,afile);
+ fwrite(&size,sizeof(int),1,afile);
+ fseeko(afile,0,SEEK_END);
+ fseeko(dfile,0,SEEK_END);
+ indx = ftello(dfile);
+ }
+
+ free(pwd);
+ free(root);
+ }
+
+ { int *mask1;
+ char *read, *lag2;
+ int wcount[64], lcount[64];
+ Candidate *aptr;
+ double skew[64], thresh2r;
+ int thresh2i;
+ int i;
+
+ read = New_Read_Buffer(db);
+ lag2 = read-2;
+
+ mask1 = mask+1;
+ *mask = -2;
+
+ aptr = cptr+1;
+ for (i = 1; i < WINDOW; i++)
+ cptr[i].next = aptr+i;
+ cptr[WINDOW].next = NULL;
+
+ cptr->next = cptr->prev = cptr;
+ cptr->beg = -2;
+
+ thresh2r = 2.*THRESH;
+ thresh2i = (int) ceil(thresh2r);
+
+ if (BIASED)
+ { int a, b, c, p;
+
+ p = 0;
+ for (a = 0; a < 4; a++)
+ for (b = 0; b < 4; b++)
+ for (c = 0; c < 4; c++)
+ skew[p++] = .015625 / (db->freq[a]*db->freq[b]*db->freq[c]);
+ }
+
+ for (i = nreads; i < db->nreads; i++)
+ { Candidate *lptr, *jptr;
+ int *mtop;
+ double mscore;
+ int len;
+ int wb, lb;
+ int j, c, d;
+
+ len = db->reads[i].rlen; // Fetch read
+ Load_Read(db,i,read,0);
+
+ c = (read[0] << 2) | read[1]; // Convert to triple codes
+ for (j = 2; j < len; j++)
+ { c = ((c << 2) & 0x3f) | read[j];
+ lag2[j] = (char) c;
+ }
+ len -= 2;
+
+ for (j = 0; j < 64; j++) // Setup counter arrays
+ wcount[j] = lcount[j] = 0;
+
+ mtop = mask; // The dust algorithm
+ lb = wb = -1;
+
+ if (BIASED)
+
+ { double lsqr, wsqr, trun; // Modification for high-compositional bias
+
+ wsqr = lsqr = 0.;
+ for (j = 0; j < len; j++)
+ { c = read[j];
+
+#define ADDR(e,cnt,sqr) sqr += (cnt[e]++) * skew[e];
+
+#define DELR(e,cnt,sqr) sqr -= (--cnt[e]) * skew[e];
+
+#define WADDR(e) ADDR(e,wcount,wsqr)
+#define WDELR(e) DELR(e,wcount,wsqr)
+#define LADDR(e) ADDR(e,lcount,lsqr)
+#define LDELR(e) DELR(e,lcount,lsqr)
+
+ if (j > WINDOW-3)
+ { d = read[++wb];
+ WDELR(d)
+ }
+ WADDR(c)
+
+ if (lb < wb)
+ { d = read[++lb];
+ LDELR(d)
+ }
+ trun = (lcount[c]++) * skew[c];
+ lsqr += trun;
+ if (trun >= thresh2r)
+ { while (lb < j)
+ { d = read[++lb];
+ LDELR(d)
+ if (d == c) break;
+ }
+ }
+
+ jptr = cptr->prev;
+ if (jptr != cptr && jptr->beg <= wb)
+ { c = jptr->end + 2;
+ if (*mtop+1 >= jptr->beg)
+ { if (*mtop < c)
+ *mtop = c;
+ }
+ else
+ { *++mtop = jptr->beg;
+ *++mtop = c;
+ }
+ lptr = jptr->prev;
+ cptr->prev = lptr;
+ lptr->next = cptr;
+ jptr->next = aptr;
+ aptr = jptr;
+ }
+
+ if (wsqr <= lsqr*THRESH) continue;
+
+ jptr = cptr->next;
+ lptr = cptr;
+ mscore = 0.;
+ for (c = lb; c > wb; c--)
+ { d = read[c];
+ LADDR(d)
+ if (lsqr >= THRESH * (j-c))
+ { for ( ; jptr->beg >= c; jptr = (lptr = jptr)->next)
+ if (jptr->score > mscore)
+ mscore = jptr->score;
+ if (lsqr >= mscore * (j-c))
+ { mscore = lsqr / (j-c);
+ if (lptr->beg == c)
+ { lptr->end = j;
+ lptr->score = mscore;
+ }
+ else
+ { aptr->beg = c;
+ aptr->end = j;
+ aptr->score = mscore;
+ aptr->prev = lptr;
+ lptr = lptr->next = aptr;
+ aptr = aptr->next;
+ jptr->prev = lptr;
+ lptr->next = jptr;
+ }
+ }
+ }
+ }
+
+ for (c++; c <= lb; c++)
+ { d = read[c];
+ LDELR(d)
+ }
+ }
+ }
+
+ else
+
+ { int lsqr, wsqr, trun; // Algorithm for GC-balanced sequences
+
+ wsqr = lsqr = 0;
+ for (j = 0; j < len; j++)
+ { c = read[j];
+
+#define ADDI(e,cnt,sqr) sqr += (cnt[e]++);
+
+#define DELI(e,cnt,sqr) sqr -= (--cnt[e]);
+
+#define WADDI(e) ADDI(e,wcount,wsqr)
+#define WDELI(e) DELI(e,wcount,wsqr)
+#define LADDI(e) ADDI(e,lcount,lsqr)
+#define LDELI(e) DELI(e,lcount,lsqr)
+
+ if (j > WINDOW-3)
+ { d = read[++wb];
+ WDELI(d)
+ }
+ WADDI(c)
+
+ if (lb < wb)
+ { d = read[++lb];
+ LDELI(d)
+ }
+ trun = lcount[c]++;
+ lsqr += trun;
+ if (trun >= thresh2i)
+ { while (lb < j)
+ { d = read[++lb];
+ LDELI(d)
+ if (d == c) break;
+ }
+ }
+
+ jptr = cptr->prev;
+ if (jptr != cptr && jptr->beg <= wb)
+ { c = jptr->end + 2;
+ if (*mtop+1 >= jptr->beg)
+ { if (*mtop < c)
+ *mtop = c;
+ }
+ else
+ { *++mtop = jptr->beg;
+ *++mtop = c;
+ }
+ lptr = jptr->prev;
+ cptr->prev = lptr;
+ lptr->next = cptr;
+ jptr->next = aptr;
+ aptr = jptr;
+ }
+
+ if (wsqr <= lsqr*THRESH) continue;
+
+ jptr = cptr->next;
+ lptr = cptr;
+ mscore = 0.;
+ for (c = lb; c > wb; c--)
+ { d = read[c];
+ LADDI(d)
+ if (lsqr >= THRESH * (j-c))
+ { for ( ; jptr->beg >= c; jptr = (lptr = jptr)->next)
+ if (jptr->score > mscore)
+ mscore = jptr->score;
+ if (lsqr >= mscore * (j-c))
+ { mscore = (1. * lsqr) / (j-c);
+ if (lptr->beg == c)
+ { lptr->end = j;
+ lptr->score = mscore;
+ }
+ else
+ { aptr->beg = c;
+ aptr->end = j;
+ aptr->score = mscore;
+ aptr->prev = lptr;
+ lptr = lptr->next = aptr;
+ aptr = aptr->next;
+ jptr->prev = lptr;
+ lptr->next = jptr;
+ }
+ }
+ }
+ }
+
+ for (c++; c <= lb; c++)
+ { d = read[c];
+ LDELI(d)
+ }
+ }
+ }
+
+ while ((jptr = cptr->prev) != cptr)
+ { c = jptr->end + 2;
+ if (*mtop+1 >= jptr->beg)
+ { if (*mtop < c)
+ *mtop = c;
+ }
+ else
+ { *++mtop = jptr->beg;
+ *++mtop = c;
+ }
+ cptr->prev = jptr->prev;
+ jptr->prev->next = cptr;
+ jptr->next = aptr;
+ aptr = jptr;
+ }
+
+ { int *jtop, ntop;
+
+ ntop = 0;
+ for (jtop = mask1; jtop < mtop; jtop += 2)
+ if (jtop[1] - jtop[0] >= MINLEN)
+ { mask[++ntop] = jtop[0];
+ mask[++ntop] = jtop[1]+1;
+ }
+ mtop = mask + ntop;
+ indx += ntop*sizeof(int);
+ fwrite(&indx,sizeof(int64),1,afile);
+ fwrite(mask1,sizeof(int),ntop,dfile);
+ }
+
+#ifdef DEBUG
+
+ { int *jtop;
+
+ printf("\nREAD %d\n",i);
+ for (jtop = mask1; jtop < mtop; jtop += 2)
+ printf(" [%5d,%5d]\n",jtop[0],jtop[1]);
+
+ Load_Read(db,i,read,0);
+
+ jtop = mask1;
+ for (c = 0; c < len; c++)
+ { while (jtop < mtop && c > jtop[1])
+ jtop += 2;
+ if (jtop < mtop && c >= *jtop)
+ printf("%c",Caps[(int) read[c]]);
+ else
+ printf("%c",Lowr[(int) read[c]]);
+ if ((c%80) == 79)
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+#endif
+ }
+ }
+
+ fclose(afile);
+ fclose(dfile);
+
+ Close_DB(db);
+
+ exit (0);
+}
diff --git a/DBrm.c b/DBrm.c
new file mode 100644
index 0000000..390d912
--- /dev/null
+++ b/DBrm.c
@@ -0,0 +1,77 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/********************************************************************************************
+ *
+ * Remove a list of .db databases
+ * Delete all the files for the given data bases <name>.db ... (there are a couple
+ * of hidden . files for each DB, and these are removed too.) Do not use "rm" to
+ * remove a database.
+ *
+ * Author: Gene Myers
+ * Date : July 2013
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "DB.h"
+
+static char *Usage = "<path:db|dam> ... ";
+
+static void HANDLER(char *path, char *name)
+{ (void) name;
+ unlink(path);
+}
+
+int main(int argc, char *argv[])
+{ int i;
+
+ Prog_Name = Strdup("DBrm","");
+
+ if (argc <= 1)
+ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+
+ for (i = 1; i < argc; i++)
+ if (List_DB_Files(argv[i],HANDLER) < 0)
+ fprintf(stderr,"%s: Could not list database %s\n",Prog_Name,argv[i]);
+
+ exit (0);
+}
diff --git a/DBshow.c b/DBshow.c
new file mode 100644
index 0000000..703fb14
--- /dev/null
+++ b/DBshow.c
@@ -0,0 +1,612 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Display a specified set of reads of a database in fasta format.
+ *
+ * Author: Gene Myers
+ * Date : September 2013
+ * Mod : With DB overhaul, made this a routine strictly for printing a selected subset
+ * and created DB2fasta for recreating all the fasta files of a DB
+ * Date : April 2014
+ * Mod : Added options to display QV streams
+ * Date : July 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+static char *Usage[] =
+ { "[-unqUQ] [-w<int(80)>] [-m<track>]+",
+ " <path:db|dam> [ <reads:FILE> | <reads:range> ... ]"
+ };
+
+#define LAST_READ_SYMBOL '$'
+#define MAX_BUFFER 10001
+
+typedef struct
+ { FILE *input;
+ int lineno;
+ int read;
+ int beg;
+ int end;
+ } File_Iterator;
+
+File_Iterator *init_file_iterator(FILE *input)
+{ File_Iterator *it;
+
+ it = Malloc(sizeof(File_Iterator),"Allocating file iterator");
+ it->input = input;
+ it->lineno = 1;
+ rewind(input);
+ return (it);
+}
+
+int next_read(File_Iterator *it)
+{ static char nbuffer[MAX_BUFFER];
+
+ char *eol;
+ int x;
+
+ if (fgets(nbuffer,MAX_BUFFER,it->input) == NULL)
+ { if (feof(it->input))
+ return (1);
+ SYSTEM_ERROR;
+ }
+ if ((eol = index(nbuffer,'\n')) == NULL)
+ { fprintf(stderr,"%s: Line %d in read list is longer than %d chars!\n",
+ Prog_Name,it->lineno,MAX_BUFFER-1);
+ return (1);
+ }
+ *eol = '\0';
+ x = sscanf(nbuffer," %d %d %d",&(it->read),&(it->beg),&(it->end));
+ if (x == 1)
+ it->beg = -1;
+ else if (x != 3)
+ { fprintf(stderr,"%s: Line %d of read list is improperly formatted\n",Prog_Name,it->lineno);
+ return (1);
+ }
+ it->lineno += 1;
+ return (0);
+}
+
+int main(int argc, char *argv[])
+{ HITS_DB _db, *db = &_db;
+ FILE *hdrs = NULL;
+
+ int nfiles;
+ char **flist = NULL;
+ int *findx = NULL;
+
+ int reps, *pts;
+ int input_pts;
+ File_Iterator *iter;
+ FILE *input;
+
+ int TRIM, UPPER;
+ int DOSEQ, DOQVS, QUIVA, DAM;
+ int WIDTH;
+
+ int MMAX, MTOP;
+ char **MASK;
+
+ // Process arguments
+
+ { int i, j, k;
+ int flags[128];
+ char *eptr;
+
+ ARG_INIT("DBshow")
+
+ WIDTH = 80;
+ MTOP = 0;
+ MMAX = 10;
+ MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array");
+ if (MASK == NULL)
+ exit (1);
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("unqUQ")
+ break;
+ case 'w':
+ ARG_NON_NEGATIVE(WIDTH,"Line width")
+ break;
+ case 'm':
+ if (MTOP >= MMAX)
+ { MMAX = 1.2*MTOP + 10;
+ MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array");
+ if (MASK == NULL)
+ exit (1);
+ }
+ MASK[MTOP++] = argv[i]+2;
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ DAM = 0;
+ TRIM = 1-flags['u'];
+ UPPER = 1+flags['U'];
+ DOQVS = flags['q'];
+ DOSEQ = 1-flags['n'];
+ QUIVA = flags['Q'];
+ if (QUIVA && (!DOSEQ || MTOP > 0))
+ { fprintf(stderr,"%s: -Q (quiva) format request inconsistent with -n and -m options\n",
+ Prog_Name);
+ exit (1);
+ }
+ if (QUIVA)
+ DOQVS = 1;
+
+ if (argc <= 1)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
+ fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
+ exit (1);
+ }
+ }
+
+ // Open DB or DAM, and if a DAM open also .hdr file
+
+ { char *pwd, *root;
+ int status;
+
+ status = Open_DB(argv[1],db);
+ if (status < 0)
+ exit (1);
+ if (status == 1)
+ { root = Root(argv[1],".dam");
+ pwd = PathTo(argv[1]);
+
+ hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"r");
+ if (hdrs == NULL)
+ exit (1);
+ DAM = 1;
+ if (QUIVA || DOQVS)
+ { fprintf(stderr,"%s: -Q and -q options not compatible with a .dam DB\n",Prog_Name);
+ exit (1);
+ }
+
+ free(root);
+ free(pwd);
+ }
+ }
+
+ // Load QVs if requested
+
+ if (DOQVS)
+ { if (Load_QVs(db) < 0)
+ { fprintf(stderr,"%s: QVs requested, but no .qvs for data base\n",Prog_Name);
+ exit (1);
+ }
+ }
+
+ // Check tracks and load tracks for untrimmed DB
+
+ { int i, status;
+
+ for (i = 0; i < MTOP; i++)
+ { status = Check_Track(db,MASK[i]);
+ if (status == -2)
+ printf("%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]);
+ else if (status == -1)
+ printf("%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]);
+ else if (status == 0)
+ Load_Track(db,MASK[i]);
+ else if (status == 1 && !TRIM)
+ printf("%s: Warning: %s track is for a trimmed db but -u is set.\n",Prog_Name,MASK[i]);
+ }
+ }
+
+ // If not a DAM then get prolog names and index ranges from the .db file
+
+ if (!DAM)
+ { char *pwd, *root;
+ FILE *dstub;
+ int i;
+
+ root = Root(argv[1],".db");
+ pwd = PathTo(argv[1]);
+ if (db->part > 0)
+ *rindex(root,'.') = '\0';
+ dstub = Fopen(Catenate(pwd,"/",root,".db"),"r");
+ if (dstub == NULL)
+ exit (1);
+ free(pwd);
+ free(root);
+
+ if (fscanf(dstub,DB_NFILE,&nfiles) != 1)
+ SYSTEM_ERROR
+
+ flist = (char **) Malloc(sizeof(char *)*nfiles,"Allocating file list");
+ findx = (int *) Malloc(sizeof(int *)*(nfiles+1),"Allocating file index");
+ if (flist == NULL || findx == NULL)
+ exit (1);
+
+ findx += 1;
+ findx[-1] = 0;
+
+ for (i = 0; i < nfiles; i++)
+ { char prolog[MAX_NAME], fname[MAX_NAME];
+
+ if (fscanf(dstub,DB_FDATA,findx+i,fname,prolog) != 3)
+ SYSTEM_ERROR
+ if ((flist[i] = Strdup(prolog,"Adding to file list")) == NULL)
+ exit (1);
+ }
+
+ fclose(dstub);
+
+ // If TRIM (the default) then "trim" prolog ranges and the DB
+
+ if (TRIM)
+ { int nid, oid, lid;
+ int cutoff, allflag;
+ HITS_READ *reads;
+
+ reads = db->reads - db->ufirst;
+ cutoff = db->cutoff;
+ if (db->all)
+ allflag = 0;
+ else
+ allflag = DB_BEST;
+
+ nid = 0;
+ oid = db->ufirst;
+ lid = oid + db->nreads;
+ for (i = 0; i < nfiles; i++)
+ { while (oid < findx[i] && oid < lid)
+ { if ((reads[oid].flags & DB_BEST) >= allflag && reads[oid].rlen >= cutoff)
+ nid++;
+ oid += 1;
+ }
+ findx[i] = nid;
+ }
+ }
+
+ else if (db->part > 0)
+ { for (i = 0; i < nfiles; i++)
+ findx[i] -= db->ufirst;
+ }
+ }
+
+ if (TRIM)
+ { int i, status;
+
+ Trim_DB(db);
+
+ // Load tracks for trimmed DB
+
+ for (i = 0; i < MTOP; i++)
+ { status = Check_Track(db,MASK[i]);
+ if (status < 0)
+ continue;
+ else if (status == 1)
+ Load_Track(db,MASK[i]);
+ }
+ }
+
+ // Process read index arguments into a list of read ranges
+
+ input_pts = 0;
+ if (argc == 3)
+ { if (argv[2][0] != LAST_READ_SYMBOL || argv[2][1] != '\0')
+ { char *eptr, *fptr;
+ int b, e;
+
+ b = strtol(argv[2],&eptr,10);
+ if (eptr > argv[2] && b > 0)
+ { if (*eptr == '-')
+ { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0')
+ { e = strtol(eptr+1,&fptr,10);
+ input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0);
+ }
+ }
+ else
+ input_pts = (*eptr != '\0');
+ }
+ else
+ input_pts = 1;
+ }
+ }
+
+ if (input_pts)
+ { input = Fopen(argv[2],"r");
+ if (input == NULL)
+ exit (1);
+
+ iter = init_file_iterator(input);
+ }
+ else
+ { pts = (int *) Malloc(sizeof(int)*2*(argc-1),"Allocating read parameters");
+ if (pts == NULL)
+ exit (1);
+
+ reps = 0;
+ if (argc > 2)
+ { int c, b, e;
+ char *eptr, *fptr;
+
+ for (c = 2; c < argc; c++)
+ { if (argv[c][0] == LAST_READ_SYMBOL)
+ { b = db->nreads;
+ eptr = argv[c]+1;
+ }
+ else
+ b = strtol(argv[c],&eptr,10);
+ if (eptr > argv[c])
+ { if (b <= 0)
+ { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b);
+ exit (1);
+ }
+ if (*eptr == 0)
+ { pts[reps++] = b;
+ pts[reps++] = b;
+ continue;
+ }
+ else if (*eptr == '-')
+ { if (eptr[1] == LAST_READ_SYMBOL)
+ { e = db->nreads;
+ fptr = eptr+2;
+ }
+ else
+ e = strtol(eptr+1,&fptr,10);
+ if (fptr > eptr+1 && *fptr == 0 && e > 0)
+ { pts[reps++] = b;
+ pts[reps++] = e;
+ if (b > e)
+ { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]);
+ exit (1);
+ }
+ continue;
+ }
+ }
+ }
+ fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]);
+ exit (1);
+ }
+ }
+ else
+ { pts[reps++] = 1;
+ pts[reps++] = db->nreads;
+ }
+ }
+
+ // Display each read (and/or QV streams) in the active DB according to the
+ // range pairs in pts[0..reps) and according to the display options.
+
+ { HITS_READ *reads;
+ HITS_TRACK *first;
+ char *read, **entry;
+ int c, b, e, i;
+ int hilight, substr;
+ int map;
+ int (*iscase)(int);
+
+ read = New_Read_Buffer(db);
+ if (DOQVS)
+ { entry = New_QV_Buffer(db);
+ first = db->tracks->next;
+ }
+ else
+ { entry = NULL;
+ first = db->tracks;
+ }
+
+ if (UPPER == 1)
+ { hilight = 'A'-'a';
+ iscase = islower;
+ }
+ else
+ { hilight = 'a'-'A';
+ iscase = isupper;
+ }
+
+ map = 0;
+ reads = db->reads;
+ substr = 0;
+
+ c = 0;
+ while (1)
+ { if (input_pts)
+ { if (next_read(iter))
+ break;
+ e = iter->read;
+ b = e-1;
+ substr = (iter->beg >= 0);
+ }
+ else
+ { if (c >= reps)
+ break;
+ b = pts[c]-1;
+ e = pts[c+1];
+ if (e > db->nreads)
+ e = db->nreads;
+ c += 2;
+ }
+
+ for (i = b; i < e; i++)
+ { int len;
+ int fst, lst;
+ int flags, qv;
+ HITS_READ *r;
+ HITS_TRACK *track;
+
+ r = reads + i;
+ len = r->rlen;
+
+ flags = r->flags;
+ qv = (flags & DB_QV);
+ if (DAM)
+ { char header[MAX_NAME];
+
+ fseeko(hdrs,r->coff,SEEK_SET);
+ fgets(header,MAX_NAME,hdrs);
+ header[strlen(header)-1] = '\0';
+ printf("%s :: Contig %d[%d,%d]",header,r->origin,r->fpulse,r->fpulse+len);
+ }
+ else
+ { while (i < findx[map-1])
+ map -= 1;
+ while (i >= findx[map])
+ map += 1;
+ if (QUIVA)
+ printf("@%s/%d/%d_%d",flist[map],r->origin,r->fpulse,r->fpulse+len);
+ else
+ printf(">%s/%d/%d_%d",flist[map],r->origin,r->fpulse,r->fpulse+len);
+ if (qv > 0)
+ printf(" RQ=0.%3d",qv);
+ }
+ printf("\n");
+
+ if (DOQVS)
+ Load_QVentry(db,i,entry,UPPER);
+ if (DOSEQ)
+ Load_Read(db,i,read,UPPER);
+
+ for (track = first; track != NULL; track = track->next)
+ { int64 *anno;
+ int *data;
+ int64 s, f, j;
+ int bd, ed, m;
+
+ anno = (int64 *) track->anno;
+ data = (int *) track->data;
+
+ s = (anno[i] >> 2);
+ f = (anno[i+1] >> 2);
+ if (s < f)
+ { for (j = s; j < f; j += 2)
+ { bd = data[j];
+ ed = data[j+1];
+ if (DOSEQ)
+ for (m = bd; m < ed; m++)
+ if (iscase(read[m]))
+ read[m] = (char) (read[m] + hilight);
+ if (j == s)
+ printf("> %s:",track->name);
+ printf(" [%d,%d]",bd,ed);
+ }
+ printf("\n");
+ }
+ }
+
+ if (substr)
+ { fst = iter->beg;
+ lst = iter->end;
+ }
+ else
+ { fst = 0;
+ lst = len;
+ }
+
+ if (QUIVA)
+ { int k;
+
+ for (k = 0; k < 5; k++)
+ printf("%.*s\n",lst-fst,entry[k]+fst);
+ }
+ else
+ { if (DOQVS)
+ { int j, k;
+
+ printf("\n");
+ for (j = fst; j+WIDTH < lst; j += WIDTH)
+ { if (DOSEQ)
+ printf("%.*s\n",WIDTH,read+j);
+ for (k = 0; k < 5; k++)
+ printf("%.*s\n",WIDTH,entry[k]+j);
+ printf("\n");
+ }
+ if (j < lst)
+ { if (DOSEQ)
+ printf("%.*s\n",lst-j,read+j);
+ for (k = 0; k < 5; k++)
+ printf("%.*s\n",lst-j,entry[k]+j);
+ printf("\n");
+ }
+ }
+ else if (DOSEQ)
+ { int j;
+
+ for (j = fst; j+WIDTH < lst; j += WIDTH)
+ printf("%.*s\n",WIDTH,read+j);
+ if (j < lst)
+ printf("%.*s\n",lst-j,read+j);
+ }
+ }
+ }
+ }
+ }
+
+ if (input_pts)
+ { fclose(input);
+ free(iter);
+ }
+ else
+ free(pts);
+
+ if (DAM)
+ fclose(hdrs);
+ else
+ { int i;
+
+ for (i = 0; i < nfiles; i++)
+ free(flist[i]);
+ free(flist);
+ free(findx-1);
+ }
+ Close_DB(db);
+
+ exit (0);
+}
diff --git a/DBsplit.c b/DBsplit.c
new file mode 100644
index 0000000..6961228
--- /dev/null
+++ b/DBsplit.c
@@ -0,0 +1,246 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Split a .db into a set of sub-database blocks for use by the Dazzler:
+ * Divide the database <path>.db conceptually into a series of blocks referable to on the
+ * command line as <path>.1.db, <path>.2.db, ... If the -x option is set then all reads
+ * less than the given length are ignored, and if the -a option is not set then secondary
+ * reads from a given well are also ignored. The remaining reads are split amongst the
+ * blocks so that each block is of size -s * 1Mbp except for the last which necessarily
+ * contains a smaller residual. The default value for -s is 400Mbp because blocks of this
+ * size can be compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks
+ * are very space efficient in that their sub-index of the master .idx is computed on the
+ * fly when loaded, and the .bps file of base pairs is shared with the master DB. Any
+ * tracks associated with the DB are also computed on the fly when loading a database block.
+ *
+ * Author: Gene Myers
+ * Date : September 2013
+ * Mod : New splitting definition to support incrementality, and new stub file format
+ * Date : April 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+static char *Usage = "[-a] [-x<int>] [-s<int(200)>] <path:db|dam>";
+
+int main(int argc, char *argv[])
+{ HITS_DB db, dbs;
+ int64 dbpos;
+ FILE *dbfile, *ixfile;
+ int status;
+
+ int ALL;
+ int CUTOFF;
+ int SIZE;
+
+ { int i, j, k;
+ int flags[128];
+ char *eptr;
+
+ ARG_INIT("DBsplit")
+
+ CUTOFF = 0;
+ SIZE = 200;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("a")
+ break;
+ case 'x':
+ ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff")
+ break;
+ case 's':
+ ARG_POSITIVE(SIZE,"Block size")
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ ALL = flags['a'];
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ // Open db
+
+ status = Open_DB(argv[1],&db);
+ if (status < 0)
+ exit (1);
+ if (db.part > 0)
+ { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+
+ { char *pwd, *root;
+ char buffer[2*MAX_NAME+100];
+ int nfiles;
+ int i;
+
+ pwd = PathTo(argv[1]);
+ if (status)
+ { root = Root(argv[1],".dam");
+ dbfile = Fopen(Catenate(pwd,"/",root,".dam"),"r+");
+ }
+ else
+ { root = Root(argv[1],".db");
+ dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r+");
+ }
+ ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+");
+ if (dbfile == NULL || ixfile == NULL)
+ exit (1);
+ free(pwd);
+ free(root);
+
+ if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
+ SYSTEM_ERROR
+ for (i = 0; i < nfiles; i++)
+ if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL)
+ SYSTEM_ERROR
+
+ if (fread(&dbs,sizeof(HITS_DB),1,ixfile) != 1)
+ SYSTEM_ERROR
+
+ if (dbs.cutoff >= 0)
+ { printf("You are about to overwrite the current partition settings. This\n");
+ printf("will invalidate any tracks, overlaps, and other derivative files.\n");
+ printf("Are you sure you want to proceed? [Y/N] ");
+ fflush(stdout);
+ if (fgets(buffer,100,stdin) == NULL)
+ SYSTEM_ERROR
+ if (index(buffer,'n') != NULL || index(buffer,'N') != NULL)
+ { printf("Aborted\n");
+ fflush(stdout);
+ fclose(dbfile);
+ exit (1);
+ }
+ }
+
+ dbpos = ftello(dbfile);
+ fseeko(dbfile,dbpos,SEEK_SET);
+ fprintf(dbfile,DB_NBLOCK,0);
+ fprintf(dbfile,DB_PARAMS,(int64) SIZE,CUTOFF,ALL);
+ }
+
+ { HITS_READ *reads = db.reads;
+ int nreads = db.ureads;
+ int64 size, totlen;
+ int nblock, ireads, treads, rlen, fno;
+ int i;
+
+ size = SIZE*1000000ll;
+
+ nblock = 0;
+ totlen = 0;
+ ireads = 0;
+ treads = 0;
+ fprintf(dbfile,DB_BDATA,0,0);
+ if (ALL)
+ for (i = 0; i < nreads; i++)
+ { rlen = reads[i].rlen;
+ if (rlen >= CUTOFF)
+ { ireads += 1;
+ treads += 1;
+ totlen += rlen;
+ if (totlen >= size)
+ { fprintf(dbfile,DB_BDATA,i+1,treads);
+ totlen = 0;
+ ireads = 0;
+ nblock += 1;
+ }
+ }
+ }
+ else
+ for (i = 0; i < nreads; i++)
+ { rlen = reads[i].rlen;
+ if (rlen >= CUTOFF && (reads[i].flags & DB_BEST) != 0)
+ { ireads += 1;
+ treads += 1;
+ totlen += rlen;
+ if (totlen >= size)
+ { fprintf(dbfile,DB_BDATA,i+1,treads);
+ totlen = 0;
+ ireads = 0;
+ nblock += 1;
+ }
+ }
+ }
+
+ if (ireads > 0)
+ { fprintf(dbfile,DB_BDATA,nreads,treads);
+ nblock += 1;
+ }
+ fno = fileno(dbfile);
+ if (ftruncate(fno,ftello(dbfile)) < 0)
+ SYSTEM_ERROR
+
+ fseeko(dbfile,dbpos,SEEK_SET);
+ fprintf(dbfile,DB_NBLOCK,nblock);
+
+ dbs.cutoff = CUTOFF;
+ dbs.all = ALL;
+ dbs.treads = treads;
+ rewind(ixfile);
+ fwrite(&dbs,sizeof(HITS_DB),1,ixfile);
+ }
+
+ fclose(ixfile);
+ fclose(dbfile);
+ Close_DB(&db);
+
+ exit (0);
+}
diff --git a/DBstats.c b/DBstats.c
new file mode 100644
index 0000000..542b77b
--- /dev/null
+++ b/DBstats.c
@@ -0,0 +1,358 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Display statistics about the contents of a .db and a histogram of its read lengths.
+ *
+ * Author: Gene Myers
+ * Date : July 2013
+ * Mod : April 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include "DB.h"
+
+static char *Usage = " [-nu] [-b<int(1000)>] [-m<track>]+ <name:db|dam>";
+
+int main(int argc, char *argv[])
+{ HITS_DB _db, *db = &_db;
+ int dam;
+
+ int64 ototal;
+ int oreads;
+ int nbin, *hist;
+ int64 *bsum;
+
+ int NONE;
+ int TRIM;
+ int BIN;
+
+ int MMAX, MTOP;
+ char **MASK;
+
+ { int i, j, k;
+ int flags[128];
+ char *eptr;
+
+ ARG_INIT("DBstats")
+
+ BIN = 1000;
+ MTOP = 0;
+ MMAX = 10;
+ MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array");
+ if (MASK == NULL)
+ exit (1);
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("nu")
+ break;
+ case 'b':
+ ARG_POSITIVE(BIN,"Bin size")
+ break;
+ case 'm':
+ if (MTOP >= MMAX)
+ { MMAX = 1.2*MTOP + 10;
+ MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array");
+ if (MASK == NULL)
+ exit (1);
+ }
+ MASK[MTOP++] = argv[i]+2;
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ NONE = flags['n'];
+ TRIM = 1-flags['u'];
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ { int i, status;
+
+ // Open .db or .dam
+
+ status = Open_DB(argv[1],db);
+ if (status < 0)
+ exit (1);
+ dam = status;
+
+ // Check tracks and load tracks for untrimmed DB
+
+ for (i = 0; i < MTOP; i++)
+ { status = Check_Track(db,MASK[i]);
+ if (status == -2)
+ fprintf(stderr,"%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]);
+ else if (status == -1)
+ fprintf(stderr,"%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]);
+ else if (status == 0)
+ Load_Track(db,MASK[i]);
+ else if (status == 1 && !TRIM)
+ fprintf(stderr,"%s: Warning: %s track is for a trimmed db but -u is set.\n",
+ Prog_Name,MASK[i]);
+ }
+
+ oreads = db->nreads;
+ ototal = db->totlen;
+
+ if (TRIM)
+ { Trim_DB(db);
+
+ // Load tracks for trimmed DB
+
+ for (i = 0; i < MTOP; i++)
+ { status = Check_Track(db,MASK[i]);
+ if (status < 0)
+ continue;
+ else if (status == 1)
+ Load_Track(db,MASK[i]);
+ }
+ }
+ }
+
+ { int i;
+ int64 totlen;
+ int nreads, maxlen;
+ int64 ave, dev;
+ HITS_READ *reads;
+
+ nreads = db->nreads;
+ totlen = db->totlen;
+ maxlen = db->maxlen;
+ reads = db->reads;
+
+ nbin = (maxlen-1)/BIN + 1;
+ hist = (int *) Malloc(sizeof(int)*nbin,"Allocating histograms");
+ bsum = (int64 *) Malloc(sizeof(int64)*nbin,"Allocating histograms");
+ if (hist == NULL || bsum == NULL)
+ exit (1);
+
+ for (i = 0; i < nbin; i++)
+ { hist[i] = 0;
+ bsum[i] = 0;
+ }
+
+ for (i = 0; i < nreads; i++)
+ { int rlen = reads[i].rlen;
+ hist[rlen/BIN] += 1;
+ bsum[rlen/BIN] += rlen;
+ }
+
+ nbin = (maxlen-1)/BIN + 1;
+ ave = totlen/nreads;
+ dev = 0;
+ for (i = 0; i < nreads; i++)
+ { int rlen = reads[i].rlen;
+ dev += (rlen-ave)*(rlen-ave);
+ }
+ dev = (int64) sqrt((1.*dev)/nreads);
+
+ if (dam)
+ printf("\nStatistics for all contigs");
+ else if (db->all || !TRIM)
+ printf("\nStatistics for all wells");
+ else
+ printf("\nStatistics for all reads");
+ if (TRIM && db->cutoff > 0)
+ { printf(" of length ");
+ Print_Number(db->cutoff,0,stdout);
+ printf(" bases or more\n\n");
+ }
+ else if (dam)
+ printf(" in the map index\n\n");
+ else
+ printf(" in the data set\n\n");
+
+ Print_Number((int64) nreads,15,stdout);
+ if (dam)
+ printf(" contigs");
+ else
+ printf(" reads ");
+ if (TRIM)
+ { printf(" out of ");
+ Print_Number((int64 ) oreads,15,stdout);
+ printf(" (%5.1f%%)",(100.*nreads)/oreads);
+ }
+ printf("\n");
+
+ Print_Number(totlen,15,stdout);
+ printf(" base pairs");
+ if (TRIM)
+ { printf(" out of ");
+ Print_Number(ototal,15,stdout);
+ printf(" (%5.1f%%)",(100.*totlen)/ototal);
+ }
+ printf("\n\n");
+
+ Print_Number(ave,15,stdout);
+ if (dam)
+ printf(" average contig length\n");
+ else
+ { printf(" average read length\n");
+ Print_Number(dev,15,stdout);
+ printf(" standard deviation\n");
+ }
+
+ printf("\n Base composition: %.3f(A) %.3f(C) %.3f(G) %.3f(T)\n",
+ db->freq[0],db->freq[1],db->freq[2],db->freq[3]);
+
+ if (!NONE)
+ { int64 btot;
+ int cum, skip;
+
+ printf("\n Distribution of Read Lengths (Bin size = ");
+ Print_Number((int64) BIN,0,stdout);
+ printf(")\n\n Bin: Count %% Reads %% Bases Average\n");
+ if (dam)
+ skip = 0;
+ else
+ skip = -1;
+ cum = 0;
+ btot = 0;
+ for (i = nbin-1; i >= 0; i--)
+ { cum += hist[i];
+ btot += bsum[i];
+ if (hist[i] != skip)
+ { Print_Number((int64) (i*BIN),11,stdout);
+ printf(":");
+ Print_Number((int64) hist[i],11,stdout);
+ printf(" %5.1f %5.1f %9lld\n",(100.*cum)/nreads,
+ (100.*btot)/totlen,btot/cum);
+ }
+ if (cum == nreads) break;
+ }
+ }
+ }
+
+ { int64 totlen;
+ int numint, maxlen;
+ int64 ave, dev;
+ HITS_TRACK *track;
+
+ for (track = db->tracks; track != NULL; track = track->next)
+ { char *data = track->data;
+ int64 *anno = (int64 *) track->anno;
+ int k, rlen;
+ int *idata, *edata;
+
+ totlen = 0;
+ numint = 0;
+ maxlen = 0;
+ for (k = 0; k < db->nreads; k++)
+ { edata = (int *) (data + anno[k+1]);
+ for (idata = (int *) (data + anno[k]); idata < edata; idata += 2)
+ { rlen = idata[1] - *idata;
+ numint += 1;
+ totlen += rlen;
+ if (rlen > maxlen)
+ maxlen = rlen;
+ }
+ }
+
+ nbin = (maxlen-1)/BIN + 1;
+
+ for (k = 0; k < nbin; k++)
+ { hist[k] = 0;
+ bsum[k] = 0;
+ }
+
+ ave = totlen/numint;
+ dev = 0;
+ for (k = 0; k < db->nreads; k++)
+ { edata = (int *) (data + anno[k+1]);
+ for (idata = (int *) (data + anno[k]); idata < edata; idata += 2)
+ { rlen = idata[1] - *idata;
+ dev += (rlen-ave)*(rlen-ave);
+ hist[rlen/BIN] += 1;
+ bsum[rlen/BIN] += rlen;
+ }
+ }
+ dev = (int64) sqrt((1.*dev)/numint);
+
+ printf("\n\nStatistics for %s-track\n",track->name);
+
+ printf("\n There are ");
+ Print_Number(numint,0,stdout);
+ printf(" intervals totaling ");
+ Print_Number(totlen,0,stdout);
+ printf(" bases (%.1f%% of all data)\n",(100.*totlen)/db->totlen);
+
+ { int64 btot;
+ int cum;
+
+ printf("\n Distribution of %s intervals (Bin size = ",track->name);
+ Print_Number((int64) BIN,0,stdout);
+ printf(")\n\n Bin: Count %% Intervals %% Bases Average\n");
+ cum = 0;
+ btot = 0;
+ for (k = nbin-1; k >= 0; k--)
+ { cum += hist[k];
+ btot += bsum[k];
+ if (hist[k] > 0)
+ { Print_Number((int64) (k*BIN),11,stdout);
+ printf(":");
+ Print_Number((int64) hist[k],11,stdout);
+ printf(" %5.1f %5.1f %9lld\n",(100.*cum)/numint,
+ (100.*btot)/totlen,btot/cum);
+ if (cum == numint) break;
+ }
+ }
+ printf("\n");
+ }
+ }
+ }
+
+ free(hist);
+ free(bsum);
+ Close_DB(db);
+
+ exit (0);
+}
diff --git a/DBupgrade.Dec.31.2014.c b/DBupgrade.Dec.31.2014.c
new file mode 100644
index 0000000..05513a4
--- /dev/null
+++ b/DBupgrade.Dec.31.2014.c
@@ -0,0 +1,115 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Interim code: upgrade previous db to have fpulse,rlen fields
+ *
+ * Author: Gene Myers
+ * Date : December 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+typedef struct
+ { int origin; // Well #
+ int beg; // First pulse
+ int end; // Last pulse
+ int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of
+ // uncompressed bases in memory block
+ int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file
+ int flags; // QV of read + flags above
+ } HITS_OLD;
+
+int main(int argc, char *argv[])
+{ HITS_DB db;
+ FILE *nxfile, *ixfile;
+ char *pwd, *root;
+ int i;
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s <path:db>\n",argv[0]);
+ exit (1);
+ }
+
+ pwd = PathTo(argv[1]);
+ root = Root(argv[1],".db");
+ ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r");
+ nxfile = Fopen(Catenate(pwd,PATHSEP,root,".ndx"),"w");
+ if (ixfile == NULL || nxfile == NULL)
+ exit (1);
+ free(pwd);
+ free(root);
+
+ if (fread(&db,sizeof(HITS_DB),1,ixfile) != 1)
+ SYSTEM_ERROR
+ fwrite(&db,sizeof(HITS_DB),1,nxfile);
+
+ for (i = 0; i < db.oreads; i++)
+ { HITS_OLD orec;
+ HITS_READ nrec;
+
+ if (fread(&orec,sizeof(HITS_OLD),1,ixfile) != 1)
+ SYSTEM_ERROR
+
+ nrec.origin = orec.origin;
+ nrec.fpulse = orec.beg;
+ nrec.rlen = orec.end-orec.beg;
+ nrec.boff = orec.boff;
+ nrec.coff = orec.coff;
+ nrec.flags = orec.flags;
+
+ fwrite(&nrec,sizeof(HITS_READ),1,nxfile);
+ }
+
+ fclose(ixfile);
+ fclose(nxfile);
+
+ exit (0);
+}
diff --git a/DBupgrade.Sep.25.2014.c b/DBupgrade.Sep.25.2014.c
new file mode 100644
index 0000000..70bbe16
--- /dev/null
+++ b/DBupgrade.Sep.25.2014.c
@@ -0,0 +1,125 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Interim code: upgrade previous db to have int's for pulse positions.
+ *
+ * Author: Gene Myers
+ * Date : September 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+typedef struct
+ { int origin; // Well #
+ uint16 beg; // First pulse
+ uint16 end; // Last pulse
+ int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of
+ // uncompressed bases in memory block
+ int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file
+ int flags; // QV of read + flags above
+ } HITS_OLD;
+
+typedef struct
+ { int origin; // Well #
+ int beg; // First pulse
+ int end; // Last pulse
+ int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of
+ // uncompressed bases in memory block
+ int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file
+ int flags; // QV of read + flags above
+ } HITS_NEW;
+
+int main(int argc, char *argv[])
+{ HITS_DB db;
+ FILE *nxfile, *ixfile;
+ char *pwd, *root;
+ int i;
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s <path:db>\n",argv[0]);
+ exit (1);
+ }
+
+ pwd = PathTo(argv[1]);
+ root = Root(argv[1],".db");
+ ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r");
+ nxfile = Fopen(Catenate(pwd,PATHSEP,root,".ndx"),"w");
+ if (ixfile == NULL || nxfile == NULL)
+ exit (1);
+ free(pwd);
+ free(root);
+
+ if (fread(&db,sizeof(HITS_DB),1,ixfile) != 1)
+ SYSTEM_ERROR
+ fwrite(&db,sizeof(HITS_DB),1,nxfile);
+
+ for (i = 0; i < db.oreads; i++)
+ { HITS_OLD orec;
+ HITS_NEW nrec;
+
+ if (fread(&orec,sizeof(HITS_OLD),1,ixfile) != 1)
+ SYSTEM_ERROR
+
+ nrec.origin = orec.origin;
+ nrec.beg = orec.beg;
+ nrec.end = orec.end;
+ nrec.boff = orec.boff;
+ nrec.coff = orec.coff;
+ nrec.flags = orec.flags;
+
+ fwrite(&nrec,sizeof(HITS_NEW),1,nxfile);
+ }
+
+ fclose(ixfile);
+ fclose(nxfile);
+
+ exit (0);
+}
diff --git a/DUSTupgrade.Jan.1.2015.c b/DUSTupgrade.Jan.1.2015.c
new file mode 100644
index 0000000..c53a66f
--- /dev/null
+++ b/DUSTupgrade.Jan.1.2015.c
@@ -0,0 +1,117 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Interim code: upgrade previous db to have fpulse,rlen fields
+ *
+ * Author: Gene Myers
+ * Date : December 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+int main(int argc, char *argv[])
+{ FILE *afile, *dfile;
+ FILE *nafile, *ndfile;
+ char *pwd, *root;
+ int size, tracklen;
+ int i, vint, dint;
+ int64 vlong;
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s <path:db>\n",argv[0]);
+ exit (1);
+ }
+
+ pwd = PathTo(argv[1]);
+ root = Root(argv[1],".db");
+ afile = Fopen(Catenate(pwd,PATHSEP,root,".dust.anno"),"r");
+ dfile = Fopen(Catenate(pwd,PATHSEP,root,".dust.data"),"r");
+ nafile = Fopen(Catenate(pwd,PATHSEP,root,".next.anno"),"w");
+ ndfile = Fopen(Catenate(pwd,PATHSEP,root,".next.data"),"w");
+ if (afile == NULL || dfile == NULL || nafile == NULL || ndfile == NULL)
+ exit (1);
+ free(pwd);
+ free(root);
+
+ if (fread(&tracklen,sizeof(int),1,afile) != 1)
+ SYSTEM_ERROR
+ fwrite(&tracklen,sizeof(int),1,nafile);
+
+ if (fread(&size,sizeof(int),1,afile) != 1)
+ SYSTEM_ERROR
+ size = 8;
+ fwrite(&size,sizeof(int),1,nafile);
+
+ for (i = 0; i <= tracklen; i++)
+ { if (fread(&vint,sizeof(int),1,afile) != 1)
+ SYSTEM_ERROR
+ vlong = vint;
+ fwrite(&vlong,sizeof(int64),1,nafile);
+ }
+
+ vint >>= 2;
+ for (i = 0; i < vint; i += 2)
+ { if (fread(&dint,sizeof(int),1,dfile) != 1)
+ SYSTEM_ERROR
+ fwrite(&dint,sizeof(int),1,ndfile);
+ if (fread(&dint,sizeof(int),1,dfile) != 1)
+ SYSTEM_ERROR
+ dint += 1;
+ fwrite(&dint,sizeof(int),1,ndfile);
+ }
+
+ fclose(nafile);
+ fclose(ndfile);
+ fclose(afile);
+ fclose(dfile);
+
+ exit (0);
+}
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..d42eeae
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,67 @@
+CFLAGS = -O3 -Wall -Wextra -fno-strict-aliasing
+
+ALL = fasta2DB DB2fasta quiva2DB DB2quiva DBsplit DBdust Catrack DBshow DBstats DBrm simulator \
+ fasta2DAM DAM2fasta
+
+all: $(ALL)
+
+fasta2DB: fasta2DB.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o fasta2DB fasta2DB.c DB.c QV.c -lm
+
+DB2fasta: DB2fasta.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DB2fasta DB2fasta.c DB.c QV.c -lm
+
+quiva2DB: quiva2DB.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o quiva2DB quiva2DB.c DB.c QV.c -lm
+
+DB2quiva: DB2quiva.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DB2quiva DB2quiva.c DB.c QV.c -lm
+
+DBsplit: DBsplit.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DBsplit DBsplit.c DB.c QV.c -lm
+
+DBdust: DBdust.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DBdust DBdust.c DB.c QV.c -lm
+
+Catrack: Catrack.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o Catrack Catrack.c DB.c QV.c -lm
+
+DBshow: DBshow.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DBshow DBshow.c DB.c QV.c -lm
+
+DBstats: DBstats.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DBstats DBstats.c DB.c QV.c -lm
+
+DBrm: DBrm.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DBrm DBrm.c DB.c QV.c -lm
+
+simulator: simulator.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o simulator simulator.c DB.c QV.c -lm
+
+fasta2DAM: fasta2DAM.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o fasta2DAM fasta2DAM.c DB.c QV.c -lm
+
+DAM2fasta: DAM2fasta.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DAM2fasta DAM2fasta.c DB.c QV.c -lm
+
+DBupgrade.Sep.25.2014: DBupgrade.Sep.25.2014.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DBupgrade.Sep.25.2014 DBupgrade.Sep.25.2014.c DB.c QV.c -lm
+
+DBupgrade.Dec.31.2014: DBupgrade.Dec.31.2014.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DBupgrade.Dec.31.2014 DBupgrade.Dec.31.2014.c DB.c QV.c -lm
+
+DUSTupgrade.Jan.1.2015: DUSTupgrade.Jan.1.2015.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DUSTupgrade.Jan.1.2015 DUSTupgrade.Jan.1.2015.c DB.c QV.c -lm
+
+clean:
+ rm -f $(ALL)
+ rm -fr *.dSYM
+ rm -f DBupgrade.Sep.25.2014 DBupgrade.Dec.31.2014 DUSTupgrade.Jan.1.2015
+ rm -f dazz.db.tar.gz
+
+install:
+ cp $(ALL) ~/bin
+
+package:
+ make clean
+ tar -zcf dazz.db.tar.gz README Makefile *.h *.c
diff --git a/QV.c b/QV.c
new file mode 100644
index 0000000..38f6db4
--- /dev/null
+++ b/QV.c
@@ -0,0 +1,1406 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on
+ * the histogram of values occuring in a given file. The two low complexity streams
+ * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant
+ * character.
+ *
+ * Author: Gene Myers
+ * Date: Jan 18, 2014
+ * Modified: July 25, 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <unistd.h>
+
+#include "DB.h"
+
+#undef DEBUG
+
+#define MIN_BUFFER 1000
+
+#define HUFF_CUTOFF 16 // This cannot be larger than 16 !
+
+
+/*******************************************************************************************
+ *
+ * Endian flipping routines
+ *
+ ********************************************************************************************/
+
+static int LittleEndian; // Little-endian machine ?
+ // Referred by: Decode & Decode_Run
+static int Flip; // Flip endian of all coded shorts and ints
+ // Referred by: Decode & Decode_Run & Read_Scheme
+
+static void Set_Endian(int flip)
+{ uint32 x = 3;
+ uint8 *b = (uint8 *) (&x);
+
+ Flip = flip;
+ LittleEndian = (b[0] == 3);
+}
+
+static void Flip_Long(void *w)
+{ uint8 *v = (uint8 *) w;
+ uint8 x;
+
+ x = v[0];
+ v[0] = v[3];
+ v[3] = x;
+ x = v[1];
+ v[1] = v[2];
+ v[2] = x;
+}
+
+static void Flip_Short(void *w)
+{ uint8 *v = (uint8 *) w;
+ uint8 x;
+
+ x = v[0];
+ v[0] = v[1];
+ v[1] = x;
+}
+
+
+/*******************************************************************************************
+ *
+ * Routines for computing a Huffman Encoding Scheme
+ *
+ ********************************************************************************************/
+
+typedef struct
+ { int type; // 0 => normal, 1 => normal but has long codes, 2 => truncated
+ uint32 codebits[256]; // If type = 2, then code 255 is the special code for
+ int codelens[256]; // non-Huffman exceptions
+ int lookup[0x10000]; // Lookup table (just for decoding)
+ } HScheme;
+
+typedef struct _HTree
+ { struct _HTree *lft, *rgt;
+ uint64 count;
+ } HTree;
+
+ // Establish heap property from node s down (1 is root, siblings of n are 2n and 2n+1)
+ // assuming s is the only perturbation in the tree.
+
+static void Reheap(int s, HTree **heap, int hsize)
+{ int c, l, r;
+ HTree *hs, *hr, *hl;
+
+ c = s;
+ hs = heap[s];
+ while ((l = 2*c) <= hsize)
+ { r = l+1;
+ hl = heap[l];
+ hr = heap[r];
+ if (r > hsize || hr->count > hl->count)
+ { if (hs->count > hl->count)
+ { heap[c] = hl;
+ c = l;
+ }
+ else
+ break;
+ }
+ else
+ { if (hs->count > hr->count)
+ { heap[c] = hr;
+ c = r;
+ }
+ else
+ break;
+ }
+ }
+ if (c != s)
+ heap[c] = hs;
+}
+
+ // Given Huffman tree build a table of codes from it, the low-order codelens[s] bits
+ // of codebits[s] contain the code for symbol s.
+
+static void Build_Table(HTree *node, int code, int len, uint32 *codebits, int *codelens)
+{ if (node->rgt == NULL)
+ { uint64 symbol = (uint64) (node->lft);
+ codebits[symbol] = code;
+ codelens[symbol] = len;
+ }
+ else
+ { code <<= 1;
+ len += 1;
+ Build_Table(node->lft,code,len,codebits,codelens);
+ Build_Table(node->rgt,code+1,len,codebits,codelens);
+ }
+}
+
+ // For the non-zero symbols in hist, compute a huffman tree over them, and then
+ // build a table of the codes. If inscheme is not NULL, then place all symbols
+ // with code 255 or with more than HUFF_CUTOFF bits in the encoding by inscheme
+ // as a single united entity, whose code signals that the value of these symbols
+ // occur explicitly in 8 (values) or 16 (run lengths) bits following the code.
+ // All the symbols in this class will have the same entry in the code table and
+ // 255 is always in this class.
+
+static HScheme *Huffman(uint64 *hist, HScheme *inscheme)
+{ HScheme *scheme;
+ HTree *heap[259];
+ HTree node[512];
+ int hsize;
+ HTree *lft, *rgt;
+ int value, range;
+ int i;
+
+ scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record");
+ if (scheme == NULL)
+ return (NULL);
+
+ hsize = 0; // Load heap
+ value = 0;
+ if (inscheme != NULL)
+ { node[0].count = 0;
+ node[0].lft = (HTree *) (uint64) 255;
+ node[0].rgt = NULL;
+ heap[++hsize] = node+(value++);
+ }
+ for (i = 0; i < 256; i++)
+ if (hist[i] > 0)
+ { if (inscheme != NULL && (inscheme->codelens[i] > HUFF_CUTOFF || i == 255))
+ node[0].count += hist[i];
+ else
+ { node[value].count = hist[i];
+ node[value].lft = (HTree *) (uint64) i;
+ node[value].rgt = NULL;
+ heap[++hsize] = node+(value++);
+ }
+ }
+
+ for (i = hsize/2; i >= 1; i--) // Establish heap property
+ Reheap(i,heap,hsize);
+
+ range = value; // Merge pairs with smallest count until have a tree
+ for (i = 1; i < value; i++)
+ { lft = heap[1];
+ heap[1] = heap[hsize--];
+ Reheap(1,heap,hsize);
+ rgt = heap[1];
+ node[range].lft = lft;
+ node[range].rgt = rgt;
+ node[range].count = lft->count + rgt->count;
+ heap[1] = node+(range++);
+ Reheap(1,heap,hsize);
+ }
+
+ for (i = 0; i < 256; i++) // Build the code table
+ { scheme->codebits[i] = 0;
+ scheme->codelens[i] = 0;
+ }
+
+ Build_Table(node+(range-1),0,0,scheme->codebits,scheme->codelens);
+
+ if (inscheme != NULL) // Set scheme type and if truncated (2), map truncated codes
+ { scheme->type = 2; // to code and length for 255
+ for (i = 0; i < 255; i++)
+ if (inscheme->codelens[i] > HUFF_CUTOFF || scheme->codelens[i] > HUFF_CUTOFF)
+ { scheme->codelens[i] = scheme->codelens[255];
+ scheme->codebits[i] = scheme->codebits[255];
+ }
+ }
+ else
+ { scheme->type = 0;
+ for (i = 0; i < 256; i++)
+ { if (scheme->codelens[i] > HUFF_CUTOFF)
+ scheme->type = 1;
+ }
+ }
+
+ return (scheme);
+}
+
+#ifdef DEBUG
+
+ // For debug, show the coding table
+
+static void Print_Table(HScheme *scheme, uint64 *hist, int infosize)
+{ uint64 total_bits;
+ uint32 specval, mask, code, *bits;
+ int speclen, clen, *lens;
+ int i, k;
+
+ total_bits = 0;
+ bits = scheme->codebits;
+ lens = scheme->codelens;
+ if (scheme->type == 2)
+ { specval = bits[255];
+ speclen = lens[255];
+ }
+ else
+ specval = speclen = 0x7fffffff;
+
+ printf("\nCode Table:\n");
+ for (i = 0; i < 256; i++)
+ if (lens[i] > 0)
+ { clen = lens[i];
+ mask = (1 << clen);
+ code = bits[i];
+ printf(" %3d: %2d ",i,clen);
+ for (k = 0; k < clen; k++)
+ { mask >>= 1;
+ if (code & mask)
+ printf("1");
+ else
+ printf("0");
+ }
+ if (code == specval && clen == speclen)
+ { printf(" ***");
+ if (hist != NULL)
+ total_bits += (clen+infosize)*hist[i];
+ }
+ else if (hist != NULL)
+ total_bits += clen*hist[i];
+ printf("\n");
+ }
+ if (hist != NULL)
+ printf("\nTotal Bytes = %lld\n",(total_bits-1)/8+1);
+}
+
+ // For debug, show the histogram
+
+static void Print_Histogram(uint64 *hist)
+{ int i, low, hgh;
+ uint64 count;
+
+ for (hgh = 255; hgh >= 0; hgh--)
+ if (hist[hgh] != 0)
+ break;
+ for (low = 0; low < 256; low++)
+ if (hist[low] != 0)
+ break;
+ count = 0;
+ for (i = low; i <= hgh; i++)
+ count += hist[i];
+
+ for (i = hgh; i >= low; i--)
+ printf(" %3d: %8llu %5.1f%%\n",i,hist[i],(hist[i]*100.)/count);
+}
+
+#endif
+
+
+/*******************************************************************************************
+ *
+ * Read and Write Huffman Schemes
+ *
+ ********************************************************************************************/
+
+ // Write the code table to out.
+
+static void Write_Scheme(HScheme *scheme, FILE *out)
+{ int i;
+ uint8 x;
+ uint32 *bits;
+ int *lens;
+
+ lens = scheme->codelens;
+ bits = scheme->codebits;
+
+ x = (uint8) (scheme->type);
+ fwrite(&x,1,1,out);
+
+ for (i = 0; i < 256; i++)
+ { x = (uint8) (lens[i]);
+ fwrite(&x,1,1,out);
+ if (x > 0)
+ fwrite(bits+i,sizeof(uint32),1,out);
+ }
+}
+
+ // Allocate and read a code table from in, and return a pointer to it.
+
+static HScheme *Read_Scheme(FILE *in)
+{ HScheme *scheme;
+ int *look, *lens;
+ uint32 *bits, base;
+ int i, j, powr;
+ uint8 x;
+
+ scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record");
+ if (scheme == NULL)
+ return (NULL);
+
+ lens = scheme->codelens;
+ bits = scheme->codebits;
+ look = scheme->lookup;
+
+ if (fread(&x,1,1,in) != 1)
+ { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n");
+ free(scheme);
+ return (NULL);
+ }
+ scheme->type = x;
+ for (i = 0; i < 256; i++)
+ { if (fread(&x,1,1,in) != 1)
+ { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i);
+ return (NULL);
+ }
+ lens[i] = x;
+ if (x > 0)
+ { if (fread(bits+i,sizeof(uint32),1,in) != 1)
+ { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i);
+ free(scheme);
+ return (NULL);
+ }
+ }
+ else
+ bits[i] = 0;
+ }
+
+ if (Flip)
+ { for (i = 0; i < 256; i++)
+ Flip_Long(bits+i);
+ }
+
+ for (i = 0; i < 256; i++)
+ { if (lens[i] > 0)
+ { base = (bits[i] << (16-lens[i]));
+ powr = (1 << (16-lens[i]));
+ for (j = 0; j < powr; j++)
+ look[base+j] = i;
+ }
+ }
+
+ return (scheme);
+}
+
+
+/*******************************************************************************************
+ *
+ * Encoders and Decoders
+ *
+ ********************************************************************************************/
+
+ // Encode read[0..rlen-1] according to scheme and write to out
+
+static void Encode(HScheme *scheme, FILE *out, uint8 *read, int rlen)
+{ uint32 x, c, ocode;
+ int n, k, olen, llen;
+ int *nlens;
+ uint32 *nbits;
+ uint32 nspec;
+ int nslen;
+
+ nlens = scheme->codelens;
+ nbits = scheme->codebits;
+
+ if (scheme->type == 2)
+ { nspec = nbits[255];
+ nslen = nlens[255];
+ }
+ else
+ nspec = nslen = 0x7fffffff;
+
+#define OCODE(L,C) \
+{ int len = olen + (L); \
+ uint32 code = (C); \
+ \
+ llen = olen; \
+ if (len >= 32) \
+ { olen = len-32; \
+ ocode |= (code >> olen); \
+ fwrite(&ocode,sizeof(uint32),1,out); \
+ if (olen > 0) \
+ ocode = (code << (32-olen)); \
+ else \
+ ocode = 0; \
+ } \
+ else \
+ { olen = len; \
+ ocode |= (code << (32-olen));; \
+ } \
+}
+
+ llen = 0;
+ olen = 0;
+ ocode = 0;
+ for (k = 0; k < rlen; k++)
+ { x = read[k];
+ n = nlens[x];
+ c = nbits[x];
+ OCODE(n,c);
+ if (c == nspec && n == nslen)
+ OCODE(8,x);
+ }
+
+ if (olen > 0) // Tricky: must pad so decoder does not read past
+ { fwrite(&ocode,sizeof(uint32),1,out); // last integer int the coded output.
+ if (llen > 16 && olen > llen)
+ fwrite(&ocode,sizeof(uint32),1,out);
+ }
+ else if (llen > 16)
+ fwrite(&ocode,sizeof(uint32),1,out);
+}
+
+ // Encode read[0..rlen-1] according to non-rchar table neme, and run-length table reme for
+ // runs of rchar characters. Write to out.
+
+static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int rlen, int rchar)
+{ uint32 x, c, ocode;
+ int n, h, k, olen, llen;
+ int *nlens, *rlens;
+ uint32 *nbits, *rbits;
+ uint32 nspec, rspec;
+ int nslen, rslen;
+
+ nlens = neme->codelens;
+ nbits = neme->codebits;
+ rlens = reme->codelens;
+ rbits = reme->codebits;
+
+ if (neme->type == 2)
+ { nspec = nbits[255];
+ nslen = nlens[255];
+ }
+ else
+ nspec = nslen = 0x7fffffff;
+
+ rspec = rbits[255];
+ rslen = rlens[255];
+
+ llen = 0;
+ olen = 0;
+ ocode = 0;
+ k = 0;
+ while (k < rlen)
+ { h = k;
+ while (k < rlen && read[k] == rchar)
+ k += 1;
+ if (k-h >= 255)
+ x = 255;
+ else
+ x = k-h;
+ n = rlens[x];
+ c = rbits[x];
+ OCODE(n,c);
+ if (c == rspec && n == rslen)
+ OCODE(16,k-h);
+ if (k < rlen)
+ { x = read[k];
+ n = nlens[x];
+ c = nbits[x];
+ OCODE(n,c);
+ if (c == nspec && n == nslen)
+ OCODE(8,x);
+ k += 1;
+ }
+ }
+
+ if (olen > 0)
+ { fwrite(&ocode,sizeof(uint32),1,out);
+ if (llen > 16 && olen > llen)
+ fwrite(&ocode,sizeof(uint32),1,out);
+ }
+ else if (llen > 16)
+ fwrite(&ocode,sizeof(uint32),1,out);
+}
+
+ // Read and decode from in, the next rlen symbols into read according to scheme
+
+static int Decode(HScheme *scheme, FILE *in, char *read, int rlen)
+{ int *look, *lens;
+ int signal, ilen;
+ uint64 icode;
+ uint32 *ipart;
+ uint16 *xpart;
+ uint8 *cpart;
+ int j, n, c;
+
+ if (LittleEndian)
+ { ipart = ((uint32 *) (&icode));
+ xpart = ((uint16 *) (&icode)) + 2;
+ cpart = ((uint8 *) (&icode)) + 5;
+ }
+ else
+ { ipart = ((uint32 *) (&icode)) + 1;
+ xpart = ((uint16 *) (&icode)) + 1;
+ cpart = ((uint8 *) (&icode)) + 2;
+ }
+
+ if (scheme->type == 2)
+ signal = 255;
+ else
+ signal = 256;
+ lens = scheme->codelens;
+ look = scheme->lookup;
+
+#define GET \
+ if (n > ilen) \
+ { icode <<= ilen; \
+ if (fread(ipart,sizeof(uint32),1,in) != 1) \
+ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \
+ return (1); \
+ } \
+ ilen = n-ilen; \
+ icode <<= ilen; \
+ ilen = 32-ilen; \
+ } \
+ else \
+ { icode <<= n; \
+ ilen -= n; \
+ }
+
+#define GETFLIP \
+ if (n > ilen) \
+ { icode <<= ilen; \
+ if (fread(ipart,sizeof(uint32),1,in) != 1) \
+ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \
+ return (1); \
+ } \
+ Flip_Long(ipart); \
+ ilen = n-ilen; \
+ icode <<= ilen; \
+ ilen = 32-ilen; \
+ } \
+ else \
+ { icode <<= n; \
+ ilen -= n; \
+ }
+
+ n = 16;
+ ilen = 0;
+ icode = 0;
+ if (Flip)
+ for (j = 0; j < rlen; j++)
+ { GETFLIP
+ c = look[*xpart];
+ n = lens[c];
+ if (c == signal)
+ { GETFLIP
+ c = *cpart;
+ n = 8;
+ }
+ read[j] = (char) c;
+ }
+ else
+ for (j = 0; j < rlen; j++)
+ { GET
+ c = look[*xpart];
+ n = lens[c];
+ if (c == signal)
+ { GET
+ c = *cpart;
+ n = 8;
+ }
+ read[j] = (char) c;
+ }
+
+ return (0);
+}
+
+ // Read and decode from in, the next rlen symbols into read according to non-rchar scheme
+ // neme, and the rchar runlength shceme reme
+
+static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read,
+ int rlen, int rchar)
+{ int *nlook, *nlens;
+ int *rlook, *rlens;
+ int nsignal, ilen;
+ uint64 icode;
+ uint32 *ipart;
+ uint16 *xpart;
+ uint8 *cpart;
+ int j, n, c, k;
+
+ if (LittleEndian)
+ { ipart = ((uint32 *) (&icode));
+ xpart = ((uint16 *) (&icode)) + 2;
+ cpart = ((uint8 *) (&icode)) + 5;
+ }
+ else
+ { ipart = ((uint32 *) (&icode)) + 1;
+ xpart = ((uint16 *) (&icode)) + 1;
+ cpart = ((uint8 *) (&icode)) + 2;
+ }
+
+ if (neme->type == 2)
+ nsignal = 255;
+ else
+ nsignal = 256;
+ nlens = neme->codelens;
+ nlook = neme->lookup;
+
+ rlens = reme->codelens;
+ rlook = reme->lookup;
+
+ n = 16;
+ ilen = 0;
+ icode = 0;
+ if (Flip)
+ for (j = 0; j < rlen; j++)
+ { GETFLIP
+ c = rlook[*xpart];
+ n = rlens[c];
+ if (c == 255)
+ { GETFLIP
+ c = *xpart;
+ n = 16;
+ }
+ for (k = 0; k < c; k++)
+ read[j++] = (char) rchar;
+
+ if (j < rlen)
+ { GETFLIP
+ c = nlook[*xpart];
+ n = nlens[c];
+ if (c == nsignal)
+ { GETFLIP
+ c = *cpart;
+ n = 8;
+ }
+ read[j] = (char) c;
+ }
+ }
+ else
+ for (j = 0; j < rlen; j++)
+ { GET
+ c = rlook[*xpart];
+ n = rlens[c];
+ if (c == 255)
+ { GET
+ c = *xpart;
+ n = 16;
+ }
+ for (k = 0; k < c; k++)
+ read[j++] = (char) rchar;
+
+ if (j < rlen)
+ { GET
+ c = nlook[*xpart];
+ n = nlens[c];
+ if (c == nsignal)
+ { GET
+ c = *cpart;
+ n = 8;
+ }
+ read[j] = (char) c;
+ }
+ }
+
+ return (0);
+}
+
+
+/*******************************************************************************************
+ *
+ * Histogrammers
+ *
+ ********************************************************************************************/
+
+// Histogram runlengths of symbol runChar in stream[0..rlen-1] into run.
+
+static void Histogram_Seqs(uint64 *hist, uint8 *stream, int rlen)
+{ int k;
+
+ for (k = 0; k < rlen; k++)
+ hist[stream[k]] += 1;
+}
+
+static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar)
+{ int k, h;
+
+ k = 0;
+ while (k < rlen)
+ { h = k;
+ while (k < rlen && stream[k] == runChar)
+ k += 1;
+ if (k-h >= 256)
+ run[255] += 1;
+ else
+ run[k-h] += 1;
+ if (k < rlen)
+ k += 1;
+ }
+}
+
+
+/*******************************************************************************************
+ *
+ * Reader
+ *
+ ********************************************************************************************/
+
+static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan,
+static int Rmax = -1; // Compress_Next_QVentry
+
+static int Nline; // Referred by: QVcoding_Scan
+
+char *QVentry()
+{ return (Read); }
+
+// If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines
+// for a sequence. Place line j at Read+j*Rmax and the length of every line is returned
+// unless eof occurs in which case return -1. If any error occurs return -2.
+
+int Read_Lines(FILE *input, int nlines)
+{ int i, rlen;
+ int tmax;
+ char *tread;
+ char *other;
+
+ if (Read == NULL)
+ { tmax = MIN_BUFFER;
+ tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer");
+ if (tread == NULL)
+ EXIT(-2);
+ Rmax = tmax;
+ Read = tread;
+ }
+
+ Nline += 1;
+ if (fgets(Read,Rmax,input) == NULL)
+ return (-1);
+
+ rlen = strlen(Read);
+ while (Read[rlen-1] != '\n')
+ { tmax = ((int) 1.4*Rmax) + MIN_BUFFER;
+ tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer");
+ if (tread == NULL)
+ EXIT(-2);
+ Rmax = tmax;
+ Read = tread;
+ if (fgets(Read+rlen,Rmax-rlen,input) == NULL)
+ { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline);
+ EXIT(-2);
+ }
+ rlen += strlen(Read+rlen);
+ }
+ other = Read;
+ for (i = 1; i < nlines; i++)
+ { other += Rmax;
+ Nline += 1;
+ if (fgets(other,Rmax,input) == NULL)
+ { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline);
+ EXIT(-2);
+ }
+ if (rlen != (int) strlen(other))
+ { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline);
+ EXIT(-2);
+ }
+ }
+ return (rlen-1);
+}
+
+
+/*******************************************************************************************
+ *
+ * Tag compression and decompression routines
+ *
+ ********************************************************************************************/
+
+// Keep only the symbols in tags[0..rlen-1] for which qvs[k] != rchar and
+// return the # of symbols kept.
+
+static int Pack_Tag(char *tags, char *qvs, int rlen, int rchar)
+{ int j, k;
+
+ j = 0;
+ for (k = 0; k < rlen; k++)
+ if (qvs[k] != rchar)
+ tags[j++] = tags[k];
+ tags[j] = '\0';
+ return (j);
+}
+
+ // Count the # of non-rchar symbols in qvs[0..rlen-1]
+
+static int Packed_Length(char *qvs, int rlen, int rchar)
+{ int k, clen;
+
+ clen = 0;
+ for (k = 0; k < rlen; k++)
+ if (qvs[k] != rchar)
+ clen += 1;
+ return (clen);
+}
+
+ // Unpack tags by moving its i'th char to position k where qvs[k] is the i'th non-rchar
+ // symbol in qvs. All other chars are set to rchar. rlen is the length of qvs and
+ // the unpacked result, clen is the initial length of tags.
+
+static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar)
+{ int j, k;
+
+ j = clen-1;
+ for (k = rlen-1; k >= 0; k--)
+ { if (qvs[k] == rchar)
+ tags[k] = 'n';
+ else
+ tags[k] = tags[j--];
+ }
+}
+
+
+/*******************************************************************************************
+ *
+ * Statistics Scan and Scheme creation and write
+ *
+ ********************************************************************************************/
+
+ // Read .quiva file from input, recording stats in the histograms. If zero is set then
+ // start the stats anew with this file.
+
+static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256];
+static uint64 totChar;
+static int delChar, subChar;
+
+ // Referred by: QVcoding_Scan, Create_QVcoding
+
+int QVcoding_Scan(FILE *input)
+{ char *slash;
+ int rlen;
+
+ // Zero histograms
+
+ bzero(delHist,sizeof(uint64)*256);
+ bzero(mrgHist,sizeof(uint64)*256);
+ bzero(insHist,sizeof(uint64)*256);
+ bzero(subHist,sizeof(uint64)*256);
+
+ { int i;
+
+ for (i = 0; i < 256; i++)
+ delRun[i] = subRun[i] = 1;
+ }
+
+ totChar = 0;
+ delChar = -1;
+ subChar = -1;
+
+ // Make a sweep through the .quiva entries, histogramming the relevant things
+ // and figuring out the run chars for the deletion and substition streams
+
+ Nline = 0;
+ while (1)
+ { int well, beg, end, qv;
+
+ rlen = Read_Lines(input,1);
+ if (rlen == -2)
+ EXIT(1);
+ if (rlen < 0)
+ break;
+
+ if (rlen == 0 || Read[0] != '@')
+ { EPRINTF(EPLACE,"Line %d: Header in quiv file is missing\n",Nline);
+ EXIT(1);
+ }
+ slash = index(Read+1,'/');
+ if (slash == NULL)
+ { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n",
+ Prog_Name,Nline);
+ EXIT(1);
+ }
+ if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4)
+ { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n",
+ Prog_Name,Nline);
+ EXIT(1);
+ }
+
+ rlen = Read_Lines(input,5);
+ if (rlen < 0)
+ { if (rlen == -1)
+ EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline);
+ EXIT(1);
+ }
+
+ Histogram_Seqs(delHist,(uint8 *) (Read),rlen);
+ Histogram_Seqs(insHist,(uint8 *) (Read+2*Rmax),rlen);
+ Histogram_Seqs(mrgHist,(uint8 *) (Read+3*Rmax),rlen);
+ Histogram_Seqs(subHist,(uint8 *) (Read+4*Rmax),rlen);
+
+ if (delChar < 0)
+ { int k;
+ char *del = Read+Rmax;
+
+ for (k = 0; k < rlen; k++)
+ if (del[k] == 'n' || del[k] == 'N')
+ { delChar = Read[k];
+ break;
+ }
+ }
+ if (delChar >= 0)
+ Histogram_Runs( delRun,(uint8 *) (Read),rlen,delChar);
+ totChar += rlen;
+ if (subChar < 0)
+ { if (totChar >= 100000)
+ { int k;
+
+ subChar = 0;
+ for (k = 1; k < 256; k++)
+ if (subHist[k] > subHist[subChar])
+ subChar = k;
+ }
+ }
+ if (subChar >= 0)
+ Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar);
+ }
+
+ return (0);
+}
+
+ // Using the statistics in the global stat tables, create the Huffman schemes and write
+ // them to output. If lossy is set, then create a lossy table for the insertion and merge
+ // QVs.
+
+QVcoding *Create_QVcoding(int lossy)
+{ static QVcoding coding;
+
+ HScheme *delScheme, *insScheme, *mrgScheme, *subScheme;
+ HScheme *dRunScheme, *sRunScheme;
+
+ delScheme = NULL;
+ dRunScheme = NULL;
+ insScheme = NULL;
+ mrgScheme = NULL;
+ subScheme = NULL;
+ sRunScheme = NULL;
+
+ // Check whether using a subtitution run char is a win
+
+ if (totChar < 200000 || subHist[subChar] < .5*totChar)
+ subChar = -1;
+
+ // If lossy encryption is enabled then scale insertions and merge QVs.
+
+ if (lossy)
+ { int k;
+
+ for (k = 0; k < 256; k += 2)
+ { insHist[k] += insHist[k+1];
+ insHist[k+1] = 0;
+ }
+
+ for (k = 0; k < 256; k += 4)
+ { mrgHist[k] += mrgHist[k+1];
+ mrgHist[k] += mrgHist[k+2];
+ mrgHist[k] += mrgHist[k+3];
+ mrgHist[k+1] = 0;
+ mrgHist[k+2] = 0;
+ mrgHist[k+3] = 0;
+ }
+ }
+
+ // Build a Huffman scheme for each stream entity from the histograms
+
+#define SCHEME_MACRO(meme,hist,label,bits) \
+ scheme = Huffman( (hist), NULL); \
+ if (scheme == NULL) \
+ goto error; \
+ if (scheme->type) \
+ { (meme) = Huffman( (hist), scheme); \
+ free(scheme); \
+ } \
+ else \
+ (meme) = scheme;
+
+#ifdef DEBUG
+
+#define MAKE_SCHEME(meme,hist,label,bits) \
+ SCHEME_MACRO(meme,hist,label,bits) \
+ printf("\n%s\n", (label) ); \
+ Print_Histogram( (hist)); \
+ Print_Table( (meme), (hist), (bits));
+
+#else
+
+#define MAKE_SCHEME(meme,hist,label,bits) \
+ SCHEME_MACRO(meme,hist,label,bits)
+
+#endif
+
+ { HScheme *scheme;
+
+ if (delChar < 0)
+ { MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs", 8);
+ dRunScheme = NULL;
+ }
+ else
+ { delHist[delChar] = 0;
+ MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs less run char", 8);
+ MAKE_SCHEME(dRunScheme,delRun, "Histogram of Deletion Runs QVs", 16);
+#ifdef DEBUG
+ printf("\nRun char is '%c'\n",delChar);
+#endif
+ }
+
+#ifdef DEBUG
+ { int k;
+ uint64 count;
+
+ count = 0;
+ for (k = 0; k < 256; k++)
+ count += delHist[k];
+ printf("\nDelTag will require %lld bytes\n",count/4);
+ }
+#endif
+
+ MAKE_SCHEME(insScheme,insHist, "Hisotgram of Insertion QVs", 8);
+ MAKE_SCHEME(mrgScheme,mrgHist, "Hisotgram of Merge QVs", 8);
+
+ if (subChar < 0)
+ { MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs", 8);
+ sRunScheme = NULL;
+ }
+ else
+ { subHist[subChar] = 0;
+ MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs less run char", 8);
+ MAKE_SCHEME(sRunScheme,subRun, "Histogram of Substitution Run QVs", 16);
+#ifdef DEBUG
+ printf("\nRun char is '%c'\n",subChar);
+#endif
+ }
+ }
+
+ // Setup endian handling
+
+ Set_Endian(0);
+
+ coding.delScheme = delScheme;
+ coding.insScheme = insScheme;
+ coding.mrgScheme = mrgScheme;
+ coding.subScheme = subScheme;
+ coding.dRunScheme = dRunScheme;
+ coding.sRunScheme = sRunScheme;
+ coding.delChar = delChar;
+ coding.subChar = subChar;
+ coding.prefix = NULL;
+ coding.flip = 0;
+
+ return (&coding);
+
+error:
+ if (delScheme != NULL)
+ free(delScheme);
+ if (dRunScheme != NULL)
+ free(dRunScheme);
+ if (insScheme != NULL)
+ free(insScheme);
+ if (mrgScheme != NULL)
+ free(mrgScheme);
+ if (subScheme != NULL)
+ free(subScheme);
+ if (sRunScheme != NULL)
+ free(sRunScheme);
+ EXIT(NULL);
+}
+
+ // Write the encoding scheme 'coding' to 'output'
+
+void Write_QVcoding(FILE *output, QVcoding *coding)
+{
+ // Write out the endian key, run chars, and prefix (if not NULL)
+
+ { uint16 half;
+ int len;
+
+ half = 0x33cc;
+ fwrite(&half,sizeof(uint16),1,output);
+
+ if (coding->delChar < 0)
+ half = 256;
+ else
+ half = (uint16) (coding->delChar);
+ fwrite(&half,sizeof(uint16),1,output);
+
+ if (coding->subChar < 0)
+ half = 256;
+ else
+ half = (uint16) (coding->subChar);
+ fwrite(&half,sizeof(uint16),1,output);
+
+ len = strlen(coding->prefix);
+ fwrite(&len,sizeof(int),1,output);
+ fwrite(coding->prefix,1,len,output);
+ }
+
+ // Write out the scheme tables
+
+ Write_Scheme(coding->delScheme,output);
+ if (coding->delChar >= 0)
+ Write_Scheme(coding->dRunScheme,output);
+ Write_Scheme(coding->insScheme,output);
+ Write_Scheme(coding->mrgScheme,output);
+ Write_Scheme(coding->subScheme,output);
+ if (coding->subChar >= 0)
+ Write_Scheme(coding->sRunScheme,output);
+}
+
+ // Read the encoding scheme 'coding' to 'output'
+
+QVcoding *Read_QVcoding(FILE *input)
+{ static QVcoding coding;
+
+ // Read endian key, run chars, and short name common to all headers
+
+ { uint16 half;
+ int len;
+
+ if (fread(&half,sizeof(uint16),1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n");
+ EXIT(NULL);
+ }
+ coding.flip = (half != 0x33cc);
+
+ if (fread(&half,sizeof(uint16),1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n");
+ EXIT(NULL);
+ }
+ if (coding.flip)
+ Flip_Short(&half);
+ coding.delChar = half;
+ if (coding.delChar >= 256)
+ coding.delChar = -1;
+
+ if (fread(&half,sizeof(uint16),1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n");
+ EXIT(NULL);
+ }
+ if (coding.flip)
+ Flip_Short(&half);
+ coding.subChar = half;
+ if (coding.subChar >= 256)
+ coding.subChar = -1;
+
+ // Read the short name common to all headers
+
+ if (fread(&len,sizeof(int),1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n");
+ EXIT(NULL);
+ }
+ if (coding.flip)
+ Flip_Long(&len);
+ coding.prefix = (char *) Malloc(len+1,"Allocating header prefix");
+ if (coding.prefix == NULL)
+ EXIT(NULL);
+ if (len > 0)
+ { if (fread(coding.prefix,len,1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n");
+ EXIT(NULL);
+ }
+ }
+ coding.prefix[len] = '\0';
+ }
+
+ // Setup endian handling
+
+ Set_Endian(coding.flip);
+
+ // Read the Huffman schemes used to compress the data
+
+ coding.delScheme = NULL;
+ coding.dRunScheme = NULL;
+ coding.insScheme = NULL;
+ coding.mrgScheme = NULL;
+ coding.subScheme = NULL;
+ coding.sRunScheme = NULL;
+
+ coding.delScheme = Read_Scheme(input);
+ if (coding.delScheme == NULL)
+ goto error;
+ if (coding.delChar >= 0)
+ { coding.dRunScheme = Read_Scheme(input);
+ if (coding.dRunScheme == NULL)
+ goto error;
+ }
+ coding.insScheme = Read_Scheme(input);
+ if (coding.insScheme == NULL)
+ goto error;
+ coding.mrgScheme = Read_Scheme(input);
+ if (coding.mrgScheme == NULL)
+ goto error;
+ coding.subScheme = Read_Scheme(input);
+ if (coding.subScheme == NULL)
+ goto error;
+ if (coding.subChar >= 0)
+ { coding.sRunScheme = Read_Scheme(input);
+ if (coding.sRunScheme == NULL)
+ goto error;
+ }
+
+ return (&coding);
+
+error:
+ if (coding.delScheme != NULL)
+ free(coding.delScheme);
+ if (coding.dRunScheme != NULL)
+ free(coding.dRunScheme);
+ if (coding.insScheme != NULL)
+ free(coding.insScheme);
+ if (coding.mrgScheme != NULL)
+ free(coding.mrgScheme);
+ if (coding.subScheme != NULL)
+ free(coding.subScheme);
+ if (coding.sRunScheme != NULL)
+ free(coding.sRunScheme);
+ EXIT(NULL);
+}
+
+ // Free all the auxilliary storage associated with the encoding argument
+
+void Free_QVcoding(QVcoding *coding)
+{ if (coding->subChar >= 0)
+ free(coding->sRunScheme);
+ free(coding->subScheme);
+ free(coding->mrgScheme);
+ free(coding->insScheme);
+ if (coding->delChar >= 0)
+ free(coding->dRunScheme);
+ free(coding->delScheme);
+ free(coding->prefix);
+}
+
+
+/*******************************************************************************************
+ *
+ * Encode/Decode (w.r.t. coding) next entry from input and write to output
+ *
+ ********************************************************************************************/
+
+int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy)
+{ int rlen, clen;
+
+ // Get all 5 streams, compress each with its scheme, and output
+
+ rlen = Read_Lines(input,5);
+ if (rlen < 0)
+ { if (rlen == -1)
+ EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline);
+ EXIT (1);
+ }
+
+ if (coding->delChar < 0)
+ { Encode(coding->delScheme, output, (uint8 *) Read, rlen);
+ clen = rlen;
+ }
+ else
+ { Encode_Run(coding->delScheme, coding->dRunScheme, output,
+ (uint8 *) Read, rlen, coding->delChar);
+ clen = Pack_Tag(Read+Rmax,Read,rlen,coding->delChar);
+ }
+ Number_Read(Read+Rmax);
+ Compress_Read(clen,Read+Rmax);
+ fwrite(Read+Rmax,1,COMPRESSED_LEN(clen),output);
+
+ if (lossy)
+ { uint8 *insert = (uint8 *) (Read+2*Rmax);
+ uint8 *merge = (uint8 *) (Read+3*Rmax);
+ int k;
+
+ for (k = 0; k < rlen; k++)
+ { insert[k] = (uint8) ((insert[k] >> 1) << 1);
+ merge[k] = (uint8) (( merge[k] >> 2) << 2);
+ }
+ }
+
+ Encode(coding->insScheme, output, (uint8 *) (Read+2*Rmax), rlen);
+ Encode(coding->mrgScheme, output, (uint8 *) (Read+3*Rmax), rlen);
+ if (coding->subChar < 0)
+ Encode(coding->subScheme, output, (uint8 *) (Read+4*Rmax), rlen);
+ else
+ Encode_Run(coding->subScheme, coding->sRunScheme, output,
+ (uint8 *) (Read+4*Rmax), rlen, coding->subChar);
+
+ return (0);
+}
+
+int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen)
+{ int clen, tlen;
+
+ // Decode each stream and write to output
+
+ if (coding->delChar < 0)
+ { if (Decode(coding->delScheme, input, entry[0], rlen))
+ EXIT(1);
+ clen = rlen;
+ tlen = COMPRESSED_LEN(clen);
+ if (tlen > 0)
+ { if (fread(entry[1],tlen,1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n");
+ EXIT(1);
+ }
+ }
+ Uncompress_Read(clen,entry[1]);
+ Lower_Read(entry[1]);
+ }
+ else
+ { if (Decode_Run(coding->delScheme, coding->dRunScheme, input,
+ entry[0], rlen, coding->delChar))
+ EXIT(1);
+ clen = Packed_Length(entry[0],rlen,coding->delChar);
+ tlen = COMPRESSED_LEN(clen);
+ if (tlen > 0)
+ { if (fread(entry[1],tlen,1,input) != 1)
+ { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n");
+ EXIT(1);
+ }
+ }
+ Uncompress_Read(clen,entry[1]);
+ Lower_Read(entry[1]);
+ Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar);
+ }
+
+ if (Decode(coding->insScheme, input, entry[2], rlen))
+ EXIT(1);
+
+ if (Decode(coding->mrgScheme, input, entry[3], rlen))
+ EXIT(1);
+
+ if (coding->subChar < 0)
+ { if (Decode(coding->subScheme, input, entry[4], rlen))
+ EXIT(1);
+ }
+ else
+ { if (Decode_Run(coding->subScheme, coding->sRunScheme, input,
+ entry[4], rlen, coding->subChar))
+ EXIT(1);
+ }
+
+ return (0);
+}
diff --git a/QV.h b/QV.h
new file mode 100644
index 0000000..35fbadc
--- /dev/null
+++ b/QV.h
@@ -0,0 +1,125 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on
+ * the histogram of values occuring in a given file. The two low complexity streams
+ * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant
+ * character.
+ *
+ * Author: Gene Myers
+ * Date: Jan 18, 2014
+ * Modified: July 25, 2014
+ *
+ ********************************************************************************************/
+
+#ifndef _QV_COMPRESSOR
+
+#define _QV_COMPRESSOR
+
+ // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or
+ // batch version of the routines in this library are compiled. In batch mode, routines
+ // print an error message and exit. In interactive mode, the routines place the error
+ // message in EPLACE (also defined in DB.h) and return an error value, typically NULL
+ // if the routine returns a pointer, and an unusual integer value if the routine returns
+ // an integer.
+ // Below when an error return is described, one should understand that this value is returned
+ // only if the routine was compiled in INTERACTIVE mode.
+
+ // A PacBio compression scheme
+
+typedef struct
+ { void *delScheme; // Huffman scheme for deletion QVs
+ void *insScheme; // Huffman scheme for insertion QVs
+ void *mrgScheme; // Huffman scheme for merge QVs
+ void *subScheme; // Huffman scheme for substitution QVs
+ void *dRunScheme; // Huffman scheme for deletion run lengths (if delChar > 0)
+ void *sRunScheme; // Huffman scheme for substitution run lengths (if subChar > 0)
+ int delChar; // If > 0, run-encoded deletion value
+ int subChar; // If > 0, run-encoded substitution value
+ int flip; // Need to flip multi-byte integers
+ char *prefix; // Header line prefix
+ } QVcoding;
+
+ // Read the next nlines of input, and QVentry returns a pointer to the first line if needed.
+ // If end-of-input is encountered before any further input, -1 is returned. If there is
+ // an error than -2 is returned. Otherwise the length of the line(s) read is returned.
+
+int Read_Lines(FILE *input, int nlines);
+char *QVentry();
+
+ // Read the .quiva file on input and record frequency statistics. If there is an error
+ // then 1 is returned, otherwise 0.
+
+int QVcoding_Scan(FILE *input);
+
+ // Given QVcoding_Scan has been called at least once, create an encoding scheme based on
+ // the accumulated statistics and return a pointer to it. The returned encoding object
+ // is *statically allocated within the routine. If lossy is set then use a lossy scaling
+ // for the insertion and merge streams. If there is an error, then NULL is returned.
+
+QVcoding *Create_QVcoding(int lossy);
+
+ // Read/write a coding scheme to input/output. The encoding object returned by the reader
+ // is *statically* allocated within the routine. If an error occurs while reading then
+ // NULL is returned.
+
+QVcoding *Read_QVcoding(FILE *input);
+void Write_QVcoding(FILE *output, QVcoding *coding);
+
+ // Free all the auxiliary storage associated with coding (but not the object itself!)
+
+void Free_QVcoding(QVcoding *coding);
+
+ // Assuming the file pointer is positioned just beyond an entry header line, read the
+ // next set of 5 QV lines, compress them according to 'coding', and output. If lossy
+ // is set then the scheme is a lossy one. A non-zero value is return only if an
+ // error occured.
+
+int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy);
+
+ // Assuming the input is position just beyond the compressed encoding of an entry header,
+ // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them,
+ // and place their decompressed values into entry which is a 5 element array of character
+ // pointers. The parameter rlen computed from the preceeding header line, critically
+ // provides the length of each of the 5 vectors. A non-zero value is return only if an
+ // error occured.
+
+int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen);
+
+#endif // _QV_COMPRESSOR
diff --git a/README b/README
new file mode 100644
index 0000000..e6308e8
--- /dev/null
+++ b/README
@@ -0,0 +1,442 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/************************************************************************************\
+
+UPGRADE & DEVELOPER NOTES ! ! !
+
+ If you have already built a big database and don't want to rebuild it, but do want
+to use a more recent version of the software that entails a change to the data
+structures (currently the updates on Sept 25, 2014 and December 31, 2014), please note
+the routines DBupgrade.Sep.25.2014 and DBupgrade.Dec.31.2014. These take a DB, say X,
+as an argument, and produce a file .X.ndx which you should then replace .X.idx with.
+To update a very old DB to today's version you will need to run both in sequence.
+
+ Both of the upgrade programs can be made with "make" but are not by default created
+when make is called without an argument.
+
+ For those interested in the details, on September 25, the "beg" and "end" fields went
+from shorts to ints, and on December 31, the "beg" and "end" fields became "fpulse" and
+"rlen", respectively where fpulse = beg and rlen = end-beg.
+
+ Unfortunately, the .dust track formats also changed on Dec.31.2014 and Jan.1.2015. To
+upgrade said use DUSTupgrade.Jan.1.2015. This program takes a DB, say X as an argument
+and produces .X.next.anno and .X.next.data which you should then replace .X.dust.* with.
+Of course, it may, if the DB is not too big, be easier and simply to just rerun DBdust.
+
+ Developers should also note carefully that the calling conventions to Open_DB have
+changed and there are new utility routines Number_Digits and Check_Track.
+
+\************************************************************************************/
+
+
+ The Dazzler Database Library
+
+ Author: Gene Myers
+ First: July 17, 2013
+ Current: December 31, 2014
+
+ To facilitate the multiple phases of the dazzler assembler, we organize all the read
+data into what is effectively a "database" of the reads and their meta-information.
+The design goals for this data base are as follows:
+
+(1) The database stores the source Pacbio read information in such a way that it can
+ recreate the original input data, thus permitting a user to remove the
+ (effectively redundant) source files. This avoids duplicating the same data,
+ once in the source file and once in the database.
+
+(2) The data base can be built up incrementally, that is new sequence data can be added
+ to the data base over time.
+
+(3) The data base flexibly allows one to store any meta-data desired for reads. This
+ is accomplished with the concept of *tracks* that implementors can add as they
+ need them.
+
+(4) The data is held in a compressed form equivalent to the .dexta and .dexqv files of
+ the data extraction module. Both the .fasta and .quiva information for each
+ read is held in the data base and can be recreated from it. The .quiva
+ information can be added separately and later on if desired.
+
+(5) To facilitate job parallel, cluster operation of the phases of our assembler, the
+ data base has a concept of a *current partitioning* in which all the reads that
+ are over a given length and optionally unique to a well, are divided up into
+ *blocks* containing roughly a given number of bases, except possibly the last
+ block which may have a short count. Often programs con be run on blocks or
+ pairs of blocks and each such job is reasonably well balanced as the blocks are
+ all the same size. One must be careful about changing the partition during an
+ assembly as doing so can void the structural validity of any interim
+ block-based results.
+
+ A Dazzler DB consists of one named, *visible* file, e.g. FOO.db, and several
+*invisible* secondary files encoding various elements of the DB. The secondary files
+are "invisible" to the UNIX OS in the sense that they begin with a "." and hence are
+not listed by "ls" unless one specifies the -a flag. We chose to do this so that when
+a user lists the contents of a directory they just see a single name, e.g. FOO.db, that
+is the one used to refer to the DB in commands. The files associated with a database
+named, say FOO, are as follows:
+
+(a) "FOO.db": a text file containing
+ (i) the list of input files added to the database so far, and
+ (ii) how to partition the database into blocks (if the partition
+ parameters have been set).
+
+(b) ".FOO.idx": a binary "index" of all the meta-data about each read allowing, for
+ example, one to randomly access a read's sequence (in the store
+ ".FOO.bps"). It is 28N + 88 bytes in size where N is the number of
+ reads in the database.
+
+(c) ".FOO.bps": a binary compressed "store" of all the DNA sequences. It is M/4 bytes
+ in size where M is the total number of base pairs in the database.
+
+(d) ".FOO.qvs": a binary compressed "store" of the 5 Pacbio quality value streams for
+ the reads. Its size is roughly 5/3M bytes depending on the
+ compression acheived. This file only exists if .quiva files have
+ been added to the database.
+
+(e) ".FOO.<track>.anno": a *track* containing customized meta-data for each read. For
+ ".FOO.<track>.data" example, the DBdust command annotates low complexity intervals
+ of reads and records the intervals for each read in two files
+ .FOO.dust.anno & .FOO.dust.data. Any kind of information
+ about a read can be recorded, such as micro-sats, repeat
+ intervals, corrected sequence, etc. Specific tracks will be
+ described as modules that produce them are released.
+
+If one does not like the convention of the secondary files being invisible, then
+un-defining the constant HIDE_FILES in DB.h before compiling the library, creates
+commands that do not place a prefixing "." before secondary file names, e.g. FOO.idx
+instead of .FOO.idx. One then sees all the files realizing a DB when listing the
+contents of a directory with ls.
+
+ While a Dazzler DB holds a collection of Pacbio reads, a Dazzler map DB or DAM holds
+a collection of contigs from a reference genome assembly. This special type of DB has
+been introduced in order to facilitate the mapping of reads to an assembly and has
+been given the suffix .dam to distinguish it from an ordinary DB. It is structurally
+identical to a .db except:
+
+(a) there is no concept of quality values, and hence no .FOO.qvs file.
+
+(b) every .fasta scaffold (a sequence with runs of N's between contigs estimating the
+ length of the gap) is broken into a separate contig sequence in the DB and the
+ header for each scaffold is retained in a new .FOO.hdr file.
+
+(c) the original and first and last pulse fields in the meta-data records held in
+ .FOO.idx, hold instead the contig number and the interval of the contig within
+ its original scaffold sequence.
+
+A map DB can equally well be the argument of many of the commands below that operate
+on normal DBs. In general, a .dam can be an argument anywhere a .db can, with the
+exception of routines or optioned calls to routines that involve quality values, or
+the special routines fasta2DAM and DAM2fasta that create a DAM and reverse said,
+just like the pair fasta2DB and DB2fasta do for a normal DB. So in general when we
+refer to a database we are referring to either a DB or a DAM.
+
+ The command DBsplit sets or resets the current partition for a database which is
+determined by 3 parameters: (i) the total number of basepairs to place in each block,
+(ii) the minimum read length of reads to include within a block, and (iii) whether or
+not to only include the longest read from a given well or all reads from a well (NB:
+several reads of the same insert in a given well can be produced by the Pacbio
+instrument). Note that the length and uniqueness parameters effectively select a
+subset of the reads that contribute to the size of a block. We call this subset the
+*trimmed* data base. Some commands operate on the entire database, others on the
+trimmed database, and yet others have an option flag that permits them to operate on
+either at the users discretion. Therefore, one should note carefully to which version
+of the database a command refers to. This is especially important for any command that
+identifies reads by their index (ordinal position) in the database.
+
+Once the database has been split into blocks, the commands DBshow, DBstats, and DBdust
+below and commands yet to come, such as the local alignment finder dalign, can take a
+block or blocks as arguments. On the command line this is indicated by supplying the
+name of the DB followed by a period and then a block number, e.g. FOO.3.db or simply
+FOO.3, refers to the 3'rd block of DB FOO (assuming of course it has a current
+partition and said partition has a 3rd block). One should note carefully that a block
+is a contiguous range of reads such that once it is trimmed has a given size in base
+pairs (as set by DBsplit). Thus like an entire database, a block can be either
+untrimmed or trimmed and one needs to again be careful when giving a read index to
+a command such as DBshow.
+
+All programs add suffixes (e.g. .db) as needed. The commands of the database library
+are currently as follows:
+
+1. fasta2DB [-v] <path:db> ( -f<file> | <input:fasta> ... )
+
+Builds an initial data base, or adds to an existing database, the list of .fasta files
+following the database name argument, or if the -f option is used, the list of .fasta
+files in <file>. A given .fasta file can only be added once to the DB (this is checked
+by the command). The .fasta headers must be in the "Pacbio" format (i.e. the output
+of the Pacbio tools or our dextract program) and the well, pulse interval, and read
+quality are extracted from the header and kept with each read record. If the files
+are being added to an existing database, and the partition settings of the DB have
+already been set (see DBsplit below), then the partitioning of the database is updated
+to include the new data.
+
+2. DB2fasta [-vU] [-w<int(80)>] <path:db>
+
+The set of .fasta files for the given DB are recreated from the DB exactly as they were
+input. That is, this is a perfect inversion, including the reconstitution of the
+proper .fasta headers. Because of this property, one can, if desired, delete the
+.fasta source files once they are in the DB as they can always be recreated from it.
+By default the output sequences are in lower case and 80 chars per line. The -U option
+specifies upper case should be used, and the characters per line, or line width, can be
+set to any positive value with the -w option.
+
+3. quiva2DB [-vl] <path:db> ( -f<file> | <input:quiva> ... )
+
+Adds the given .quiva files on the command line or in the file specified by the
+-f option to an existing DB "path". The input files must be added in the same order
+as the .fasta files were and have the same root names, e.g. FOO.fasta and FOO.quiva.
+The files can be added incrementally but must be added in the same order as the .fasta
+files. This is enforced by the program. With the -l option set the compression
+scheme is a bit lossy to get more compression (see the description of dexqv in the
+DEXTRACTOR module).
+
+4. DB2quiva [-vU] <path:db>
+
+The set of .quiva files within the given DB are recreated from the DB exactly as they
+were input. That is, this is a perfect inversion, including the reconstitution of the
+proper .quiva headers. Because of this property, one can, if desired, delete the
+.quiva source files once they are in the DB as they can always be recreated from it.
+By .fastq convention each QV vector is output as a line without new-lines, and by
+default the Deletion Tag entry is in lower case letters. The -U option specifies
+upper case letters should be used instead.
+
+5. fasta2DAM [-v] <path:dam> ( -f<file> | <input:fasta> ... )
+
+Builds a map DB or DAM from the list of .fasta files following the map database name
+argument, or if the -f option is used, the list of .fasta files in <file>. Any .fasta
+entry that has a run of N's in it will be split into separate "contig" entries and
+the interval of the contig in the original entry recorded. The header for each .fasta
+entry is saved with the contigs created from it.
+
+6. DAM2fasta [-vU] [-w<int(80)>] <path:dam>
+
+The set of .fasta files for the given map DB or DAM are recreated from the DAM
+exactly as they were input. That is, this is a perfect inversion, including the
+reconstitution of the proper .fasta headers and the concatenation of contigs with
+the proper number of N's between them. By default the output sequences are in lower
+case and 80 chars per line. The -U option specifies upper case should be used, and
+the characters per line, or line width, can be set to any positive value with
+the -w option.
+
+7. DBsplit [-a] [-x<int>] [-s<int(200)>] <path:db|dam>
+
+Divide the database <path>.db or <path>.dam conceptually into a series of blocks
+referable to on the command line as <path>.1, <path>.2, ... If the -x option is set
+then all reads less than the given length are ignored, and if the -a option is not
+set then secondary reads from a given well are also ignored. The remaining reads,
+constituting what we call the trimmed DB, are split amongst the blocks so that each
+block is of size -s * 1Mbp except for the last which necessarily contains a smaller
+residual. The default value for -s is 200Mbp because blocks of this size can be
+compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks are very
+space efficient in that their sub-index of the master .idx is computed on the fly
+when loaded, and the .bps and .qvs files (if a .db) of base pairs and quality values,
+respectively, is shared with the master DB. Any relevant portions of tracks
+associated with the DB are also computed on the fly when loading a database block.
+
+8. DBdust [-b] [-w<int(64)>] [-t<double(2.)>] [-m<int(10)>] <path:db|dam>
+
+Runs the symmetric DUST algorithm over the reads in the untrimmed DB <path>.db or
+<path>.dam producing a track .<path>.dust[.anno,.data] that marks all intervals of low
+complexity sequence, where the scan window is of size -w, the threshold for being a
+low-complexity interval is -t, and only perfect intervals of size greater than -m are
+recorded. If the -b option is set then the definition of low complexity takes into
+account the frequency of a given base. The command is incremental if given a DB to
+which new data has been added since it was last run on the DB, then it will extend
+the track to include the new reads. It is important to set this flag for genomes with
+a strong AT/GC bias, albeit the code is a tad slower. The dust track, if present,
+is understood and used by DBshow, DBstats, and dalign.
+
+DBdust can also be run over an untriimmed DB block in which case it outputs a track
+encoding where the trace file names contain the block number, e.g. .FOO.3.dust.anno
+and .FOO.3.dust.data, given FOO.3 on the command line. We call this a *block track*.
+This permits job parallelism in block-sized chunks, and the resulting sequence of
+block tracks can then be merged into a track for the entire untrimmed DB with Catrack.
+
+9. Catrack [-v] <path:db|dam> <track:name>
+
+Find all block tracks of the form .<path>.#.<track>... and merge them into a single
+track, .<path>.<track>..., for the given DB or DAM. The block track files must all
+encode the same kind of track data (this is checked), and the files must exist for
+block 1, 2, 3, ... up to the last block number.
+
+10. DBshow [-unqUQ] [-w<int(80)>] [-m<track>]+
+ <path:db|dam> [ <reads:FILE> | <reads:range> ... ]
+
+Displays the requested reads in the database <path>.db or <path>.dam. By default the
+command applies to the trimmed database, but if -u is set then the entire DB is used.
+If no read arguments are given then every read in the database or database block is
+displayed. Otherwise the input file or the list of supplied integer ranges give the
+ordinal positions in the actively loaded portion of the db. In the case of a file, it
+should simply contain a read index, one per line. In the other case, a read range is
+either a lone integer or the symbol $, in which case the read range consists of just
+that read (the last read in the database if $). One may also give two positive
+integers separated by a dash to indicate a range of integers, where again a $
+represents the index of the last read in the actively loaded db. For example,
+1 3-5 $ displays reads 1, 3, 4, 5, and the last read in the active db. As another
+example, 1-$ displays every read in the active db (the default).
+
+By default a .fasta file of the read sequences is displayed. If the -q option is
+set, then the QV streams are also displayed in a non-standard modification of the
+fasta format. If the -n option is set then the DNA sequence is *not* displayed.
+If the -Q option is set then a .quiva file is displayed and in this case the -n
+and -m options mayt not be set (and the -q and -w options have no effect).
+
+If one or more masks are set with the -m option then the track intervals are also
+displayed in an additional header line and the bases within an interval are displayed
+in the case opposite that used for all the other bases. By default the output
+sequences are in lower case and 80 chars per line. The -U option specifies upper
+case should be used, and the characters per line, or line width, can be set to any
+positive value with the -w option.
+
+The .fasta or .quiva files that are output can be converted into a DB by fasta2DB
+and quiva2DB (if the -q and -n options are not set and no -m options are set),
+giving one a simple way to make a DB of a subset of the reads for testing purposes.
+
+11. DBstats [-nu] [-b<int(1000)] [-m<track>]+ <path:db|dam>
+
+Show overview statistics for all the reads in the trimmed data base <path>.db or
+<path>.dam, including a histogram of read lengths where the bucket size is set
+with the -b option (default 1000). If the -u option is given then the untrimmed
+database is summarized. If the -n option is given then the histogran of read lengths
+is not displayed. Any track such as a "dust" track that gives a seried of
+intervals along the read can be specified with the -m option in which case a summary
+and a histogram of the interval lengths is displayed.
+
+12. DBrm <path:db|dam> ...
+
+Delete all the files for the given data bases. Do not use rm to remove a database, as
+there are at least two and often several secondary files for each DB including track
+files, and all of these are removed by DBrm.
+
+13. simulator <genlen:double> [-c<double(20.)>] [-b<double(.5)] [-r<int>]
+ [-m<int(10000)>] [-s<int(2000)>]
+ [-x<int(4000)>] [-e<double(.15)>]
+ [-M<file>]
+
+In addition to the DB commands we include here, somewhat tangentially, a simple
+simulator that generates synthetic reads for a random genome. simulator first
+generates a fake genome of size genlen*1Mb long, that has an AT-bias of -b. It then
+generates sample reads of mean length -m from a log-normal length distribution with
+standard deviation -s, but ignores reads of length less than -x. It collects enough
+reads to cover the genome -c times and introduces -e fraction errors into each read
+where the ratio of insertions, deletions, and substitutions are set by defined
+constants INS_RATE (default 73%) and DEL_RATE (default 20%) within generate.c. One
+can also control the rate at which reads are picked from the forward and reverse
+strands by setting the defined constant FLIP_RATE (default 50/50). The -r option seeds
+the random number generator for the generation of the genome so that one can
+reproducibly generate the same underlying genome to sample from. If this parameter is
+missing, then the job id of the invocation seeds the random number generator. The
+output is sent to the standard output (i.e. it is a UNIX pipe). The output is in
+Pacbio .fasta format suitable as input to fasta2DB. Finally, the -M option requests
+that the coordinates from which each read has been sampled are written to the indicated
+file, one line per read, ASCII encoded. This "map" file essentially tells one where
+every read belongs in an assembly and is very useful for debugging and testing
+purposes. If a read pair is say b,e then if b < e the read was sampled from [b,e] in
+the forward direction, and if b > e from [e,b] in the reverse direction.
+
+Example:
+
+ A small complete example of most of the commands above.
+
+> simulator 1.0 -c20. >G.fasta // Generate a 20x data sets of a 1Mb genome
+> fasta2DB G G.fasta // Create a compressed data base of the reads, G.db
+> rm G.fasta // Redundant, recreate any time with "DB2fasta G"
+> DBsplit -s11 G // Split G into 2 parts of size ~ 11MB each
+> DBdust G.1 // Produce a "dust" track on each part
+> DBdust G.2
+> Catrack G dust // Create one track for all of the DB
+> rm .G.*.dust.* // Clean up the sub-tracks
+> DBstats -mdust G // Take a look at the statistics for the database
+
+Statistics for all reads in the data set
+
+ 1,836 reads out of 1,836 (100.0%)
+ 20,007,090 base pairs out of 20,007,090 (100.0%)
+
+ 10,897 average read length
+ 2,192 standard deviation
+
+ Base composition: 0.250(A) 0.250(C) 0.250(G) 0.250(T)
+
+ Distribution of Read Lengths (Bin size = 1,000)
+
+ Bin: Count % Reads % Bases Average
+ 22,000: 1 0.1 0.1 22654
+ 21,000: 0 0.1 0.1 22654
+ 20,000: 1 0.1 0.2 21355
+ 19,000: 0 0.1 0.2 21355
+ 18,000: 4 0.3 0.6 19489
+ 17,000: 8 0.8 1.3 18374
+ 16,000: 19 1.8 2.8 17231
+ 15,000: 43 4.1 6.2 16253
+ 14,000: 81 8.6 12.0 15341
+ 13,000: 146 16.5 21.9 14428
+ 12,000: 200 27.4 34.4 13664
+ 11,000: 315 44.6 52.4 12824
+ 10,000: 357 64.0 71.2 12126
+ 9,000: 306 80.7 85.8 11586
+ 8,000: 211 92.2 94.8 11208
+ 7,000: 95 97.3 98.4 11017
+ 6,000: 43 99.7 99.8 10914
+ 5,000: 6 100.0 100.0 10897
+
+
+Statistics for dust-track
+
+ There are 158 intervals totaling 1,820 bases (0.0% of all data)
+
+ Distribution of dust intervals (Bin size = 1,000)
+
+ Bin: Count % Intervals % Bases Average
+ 0: 158 100.0 100.0 11
+
+> ls -al
+total 66518744
+drwxr-xr-x+ 177 myersg staff 6018 Mar 2 13:28 .
+drwxr-xr-x+ 20 myersg staff 680 Feb 26 19:52 ..
+-rw-r--r--+ 1 myersg staff 5002464 Mar 2 13:28 .G.bps
+-rw-r--r--+ 1 myersg staff 14704 Mar 2 13:28 .G.dust.anno
+-rw-r--r--+ 1 myersg staff 1264 Mar 2 13:28 .G.dust.data
+-rw-r--r--+ 1 myersg staff 73552 Mar 2 13:28 .G.idx
+-rw-r--r--+ 1 myersg staff 162 Mar 2 13:28 G.db
+> cat G.db
+files = 1
+ 1836 G Sim
+blocks = 2
+size = 11 cutoff = 0 all = 0
+ 0 0
+ 1011 1011
+ 1836 1836
diff --git a/fasta2DAM.c b/fasta2DAM.c
new file mode 100644
index 0000000..d7f75ec
--- /dev/null
+++ b/fasta2DAM.c
@@ -0,0 +1,450 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Add .fasta files to a DB:
+ * Adds the given fasta files in the given order to <path>.db. If the db does not exist
+ * then it is created. All .fasta files added to a given data base must have the same
+ * header format and follow Pacbio's convention. A file cannot be added twice and this
+ * is enforced. The command either builds or appends to the .<path>.idx and .<path>.bps
+ * files, where the index file (.idx) contains information about each read and their offsets
+ * in the base-pair file (.bps) that holds the sequences where each base is compessed
+ * into 2-bits. The two files are hidden by virtue of their names beginning with a '.'.
+ * <path>.db is effectively a stub file with given name that contains an ASCII listing
+ * of the files added to the DB and possibly the block partitioning for the DB if DBsplit
+ * has been called upon it.
+ *
+ * Author: Gene Myers
+ * Date : May 2013
+ * Modify: DB upgrade: now *add to* or create a DB depending on whether it exists, read
+ * multiple .fasta files (no longer a stdin pipe).
+ * Date : April 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <strings.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+static char *Usage = "[-v] <path:string> ( -f<file> | <input:fasta> ... )";
+
+static char number[128] =
+ { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 2,
+ 0, 0, 0, 0, 0, 0, 4, 0,
+ 0, 0, 0, 0, 3, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 2,
+ 0, 0, 0, 0, 0, 0, 4, 0,
+ 0, 0, 0, 0, 3, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+
+typedef struct
+ { int argc;
+ char **argv;
+ FILE *input;
+ int count;
+ char *name;
+ } File_Iterator;
+
+File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first)
+{ File_Iterator *it;
+
+ it = Malloc(sizeof(File_Iterator),"Allocating file iterator");
+ it->argc = argc;
+ it->argv = argv;
+ it->input = input;
+ if (input == NULL)
+ it->count = first;
+ else
+ { it->count = 1;
+ rewind(input);
+ }
+ return (it);
+}
+
+int next_file(File_Iterator *it)
+{ static char nbuffer[MAX_NAME+8];
+
+ if (it->input == NULL)
+ { if (it->count >= it->argc)
+ return (0);
+ it->name = it->argv[it->count++];
+ }
+ else
+ { char *eol;
+
+ if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL)
+ { if (feof(it->input))
+ return (0);
+ SYSTEM_ERROR;
+ }
+ if ((eol = index(nbuffer,'\n')) == NULL)
+ { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n",
+ Prog_Name,it->count,MAX_NAME+7);
+ it->name = NULL;
+ }
+ *eol = '\0';
+ it->count += 1;
+ it->name = nbuffer;
+ }
+ return (1);
+}
+
+
+int main(int argc, char *argv[])
+{ FILE *ostub;
+ char *dbname;
+ char *root, *pwd;
+
+ FILE *bases, *indx, *hdrs;
+ int64 boff, hoff;
+
+ int ifiles, ofiles;
+ char **flist;
+
+ HITS_DB db;
+ int ureads;
+
+ int VERBOSE;
+ FILE *IFILE;
+
+ // Process command line
+
+ { int i, j, k;
+ int flags[128];
+
+ ARG_INIT("fasta2DAM")
+
+ IFILE = NULL;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("v")
+ break;
+ case 'f':
+ IFILE = fopen(argv[i]+2,"r");
+ if (IFILE == NULL)
+ { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2);
+ exit (1);
+ }
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ VERBOSE = flags['v'];
+
+ if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2))
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ // Try to open DB file, if present then adding to DB, otherwise creating new DB. Set up
+ // variables as follows:
+ // dbname = full name of map index = <pwd>/<root>.dam
+ // ostub = new image of db file (will overwrite old image at end)
+ // bases = .bps file positioned for appending
+ // indx = .idx file positioned for appending
+ // ureads = # of reads currently in db
+ // boff = offset in .bps at which to place next sequence
+ // hoff = offset in .hdr at which to place next header prefix
+ // ifiles = # of .fasta files to add
+ // ofiles = # of .fasta files added so far
+ // flist = [0..ifiles] list of file names (root only) added to db so far
+
+ root = Root(argv[1],".dam");
+ pwd = PathTo(argv[1]);
+ dbname = Strdup(Catenate(pwd,"/",root,".dam"),"Allocating map index name");
+ if (dbname == NULL)
+ exit (1);
+
+ if (IFILE == NULL)
+ ifiles = argc-2;
+ else
+ { File_Iterator *ng;
+
+ ifiles = 0;
+ ng = init_file_iterator(argc,argv,IFILE,2);
+ while (next_file(ng))
+ ifiles += 1;
+ free(ng);
+ }
+ ofiles = 0;
+
+ bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"w");
+ indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w");
+ hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"w");
+ if (bases == NULL || indx == NULL || hdrs == NULL)
+ exit (1);
+
+ flist = (char **) Malloc(sizeof(char *)*ifiles,"Allocating file list");
+ fwrite(&db,sizeof(HITS_DB),1,indx);
+
+ ureads = 0;
+ boff = 0;
+ hoff = 0;
+
+ ostub = Fopen(dbname,"w+");
+ if (ostub == NULL)
+ exit (1);
+
+ fprintf(ostub,DB_NFILE,argc-2);
+
+ { int maxlen;
+ int64 totlen, count[4];
+ int rmax;
+ HITS_READ prec;
+ char *read;
+ int c;
+ File_Iterator *ng;
+
+ // Buffer for accumulating .fasta sequence over multiple lines
+
+ rmax = MAX_NAME + 60000;
+ read = (char *) Malloc(rmax+1,"Allocating line buffer");
+ if (read == NULL)
+ goto error;
+
+ totlen = 0; // total # of bases in new .fasta files
+ maxlen = 0; // longest read in new .fasta files
+ for (c = 0; c < 4; c++) // count of acgt in new .fasta files
+ count[c] = 0;
+
+ // For each .fasta file do:
+
+ ng = init_file_iterator(argc,argv,IFILE,2);
+ while (next_file(ng))
+ { FILE *input;
+ char *path, *core;
+ int nline, eof, rlen;
+
+ if (ng->name == NULL) goto error;
+
+ // Open it: <path>/<core>.fasta, check that core is not too long,
+ // and checking that it is not already in flist.
+
+ path = PathTo(ng->name);
+ core = Root(ng->name,".fasta");
+ if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL)
+ goto error;
+ free(path);
+
+ { int j;
+
+ for (j = 0; j < ofiles; j++)
+ if (strcmp(core,flist[j]) == 0)
+ { fprintf(stderr,"%s: File %s.fasta is already in database %s.db\n",
+ Prog_Name,core,Root(argv[1],".db"));
+ goto error;
+ }
+ }
+
+ // Get the header of the first line. If the file is empty skip.
+
+ rlen = 0;
+ nline = 1;
+ eof = (fgets(read,MAX_NAME,input) == NULL);
+ if (eof || strlen(read) < 1)
+ { fprintf(stderr,"Skipping '%s', file is empty!\n",core);
+ fclose(input);
+ free(core);
+ continue;
+ }
+
+ // Add the file name to flist
+
+ if (VERBOSE)
+ { fprintf(stderr,"Adding '%s' ...\n",core);
+ fflush(stderr);
+ }
+ flist[ofiles++] = core;
+
+ // Check that the first line has PACBIO format, and record prolog in 'prolog'.
+
+ if (read[strlen(read)-1] != '\n')
+ { fprintf(stderr,"File %s.fasta, Line 1: Fasta line is too long (> %d chars)\n",
+ core,MAX_NAME-2);
+ goto error;
+ }
+ if (!eof && read[0] != '>')
+ { fprintf(stderr,"File %s.fasta, Line 1: First header in fasta file is missing\n",core);
+ goto error;
+ }
+
+ // Read in all the sequences until end-of-file
+
+ { int i, x, n;
+
+ while (!eof)
+ { int hlen, hline;
+
+ read[rlen] = '>';
+ hlen = strlen(read+rlen);
+ fwrite(read+rlen,1,hlen,hdrs);
+
+ hline = nline;
+ rlen = 0;
+ while (1)
+ { eof = (fgets(read+rlen,MAX_NAME,input) == NULL);
+ nline += 1;
+ x = strlen(read+rlen)-1;
+ if (read[rlen+x] != '\n')
+ { fprintf(stderr,"File %s.fasta, Line %d:",core,nline);
+ fprintf(stderr," Fasta line is too long (> %d chars)\n",MAX_NAME-2);
+ goto error;
+ }
+ if (eof || read[rlen] == '>')
+ break;
+ rlen += x;
+ if (rlen + MAX_NAME > rmax)
+ { rmax = ((int) (1.2 * rmax)) + 1000 + MAX_NAME;
+ read = (char *) realloc(read,rmax+1);
+ if (read == NULL)
+ { fprintf(stderr,"File %s.fasta, Line %d:",core,nline);
+ fprintf(stderr," Out of memory (Allocating line buffer)\n");
+ goto error;
+ }
+ }
+ }
+ read[rlen] = '\0';
+
+ n = 0;
+ i = -1;
+ while (i < rlen)
+ { int pbeg, plen, clen;
+
+ while (i < rlen)
+ if (number[(int) read[++i]] < 4)
+ break;
+
+ if (i >= rlen) break;
+
+ pbeg = i;
+ prec.fpulse = pbeg;
+ prec.origin = n++;
+ prec.boff = boff;
+ prec.coff = hoff;
+ prec.flags = DB_BEST;
+ while (i < rlen)
+ { x = number[(int) read[i]];
+ if (x >= 4) break;
+ count[x] += 1;
+ read[i++] = (char) x;
+ }
+ prec.rlen = plen = i-pbeg;
+ ureads += 1;
+ totlen += plen;
+ if (plen > maxlen)
+ maxlen = plen;
+
+ Compress_Read(plen,read+pbeg);
+ clen = COMPRESSED_LEN(plen);
+ fwrite(read+pbeg,1,clen,bases);
+ boff += clen;
+
+ fwrite(&prec,sizeof(HITS_READ),1,indx);
+ }
+ hoff += hlen;
+ }
+
+ fprintf(ostub,DB_FDATA,ureads,core,core);
+
+ fclose(input);
+ }
+ }
+
+ // Update relevant fields in db record
+
+ db.ureads = ureads;
+ db.treads = ureads;
+ for (c = 0; c < 4; c++)
+ db.freq[c] = (float) ((1.*count[c])/totlen);
+ db.totlen = totlen;
+ db.maxlen = maxlen;
+ db.cutoff = -1;
+ }
+
+ rewind(indx);
+ fwrite(&db,sizeof(HITS_DB),1,indx); // Write the finalized db record into .idx
+
+ fclose(ostub);
+ fclose(indx);
+ fclose(bases);
+ fclose(hdrs);
+
+ exit (0);
+
+ // Error exit: Remove the .idx, .bps, and .dam files
+
+error:
+ fclose(ostub);
+ fclose(indx);
+ fclose(hdrs);
+ fclose(bases);
+ unlink(Catenate(pwd,PATHSEP,root,".idx"));
+ unlink(Catenate(pwd,PATHSEP,root,".bps"));
+ unlink(Catenate(pwd,PATHSEP,root,".hdr"));
+ unlink(Catenate(pwd,"/",root,".dam"));
+
+ exit (1);
+}
diff --git a/fasta2DB.c b/fasta2DB.c
new file mode 100644
index 0000000..50061d5
--- /dev/null
+++ b/fasta2DB.c
@@ -0,0 +1,668 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Add .fasta files to a DB:
+ * Adds the given fasta files in the given order to <path>.db. If the db does not exist
+ * then it is created. All .fasta files added to a given data base must have the same
+ * header format and follow Pacbio's convention. A file cannot be added twice and this
+ * is enforced. The command either builds or appends to the .<path>.idx and .<path>.bps
+ * files, where the index file (.idx) contains information about each read and their offsets
+ * in the base-pair file (.bps) that holds the sequences where each base is compessed
+ * into 2-bits. The two files are hidden by virtue of their names beginning with a '.'.
+ * <path>.db is effectively a stub file with given name that contains an ASCII listing
+ * of the files added to the DB and possibly the block partitioning for the DB if DBsplit
+ * has been called upon it.
+ *
+ * Author: Gene Myers
+ * Date : May 2013
+ * Modify: DB upgrade: now *add to* or create a DB depending on whether it exists, read
+ * multiple .fasta files (no longer a stdin pipe).
+ * Date : April 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <strings.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+static char *Usage = "[-v] <path:string> ( -f<file> | <input:fasta> ... )";
+
+static char number[128] =
+ { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 3, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+
+typedef struct
+ { int argc;
+ char **argv;
+ FILE *input;
+ int count;
+ char *name;
+ } File_Iterator;
+
+File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first)
+{ File_Iterator *it;
+
+ it = Malloc(sizeof(File_Iterator),"Allocating file iterator");
+ it->argc = argc;
+ it->argv = argv;
+ it->input = input;
+ if (input == NULL)
+ it->count = first;
+ else
+ { it->count = 1;
+ rewind(input);
+ }
+ return (it);
+}
+
+int next_file(File_Iterator *it)
+{ static char nbuffer[MAX_NAME+8];
+
+ if (it->input == NULL)
+ { if (it->count >= it->argc)
+ return (0);
+ it->name = it->argv[it->count++];
+ }
+ else
+ { char *eol;
+
+ if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL)
+ { if (feof(it->input))
+ return (0);
+ SYSTEM_ERROR;
+ }
+ if ((eol = index(nbuffer,'\n')) == NULL)
+ { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n",
+ Prog_Name,it->count,MAX_NAME+7);
+ it->name = NULL;
+ }
+ *eol = '\0';
+ it->count += 1;
+ it->name = nbuffer;
+ }
+ return (1);
+}
+
+
+int main(int argc, char *argv[])
+{ FILE *istub, *ostub;
+ char *dbname;
+ char *root, *pwd;
+
+ FILE *bases, *indx;
+ int64 boff, ioff;
+
+ int ifiles, ofiles;
+ char **flist;
+
+ HITS_DB db;
+ int ureads;
+ int64 offset;
+
+ FILE *IFILE;
+ int VERBOSE;
+
+ // Usage: [-v] <path:string> ( -f<file> | <input:fasta> ... )
+
+ { int i, j, k;
+ int flags[128];
+
+ ARG_INIT("fasta2DB")
+
+ IFILE = NULL;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("v")
+ break;
+ case 'f':
+ IFILE = fopen(argv[i]+2,"r");
+ if (IFILE == NULL)
+ { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2);
+ exit (1);
+ }
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ VERBOSE = flags['v'];
+
+ if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2))
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ // Try to open DB file, if present then adding to DB, otherwise creating new DB. Set up
+ // variables as follows:
+ // dbname = full name of db = <pwd>/<root>.db
+ // istub = open db file (if adding) or NULL (if creating)
+ // ostub = new image of db file (will overwrite old image at end)
+ // bases = .bps file positioned for appending
+ // indx = .idx file positioned for appending
+ // ureads = # of reads currently in db
+ // offset = offset in .bps at which to place next sequence
+ // ioff = offset in .idx file to truncate to if command fails
+ // boff = offset in .bps file to truncate to if command fails
+ // ifiles = # of .fasta files to add
+ // ofiles = # of .fasta files already in db
+ // flist = [0..ifiles+ofiles] list of file names (root only) added to db so far
+
+ { int i;
+
+ root = Root(argv[1],".db");
+ pwd = PathTo(argv[1]);
+ dbname = Strdup(Catenate(pwd,"/",root,".db"),"Allocating db name");
+ if (dbname == NULL)
+ exit (1);
+
+ if (IFILE == NULL)
+ ifiles = argc-2;
+ else
+ { File_Iterator *ng;
+
+ ifiles = 0;
+ ng = init_file_iterator(argc,argv,IFILE,2);
+ while (next_file(ng))
+ ifiles += 1;
+ free(ng);
+ }
+
+ istub = fopen(dbname,"r");
+ if (istub == NULL)
+ { ofiles = 0;
+
+ bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"w+");
+ indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w+");
+ if (bases == NULL || indx == NULL)
+ exit (1);
+
+ fwrite(&db,sizeof(HITS_DB),1,indx);
+
+ ureads = 0;
+ offset = 0;
+ boff = 0;
+ ioff = 0;
+ }
+ else
+ { if (fscanf(istub,DB_NFILE,&ofiles) != 1)
+ SYSTEM_ERROR
+
+ bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"r+");
+ indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+");
+ if (bases == NULL || indx == NULL)
+ exit (1);
+
+ if (fread(&db,sizeof(HITS_DB),1,indx) != 1)
+ SYSTEM_ERROR
+ fseeko(bases,0,SEEK_END);
+ fseeko(indx, 0,SEEK_END);
+
+ ureads = db.ureads;
+ offset = ftello(bases);
+ boff = offset;
+ ioff = ftello(indx);
+ }
+
+ flist = (char **) Malloc(sizeof(char *)*(ofiles+ifiles),"Allocating file list");
+ ostub = Fopen(Catenate(pwd,"/",root,".dbx"),"w+");
+ if (ostub == NULL || flist == NULL)
+ exit (1);
+
+ fprintf(ostub,DB_NFILE,ofiles+ifiles);
+ for (i = 0; i < ofiles; i++)
+ { int last;
+ char prolog[MAX_NAME], fname[MAX_NAME];
+
+ if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ SYSTEM_ERROR
+ if ((flist[i] = Strdup(fname,"Adding to file list")) == NULL)
+ goto error;
+ fprintf(ostub,DB_FDATA,last,fname,prolog);
+ }
+ }
+
+ { int maxlen;
+ int64 totlen, count[4];
+ int pmax, rmax;
+ HITS_READ *prec;
+ char *read;
+ int c;
+ File_Iterator *ng;
+
+ // Buffer for reads all in the same well
+
+ pmax = 100;
+ prec = (HITS_READ *) Malloc(sizeof(HITS_READ)*pmax,"Allocating record buffer");
+ if (prec == NULL)
+ goto error;
+
+ // Buffer for accumulating .fasta sequence over multiple lines
+
+ rmax = MAX_NAME + 60000;
+ read = (char *) Malloc(rmax+1,"Allocating line buffer");
+ if (read == NULL)
+ goto error;
+
+ totlen = 0; // total # of bases in new .fasta files
+ maxlen = 0; // longest read in new .fasta files
+ for (c = 0; c < 4; c++) // count of acgt in new .fasta files
+ count[c] = 0;
+
+ // For each new .fasta file do:
+
+ ng = init_file_iterator(argc,argv,IFILE,2);
+ while (next_file(ng))
+ { FILE *input;
+ char *path, *core, *prolog;
+ int nline, eof, rlen, pcnt;
+ int pwell;
+
+ if (ng->name == NULL) goto error;
+
+ // Open it: <path>/<core>.fasta, check that core is not too long,
+ // and checking that it is not already in flist.
+
+ path = PathTo(ng->name);
+ core = Root(ng->name,".fasta");
+ if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL)
+ goto error;
+ free(path);
+ if (strlen(core) >= MAX_NAME)
+ { fprintf(stderr,"%s: File name over %d chars: '%.200s'\n",
+ Prog_Name,MAX_NAME,core);
+ goto error;
+ }
+
+ { int j;
+
+ for (j = 0; j < ofiles; j++)
+ if (strcmp(core,flist[j]) == 0)
+ { fprintf(stderr,"%s: File %s.fasta is already in database %s.db\n",
+ Prog_Name,core,Root(argv[1],".db"));
+ goto error;
+ }
+ }
+
+ // Get the header of the first line. If the file is empty skip.
+
+ pcnt = 0;
+ rlen = 0;
+ nline = 1;
+ eof = (fgets(read,MAX_NAME,input) == NULL);
+ if (eof || strlen(read) < 1)
+ { fprintf(stderr,"Skipping '%s', file is empty!\n",core);
+ fclose(input);
+ free(core);
+ continue;
+ }
+
+ // Add the file name to flist
+
+ if (VERBOSE)
+ { fprintf(stderr,"Adding '%s' ...\n",core);
+ fflush(stderr);
+ }
+ flist[ofiles++] = core;
+
+ // Check that the first line has PACBIO format, and record prolog in 'prolog'.
+
+ if (read[strlen(read)-1] != '\n')
+ { fprintf(stderr,"File %s.fasta, Line 1: Fasta line is too long (> %d chars)\n",
+ core,MAX_NAME-2);
+ goto error;
+ }
+ if (!eof && read[0] != '>')
+ { fprintf(stderr,"File %s.fasta, Line 1: First header in fasta file is missing\n",core);
+ goto error;
+ }
+
+ { char *find;
+ int well, beg, end, qv;
+
+ find = index(read+1,'/');
+ if (find != NULL && sscanf(find+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) >= 3)
+ { *find = '\0';
+ prolog = Strdup(read+1,"Extracting prolog");
+ *find = '/';
+ if (prolog == NULL) goto error;
+ }
+ else
+ { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n",
+ core,nline);
+ goto error;
+ }
+ }
+
+ // Read in all the sequences until end-of-file
+
+ { int i, x;
+
+ pwell = -1;
+ while (!eof)
+ { int beg, end, clen, hline;
+ int well, qv;
+ char *find;
+
+ find = index(read+(rlen+1),'/');
+ if (find == NULL)
+ { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n",
+ core,nline);
+ goto error;
+ }
+ *find = '\0';
+ if (strcmp(read+(rlen+1),prolog) != 0)
+ { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line name inconsisten\n",
+ core,nline);
+ goto error;
+ }
+ *find = '/';
+ x = sscanf(find+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv);
+ if (x < 3)
+ { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n",
+ core,nline);
+ goto error;
+ }
+ else if (x == 3)
+ qv = 0;
+
+ hline = nline;
+ rlen = 0;
+ while (1)
+ { eof = (fgets(read+rlen,MAX_NAME,input) == NULL);
+ nline += 1;
+ x = strlen(read+rlen)-1;
+ if (read[rlen+x] != '\n')
+ { fprintf(stderr,"File %s.fasta, Line %d:",core,nline);
+ fprintf(stderr," Fasta line is too long (> %d chars)\n",MAX_NAME-2);
+ goto error;
+ }
+ if (eof || read[rlen] == '>')
+ break;
+ rlen += x;
+ if (rlen + MAX_NAME > rmax)
+ { rmax = ((int) (1.2 * rmax)) + 1000 + MAX_NAME;
+ read = (char *) realloc(read,rmax+1);
+ if (read == NULL)
+ { fprintf(stderr,"File %s.fasta, Line %d:",core,nline);
+ fprintf(stderr," Out of memory (Allocating line buffer)\n");
+ goto error;
+ }
+ }
+ }
+ read[rlen] = '\0';
+
+ for (i = 0; i < rlen; i++)
+ { x = number[(int) read[i]];
+ count[x] += 1;
+ read[i] = (char) x;
+ }
+ ureads += 1;
+ totlen += rlen;
+ if (rlen > maxlen)
+ maxlen = rlen;
+
+ prec[pcnt].origin = well;
+ prec[pcnt].fpulse = beg;
+ prec[pcnt].rlen = rlen;
+ prec[pcnt].boff = offset;
+ prec[pcnt].coff = -1;
+ prec[pcnt].flags = qv;
+
+ Compress_Read(rlen,read);
+ clen = COMPRESSED_LEN(rlen);
+ fwrite(read,1,clen,bases);
+ offset += clen;
+
+ if (pwell == well)
+ { prec[pcnt].flags |= DB_CSS;
+ pcnt += 1;
+ if (pcnt >= pmax)
+ { pmax = ((int) (pcnt*1.2)) + 100;
+ prec = (HITS_READ *) realloc(prec,sizeof(HITS_READ)*pmax);
+ if (prec == NULL)
+ { fprintf(stderr,"File %s.fasta, Line %d: Out of memory",core,nline);
+ fprintf(stderr," (Allocating read records)\n");
+ goto error;
+ }
+ }
+ }
+ else if (pcnt == 0)
+ pcnt += 1;
+ else
+ { x = 0;
+ for (i = 1; i < pcnt; i++)
+ if (prec[i].rlen > prec[x].rlen)
+ x = i;
+ prec[x].flags |= DB_BEST;
+ fwrite(prec,sizeof(HITS_READ),pcnt,indx);
+ prec[0] = prec[pcnt];
+ pcnt = 1;
+ }
+ pwell = well;
+ }
+
+ // Complete processing of .fasta file: flush last well group, write file line
+ // in db image, free prolog, and close file
+
+ x = 0;
+ for (i = 1; i < pcnt; i++)
+ if (prec[i].rlen > prec[x].rlen)
+ x = i;
+ prec[x].flags |= DB_BEST;
+ fwrite(prec,sizeof(HITS_READ),pcnt,indx);
+
+ fprintf(ostub,DB_FDATA,ureads,core,prolog);
+ }
+
+ free(prolog);
+ fclose(input);
+ }
+
+ // Finished loading all sequences: update relevant fields in db record
+
+ db.ureads = ureads;
+ if (istub == NULL)
+ { for (c = 0; c < 4; c++)
+ db.freq[c] = (float) ((1.*count[c])/totlen);
+ db.totlen = totlen;
+ db.maxlen = maxlen;
+ db.cutoff = -1;
+ }
+ else
+ { for (c = 0; c < 4; c++)
+ db.freq[c] = (float) ((db.freq[c]*db.totlen + (1.*count[c]))/(db.totlen + totlen));
+ db.totlen += totlen;
+ if (maxlen > db.maxlen)
+ db.maxlen = maxlen;
+ }
+ }
+
+ // If db has been previously partitioned then calculate additional partition points and
+ // write to new db file image
+
+ if (db.cutoff >= 0)
+ { int64 totlen, dbpos, size;
+ int nblock, ireads, tfirst, rlen;
+ int ufirst, cutoff, allflag;
+ HITS_READ record;
+ int i;
+
+ if (VERBOSE)
+ { fprintf(stderr,"Updating block partition ...\n");
+ fflush(stderr);
+ }
+
+ // Read the block portion of the existing db image getting the indices of the first
+ // read in the last block of the exisiting db as well as the partition parameters.
+ // Copy the old image block information to the new block information (except for
+ // the indices of the last partial block)
+
+ if (fscanf(istub,DB_NBLOCK,&nblock) != 1)
+ SYSTEM_ERROR
+ dbpos = ftello(ostub);
+ fprintf(ostub,DB_NBLOCK,0);
+ if (fscanf(istub,DB_PARAMS,&size,&cutoff,&allflag) != 3)
+ SYSTEM_ERROR
+ fprintf(ostub,DB_PARAMS,size,cutoff,allflag);
+ if (allflag)
+ allflag = 0;
+ else
+ allflag = DB_BEST;
+ size *= 1000000ll;
+
+ nblock -= 1;
+ for (i = 0; i <= nblock; i++)
+ { if (fscanf(istub,DB_BDATA,&ufirst,&tfirst) != 2)
+ SYSTEM_ERROR
+ fprintf(ostub,DB_BDATA,ufirst,tfirst);
+ }
+
+ // Seek the first record of the last block of the existing db in .idx, and then
+ // compute and record partition indices for the rest of the db from this point
+ // forward.
+
+ fseeko(indx,sizeof(HITS_DB)+sizeof(HITS_READ)*ufirst,SEEK_SET);
+ totlen = 0;
+ ireads = 0;
+ for (i = ufirst; i < ureads; i++)
+ { if (fread(&record,sizeof(HITS_READ),1,indx) != 1)
+ SYSTEM_ERROR
+ rlen = record.rlen;
+ if (rlen >= cutoff && (record.flags & DB_BEST) >= allflag)
+ { ireads += 1;
+ tfirst += 1;
+ totlen += rlen;
+ if (totlen >= size)
+ { fprintf(ostub," %9d %9d\n",i+1,tfirst);
+ totlen = 0;
+ ireads = 0;
+ nblock += 1;
+ }
+ }
+ }
+
+ if (ireads > 0)
+ { fprintf(ostub,DB_BDATA,ureads,tfirst);
+ nblock += 1;
+ }
+
+ db.treads = tfirst;
+
+ fseeko(ostub,dbpos,SEEK_SET);
+ fprintf(ostub,DB_NBLOCK,nblock); // Rewind and record the new number of blocks
+ }
+ else
+ db.treads = ureads;
+
+ rewind(indx);
+ fwrite(&db,sizeof(HITS_DB),1,indx); // Write the finalized db record into .idx
+
+ rewind(ostub); // Rewrite the number of files actually added
+ fprintf(ostub,DB_NFILE,ofiles);
+
+ if (istub != NULL)
+ fclose(istub);
+ fclose(ostub);
+ fclose(indx);
+ fclose(bases);
+
+ rename(Catenate(pwd,"/",root,".dbx"),dbname); // New image replaces old image
+
+ exit (0);
+
+ // Error exit: Either truncate or remove the .idx and .bps files as appropriate.
+ // Remove the new image file <pwd>/<root>.dbx
+
+error:
+ if (ioff != 0)
+ { fseeko(indx,0,SEEK_SET);
+ if (ftruncate(fileno(indx),ioff) < 0)
+ SYSTEM_ERROR
+ }
+ if (boff != 0)
+ { fseeko(bases,0,SEEK_SET);
+ if (ftruncate(fileno(bases),boff) < 0)
+ SYSTEM_ERROR
+ }
+ fclose(indx);
+ fclose(bases);
+ if (ioff == 0)
+ unlink(Catenate(pwd,PATHSEP,root,".idx"));
+ if (boff == 0)
+ unlink(Catenate(pwd,PATHSEP,root,".bps"));
+
+ if (istub != NULL)
+ fclose(istub);
+ fclose(ostub);
+ unlink(Catenate(pwd,"/",root,".dbx"));
+
+ exit (1);
+}
diff --git a/quiva2DB.c b/quiva2DB.c
new file mode 100644
index 0000000..0ec2628
--- /dev/null
+++ b/quiva2DB.c
@@ -0,0 +1,384 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Adds the given .quiva files to an existing DB "path". The input files must be added in
+ * the same order as the .fasta files were and have the same root names, e.g. FOO.fasta
+ * and FOO.quiva. The files can be added incrementally but must be added in the same order
+ * as the .fasta files. This is enforced by the program. With the -l option set the
+ * compression scheme is a bit lossy to get more compression (see the description of dexqv
+ * in the DEXTRACTOR module).
+ *
+ * Author: Gene Myers
+ * Date : July 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <strings.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "DB.h"
+#include "QV.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+static char *Usage = "[-vl] <path:string> ( -f<file> | <input:quiva> ... )";
+
+typedef struct
+ { int argc;
+ char **argv;
+ FILE *input;
+ int count;
+ char *name;
+ } File_Iterator;
+
+File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first)
+{ File_Iterator *it;
+
+ it = Malloc(sizeof(File_Iterator),"Allocating file iterator");
+ it->argc = argc;
+ it->argv = argv;
+ it->input = input;
+ if (input == NULL)
+ it->count = first;
+ else
+ { it->count = 1;
+ rewind(input);
+ }
+ return (it);
+}
+
+int next_file(File_Iterator *it)
+{ static char nbuffer[MAX_NAME+8];
+
+ if (it->input == NULL)
+ { if (it->count >= it->argc)
+ return (0);
+ it->name = it->argv[it->count++];
+ }
+ else
+ { char *eol;
+
+ if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL)
+ { if (feof(it->input))
+ return (0);
+ SYSTEM_ERROR;
+ }
+ if ((eol = index(nbuffer,'\n')) == NULL)
+ { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n",
+ Prog_Name,it->count,MAX_NAME+7);
+ it->name = NULL;
+ }
+ *eol = '\0';
+ it->count += 1;
+ it->name = nbuffer;
+ }
+ return (1);
+}
+
+
+int main(int argc, char *argv[])
+{ FILE *istub, *quiva, *indx;
+ int64 coff;
+ int ofile;
+ HITS_DB db;
+ HITS_READ *reads;
+
+ int VERBOSE;
+ int LOSSY;
+ FILE *IFILE;
+
+ // Process command line
+
+ { int i, j, k;
+ int flags[128];
+
+ ARG_INIT("quiva2DB")
+
+ IFILE = NULL;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("vl")
+ break;
+ case 'f':
+ IFILE = fopen(argv[i]+2,"r");
+ if (IFILE == NULL)
+ { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2);
+ exit (1);
+ }
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ VERBOSE = flags['v'];
+ LOSSY = flags['l'];
+
+ if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2))
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ // Open DB stub file and index, load db and read records. Confirm that the .fasta files
+ // corresponding to the command line .quiva files are in the DB and in order where the
+ // index of the first file is ofile and the index of the first read to be added is ofirst.
+ // Record in coff the current size of the .qvs file in case an error occurs and it needs
+ // to be truncated back to its size at the start.
+
+ { int i;
+ char *pwd, *root;
+ int nfiles;
+ File_Iterator *ng;
+
+ root = Root(argv[1],".db");
+ pwd = PathTo(argv[1]);
+ istub = Fopen(Catenate(pwd,"/",root,".db"),"r");
+ if (istub == NULL)
+ exit (1);
+
+ indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+");
+ if (indx == NULL)
+ exit (1);
+ if (fread(&db,sizeof(HITS_DB),1,indx) != 1)
+ SYSTEM_ERROR
+
+ reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*db.ureads,"Allocating DB index");
+ if (reads == NULL)
+ exit (1);
+ if (fread(reads,sizeof(HITS_READ),db.ureads,indx) != (size_t) (db.ureads))
+ SYSTEM_ERROR
+
+ { int first, last;
+ char prolog[MAX_NAME], fname[MAX_NAME];
+ char *core;
+
+ ng = init_file_iterator(argc,argv,IFILE,2);
+ if ( ! next_file(ng))
+ { fprintf(stderr,"%s: file list is empty!\n",Prog_Name);
+ exit (1);
+ }
+ if (ng->name == NULL) exit (1);
+
+ core = Root(ng->name,".quiva");
+
+ if (fscanf(istub,DB_NFILE,&nfiles) != 1)
+ SYSTEM_ERROR
+ first = 0;
+ for (i = 0; i < nfiles; i++)
+ { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ SYSTEM_ERROR
+ if (strcmp(core,fname) == 0)
+ break;
+ first = last;
+ }
+ if (i >= nfiles)
+ { fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core);
+ exit (1);
+ }
+
+ ofile = i;
+ if (first > 0 && reads[first-1].coff < 0)
+ { fprintf(stderr,"%s: Predecessor of %s.quiva has not been added yet\n",Prog_Name,core);
+ exit (1);
+ }
+ if (reads[first].coff >= 0)
+ { fprintf(stderr,"%s: %s.quiva has already been added\n",Prog_Name,core);
+ exit (1);
+ }
+
+ while (next_file(ng))
+ { if (ng->name == NULL)
+ exit (1);
+ core = Root(ng->name,".quiva");
+ if (++i >= nfiles)
+ { fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core);
+ exit (1);
+ }
+ if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ SYSTEM_ERROR
+ if (strcmp(core,fname) != 0)
+ { fprintf(stderr,"%s: Files not being added in order (expect %s, given %s)",
+ Prog_Name,fname,core);
+ exit (1);
+ }
+ }
+
+ if (ofile == 0)
+ quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"w");
+ else
+ quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"r+");
+ if (quiva == NULL)
+ exit (1);
+
+ fseeko(quiva,0,SEEK_END);
+ coff = ftello(quiva);
+
+ free(core);
+ free(ng);
+ }
+
+ free(root);
+ free(pwd);
+ }
+
+ // For each .quiva file, determine its compression scheme in a fast scan and append it to
+ // the .qvs file Then compress every .quiva entry in the file, appending its compressed
+ // form to the .qvs file as you go and recording the offset in the .qvs in the .coff field
+ // of each read record (*except* the first, that points at the compression scheme immediately
+ // preceding it). Ensure that the # of .quiva entries matches the # of .fasta entries
+ // in each added file.
+
+ { int i;
+ int last, cur;
+ File_Iterator *ng;
+
+ // For each .quiva file do:
+
+ rewind(istub);
+ if (fscanf(istub,"files = %*d\n") != 0)
+ SYSTEM_ERROR
+
+ last = 0;
+ for (i = 0; i < ofile; i++)
+ if (fscanf(istub," %9d %*s %*s\n",&last) != 1)
+ SYSTEM_ERROR
+
+ ng = init_file_iterator(argc,argv,IFILE,2);
+ cur = last;
+ while (next_file(ng))
+ { FILE *input;
+ int64 qpos;
+ char *pwd, *root;
+ QVcoding *coding;
+
+ // Open next .quiva file and create its compression scheme
+
+ pwd = PathTo(ng->name);
+ root = Root(ng->name,".quiva");
+ if ((input = Fopen(Catenate(pwd,"/",root,".quiva"),"r")) == NULL)
+ goto error;
+
+ if (VERBOSE)
+ { fprintf(stderr,"Analyzing '%s' ...\n",root);
+ fflush(stderr);
+ }
+
+ QVcoding_Scan(input);
+ coding = Create_QVcoding(LOSSY);
+ coding->prefix = Strdup(".qvs","Allocating header prefix");
+
+ qpos = ftello(quiva);
+ Write_QVcoding(quiva,coding);
+
+ // Then compress and append to the .qvs each compressed QV entry
+
+ if (VERBOSE)
+ { fprintf(stderr,"Compressing '%s' ...\n",root);
+ fflush(stderr);
+ }
+
+ rewind(input);
+ while (Read_Lines(input,1) > 0)
+ { reads[cur++].coff = qpos;
+ Compress_Next_QVentry(input,quiva,coding,LOSSY);
+ qpos = ftello(quiva);
+ }
+
+ if (fscanf(istub," %9d %*s %*s\n",&last) != 1)
+ SYSTEM_ERROR
+ if (last != cur)
+ { fprintf(stderr,"%s: Number of reads in %s.quiva doesn't match number in %s.fasta\n",
+ Prog_Name,root,root);
+ goto error;
+ }
+
+ Free_QVcoding(coding);
+ free(root);
+ free(pwd);
+ }
+
+ free(ng);
+ }
+
+ // Write the db record and read index into .idx and clean up
+
+ rewind(indx);
+ fwrite(&db,sizeof(HITS_DB),1,indx);
+ fwrite(reads,sizeof(HITS_READ),db.ureads,indx);
+
+ fclose(istub);
+ fclose(indx);
+ fclose(quiva);
+
+ exit (0);
+
+ // Error exit: Either truncate or remove the .qvs file as appropriate.
+
+error:
+ if (coff != 0)
+ { fseeko(quiva,0,SEEK_SET);
+ if (ftruncate(fileno(quiva),coff) < 0)
+ SYSTEM_ERROR
+ }
+ fclose(istub);
+ fclose(indx);
+ fclose(quiva);
+ if (coff == 0)
+ { char *root = Root(argv[1],".db");
+ char *pwd = PathTo(argv[1]);
+ unlink(Catenate(pwd,PATHSEP,root,".qvs"));
+ free(pwd);
+ free(root);
+ }
+
+ exit (1);
+}
diff --git a/simulator.c b/simulator.c
new file mode 100644
index 0000000..b16fb02
--- /dev/null
+++ b/simulator.c
@@ -0,0 +1,458 @@
+/************************************************************************************\
+* *
+* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
+* *
+* Redistribution and use in source and binary forms, with or without modification, *
+* are permitted provided that the following conditions are met: *
+* *
+* · Redistributions of source code must retain the above copyright notice, this *
+* list of conditions and the following disclaimer. *
+* *
+* · Redistributions in binary form must reproduce the above copyright notice, this *
+* list of conditions and the following disclaimer in the documentation and/or *
+* other materials provided with the distribution. *
+* *
+* · The name of EWM may not be used to endorse or promote products derived from *
+* this software without specific prior written permission. *
+* *
+* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
+* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
+* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
+* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
+* *
+* For any issues regarding this software and its use, contact EWM at: *
+* *
+* Eugene W. Myers Jr. *
+* Bautzner Str. 122e *
+* 01099 Dresden *
+* GERMANY *
+* Email: gene.myers at gmail.com *
+* *
+\************************************************************************************/
+
+/*******************************************************************************************
+ *
+ * Synthetic DNA shotgun dataset simulator
+ * Generate a fake genome of size genlen*1Mb long, that has an AT-bias of -b. Then
+ * sample reads of mean length -m from a log-normal length distribution with
+ * standard deviation -s, but ignore reads of length less than -x. Collect enough
+ * reads to cover the genome -c times. Introduce -e fraction errors into each
+ * read where the ratio of insertions, deletions, and substitutions are set by
+ * defined constants INS_RATE and DEL_RATE within generate.c. One can also control
+ * the rate at which reads are picked from the forward and reverse strands by setting
+ * the defined constant FLIP_RATE.
+ *
+ * The -r parameter seeds the random number generator for the generation of the genome
+ * so that one can reproducbile produce the same underlying genome to sample from. If
+ * missing, then the job id of the invocation seeds the generator. The output is sent
+ * to the standard output (i.e. it is a pipe). The output is in fasta format (i.e. it is
+ * a UNIX pipe). The output is in Pacbio .fasta format suitable as input to fasta2DB.
+ *
+ * The -M option requests that the coordinates from which each read has been sampled are
+ * written to the indicated file, one line per read, ASCII encoded. This "map" file
+ * essentially tells one where every read belongs in an assembly and is very useful for
+ * debugging and testing purposes. If a read pair is say b,e then if b < e the read was
+ * sampled from [b,e] in the forward direction, and from [e,b] in the reverse direction
+ * otherwise.
+ *
+ * Author: Gene Myers
+ * Date : July 2013
+ * Mod : April 2014 (made independent of "mylib")
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+
+#include "DB.h"
+
+static char *Usage[] = { "<genlen:double> [-c<double(20.)>] [-b<double(.5)>] [-r<int>]",
+ " [-m<int(10000)>] [-s<int(2000)>] [-x<int(4000)>]",
+ " [-e<double(.15)>] [-M<file>]"
+ };
+
+static int GENOME; // -g option * 1Mbp
+static double COVERAGE; // -c option
+static double BIAS; // -b option
+static int HASR = 0; // -r option is set?
+static int SEED; // -r option
+static int RMEAN; // -m option
+static int RSDEV; // -s option
+static int RSHORT; // -x option
+static double ERROR; // -e option
+static FILE *MAP; // -M option
+
+#define INS_RATE .73333 // insert rate
+#define DEL_RATE .20000 // deletion rate
+#define IDL_RATE .93333 // insert + delete rate
+#define FLIP_RATE .5 // orientation rate (equal)
+
+// Generate a random 4 letter string of length *len* with every letter having equal probability.
+
+static char *random_genome()
+{ char *seq;
+ int i;
+ double x, PRA, PRC, PRG;
+
+ PRA = BIAS/2.;
+ PRC = (1.-BIAS)/2. + PRA;
+ PRG = (1.-BIAS)/2. + PRC;
+
+ if (HASR)
+ srand48(SEED);
+ else
+ srand48(getpid());
+
+ if ((seq = (char *) Malloc(GENOME+1,"Allocating genome sequence")) == NULL)
+ exit (1);
+ for (i = 0; i < GENOME; i++)
+ { x = drand48();
+ if (x < PRA)
+ seq[i] = 0;
+ else if (x < PRC)
+ seq[i] = 1;
+ else if (x < PRG)
+ seq[i] = 2;
+ else
+ seq[i] = 3;
+ }
+ seq[GENOME] = 4;
+ return (seq);
+}
+
+// Complement (in the DNA sense) string *s*.
+
+static void complement(int elen, char *s)
+{ char *t;
+ int c;
+
+ t = s + (elen-1);
+ while (s <= t)
+ { c = *s;
+ *s = (char) (3-*t);
+ *t = (char) (3-c);
+ s += 1;
+ t -= 1;
+ }
+}
+
+#define UNORM_LEN 60000
+#define UNORM_MAX 6.0
+
+static double unorm_table[UNORM_LEN+1]; // Upper half of cdf of N(0,1)
+static double unorm_scale;
+
+static void init_unorm()
+{ double del, sum, x;
+ int i;
+
+ unorm_scale = del = UNORM_MAX / UNORM_LEN;
+
+ sum = 0; // Integrate pdf, x >= 0 half only.
+ for (i = 0; i < UNORM_LEN; i++)
+ { x = i * del;
+ unorm_table[i] = sum;
+ sum += exp(-.5*x*x) * del;
+ }
+ unorm_table[UNORM_LEN] = sum;
+
+ /* Normalize cdf */
+ sum *= 2.;
+ for (i = 0; i < UNORM_LEN; i++)
+ unorm_table[i] /= sum;
+ unorm_table[UNORM_LEN] = 1.;
+
+#ifdef DEBUG
+ printf("Truncated tail is < %g\n",
+ exp(-.5*UNORM_MAX*UNORM_MAX)/(sum*(1.-exp(-UNORM_MAX))) );
+ printf("Diff between last two entries is %g\n",.5-unorm_table[UNORM_LEN-1]);
+
+ printf("\n CDF:\n");
+ for (i = 0; i <= UNORM_LEN; i += 100)
+ printf("%6.2f: %10.9f\n",i*del,unorm_table[i]);
+#endif
+}
+
+static int bin_search(int len, double *tab, double y)
+{ int l, m, r;
+
+ // Searches tab[0..len] for min { r : y < tab[r] }.
+ // Assumes y < 1, tab[0] = 0 and tab[len] = 1.
+ // So returned index is in [1,len].
+
+ l = 0;
+ r = len;
+ while (l < r)
+ { m = (l+r) >> 1;
+ if (y < tab[m])
+ r = m;
+ else
+ l = m+1;
+ }
+ return (r);
+}
+
+static double sample_unorm(double x)
+{ double y;
+ int f;
+
+ if (x >= .5) // Map [0,1) random var to upper-half of cdf */
+ y = x-.5;
+ else
+ y = .5-x;
+
+ f = bin_search(UNORM_LEN,unorm_table,y); // Bin. search upper-half cdf
+#ifdef DEBUG
+ printf("Normal search %g -> %g -> %d",x,y,f);
+#endif
+
+ // Linear interpolate between table points
+
+ y = (f - (unorm_table[f]-y) / (unorm_table[f] - unorm_table[f-1]) ) * unorm_scale;
+
+ if (x < .5) y = -y; // Map upper-half var back to full range
+#ifdef DEBUG
+ printf(" -> %g\n",y);
+#endif
+
+ return (y);
+}
+
+
+// Generate reads (a) whose lengths are exponentially distributed with mean *mean* and
+// standard deviation *stdev*, (b) that are never shorter than *shortest* and never
+// longer than the string *source*. Each read is a randomly sampled interval of
+// *source* (each interval is equally likely) that has insertion, deletion, and/or
+// substitution errors introduced into it and which is oriented in either the forward
+// or reverse strand direction with probability FLIP_RATE. The number of errors
+// introduced is the length of the string times *erate*, and the probability of an
+// insertion, deletion, or substitution is controlled by the defined constants INS_RATE
+// and DEL_RATE. Generate reads until the sum of the lengths of the reads is greater
+// than slen*coverage. The reads are output as fasta entries with a specific header
+// format that contains the sampling interval, read length, and a read id.
+
+static void shotgun(char *source)
+{ int maxlen, nreads, qv;
+ int64 totlen, totbp;
+ char *rbuffer;
+ double nmean, nsdev;
+
+ nsdev = (1.*RSDEV)/RMEAN;
+ nsdev = log(1.+nsdev*nsdev);
+ nmean = log(1.*RMEAN) - .5*nsdev;
+ nsdev = sqrt(nsdev);
+
+ if (GENOME < RSHORT)
+ { fprintf(stderr,"Genome length is less than shortest read length !\n");
+ exit (1);
+ }
+
+ init_unorm();
+
+ qv = (int) (1000 * (1.-ERROR));
+
+ rbuffer = NULL;
+ maxlen = 0;
+ totlen = 0;
+ totbp = COVERAGE*GENOME;
+ nreads = 0;
+ while (totlen < totbp)
+ { int len, sdl, ins, del, elen, rbeg, rend;
+ int j;
+ char *s, *t;
+
+ len = (int) exp(nmean + nsdev*sample_unorm(drand48())); // Determine length of read.
+ if (len > GENOME) len = GENOME;
+ if (len < RSHORT)
+ continue;
+
+ sdl = (int) (len*ERROR); // Determine number of inserts *ins*, deletions *del,
+ ins = del = 0; // and substitions+deletions *sdl*.
+ for (j = 0; j < sdl; j++)
+ { double x = drand48();
+ if (x < INS_RATE)
+ ins += 1;
+ else if (x < IDL_RATE)
+ del += 1;
+ }
+ sdl -= ins;
+ elen = len + (ins-del);
+ rbeg = (int) (drand48()*((GENOME-len)+.9999999));
+ rend = rbeg + len;
+
+ if (elen > maxlen)
+ { maxlen = ((int) (1.2*elen)) + 1000;
+ rbuffer = (char *) Realloc(rbuffer,maxlen+3,"Allocating read buffer");
+ if (rbuffer == NULL)
+ exit (1);
+ }
+
+ t = rbuffer;
+ s = source + rbeg;
+
+ // Generate the string with errors. NB that inserts occur randomly between source
+ // characters, while deletions and substitutions occur on source characters.
+
+ while ((len+1) * drand48() < ins)
+ { *t++ = (char) (4.*drand48());
+ ins -= 1;
+ }
+ for ( ; len > 0; len--)
+ { if (len * drand48() >= sdl)
+ *t++ = *s;
+ else if (sdl * drand48() >= del)
+ { double x = 3.*drand48();
+ if (x >= *s)
+ x += 1.;
+ *t++ = (char) x;
+ sdl -= 1;
+ }
+ else
+ { del -= 1;
+ sdl -= 1;
+ }
+ s += 1;
+ while (len * drand48() < ins)
+ { *t++ = (char) (4.*drand48());
+ ins -= 1;
+ }
+ }
+ *t = 4;
+
+ if (drand48() >= FLIP_RATE) // Complement the string with probability FLIP_RATE.
+ { printf(">Sim/%d/%d_%d RQ=0.%d\n",nreads+1,0,elen,qv);
+ complement(elen,rbuffer);
+ j = rend;
+ rend = rbeg;
+ rbeg = j;
+ }
+ else
+ printf(">Sim/%d/%d_%d RQ=0.%d\n",nreads+1,0,elen,qv);
+
+ Lower_Read(rbuffer);
+ for (j = 0; j+80 < elen; j += 80)
+ printf("%.80s\n",rbuffer+j);
+ if (j < elen)
+ printf("%s\n",rbuffer+j);
+
+ if (MAP != NULL)
+ fprintf(MAP," %9d %9d\n",rbeg,rend);
+
+ totlen += elen;
+ nreads += 1;
+ }
+}
+
+int main(int argc, char *argv[])
+{ char *source;
+
+// Usage: <GenomeLen:double> [-c<double(20.)>] [-b<double(.5)>] [-r<int>]
+// [-m<int(10000)>] [-s<int(2000)>] [-x<int(4000)>]
+// [-e<double(.15)>] [-M<file]"
+
+ { int i, j;
+ char *eptr;
+ double glen;
+
+ Prog_Name = Strdup("simulator","");
+
+ COVERAGE = 20.;
+ BIAS = .5;
+ HASR = 0;
+ RMEAN = 10000;
+ RSDEV = 2000;
+ RSHORT = 4000;
+ ERROR = .15;
+ MAP = NULL;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][2]);
+ exit (1);
+ case 'c':
+ ARG_REAL(COVERAGE)
+ if (COVERAGE < 0.)
+ { fprintf(stderr,"%s: Coverage must be non-negative (%g)\n",Prog_Name,COVERAGE);
+ exit (1);
+ }
+ break;
+ case 'b':
+ ARG_REAL(BIAS)
+ if (BIAS < 0. || BIAS > 1.)
+ { fprintf(stderr,"%s: AT-bias must be in [0,1] (%g)\n",Prog_Name,BIAS);
+ exit (1);
+ }
+ break;
+ case 'r':
+ SEED = strtol(argv[i]+2,&eptr,10);
+ HASR = 1;
+ if (*eptr != '\0' || argv[i][2] == '\0')
+ { fprintf(stderr,"%s: -r argument is not an integer\n",Prog_Name);
+ exit (1);
+ }
+ break;
+ case 'M':
+ MAP = Fopen(argv[i]+2,"w");
+ if (MAP == NULL)
+ exit (1);
+ break;
+ case 'm':
+ ARG_POSITIVE(RMEAN,"Mean read length")
+ break;
+ case 's':
+ ARG_POSITIVE(RSDEV,"Read length standard deviation")
+ break;
+ case 'x':
+ ARG_NON_NEGATIVE(RSHORT,"Read length minimum")
+ break;
+ case 'e':
+ ARG_REAL(ERROR)
+ if (ERROR < 0. || ERROR > .5)
+ { fprintf(stderr,"%s: Error rate must be in [0,.5] (%g)\n",Prog_Name,ERROR);
+ exit (1);
+ }
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
+ fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
+ fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]);
+ exit (1);
+ }
+
+ glen = strtod(argv[1],&eptr);
+ if (*eptr != '\0')
+ { fprintf(stderr,"%s: genome length is not a real number\n",Prog_Name);
+ exit (1);
+ }
+ if (glen < 0.)
+ { fprintf(stderr,"%s: Genome length must be positive (%g)\n",Prog_Name,glen);
+ exit (1);
+ }
+ GENOME = (int) (glen*1000000.);
+ }
+
+ source = random_genome();
+
+ shotgun(source);
+
+ if (MAP != NULL)
+ fclose(MAP);
+
+ exit (0);
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/dazzdb.git
More information about the debian-med-commit
mailing list