[med-svn] [dazzdb] 01/10: Imported Upstream version 1.0+20160930
Afif Elghraoui
afif at moszumanska.debian.org
Wed Oct 12 09:04:23 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository dazzdb.
commit f9c46a18a31b3ee59aa4e76b12d60981c2dd8a26
Author: Afif Elghraoui <afif at debian.org>
Date: Tue Oct 11 21:24:32 2016 -0700
Imported Upstream version 1.0+20160930
---
Catrack.c | 57 ++---
DAM2fasta.c | 60 ++----
DB.c | 284 ++++++++++++++++++------
DB.h | 78 +++----
DB2fasta.c | 108 +++++-----
DB2quiva.c | 97 ++++-----
DBshow.c => DBdump.c | 504 +++++++++++++++++++++++++++++--------------
DBdust.c | 39 +---
DBrm.c | 37 ----
DBshow.c | 55 +----
DBsplit.c | 68 ++----
DBstats.c | 178 +++++++--------
DBupgrade.Dec.31.2014.c | 115 ----------
DBupgrade.Sep.25.2014.c | 125 -----------
DUSTupgrade.Jan.1.2015.c | 117 ----------
LICENSE | 34 +++
Makefile | 28 ++-
QV.c | 93 ++++----
QV.h | 55 ++---
README | 442 --------------------------------------
README.md | 501 +++++++++++++++++++++++++++++++++++++++++++
fasta2DAM.c | 477 ++++++++++++++++++++++++++++++-----------
fasta2DB.c | 290 +++++++++++++++----------
quiva2DB.c | 546 +++++++++++++++++++++++++++++------------------
rangen.c | 182 ++++++++++++++++
simulator.c | 473 ++++++++++++++++++++++++----------------
26 files changed, 2771 insertions(+), 2272 deletions(-)
diff --git a/Catrack.c b/Catrack.c
index 8df7e3e..14d9b64 100644
--- a/Catrack.c
+++ b/Catrack.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/********************************************************************************************
*
* Concate in block order all "block tracks" <DB>.<track>.# into a single track
@@ -126,6 +89,7 @@ int main(int argc, char *argv[])
int nfiles;
char data[1024];
void *anno;
+ FILE *lfile = NULL;
anno = NULL;
trackoff = 0;
@@ -135,7 +99,7 @@ int main(int argc, char *argv[])
nfiles = 0;
while (1)
- { FILE *afile, *dfile;
+ { FILE *dfile, *afile;
int i, size, tracklen;
afile = fopen(Numbered_Suffix(prefix,nfiles+1,Catenate(".",argv[2],".","anno")),"r");
@@ -143,6 +107,10 @@ int main(int argc, char *argv[])
break;
dfile = fopen(Numbered_Suffix(prefix,nfiles+1,Catenate(".",argv[2],".","data")),"r");
+ if (nfiles > 0)
+ fclose(lfile);
+ lfile = afile;
+
if (VERBOSE)
{ fprintf(stderr,"Concatenating %s%d.%s ...\n",prefix,nfiles+1,argv[2]);
fflush(stderr);
@@ -247,7 +215,6 @@ int main(int argc, char *argv[])
tracktot += tracklen;
nfiles += 1;
if (dfile != NULL) fclose(dfile);
- fclose(afile);
}
if (nfiles == 0)
@@ -256,7 +223,9 @@ int main(int argc, char *argv[])
goto error;
}
else
- { if (dout != NULL)
+ { char *byte;
+
+ if (dout != NULL)
{ if (tracksiz == 4)
{ int anno4 = trackoff;
fwrite(&anno4,sizeof(int),1,aout);
@@ -266,10 +235,10 @@ int main(int argc, char *argv[])
fwrite(&anno8,sizeof(int64),1,aout);
}
}
- else
- { fwrite(anno,tracksiz,1,aout);
- free(anno);
- }
+ while (fread(&byte,1,1,lfile) == 1)
+ fwrite(&byte,1,1,aout);
+ fclose(lfile);
+
rewind(aout);
fwrite(&tracktot,sizeof(int),1,aout);
fwrite(&tracksiz,sizeof(int),1,aout);
diff --git a/DAM2fasta.c b/DAM2fasta.c
index 57ca2f7..9c6ffff 100644
--- a/DAM2fasta.c
+++ b/DAM2fasta.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/********************************************************************************************
*
* Recreate all the .fasta files that are in a specified DAM.
@@ -160,12 +123,22 @@ int main(int argc, char *argv[])
if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3)
SYSTEM_ERROR
- if ((ofile = Fopen(Catenate(".","/",fname,".fasta"),"w")) == NULL)
- exit (1);
+ if (strcmp(fname,"stdout") == 0)
+ { ofile = stdout;
+
+ if (VERBOSE)
+ { fprintf(stderr,"Sending %d contigs to stdout ...\n",last-first);
+ fflush(stdout);
+ }
+ }
+ else
+ { if ((ofile = Fopen(Catenate(".","/",fname,".fasta"),"w")) == NULL)
+ exit (1);
- if (VERBOSE)
- { fprintf(stderr,"Creating %s.fasta ...\n",fname);
- fflush(stdout);
+ if (VERBOSE)
+ { fprintf(stderr,"Creating %s.fasta ...\n",fname);
+ fflush(stdout);
+ }
}
// For the relevant range of reads, write each to the file
@@ -225,6 +198,9 @@ int main(int argc, char *argv[])
if (wpos > 0)
fprintf(ofile,"\n");
+ if (ofile != stdout)
+ fclose(ofile);
+
first = last;
}
}
diff --git a/DB.c b/DB.c
index 27b202e..3764842 100644
--- a/DB.c
+++ b/DB.c
@@ -1,47 +1,9 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
* Compressed data base module. Auxiliary routines to open and manipulate a data base for
* which the sequence and read information are separated into two separate files, and the
* sequence is compressed into 2-bits for each base. Support for tracks of additional
- * information, and trimming according to the current partition. Eventually will also
- * support compressed quality information.
+ * information, and trimming according to the current partition.
*
* Author : Gene Myers
* Date : July 2013
@@ -92,7 +54,9 @@ void *Malloc(int64 size, char *mesg)
}
void *Realloc(void *p, int64 size, char *mesg)
-{ if ((p = realloc(p,size)) == NULL)
+{ if (size <= 0)
+ size = 1;
+ if ((p = realloc(p,size)) == NULL)
{ if (mesg == NULL)
EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name);
else
@@ -509,10 +473,12 @@ int Open_DB(char* path, HITS_DB *db)
nreads = ulast-ufirst;
if (part <= 0)
{ db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index");
+ if (db->reads == NULL)
+ goto error2;
db->reads += 1;
if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
- free(db->reads);
+ free(db->reads-1);
goto error2;
}
}
@@ -521,13 +487,15 @@ int Open_DB(char* path, HITS_DB *db)
int i, r, maxlen;
int64 totlen;
- reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index");
+ reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index");
+ if (reads == NULL)
+ goto error2;
reads += 1;
fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR);
if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads)
{ EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
- free(reads);
+ free(reads-1);
goto error2;
}
@@ -679,6 +647,111 @@ void Trim_DB(HITS_DB *db)
}
}
+// The DB has already been trimmed, but a track over the untrimmed DB needs to be loaded.
+// Trim the track by rereading the untrimmed DB index from the file system.
+
+static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart)
+{ int i, j, r;
+ int allflag, cutoff;
+ int ureads;
+ char *root;
+ HITS_READ read;
+ FILE *indx;
+
+ if (!db->trimmed) return (0);
+
+ if (db->cutoff <= 0 && db->all) return (0);
+
+ cutoff = db->cutoff;
+ if (db->all)
+ allflag = 0;
+ else
+ allflag = DB_BEST;
+
+ root = rindex(db->path,'/') + 2;
+ indx = Fopen(Catenate(db->path,"","",".idx"),"r");
+ fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*db->ufirst,SEEK_SET);
+ if (ispart)
+ ureads = ((int *) (db->reads))[-1];
+ else
+ ureads = db->ureads;
+
+ if (strcmp(track->name,". at qvs") == 0)
+ { EPRINTF(EPLACE,"%s: Cannot load QV track after trimming\n",Prog_Name);
+ fclose(indx);
+ EXIT(1);
+ }
+
+ { int *anno4, size;
+ int64 *anno8;
+ char *anno, *data;
+
+ size = track->size;
+ data = (char *) track->data;
+ if (data == NULL)
+ { anno = (char *) track->anno;
+ j = r = 0;
+ for (i = r = 0; i < ureads; i++, r += size)
+ { if (fread(&read,sizeof(HITS_READ),1,indx) != 1)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ fclose(indx);
+ EXIT(1);
+ }
+ if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff)
+ { memmove(anno+j,anno+r,size);
+ j += size;
+ }
+ r += size;
+ }
+ memmove(anno+j,anno+r,size);
+ }
+ else if (size == 4)
+ { int ai;
+
+ anno4 = (int *) (track->anno);
+ j = anno4[0] = 0;
+ for (i = 0; i < ureads; i++)
+ { if (fread(&read,sizeof(HITS_READ),1,indx) != 1)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ fclose(indx);
+ EXIT(1);
+ }
+ if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff)
+ { ai = anno4[i];
+ anno4[j+1] = anno4[j] + (anno4[i+1]-ai);
+ memmove(data+anno4[j],data+ai,anno4[i+1]-ai);
+ j += 1;
+ }
+ }
+ track->data = Realloc(track->data,anno4[j],NULL);
+ }
+ else // size == 8
+ { int64 ai;
+
+ anno8 = (int64 *) (track->anno);
+ j = anno8[0] = 0;
+ for (i = 0; i < ureads; i++)
+ { if (fread(&read,sizeof(HITS_READ),1,indx) != 1)
+ { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root);
+ fclose(indx);
+ EXIT(1);
+ }
+ if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff)
+ { ai = anno8[i];
+ anno8[j+1] = anno8[j] + (anno8[i+1]-ai);
+ memmove(data+anno8[j],data+ai,anno8[i+1]-ai);
+ j += 1;
+ }
+ }
+ track->data = Realloc(track->data,anno8[j],NULL);
+ }
+ track->anno = Realloc(track->anno,track->size*(j+1),NULL);
+ }
+
+ fclose(indx);
+ return (0);
+}
+
// Shut down an open 'db' by freeing all associated space, including tracks and QV structures,
// and any open file pointers. The record pointed at by db however remains (the user
// supplied it and so should free it).
@@ -690,7 +763,8 @@ void Close_DB(HITS_DB *db)
free(((char *) (db->bases)) - 1);
else if (db->bases != NULL)
fclose((FILE *) db->bases);
- free(db->reads-1);
+ if (db->reads != NULL)
+ free(db->reads-1);
free(db->path);
Close_QVs(db);
@@ -704,6 +778,43 @@ void Close_DB(HITS_DB *db)
}
+// Return the size in bytes of the memory occupied by a given DB
+
+int64 sizeof_DB(HITS_DB *db)
+{ int64 s;
+ HITS_TRACK *t;
+
+ s = sizeof(HITS_DB)
+ + sizeof(HITS_READ)*(db->nreads+2)
+ + strlen(db->path)+1
+ + (db->totlen+db->nreads+4);
+
+ t = db->tracks;
+ if (t != NULL && strcmp(t->name,". at qvs") == 0)
+ { HITS_QV *q = (HITS_QV *) t;
+ s += sizeof(HITS_QV)
+ + sizeof(uint16) * db->nreads
+ + q->ncodes * sizeof(QVcoding)
+ + 6;
+ t = t->next;
+ }
+
+ for (; t != NULL; t = t->next)
+ { s += sizeof(HITS_TRACK)
+ + strlen(t->name)+1
+ + t->size * (db->nreads+1);
+ if (t->data != NULL)
+ { if (t->size == 8)
+ s += sizeof(int)*((int64 *) t->anno)[db->nreads];
+ else // t->size == 4
+ s += sizeof(int)*((int *) t->anno)[db->nreads];
+ }
+ }
+
+ return (s);
+}
+
+
/*******************************************************************************************
*
* QV LOAD & CLOSE ROUTINES
@@ -719,7 +830,7 @@ int Load_QVs(HITS_DB *db)
uint16 *table;
HITS_QV *qvtrk;
QVcoding *coding, *nx;
- int ncodes;
+ int ncodes = 0;
if (db->tracks != NULL && strcmp(db->tracks->name,". at qvs") == 0)
return (0);
@@ -730,8 +841,14 @@ int Load_QVs(HITS_DB *db)
}
if (db->reads[db->nreads-1].coff < 0)
- { EPRINTF(EPLACE,"%s: The requested QVs have not been added to the DB!\n",Prog_Name);
- EXIT(1);
+ { if (db->part > 0)
+ { EPRINTF(EPLACE,"%s: All QVs for this block have not been added to the DB!\n",Prog_Name);
+ EXIT(1);
+ }
+ else
+ { EPRINTF(EPLACE,"%s: All QVs for this DB have not been added!\n",Prog_Name);
+ EXIT(1);
+ }
}
// Open .qvs, .idx, and .db files
@@ -746,8 +863,14 @@ int Load_QVs(HITS_DB *db)
coding = NULL;
qvtrk = NULL;
- root = rindex(db->path,'/') + 2;
- istub = Fopen(Catenate(db->path,"/",root,".db"),"r");
+ root = rindex(db->path,'/');
+ if (root[1] == '.')
+ { *root = '\0';
+ istub = Fopen(Catenate(db->path,"/",root+2,".db"),"r");
+ *root = '/';
+ }
+ else
+ istub = Fopen(Catenate(db->path,"","",".db"),"r");
if (istub == NULL)
goto error;
@@ -805,16 +928,16 @@ int Load_QVs(HITS_DB *db)
// assign the tables # for each read in the block in "tables".
rewind(istub);
- fscanf(istub,DB_NFILE,&nfiles);
+ (void) fscanf(istub,DB_NFILE,&nfiles);
first = 0;
for (n = 0; n < fbeg; n++)
- { fscanf(istub,DB_FDATA,&last,fname,prolog);
+ { (void) fscanf(istub,DB_FDATA,&last,fname,prolog);
first = last;
}
for (n = fbeg; n < fend; n++)
- { fscanf(istub,DB_FDATA,&last,fname,prolog);
+ { (void) fscanf(istub,DB_FDATA,&last,fname,prolog);
i = n-fbeg;
if (first < pfirst)
@@ -969,9 +1092,9 @@ void Close_QVs(HITS_DB *db)
// -1: Track is not the right size of DB either trimmed or untrimmed
// -2: Could not find the track
-int Check_Track(HITS_DB *db, char *track)
+int Check_Track(HITS_DB *db, char *track, int *kind)
{ FILE *afile;
- int tracklen, ispart;
+ int tracklen, size, ispart;
int ureads, treads;
afile = NULL;
@@ -987,8 +1110,23 @@ int Check_Track(HITS_DB *db, char *track)
return (-2);
if (fread(&tracklen,sizeof(int),1,afile) != 1)
- return (-1);
+ { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track);
+ exit (1);
+ }
+ if (fread(&size,sizeof(int),1,afile) != 1)
+ { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track);
+ exit (1);
+ }
+ if (size == 0)
+ *kind = MASK_TRACK;
+ else if (size > 0)
+ *kind = CUSTOM_TRACK;
+ else
+ { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track);
+ exit (1);
+ }
+
fclose(afile);
if (ispart)
@@ -1000,10 +1138,10 @@ int Check_Track(HITS_DB *db, char *track)
treads = db->treads;
}
- if (tracklen == treads)
- return (1);
- else if (tracklen == ureads)
+ if (tracklen == ureads)
return (0);
+ else if (tracklen == treads)
+ return (1);
else
return (-1);
}
@@ -1067,10 +1205,13 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track)
{ EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track);
goto error;
}
- if (size <= 0)
+
+ if (size < 0)
{ EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track);
goto error;
}
+ if (size == 0)
+ size = 8;
if (ispart)
{ ureads = ((int *) (db->reads))[-1];
@@ -1082,22 +1223,32 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track)
}
if (db->trimmed)
- { if (tracklen != treads)
+ { if (tracklen != treads && tracklen != ureads)
{ EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track);
goto error;
}
if ( ! ispart && db->part > 0)
- fseeko(afile,size*db->tfirst,SEEK_CUR);
+ { if (tracklen == treads)
+ fseeko(afile,size*db->tfirst,SEEK_CUR);
+ else
+ fseeko(afile,size*db->ufirst,SEEK_CUR);
+ }
}
else
{ if (tracklen != ureads)
- { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track);
+ { if (tracklen == treads)
+ EPRINTF(EPLACE,"%s: Track '%s' is for a trimmed DB !\n",Prog_Name,track);
+ else
+ EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track);
goto error;
}
if ( ! ispart && db->part > 0)
fseeko(afile,size*db->ufirst,SEEK_CUR);
}
- nreads = db->nreads;
+ if (tracklen == treads)
+ nreads = ((int *) (db->reads))[-2];
+ else
+ nreads = ((int *) (db->reads))[-1];
anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector");
if (anno == NULL)
@@ -1166,6 +1317,11 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track)
record->anno = anno;
record->size = size;
+ if (db->trimmed && tracklen != treads)
+ { if (Late_Track_Trim(db,record,ispart))
+ goto error;
+ }
+
if (db->tracks != NULL && strcmp(db->tracks->name,". at qvs") == 0)
{ record->next = db->tracks->next;
db->tracks->next = record;
@@ -1178,7 +1334,7 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track)
return (record);
error:
- if (record == NULL)
+ if (record != NULL)
free(record);
if (data != NULL)
free(data);
diff --git a/DB.h b/DB.h
index 567a91a..983bee7 100644
--- a/DB.h
+++ b/DB.h
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
* Compressed data base module. Auxiliary routines to open and manipulate a data base for
@@ -71,8 +34,6 @@
// value. For such routines that were previously void, they are now int, and
// return 1 if an error occured, 0 otherwise.
-#undef INTERACTIVE
-
#ifdef INTERACTIVE
#define EPRINTF sprintf
@@ -135,7 +96,8 @@ extern char Ebuffer[];
#define ARG_POSITIVE(var,name) \
var = strtol(argv[i]+2,&eptr,10); \
if (*eptr != '\0' || argv[i][2] == '\0') \
- { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \
+ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \
+ Prog_Name,argv[i][1],argv[i]+2); \
exit (1); \
} \
if (var <= 0) \
@@ -146,7 +108,8 @@ extern char Ebuffer[];
#define ARG_NON_NEGATIVE(var,name) \
var = strtol(argv[i]+2,&eptr,10); \
if (*eptr != '\0' || argv[i][2] == '\0') \
- { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \
+ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \
+ Prog_Name,argv[i][1],argv[i]+2); \
exit (1); \
} \
if (var < 0) \
@@ -157,7 +120,8 @@ extern char Ebuffer[];
#define ARG_REAL(var) \
var = strtod(argv[i]+2,&eptr); \
if (*eptr != '\0' || argv[i][2] == '\0') \
- { fprintf(stderr,"%s: -%c argument is not a real number\n",Prog_Name,argv[i][1]); \
+ { fprintf(stderr,"%s: -%c '%s' argument is not a real number\n", \
+ Prog_Name,argv[i][1],argv[i]+2); \
exit (1); \
}
@@ -211,14 +175,17 @@ void Number_Read(char *s); // Convert read from letters to numbers
#define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert
#define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1)
+// Fields have different interpretations if a .db versus a .dam
+
typedef struct
- { int origin; // Well #
+ { int origin; // Well # (DB), Contig # (DAM)
int rlen; // Length of the sequence (Last pulse = fpulse + rlen)
- int fpulse; // First pulse
+ int fpulse; // First pulse (DB), left index of contig in scaffold (DAM)
int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of
// uncompressed bases in memory block
- int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file
- int flags; // QV of read + flags above
+ int64 coff; // Offset (in bytes) of compressed quiva streams in '.qvs' file (DB),
+ // Offset (in bytes) of scaffold header string in '.hdr' file (DAM)
+ int flags; // QV of read + flags above (DB only)
} HITS_READ;
// A track can be of 3 types:
@@ -299,7 +266,7 @@ typedef struct
#define DB_NFILE "files = %9d\n" // number of files
#define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name
#define DB_NBLOCK "blocks = %9d\n" // number of blocks
-#define DB_PARAMS "size = %9lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well
+#define DB_PARAMS "size = %10lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well
#define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed)
@@ -342,6 +309,10 @@ void Trim_DB(HITS_DB *db);
void Close_DB(HITS_DB *db);
+ // Return the size in bytes of the given DB
+
+int64 sizeof_DB(HITS_DB *db);
+
// If QV pseudo track is not already in db's track list, then load it and set it up.
// The database must not have been trimmed yet. -1 is returned if a .qvs file is not
// present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE
@@ -358,8 +329,15 @@ void Close_QVs(HITS_DB *db);
// 0: Track is for untrimmed DB
// -1: Track is not the right size of DB either trimmed or untrimmed
// -2: Could not find the track
+ // In addition, if opened (0 or 1 returned), then kind points at an integer indicating
+ // the type of track as follows:
+ // CUSTOM 0 => a custom track
+ // MASK 1 => a mask track
+
+#define CUSTOM_TRACK 0
+#define MASK_TRACK 1
-int Check_Track(HITS_DB *db, char *track);
+int Check_Track(HITS_DB *db, char *track, int *kind);
// If track is not already in the db's track list, then allocate all the storage for it,
// read it in from the appropriate file, add it to the track list, and return a pointer
@@ -427,11 +405,11 @@ int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii);
int Read_All_Sequences(HITS_DB *db, int ascii);
- // For the DB or DAM "path" = "prefix/root[.db|.dam]", find all the files for that DB, i.e. all
+ // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all
// those of the form "prefix/[.]root.part" and call actor with the complete path to each file
// pointed at by path, and the suffix of the path by extension. The . proceeds the root
// name if the defined constant HIDE_FILES is set. Always the first call is with the
- // path "prefix/root.db" and extension "db". There will always be calls for
+ // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for
// "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and
// so this routine gives one a way to know all the tracks associated with a given DB.
// -1 is returned if the path could not be found, and 1 is returned if an error (reported
diff --git a/DB2fasta.c b/DB2fasta.c
index 5080f88..eb769be 100644
--- a/DB2fasta.c
+++ b/DB2fasta.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/********************************************************************************************
*
* Recreate all the .fasta files that have been loaded into a specified database.
@@ -55,7 +18,6 @@ static char *Usage = "[-vU] [-w<int(80)>] <path:db>";
int main(int argc, char *argv[])
{ HITS_DB _db, *db = &_db;
FILE *dbfile;
- int nfiles;
int VERBOSE, UPPER, WIDTH;
// Process arguments
@@ -92,9 +54,10 @@ int main(int argc, char *argv[])
}
}
- // Open db
+ // Open db, and db stub file
{ int status;
+ char *pwd, *root;
status = Open_DB(argv[1],db);
if (status < 0)
@@ -107,9 +70,6 @@ int main(int argc, char *argv[])
{ fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
exit (1);
}
- }
-
- { char *pwd, *root;
pwd = PathTo(argv[1]);
root = Root(argv[1],".db");
@@ -120,23 +80,22 @@ int main(int argc, char *argv[])
exit (1);
}
- // nfiles = # of files in data base
-
- if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
- SYSTEM_ERROR
-
- // For each file do:
+ // For each cell do:
{ HITS_READ *reads;
+ char lname[MAX_NAME];
+ FILE *ofile = NULL;
+ int f, first, last, ofirst, nfiles;
char *read;
- int f, first;
+
+ if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
+ SYSTEM_ERROR
reads = db->reads;
read = New_Read_Buffer(db);
- first = 0;
+ first = ofirst = 0;
for (f = 0; f < nfiles; f++)
- { int i, last;
- FILE *ofile;
+ { int i;
char prolog[MAX_NAME], fname[MAX_NAME];
// Scan db image file line, create .fasta file for writing
@@ -144,12 +103,36 @@ int main(int argc, char *argv[])
if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3)
SYSTEM_ERROR
- if ((ofile = Fopen(Catenate(".","/",fname,".fasta"),"w")) == NULL)
- exit (1);
-
- if (VERBOSE)
- { fprintf(stderr,"Creating %s.fasta ...\n",fname);
- fflush(stdout);
+ if (f == 0 || strcmp(fname,lname) != 0)
+ { if (f > 0)
+ { if (ofile == stdout)
+ { fprintf(stderr," %d reads\n",first-ofirst);
+ fflush(stderr);
+ }
+ else
+ fclose(ofile);
+ }
+
+ if (strcmp(fname,"stdout") == 0)
+ { ofile = stdout;
+ ofirst = first;
+
+ if (VERBOSE)
+ { fprintf(stderr,"Sending to stdout ...");
+ fflush(stdout);
+ }
+ }
+ else
+ { if ((ofile = Fopen(Catenate(".","/",fname,".fasta"),"w")) == NULL)
+ exit (1);
+
+ if (VERBOSE)
+ { fprintf(stderr,"Creating %s.fasta ...\n",fname);
+ fflush(stdout);
+ }
+ }
+
+ strcpy(lname,fname);
}
// For the relevant range of reads, write each to the file
@@ -179,6 +162,15 @@ int main(int argc, char *argv[])
first = last;
}
+
+ if (f > 0)
+ { if (ofile == stdout)
+ { fprintf(stderr," %d reads\n",first-ofirst);
+ fflush(stderr);
+ }
+ else
+ fclose(ofile);
+ }
}
fclose(dbfile);
diff --git a/DB2quiva.c b/DB2quiva.c
index 63c0b91..a7b93da 100644
--- a/DB2quiva.c
+++ b/DB2quiva.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/********************************************************************************************
*
* Recreate all the .quiva files that have been loaded into a specified database.
@@ -115,22 +78,23 @@ int main(int argc, char *argv[])
exit (1);
}
- // For each file do:
+ // For each cell do:
{ HITS_READ *reads;
- int f, first, nfiles;
+ char lname[MAX_NAME];
+ FILE *ofile = NULL;
+ int f, first, last, ofirst, nfiles;
QVcoding *coding;
char **entry;
if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
SYSTEM_ERROR
- entry = New_QV_Buffer(db);
reads = db->reads;
- first = 0;
+ entry = New_QV_Buffer(db);
+ first = ofirst = 0;
for (f = 0; f < nfiles; f++)
- { int i, last;
- FILE *ofile;
+ { int i;
char prolog[MAX_NAME], fname[MAX_NAME];
// Scan db image file line, create .quiva file for writing
@@ -140,19 +104,43 @@ int main(int argc, char *argv[])
if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3)
SYSTEM_ERROR
- if ((ofile = Fopen(Catenate(".","/",fname,".quiva"),"w")) == NULL)
- exit (1);
+ if (f == 0 || strcmp(fname,lname) != 0)
+ { if (f > 0)
+ { if (ofile == stdout)
+ { fprintf(stderr," %d quivas\n",first-ofirst);
+ fflush(stderr);
+ }
+ else
+ fclose(ofile);
+ }
- if (VERBOSE)
- { fprintf(stderr,"Creating %s.quiva ...\n",fname);
- fflush(stderr);
- }
+ if (strcmp(fname,"stdout") == 0)
+ { ofile = stdout;
+ ofirst = first;
- coding = Read_QVcoding(quiva);
+ if (VERBOSE)
+ { fprintf(stderr,"Sending to stdout ...");
+ fflush(stdout);
+ }
+ }
+ else
+ { if ((ofile = Fopen(Catenate(".","/",fname,".quiva"),"w")) == NULL)
+ exit (1);
+
+ if (VERBOSE)
+ { fprintf(stderr,"Creating %s.quiva ...\n",fname);
+ fflush(stderr);
+ }
+ }
+
+ strcpy(lname,fname);
+ }
// For the relevant range of reads, write the header for each to the file
// and then uncompress and write the quiva entry for each
+ coding = Read_QVcoding(quiva);
+
for (i = first; i < last; i++)
{ int e, flags, qv, rlen;
HITS_READ *r;
@@ -182,6 +170,15 @@ int main(int argc, char *argv[])
first = last;
}
+
+ if (f > 0)
+ { if (ofile == stdout)
+ { fprintf(stderr," %d quivas\n",first-ofirst);
+ fflush(stderr);
+ }
+ else
+ fclose(ofile);
+ }
}
fclose(quiva);
diff --git a/DBshow.c b/DBdump.c
similarity index 52%
copy from DBshow.c
copy to DBdump.c
index 703fb14..735e726 100644
--- a/DBshow.c
+++ b/DBdump.c
@@ -1,51 +1,9 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
- * Display a specified set of reads of a database in fasta format.
+ * Display a portion of the data-base and selected information in 1-code format.
*
* Author: Gene Myers
- * Date : September 2013
- * Mod : With DB overhaul, made this a routine strictly for printing a selected subset
- * and created DB2fasta for recreating all the fasta files of a DB
- * Date : April 2014
- * Mod : Added options to display QV streams
- * Date : July 2014
+ * Date : November 2015
*
********************************************************************************************/
@@ -63,8 +21,8 @@
#endif
static char *Usage[] =
- { "[-unqUQ] [-w<int(80)>] [-m<track>]+",
- " <path:db|dam> [ <reads:FILE> | <reads:range> ... ]"
+ { "[-rhsiqp] [-uU] [-m<track>]+",
+ " <path:db|dam> [ <reads:FILE> | <reads:range> ... ]"
};
#define LAST_READ_SYMBOL '$'
@@ -116,35 +74,55 @@ int next_read(File_Iterator *it)
return (0);
}
+static int qv_map[51] =
+ { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
+ 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
+ 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D',
+ 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
+ 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
+ 'Y'
+ };
+
+static int prof_map[41] =
+ { '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
+ 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+ 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C',
+ 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+ 'N',
+ };
+
int main(int argc, char *argv[])
{ HITS_DB _db, *db = &_db;
FILE *hdrs = NULL;
+ int64 *qv_idx = NULL;
+ uint8 *qv_val = NULL;
+ int64 *pf_idx = NULL;
+ uint8 *pf_val = NULL;
int nfiles;
char **flist = NULL;
int *findx = NULL;
- int reps, *pts;
int input_pts;
- File_Iterator *iter;
- FILE *input;
+ int reps = 0;
+ int *pts = NULL;
+ File_Iterator *iter = NULL;
+ FILE *input = NULL;
int TRIM, UPPER;
- int DOSEQ, DOQVS, QUIVA, DAM;
- int WIDTH;
+ int DORED, DOSEQ, DOQVS, DOHDR, DOIQV, DOPRF, DAM;
- int MMAX, MTOP;
- char **MASK;
+ int MMAX, MTOP;
+ char **MASK;
+ HITS_TRACK **MTRACK;
// Process arguments
{ int i, j, k;
int flags[128];
- char *eptr;
- ARG_INIT("DBshow")
+ ARG_INIT("DBdump")
- WIDTH = 80;
MTOP = 0;
MMAX = 10;
MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array");
@@ -156,10 +134,7 @@ int main(int argc, char *argv[])
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
- ARG_FLAGS("unqUQ")
- break;
- case 'w':
- ARG_NON_NEGATIVE(WIDTH,"Line width")
+ ARG_FLAGS("hpqrsiuU")
break;
case 'm':
if (MTOP >= MMAX)
@@ -179,21 +154,26 @@ int main(int argc, char *argv[])
TRIM = 1-flags['u'];
UPPER = 1+flags['U'];
DOQVS = flags['q'];
- DOSEQ = 1-flags['n'];
- QUIVA = flags['Q'];
- if (QUIVA && (!DOSEQ || MTOP > 0))
- { fprintf(stderr,"%s: -Q (quiva) format request inconsistent with -n and -m options\n",
- Prog_Name);
- exit (1);
- }
- if (QUIVA)
- DOQVS = 1;
+ DORED = flags['r'];
+ DOSEQ = flags['s'];
+ DOHDR = flags['h'];
+ DOIQV = flags['i'];
+ DOPRF = flags['p'];
if (argc <= 1)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
exit (1);
}
+
+ if ( ! TRIM && DOIQV)
+ { fprintf(stderr,"%s: -i and -u are incompatible\n",Prog_Name);
+ exit (1);
+ }
+ if ( ! TRIM && DOPRF)
+ { fprintf(stderr,"%s: -p and -u are incompatible\n",Prog_Name);
+ exit (1);
+ }
}
// Open DB or DAM, and if a DAM open also .hdr file
@@ -208,11 +188,13 @@ int main(int argc, char *argv[])
{ root = Root(argv[1],".dam");
pwd = PathTo(argv[1]);
+ if (db->part > 0)
+ *rindex(root,'.') = '\0';
hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"r");
if (hdrs == NULL)
exit (1);
- DAM = 1;
- if (QUIVA || DOQVS)
+ DAM = 1;
+ if (DOQVS)
{ fprintf(stderr,"%s: -Q and -q options not compatible with a .dam DB\n",Prog_Name);
exit (1);
}
@@ -228,23 +210,39 @@ int main(int argc, char *argv[])
{ if (Load_QVs(db) < 0)
{ fprintf(stderr,"%s: QVs requested, but no .qvs for data base\n",Prog_Name);
exit (1);
- }
+ }
}
// Check tracks and load tracks for untrimmed DB
- { int i, status;
+ { int i, status, kind;
+
+ MTRACK = Malloc(sizeof(HITS_TRACK *)*MTOP,"Allocation of track pointer vector");
+ if (MTRACK == NULL)
+ exit (1);
for (i = 0; i < MTOP; i++)
- { status = Check_Track(db,MASK[i]);
+ { status = Check_Track(db,MASK[i],&kind);
if (status == -2)
- printf("%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]);
+ { fprintf(stderr,"%s: Warning: -m%s option given but no track found.\n",
+ Prog_Name,MASK[i]);
+ exit (1);
+ }
else if (status == -1)
- printf("%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]);
+ { fprintf(stderr,"%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]);
+ exit (1);
+ }
+ else if (kind != MASK_TRACK)
+ { fprintf(stderr,"%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[i]);
+ exit (1);
+ }
else if (status == 0)
- Load_Track(db,MASK[i]);
+ MTRACK[i] = Load_Track(db,MASK[i]);
else if (status == 1 && !TRIM)
- printf("%s: Warning: %s track is for a trimmed db but -u is set.\n",Prog_Name,MASK[i]);
+ { fprintf(stderr,"%s: Warning: %s track is for a trimmed db but -u is set.\n",
+ Prog_Name,MASK[i]);
+ exit (1);
+ }
}
}
@@ -321,21 +319,57 @@ int main(int argc, char *argv[])
}
if (TRIM)
- { int i, status;
+ { int i, status, kind;
Trim_DB(db);
// Load tracks for trimmed DB
for (i = 0; i < MTOP; i++)
- { status = Check_Track(db,MASK[i]);
+ { status = Check_Track(db,MASK[i],&kind);
if (status < 0)
continue;
else if (status == 1)
- Load_Track(db,MASK[i]);
+ MTRACK[i] = Load_Track(db,MASK[i]);
}
}
+ if (DOIQV)
+ { int status, kind;
+ HITS_TRACK *track;
+
+ status = Check_Track(db,"qual",&kind);
+ if (status == -2)
+ { fprintf(stderr,"%s: .qual-track does not exist for this db.\n",Prog_Name);
+ exit (1);
+ }
+ if (status == -1)
+ { fprintf(stderr,"%s: .qual-track not sync'd with db.\n",Prog_Name);
+ exit (1);
+ }
+ track = Load_Track(db,"qual");
+ qv_idx = (int64 *) track->anno;
+ qv_val = (uint8 *) track->data;
+ }
+
+ if (DOPRF)
+ { int status, kind;
+ HITS_TRACK *track;
+
+ status = Check_Track(db,"prof",&kind);
+ if (status == -2)
+ { fprintf(stderr,"%s: .prof-track does not exist for this db.\n",Prog_Name);
+ exit (1);
+ }
+ if (status == -1)
+ { fprintf(stderr,"%s: .prof-track not sync'd with db.\n",Prog_Name);
+ exit (1);
+ }
+ track = Load_Track(db,"prof");
+ pf_idx = (int64 *) track->anno;
+ pf_val = (uint8 *) track->data;
+ }
+
// Process read index arguments into a list of read ranges
input_pts = 0;
@@ -368,7 +402,7 @@ int main(int argc, char *argv[])
iter = init_file_iterator(input);
}
else
- { pts = (int *) Malloc(sizeof(int)*2*(argc-1),"Allocating read parameters");
+ { pts = (int *) Malloc(sizeof(int)*2*(argc-1),"Allocating read parameters");
if (pts == NULL)
exit (1);
@@ -422,40 +456,186 @@ int main(int argc, char *argv[])
}
}
+ // Scan to count the size of things
+
+ { HITS_READ *reads;
+ int c, b, e, i, m;
+ int map, substr;
+ int64 noreads;
+ int64 seqmax, seqtot;
+ int64 iqvmax, iqvtot;
+ int64 prfmax, prftot;
+ int64 hdrmax, hdrtot;
+ int64 trkmax[MTOP], trktot[MTOP];
+
+ map = 0;
+ reads = db->reads;
+ substr = 0;
+
+ noreads = 0;
+ seqmax = 0;
+ seqtot = 0;
+ iqvmax = 0;
+ iqvtot = 0;
+ prfmax = 0;
+ prftot = 0;
+ hdrmax = 0;
+ hdrmax = 0;
+ hdrtot = 0;
+ for (m = 0; m < MTOP; m++)
+ { trkmax[m] = 0;
+ trktot[m] = 0;
+ }
+
+ c = 0;
+ while (1)
+ { if (input_pts)
+ { if (next_read(iter))
+ break;
+ e = iter->read;
+ b = e-1;
+ substr = (iter->beg >= 0);
+ }
+ else
+ { if (c >= reps)
+ break;
+ b = pts[c]-1;
+ e = pts[c+1];
+ if (e > db->nreads)
+ e = db->nreads;
+ c += 2;
+ }
+
+ for (i = b; i < e; i++)
+ { int len, ten;
+ int fst, lst;
+ HITS_READ *r;
+
+ r = reads + i;
+ len = r->rlen;
+
+ noreads += 1;
+
+ if (DOHDR)
+ { int ten;
+
+ if (DAM)
+ { char header[MAX_NAME];
+
+ fseeko(hdrs,r->coff,SEEK_SET);
+ fgets(header,MAX_NAME,hdrs);
+ header[strlen(header)-1] = '\0';
+ ten = strlen(header);
+ }
+ else
+ { while (i < findx[map-1])
+ map -= 1;
+ while (i >= findx[map])
+ map += 1;
+ ten = strlen(flist[map]);
+ }
+ if (hdrmax < ten)
+ hdrmax = ten;
+ hdrtot += ten;
+ }
+
+ for (m = 0; m < MTOP; m++)
+ { int64 *anno;
+
+ anno = (int64 *) MTRACK[m]->anno;
+ ten = ((anno[i+1]-anno[i]) >> 3);
+ if (ten > trkmax[m])
+ trkmax[m] = ten;
+ trktot[m] += ten;
+ }
+
+ if (substr)
+ { fst = iter->beg;
+ lst = iter->end;
+ if (DOIQV)
+ { fprintf(stderr,"%s: Cannot select subreads when -i is requested\n",Prog_Name);
+ exit (1);
+ }
+ if (DOPRF)
+ { fprintf(stderr,"%s: Cannot select subreads when -p is requested\n",Prog_Name);
+ exit (1);
+ }
+ }
+ else
+ { fst = 0;
+ lst = len;
+ }
+
+ if (DOSEQ | DOQVS)
+ { int ten = lst-fst;
+ if (ten > seqmax)
+ seqmax = ten;
+ seqtot += ten;
+ }
+ if (DOIQV)
+ { int ten = qv_idx[i+1] - qv_idx[i];
+ if (ten > iqvmax)
+ iqvmax = ten;
+ iqvtot += ten;
+ }
+ if (DOPRF)
+ { int ten = pf_idx[i+1] - pf_idx[i];
+ if (ten > prfmax)
+ prfmax = ten;
+ prftot += ten;
+ }
+ }
+ }
+
+ printf("+ R %lld\n",noreads);
+ printf("+ M %d\n",MTOP);
+ if (DOHDR)
+ { printf("+ H %lld\n",hdrtot);
+ printf("@ H %lld\n",hdrmax);
+ }
+ for (m = 0; m < MTOP; m++)
+ { printf("+ T%d %lld\n",m,trktot[m]);
+ printf("@ T%d %lld\n",m,trkmax[m]);
+ }
+ if (DOSEQ | DOQVS)
+ { printf("+ S %lld\n",seqtot);
+ printf("@ S %lld\n",seqmax);
+ }
+ if (DOIQV)
+ { printf("+ I %lld\n",iqvtot);
+ printf("@ I %lld\n",iqvmax);
+ }
+ if (DOPRF)
+ { printf("+ P %lld\n",prftot);
+ printf("@ P %lld\n",prfmax);
+ }
+ }
+
// Display each read (and/or QV streams) in the active DB according to the
// range pairs in pts[0..reps) and according to the display options.
{ HITS_READ *reads;
- HITS_TRACK *first;
char *read, **entry;
- int c, b, e, i;
- int hilight, substr;
+ int c, b, e, i, m;
+ int substr;
int map;
- int (*iscase)(int);
+ char qvname[5] = { 'd', 'c', 'i', 'm', 's' };
read = New_Read_Buffer(db);
if (DOQVS)
- { entry = New_QV_Buffer(db);
- first = db->tracks->next;
- }
+ entry = New_QV_Buffer(db);
else
- { entry = NULL;
- first = db->tracks;
- }
-
- if (UPPER == 1)
- { hilight = 'A'-'a';
- iscase = islower;
- }
- else
- { hilight = 'a'-'A';
- iscase = isupper;
- }
+ entry = NULL;
map = 0;
reads = db->reads;
substr = 0;
+ if (input_pts)
+ iter = init_file_iterator(input);
+ else
+ iter = NULL;
+
c = 0;
while (1)
{ if (input_pts)
@@ -480,65 +660,57 @@ int main(int argc, char *argv[])
int fst, lst;
int flags, qv;
HITS_READ *r;
- HITS_TRACK *track;
r = reads + i;
len = r->rlen;
+ if (DORED)
+ printf("R %d\n",i+1);
flags = r->flags;
qv = (flags & DB_QV);
- if (DAM)
- { char header[MAX_NAME];
-
- fseeko(hdrs,r->coff,SEEK_SET);
- fgets(header,MAX_NAME,hdrs);
- header[strlen(header)-1] = '\0';
- printf("%s :: Contig %d[%d,%d]",header,r->origin,r->fpulse,r->fpulse+len);
- }
- else
- { while (i < findx[map-1])
- map -= 1;
- while (i >= findx[map])
- map += 1;
- if (QUIVA)
- printf("@%s/%d/%d_%d",flist[map],r->origin,r->fpulse,r->fpulse+len);
+ if (DOHDR)
+ { if (DAM)
+ { char header[MAX_NAME];
+
+ fseeko(hdrs,r->coff,SEEK_SET);
+ fgets(header,MAX_NAME,hdrs);
+ header[strlen(header)-1] = '\0';
+ printf("H %ld %s\n",strlen(header),header);
+ printf("L %d %d %d\n",r->origin,r->fpulse,r->fpulse+len);
+ }
else
- printf(">%s/%d/%d_%d",flist[map],r->origin,r->fpulse,r->fpulse+len);
- if (qv > 0)
- printf(" RQ=0.%3d",qv);
+ { while (i < findx[map-1])
+ map -= 1;
+ while (i >= findx[map])
+ map += 1;
+ printf("H %ld %s\n",strlen(flist[map]),flist[map]);
+ printf("L %d %d %d\n",r->origin,r->fpulse,r->fpulse+len);
+ if (qv > 0)
+ printf("Q: %d\n",qv);
+ }
}
- printf("\n");
if (DOQVS)
Load_QVentry(db,i,entry,UPPER);
if (DOSEQ)
Load_Read(db,i,read,UPPER);
- for (track = first; track != NULL; track = track->next)
+ for (m = 0; m < MTOP; m++)
{ int64 *anno;
int *data;
int64 s, f, j;
- int bd, ed, m;
- anno = (int64 *) track->anno;
- data = (int *) track->data;
+ anno = (int64 *) MTRACK[m]->anno;
+ data = (int *) MTRACK[m]->data;
s = (anno[i] >> 2);
f = (anno[i+1] >> 2);
+ printf("T%d %lld ",m,(f-s)/2);
if (s < f)
{ for (j = s; j < f; j += 2)
- { bd = data[j];
- ed = data[j+1];
- if (DOSEQ)
- for (m = bd; m < ed; m++)
- if (iscase(read[m]))
- read[m] = (char) (read[m] + hilight);
- if (j == s)
- printf("> %s:",track->name);
- printf(" [%d,%d]",bd,ed);
- }
- printf("\n");
+ printf(" %d %d",data[j],data[j+1]);
}
+ printf("\n");
}
if (substr)
@@ -550,39 +722,39 @@ int main(int argc, char *argv[])
lst = len;
}
- if (QUIVA)
+ if (DOSEQ)
+ { printf("S %d ",lst-fst);
+ printf("%.*s\n",lst-fst,read+fst);
+ }
+
+ if (DOIQV)
+ { int64 k, e;
+
+ k = qv_idx[i];
+ e = qv_idx[i+1];
+ printf("I %lld ",e-k);
+ while (k < e)
+ putchar(qv_map[qv_val[k++]]);
+ printf("\n");
+ }
+
+ if (DOPRF)
+ { int64 k, e;
+
+ k = pf_idx[i];
+ e = pf_idx[i+1];
+ printf("P %lld ",e-k);
+ while (k < e)
+ putchar(prof_map[pf_val[k++]]);
+ printf("\n");
+ }
+
+ if (DOQVS)
{ int k;
for (k = 0; k < 5; k++)
- printf("%.*s\n",lst-fst,entry[k]+fst);
- }
- else
- { if (DOQVS)
- { int j, k;
-
- printf("\n");
- for (j = fst; j+WIDTH < lst; j += WIDTH)
- { if (DOSEQ)
- printf("%.*s\n",WIDTH,read+j);
- for (k = 0; k < 5; k++)
- printf("%.*s\n",WIDTH,entry[k]+j);
- printf("\n");
- }
- if (j < lst)
- { if (DOSEQ)
- printf("%.*s\n",lst-j,read+j);
- for (k = 0; k < 5; k++)
- printf("%.*s\n",lst-j,entry[k]+j);
- printf("\n");
- }
- }
- else if (DOSEQ)
- { int j;
-
- for (j = fst; j+WIDTH < lst; j += WIDTH)
- printf("%.*s\n",WIDTH,read+j);
- if (j < lst)
- printf("%.*s\n",lst-j,read+j);
+ { printf("%c %d ",qvname[k],lst-fst);
+ printf("%.*s\n",lst-fst,entry[k]+fst);
}
}
}
diff --git a/DBdust.c b/DBdust.c
index a63ddd4..3b75400 100644
--- a/DBdust.c
+++ b/DBdust.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
* My implementation of the SDUST algorithm (Morgulis et al., JCB 13, 5 (2006), 1028-1040)
@@ -155,7 +118,7 @@ int main(int argc, char *argv[])
pwd = PathTo(argv[1]);
root = Root(argv[1],".db");
- size = 8;
+ size = 0;
fname = Catenate(pwd,PATHSEP,root,".dust.anno");
if ((afile = fopen(fname,"r+")) == NULL || db->part > 0)
diff --git a/DBrm.c b/DBrm.c
index 390d912..6bad19b 100644
--- a/DBrm.c
+++ b/DBrm.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/********************************************************************************************
*
* Remove a list of .db databases
diff --git a/DBshow.c b/DBshow.c
index 703fb14..d4346a5 100644
--- a/DBshow.c
+++ b/DBshow.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
* Display a specified set of reads of a database in fasta format.
@@ -126,7 +89,7 @@ int main(int argc, char *argv[])
int reps, *pts;
int input_pts;
- File_Iterator *iter;
+ File_Iterator *iter = NULL;
FILE *input;
int TRIM, UPPER;
@@ -208,10 +171,12 @@ int main(int argc, char *argv[])
{ root = Root(argv[1],".dam");
pwd = PathTo(argv[1]);
+ if (db->part > 0)
+ *rindex(root,'.') = '\0';
hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"r");
if (hdrs == NULL)
exit (1);
- DAM = 1;
+ DAM = 1;
if (QUIVA || DOQVS)
{ fprintf(stderr,"%s: -Q and -q options not compatible with a .dam DB\n",Prog_Name);
exit (1);
@@ -233,14 +198,16 @@ int main(int argc, char *argv[])
// Check tracks and load tracks for untrimmed DB
- { int i, status;
+ { int i, status, kind;
for (i = 0; i < MTOP; i++)
- { status = Check_Track(db,MASK[i]);
+ { status = Check_Track(db,MASK[i],&kind);
if (status == -2)
printf("%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]);
else if (status == -1)
printf("%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]);
+ else if (kind != MASK_TRACK)
+ printf("%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[i]);
else if (status == 0)
Load_Track(db,MASK[i]);
else if (status == 1 && !TRIM)
@@ -321,17 +288,17 @@ int main(int argc, char *argv[])
}
if (TRIM)
- { int i, status;
+ { int i, status, kind;
Trim_DB(db);
// Load tracks for trimmed DB
for (i = 0; i < MTOP; i++)
- { status = Check_Track(db,MASK[i]);
+ { status = Check_Track(db,MASK[i],&kind);
if (status < 0)
continue;
- else if (status == 1)
+ else if (status == 1 && kind == MASK_TRACK)
Load_Track(db,MASK[i]);
}
}
diff --git a/DBsplit.c b/DBsplit.c
index 6961228..6a218ec 100644
--- a/DBsplit.c
+++ b/DBsplit.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
* Split a .db into a set of sub-database blocks for use by the Dazzler:
@@ -69,7 +32,7 @@
#define PATHSEP "/"
#endif
-static char *Usage = "[-a] [-x<int>] [-s<int(200)>] <path:db|dam>";
+static char *Usage = "[-af] [-x<int>] [-s<float(200.)>] <path:db|dam>";
int main(int argc, char *argv[])
{ HITS_DB db, dbs;
@@ -77,38 +40,46 @@ int main(int argc, char *argv[])
FILE *dbfile, *ixfile;
int status;
+ int FORCE;
int ALL;
int CUTOFF;
- int SIZE;
+ int64 SIZE;
{ int i, j, k;
int flags[128];
char *eptr;
+ float size;
ARG_INIT("DBsplit")
CUTOFF = 0;
- SIZE = 200;
+ size = 200;
j = 1;
for (i = 1; i < argc; i++)
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
- ARG_FLAGS("a")
+ ARG_FLAGS("af")
break;
case 'x':
ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff")
break;
case 's':
- ARG_POSITIVE(SIZE,"Block size")
+ ARG_REAL(size)
+ if (size <= 0.)
+ { fprintf(stderr,"%s: Block size must be a positive number\n",Prog_Name);
+ exit (1);
+ }
break;
}
else
argv[j++] = argv[i];
argc = j;
- ALL = flags['a'];
+ SIZE = size*1000000ll;
+ ALL = flags['a'];
+ FORCE = flags['f'];
if (argc != 2)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
@@ -155,7 +126,7 @@ int main(int argc, char *argv[])
if (fread(&dbs,sizeof(HITS_DB),1,ixfile) != 1)
SYSTEM_ERROR
- if (dbs.cutoff >= 0)
+ if (dbs.cutoff >= 0 && !FORCE)
{ printf("You are about to overwrite the current partition settings. This\n");
printf("will invalidate any tracks, overlaps, and other derivative files.\n");
printf("Are you sure you want to proceed? [Y/N] ");
@@ -173,16 +144,15 @@ int main(int argc, char *argv[])
dbpos = ftello(dbfile);
fseeko(dbfile,dbpos,SEEK_SET);
fprintf(dbfile,DB_NBLOCK,0);
- fprintf(dbfile,DB_PARAMS,(int64) SIZE,CUTOFF,ALL);
+ fprintf(dbfile,DB_PARAMS,SIZE,CUTOFF,ALL);
}
{ HITS_READ *reads = db.reads;
int nreads = db.ureads;
- int64 size, totlen;
+ int64 totlen;
int nblock, ireads, treads, rlen, fno;
int i;
- size = SIZE*1000000ll;
nblock = 0;
totlen = 0;
@@ -196,7 +166,7 @@ int main(int argc, char *argv[])
{ ireads += 1;
treads += 1;
totlen += rlen;
- if (totlen >= size)
+ if (totlen >= SIZE)
{ fprintf(dbfile,DB_BDATA,i+1,treads);
totlen = 0;
ireads = 0;
@@ -211,7 +181,7 @@ int main(int argc, char *argv[])
{ ireads += 1;
treads += 1;
totlen += rlen;
- if (totlen >= size)
+ if (totlen >= SIZE)
{ fprintf(dbfile,DB_BDATA,i+1,treads);
totlen = 0;
ireads = 0;
diff --git a/DBstats.c b/DBstats.c
index 542b77b..b0e46c2 100644
--- a/DBstats.c
+++ b/DBstats.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
* Display statistics about the contents of a .db and a histogram of its read lengths.
@@ -116,7 +79,7 @@ int main(int argc, char *argv[])
}
}
- { int i, status;
+ { int i, status, kind;
// Open .db or .dam
@@ -128,11 +91,13 @@ int main(int argc, char *argv[])
// Check tracks and load tracks for untrimmed DB
for (i = 0; i < MTOP; i++)
- { status = Check_Track(db,MASK[i]);
+ { status = Check_Track(db,MASK[i],&kind);
if (status == -2)
fprintf(stderr,"%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]);
else if (status == -1)
fprintf(stderr,"%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]);
+ else if (kind != MASK_TRACK)
+ fprintf(stderr,"%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[i]);
else if (status == 0)
Load_Track(db,MASK[i]);
else if (status == 1 && !TRIM)
@@ -149,7 +114,7 @@ int main(int argc, char *argv[])
// Load tracks for trimmed DB
for (i = 0; i < MTOP; i++)
- { status = Check_Track(db,MASK[i]);
+ { status = Check_Track(db,MASK[i],&kind);
if (status < 0)
continue;
else if (status == 1)
@@ -161,7 +126,6 @@ int main(int argc, char *argv[])
{ int i;
int64 totlen;
int nreads, maxlen;
- int64 ave, dev;
HITS_READ *reads;
nreads = db->nreads;
@@ -169,7 +133,7 @@ int main(int argc, char *argv[])
maxlen = db->maxlen;
reads = db->reads;
- nbin = (maxlen-1)/BIN + 1;
+ nbin = maxlen/BIN + 1;
hist = (int *) Malloc(sizeof(int)*nbin,"Allocating histograms");
bsum = (int64 *) Malloc(sizeof(int64)*nbin,"Allocating histograms");
if (hist == NULL || bsum == NULL)
@@ -186,15 +150,6 @@ int main(int argc, char *argv[])
bsum[rlen/BIN] += rlen;
}
- nbin = (maxlen-1)/BIN + 1;
- ave = totlen/nreads;
- dev = 0;
- for (i = 0; i < nreads; i++)
- { int rlen = reads[i].rlen;
- dev += (rlen-ave)*(rlen-ave);
- }
- dev = (int64) sqrt((1.*dev)/nreads);
-
if (dam)
printf("\nStatistics for all contigs");
else if (db->all || !TRIM)
@@ -219,7 +174,10 @@ int main(int argc, char *argv[])
if (TRIM)
{ printf(" out of ");
Print_Number((int64 ) oreads,15,stdout);
- printf(" (%5.1f%%)",(100.*nreads)/oreads);
+ if (oreads <= 0)
+ printf(" (100.0%%)");
+ else
+ printf(" (%5.1f%%)",(100.*nreads)/oreads);
}
printf("\n");
@@ -228,17 +186,40 @@ int main(int argc, char *argv[])
if (TRIM)
{ printf(" out of ");
Print_Number(ototal,15,stdout);
- printf(" (%5.1f%%)",(100.*totlen)/ototal);
+ if (ototal <= 0)
+ printf(" (100.0%%)");
+ else
+ printf(" (%5.1f%%)",(100.*totlen)/ototal);
}
printf("\n\n");
- Print_Number(ave,15,stdout);
- if (dam)
- printf(" average contig length\n");
- else
- { printf(" average read length\n");
- Print_Number(dev,15,stdout);
- printf(" standard deviation\n");
+ if (nreads > 0)
+ { int64 ave, dev;
+
+ ave = totlen/nreads;
+ Print_Number(ave,15,stdout);
+ if (dam)
+ printf(" average contig length\n");
+ else
+ { printf(" average read length\n");
+
+ dev = 0;
+ for (i = 0; i < nreads; i++)
+ { int rlen = reads[i].rlen;
+ dev += (rlen-ave)*(rlen-ave);
+ }
+ dev = (int64) sqrt((1.*dev)/nreads);
+
+ Print_Number(dev,15,stdout);
+ printf(" standard deviation\n");
+ }
+ }
+
+ if (totlen <= 0)
+ { free(hist);
+ free(bsum);
+ Close_DB(db);
+ exit (0);
}
printf("\n Base composition: %.3f(A) %.3f(C) %.3f(G) %.3f(T)\n",
@@ -246,7 +227,7 @@ int main(int argc, char *argv[])
if (!NONE)
{ int64 btot;
- int cum, skip;
+ int cum, skip, avg;
printf("\n Distribution of Read Lengths (Bin size = ");
Print_Number((int64) BIN,0,stdout);
@@ -264,8 +245,11 @@ int main(int argc, char *argv[])
{ Print_Number((int64) (i*BIN),11,stdout);
printf(":");
Print_Number((int64) hist[i],11,stdout);
- printf(" %5.1f %5.1f %9lld\n",(100.*cum)/nreads,
- (100.*btot)/totlen,btot/cum);
+ if (cum > 0)
+ avg = btot/cum;
+ else
+ avg = 0;
+ printf(" %5.1f %5.1f %9d\n",(100.*cum)/nreads,(100.*btot)/totlen,avg);
}
if (cum == nreads) break;
}
@@ -274,14 +258,14 @@ int main(int argc, char *argv[])
{ int64 totlen;
int numint, maxlen;
- int64 ave, dev;
HITS_TRACK *track;
for (track = db->tracks; track != NULL; track = track->next)
{ char *data = track->data;
int64 *anno = (int64 *) track->anno;
- int k, rlen;
int *idata, *edata;
+ int64 ave, dev, btot;
+ int k, rlen, cum;
totlen = 0;
numint = 0;
@@ -297,14 +281,23 @@ int main(int argc, char *argv[])
}
}
- nbin = (maxlen-1)/BIN + 1;
+ printf("\n\nStatistics for %s-track\n",track->name);
+
+ printf("\n There are ");
+ Print_Number(numint,0,stdout);
+ printf(" intervals totaling ");
+ Print_Number(totlen,0,stdout);
+ printf(" bases (%.1f%% of all data)\n",(100.*totlen)/db->totlen);
+
+ if (numint <= 0) continue;
+ nbin = maxlen/BIN + 1;
for (k = 0; k < nbin; k++)
{ hist[k] = 0;
bsum[k] = 0;
}
- ave = totlen/numint;
+ ave = totlen/numint;
dev = 0;
for (k = 0; k < db->nreads; k++)
{ edata = (int *) (data + anno[k+1]);
@@ -317,36 +310,31 @@ int main(int argc, char *argv[])
}
dev = (int64) sqrt((1.*dev)/numint);
- printf("\n\nStatistics for %s-track\n",track->name);
+ printf("\n");
+ Print_Number(ave,15,stdout);
+ printf(" average interval length\n");
+ Print_Number(dev,15,stdout);
+ printf(" standard deviation\n");
- printf("\n There are ");
- Print_Number(numint,0,stdout);
- printf(" intervals totaling ");
- Print_Number(totlen,0,stdout);
- printf(" bases (%.1f%% of all data)\n",(100.*totlen)/db->totlen);
- { int64 btot;
- int cum;
-
- printf("\n Distribution of %s intervals (Bin size = ",track->name);
- Print_Number((int64) BIN,0,stdout);
- printf(")\n\n Bin: Count %% Intervals %% Bases Average\n");
- cum = 0;
- btot = 0;
- for (k = nbin-1; k >= 0; k--)
- { cum += hist[k];
- btot += bsum[k];
- if (hist[k] > 0)
- { Print_Number((int64) (k*BIN),11,stdout);
- printf(":");
- Print_Number((int64) hist[k],11,stdout);
- printf(" %5.1f %5.1f %9lld\n",(100.*cum)/numint,
- (100.*btot)/totlen,btot/cum);
- if (cum == numint) break;
- }
- }
- printf("\n");
- }
+ printf("\n Distribution of %s intervals (Bin size = ",track->name);
+ Print_Number((int64) BIN,0,stdout);
+ printf(")\n\n Bin: Count %% Intervals %% Bases Average\n");
+ cum = 0;
+ btot = 0;
+ for (k = nbin-1; k >= 0; k--)
+ { cum += hist[k];
+ btot += bsum[k];
+ if (hist[k] > 0)
+ { Print_Number((int64) (k*BIN),11,stdout);
+ printf(":");
+ Print_Number((int64) hist[k],11,stdout);
+ printf(" %5.1f %5.1f %9lld\n",(100.*cum)/numint,
+ (100.*btot)/totlen,btot/cum);
+ if (cum == numint) break;
+ }
+ }
+ printf("\n");
}
}
diff --git a/DBupgrade.Dec.31.2014.c b/DBupgrade.Dec.31.2014.c
deleted file mode 100644
index 05513a4..0000000
--- a/DBupgrade.Dec.31.2014.c
+++ /dev/null
@@ -1,115 +0,0 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
-/*******************************************************************************************
- *
- * Interim code: upgrade previous db to have fpulse,rlen fields
- *
- * Author: Gene Myers
- * Date : December 2014
- *
- ********************************************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "DB.h"
-
-#ifdef HIDE_FILES
-#define PATHSEP "/."
-#else
-#define PATHSEP "/"
-#endif
-
-typedef struct
- { int origin; // Well #
- int beg; // First pulse
- int end; // Last pulse
- int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of
- // uncompressed bases in memory block
- int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file
- int flags; // QV of read + flags above
- } HITS_OLD;
-
-int main(int argc, char *argv[])
-{ HITS_DB db;
- FILE *nxfile, *ixfile;
- char *pwd, *root;
- int i;
-
- if (argc != 2)
- { fprintf(stderr,"Usage: %s <path:db>\n",argv[0]);
- exit (1);
- }
-
- pwd = PathTo(argv[1]);
- root = Root(argv[1],".db");
- ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r");
- nxfile = Fopen(Catenate(pwd,PATHSEP,root,".ndx"),"w");
- if (ixfile == NULL || nxfile == NULL)
- exit (1);
- free(pwd);
- free(root);
-
- if (fread(&db,sizeof(HITS_DB),1,ixfile) != 1)
- SYSTEM_ERROR
- fwrite(&db,sizeof(HITS_DB),1,nxfile);
-
- for (i = 0; i < db.oreads; i++)
- { HITS_OLD orec;
- HITS_READ nrec;
-
- if (fread(&orec,sizeof(HITS_OLD),1,ixfile) != 1)
- SYSTEM_ERROR
-
- nrec.origin = orec.origin;
- nrec.fpulse = orec.beg;
- nrec.rlen = orec.end-orec.beg;
- nrec.boff = orec.boff;
- nrec.coff = orec.coff;
- nrec.flags = orec.flags;
-
- fwrite(&nrec,sizeof(HITS_READ),1,nxfile);
- }
-
- fclose(ixfile);
- fclose(nxfile);
-
- exit (0);
-}
diff --git a/DBupgrade.Sep.25.2014.c b/DBupgrade.Sep.25.2014.c
deleted file mode 100644
index 70bbe16..0000000
--- a/DBupgrade.Sep.25.2014.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
-/*******************************************************************************************
- *
- * Interim code: upgrade previous db to have int's for pulse positions.
- *
- * Author: Gene Myers
- * Date : September 2014
- *
- ********************************************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "DB.h"
-
-#ifdef HIDE_FILES
-#define PATHSEP "/."
-#else
-#define PATHSEP "/"
-#endif
-
-typedef struct
- { int origin; // Well #
- uint16 beg; // First pulse
- uint16 end; // Last pulse
- int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of
- // uncompressed bases in memory block
- int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file
- int flags; // QV of read + flags above
- } HITS_OLD;
-
-typedef struct
- { int origin; // Well #
- int beg; // First pulse
- int end; // Last pulse
- int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of
- // uncompressed bases in memory block
- int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file
- int flags; // QV of read + flags above
- } HITS_NEW;
-
-int main(int argc, char *argv[])
-{ HITS_DB db;
- FILE *nxfile, *ixfile;
- char *pwd, *root;
- int i;
-
- if (argc != 2)
- { fprintf(stderr,"Usage: %s <path:db>\n",argv[0]);
- exit (1);
- }
-
- pwd = PathTo(argv[1]);
- root = Root(argv[1],".db");
- ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r");
- nxfile = Fopen(Catenate(pwd,PATHSEP,root,".ndx"),"w");
- if (ixfile == NULL || nxfile == NULL)
- exit (1);
- free(pwd);
- free(root);
-
- if (fread(&db,sizeof(HITS_DB),1,ixfile) != 1)
- SYSTEM_ERROR
- fwrite(&db,sizeof(HITS_DB),1,nxfile);
-
- for (i = 0; i < db.oreads; i++)
- { HITS_OLD orec;
- HITS_NEW nrec;
-
- if (fread(&orec,sizeof(HITS_OLD),1,ixfile) != 1)
- SYSTEM_ERROR
-
- nrec.origin = orec.origin;
- nrec.beg = orec.beg;
- nrec.end = orec.end;
- nrec.boff = orec.boff;
- nrec.coff = orec.coff;
- nrec.flags = orec.flags;
-
- fwrite(&nrec,sizeof(HITS_NEW),1,nxfile);
- }
-
- fclose(ixfile);
- fclose(nxfile);
-
- exit (0);
-}
diff --git a/DUSTupgrade.Jan.1.2015.c b/DUSTupgrade.Jan.1.2015.c
deleted file mode 100644
index c53a66f..0000000
--- a/DUSTupgrade.Jan.1.2015.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
-/*******************************************************************************************
- *
- * Interim code: upgrade previous db to have fpulse,rlen fields
- *
- * Author: Gene Myers
- * Date : December 2014
- *
- ********************************************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "DB.h"
-
-#ifdef HIDE_FILES
-#define PATHSEP "/."
-#else
-#define PATHSEP "/"
-#endif
-
-int main(int argc, char *argv[])
-{ FILE *afile, *dfile;
- FILE *nafile, *ndfile;
- char *pwd, *root;
- int size, tracklen;
- int i, vint, dint;
- int64 vlong;
-
- if (argc != 2)
- { fprintf(stderr,"Usage: %s <path:db>\n",argv[0]);
- exit (1);
- }
-
- pwd = PathTo(argv[1]);
- root = Root(argv[1],".db");
- afile = Fopen(Catenate(pwd,PATHSEP,root,".dust.anno"),"r");
- dfile = Fopen(Catenate(pwd,PATHSEP,root,".dust.data"),"r");
- nafile = Fopen(Catenate(pwd,PATHSEP,root,".next.anno"),"w");
- ndfile = Fopen(Catenate(pwd,PATHSEP,root,".next.data"),"w");
- if (afile == NULL || dfile == NULL || nafile == NULL || ndfile == NULL)
- exit (1);
- free(pwd);
- free(root);
-
- if (fread(&tracklen,sizeof(int),1,afile) != 1)
- SYSTEM_ERROR
- fwrite(&tracklen,sizeof(int),1,nafile);
-
- if (fread(&size,sizeof(int),1,afile) != 1)
- SYSTEM_ERROR
- size = 8;
- fwrite(&size,sizeof(int),1,nafile);
-
- for (i = 0; i <= tracklen; i++)
- { if (fread(&vint,sizeof(int),1,afile) != 1)
- SYSTEM_ERROR
- vlong = vint;
- fwrite(&vlong,sizeof(int64),1,nafile);
- }
-
- vint >>= 2;
- for (i = 0; i < vint; i += 2)
- { if (fread(&dint,sizeof(int),1,dfile) != 1)
- SYSTEM_ERROR
- fwrite(&dint,sizeof(int),1,ndfile);
- if (fread(&dint,sizeof(int),1,dfile) != 1)
- SYSTEM_ERROR
- dint += 1;
- fwrite(&dint,sizeof(int),1,ndfile);
- }
-
- fclose(nafile);
- fclose(ndfile);
- fclose(afile);
- fclose(dfile);
-
- exit (0);
-}
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..9aa819c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,34 @@
+
+ Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ · Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ · Redistributions in binary form must reproduce the above copyright notice, this
+ list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+ · The name of EWM may not be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ For any issues regarding this software and its use, contact EWM at:
+
+ Eugene W. Myers Jr.
+ Bautzner Str. 122e
+ 01099 Dresden
+ GERMANY
+ Email: gene.myers at gmail.com
+
diff --git a/Makefile b/Makefile
index d42eeae..cf0afb8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,9 @@
-CFLAGS = -O3 -Wall -Wextra -fno-strict-aliasing
+DEST_DIR = ~/bin
+
+CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing
ALL = fasta2DB DB2fasta quiva2DB DB2quiva DBsplit DBdust Catrack DBshow DBstats DBrm simulator \
- fasta2DAM DAM2fasta
+ fasta2DAM DAM2fasta DBdump rangen
all: $(ALL)
@@ -12,7 +14,7 @@ DB2fasta: DB2fasta.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DB2fasta DB2fasta.c DB.c QV.c -lm
quiva2DB: quiva2DB.c DB.c DB.h QV.c QV.h
- gcc $(CFLAGS) -o quiva2DB quiva2DB.c DB.c QV.c -lm
+ gcc $(CFLAGS) -DINTERACTIVE -o quiva2DB quiva2DB.c DB.c QV.c -lm
DB2quiva: DB2quiva.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DB2quiva DB2quiva.c DB.c QV.c -lm
@@ -29,6 +31,9 @@ Catrack: Catrack.c DB.c DB.h QV.c QV.h
DBshow: DBshow.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DBshow DBshow.c DB.c QV.c -lm
+DBdump: DBdump.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DBdump DBdump.c DB.c QV.c -lm
+
DBstats: DBstats.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DBstats DBstats.c DB.c QV.c -lm
@@ -38,30 +43,23 @@ DBrm: DBrm.c DB.c DB.h QV.c QV.h
simulator: simulator.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o simulator simulator.c DB.c QV.c -lm
+rangen: rangen.c
+ gcc $(CFLAGS) -o rangen rangen.c
+
fasta2DAM: fasta2DAM.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o fasta2DAM fasta2DAM.c DB.c QV.c -lm
DAM2fasta: DAM2fasta.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DAM2fasta DAM2fasta.c DB.c QV.c -lm
-DBupgrade.Sep.25.2014: DBupgrade.Sep.25.2014.c DB.c DB.h QV.c QV.h
- gcc $(CFLAGS) -o DBupgrade.Sep.25.2014 DBupgrade.Sep.25.2014.c DB.c QV.c -lm
-
-DBupgrade.Dec.31.2014: DBupgrade.Dec.31.2014.c DB.c DB.h QV.c QV.h
- gcc $(CFLAGS) -o DBupgrade.Dec.31.2014 DBupgrade.Dec.31.2014.c DB.c QV.c -lm
-
-DUSTupgrade.Jan.1.2015: DUSTupgrade.Jan.1.2015.c DB.c DB.h QV.c QV.h
- gcc $(CFLAGS) -o DUSTupgrade.Jan.1.2015 DUSTupgrade.Jan.1.2015.c DB.c QV.c -lm
-
clean:
rm -f $(ALL)
rm -fr *.dSYM
- rm -f DBupgrade.Sep.25.2014 DBupgrade.Dec.31.2014 DUSTupgrade.Jan.1.2015
rm -f dazz.db.tar.gz
install:
- cp $(ALL) ~/bin
+ cp $(ALL) $(DEST_DIR)
package:
make clean
- tar -zcf dazz.db.tar.gz README Makefile *.h *.c
+ tar -zcf dazz.db.tar.gz README.md Makefile *.h *.c
diff --git a/QV.c b/QV.c
index 38f6db4..cdb6a63 100644
--- a/QV.c
+++ b/QV.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
* Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on
@@ -775,6 +738,12 @@ static int Nline; // Referred by: QVcoding_Scan
char *QVentry()
{ return (Read); }
+void Set_QV_Line(int line)
+{ Nline = line; }
+
+int Get_QV_Line()
+{ return (Nline); }
+
// If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines
// for a sequence. Place line j at Read+j*Rmax and the length of every line is returned
// unless eof occurs in which case return -1. If any error occurs return -2.
@@ -884,8 +853,9 @@ static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar)
*
********************************************************************************************/
- // Read .quiva file from input, recording stats in the histograms. If zero is set then
- // start the stats anew with this file.
+ // Read up to the next num entries or until eof from the .quiva file on input and record
+ // frequency statistics. Copy these entries to the temporary file temp if != NULL.
+ // If there is an error then -1 is returned, otherwise the number of entries read.
static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256];
static uint64 totChar;
@@ -893,9 +863,10 @@ static int delChar, subChar;
// Referred by: QVcoding_Scan, Create_QVcoding
-int QVcoding_Scan(FILE *input)
+int QVcoding_Scan(FILE *input, int num, FILE *temp)
{ char *slash;
int rlen;
+ int i, r;
// Zero histograms
@@ -904,11 +875,8 @@ int QVcoding_Scan(FILE *input)
bzero(insHist,sizeof(uint64)*256);
bzero(subHist,sizeof(uint64)*256);
- { int i;
-
- for (i = 0; i < 256; i++)
- delRun[i] = subRun[i] = 1;
- }
+ for (i = 0; i < 256; i++)
+ delRun[i] = subRun[i] = 1;
totChar = 0;
delChar = -1;
@@ -917,37 +885,48 @@ int QVcoding_Scan(FILE *input)
// Make a sweep through the .quiva entries, histogramming the relevant things
// and figuring out the run chars for the deletion and substition streams
- Nline = 0;
- while (1)
+ r = 0;
+ for (i = 0; i < num; i++)
{ int well, beg, end, qv;
rlen = Read_Lines(input,1);
if (rlen == -2)
- EXIT(1);
+ EXIT(-1);
if (rlen < 0)
break;
if (rlen == 0 || Read[0] != '@')
- { EPRINTF(EPLACE,"Line %d: Header in quiv file is missing\n",Nline);
- EXIT(1);
+ { EPRINTF(EPLACE,"Line %d: Header in quiva file is missing\n",Nline);
+ EXIT(-1);
}
slash = index(Read+1,'/');
if (slash == NULL)
{ EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n",
Prog_Name,Nline);
- EXIT(1);
+ EXIT(-1);
}
if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4)
{ EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n",
Prog_Name,Nline);
- EXIT(1);
+ EXIT(-1);
}
+ if (temp != NULL)
+ fputs(Read,temp);
+
rlen = Read_Lines(input,5);
if (rlen < 0)
{ if (rlen == -1)
EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline);
- EXIT(1);
+ EXIT(-1);
+ }
+
+ if (temp != NULL)
+ { fputs(Read,temp);
+ fputs(Read+Rmax,temp);
+ fputs(Read+2*Rmax,temp);
+ fputs(Read+3*Rmax,temp);
+ fputs(Read+4*Rmax,temp);
}
Histogram_Seqs(delHist,(uint8 *) (Read),rlen);
@@ -980,9 +959,11 @@ int QVcoding_Scan(FILE *input)
}
if (subChar >= 0)
Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar);
+
+ r += 1;
}
- return (0);
+ return (r);
}
// Using the statistics in the global stat tables, create the Huffman schemes and write
@@ -1312,7 +1293,7 @@ int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy
if (rlen < 0)
{ if (rlen == -1)
EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline);
- EXIT (1);
+ EXIT (-1);
}
if (coding->delChar < 0)
@@ -1347,7 +1328,7 @@ int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy
Encode_Run(coding->subScheme, coding->sRunScheme, output,
(uint8 *) (Read+4*Rmax), rlen, coding->subChar);
- return (0);
+ return (rlen);
}
int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen)
diff --git a/QV.h b/QV.h
index 35fbadc..532b2f4 100644
--- a/QV.h
+++ b/QV.h
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
* Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on
@@ -50,6 +13,8 @@
#ifndef _QV_COMPRESSOR
+#include <stdio.h>
+
#define _QV_COMPRESSOR
// The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or
@@ -83,10 +48,16 @@ typedef struct
int Read_Lines(FILE *input, int nlines);
char *QVentry();
- // Read the .quiva file on input and record frequency statistics. If there is an error
- // then 1 is returned, otherwise 0.
+ // Get and set the line counter for error reporting
+
+void Set_QV_Line(int line);
+int Get_QV_Line();
-int QVcoding_Scan(FILE *input);
+ // Read up to the next num entries or until eof from the .quiva file on input and record
+ // frequency statistics. Copy these entries to the temporary file temp if != NULL.
+ // If there is an error then -1 is returned, otherwise the number of entries read.
+
+int QVcoding_Scan(FILE *input, int num, FILE *temp);
// Given QVcoding_Scan has been called at least once, create an encoding scheme based on
// the accumulated statistics and return a pointer to it. The returned encoding object
@@ -108,8 +79,8 @@ void Free_QVcoding(QVcoding *coding);
// Assuming the file pointer is positioned just beyond an entry header line, read the
// next set of 5 QV lines, compress them according to 'coding', and output. If lossy
- // is set then the scheme is a lossy one. A non-zero value is return only if an
- // error occured.
+ // is set then the scheme is a lossy one. A negative value is returned if an error
+ // occurred, and the sequence length otherwise.
int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy);
diff --git a/README b/README
deleted file mode 100644
index e6308e8..0000000
--- a/README
+++ /dev/null
@@ -1,442 +0,0 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
-/************************************************************************************\
-
-UPGRADE & DEVELOPER NOTES ! ! !
-
- If you have already built a big database and don't want to rebuild it, but do want
-to use a more recent version of the software that entails a change to the data
-structures (currently the updates on Sept 25, 2014 and December 31, 2014), please note
-the routines DBupgrade.Sep.25.2014 and DBupgrade.Dec.31.2014. These take a DB, say X,
-as an argument, and produce a file .X.ndx which you should then replace .X.idx with.
-To update a very old DB to today's version you will need to run both in sequence.
-
- Both of the upgrade programs can be made with "make" but are not by default created
-when make is called without an argument.
-
- For those interested in the details, on September 25, the "beg" and "end" fields went
-from shorts to ints, and on December 31, the "beg" and "end" fields became "fpulse" and
-"rlen", respectively where fpulse = beg and rlen = end-beg.
-
- Unfortunately, the .dust track formats also changed on Dec.31.2014 and Jan.1.2015. To
-upgrade said use DUSTupgrade.Jan.1.2015. This program takes a DB, say X as an argument
-and produces .X.next.anno and .X.next.data which you should then replace .X.dust.* with.
-Of course, it may, if the DB is not too big, be easier and simply to just rerun DBdust.
-
- Developers should also note carefully that the calling conventions to Open_DB have
-changed and there are new utility routines Number_Digits and Check_Track.
-
-\************************************************************************************/
-
-
- The Dazzler Database Library
-
- Author: Gene Myers
- First: July 17, 2013
- Current: December 31, 2014
-
- To facilitate the multiple phases of the dazzler assembler, we organize all the read
-data into what is effectively a "database" of the reads and their meta-information.
-The design goals for this data base are as follows:
-
-(1) The database stores the source Pacbio read information in such a way that it can
- recreate the original input data, thus permitting a user to remove the
- (effectively redundant) source files. This avoids duplicating the same data,
- once in the source file and once in the database.
-
-(2) The data base can be built up incrementally, that is new sequence data can be added
- to the data base over time.
-
-(3) The data base flexibly allows one to store any meta-data desired for reads. This
- is accomplished with the concept of *tracks* that implementors can add as they
- need them.
-
-(4) The data is held in a compressed form equivalent to the .dexta and .dexqv files of
- the data extraction module. Both the .fasta and .quiva information for each
- read is held in the data base and can be recreated from it. The .quiva
- information can be added separately and later on if desired.
-
-(5) To facilitate job parallel, cluster operation of the phases of our assembler, the
- data base has a concept of a *current partitioning* in which all the reads that
- are over a given length and optionally unique to a well, are divided up into
- *blocks* containing roughly a given number of bases, except possibly the last
- block which may have a short count. Often programs con be run on blocks or
- pairs of blocks and each such job is reasonably well balanced as the blocks are
- all the same size. One must be careful about changing the partition during an
- assembly as doing so can void the structural validity of any interim
- block-based results.
-
- A Dazzler DB consists of one named, *visible* file, e.g. FOO.db, and several
-*invisible* secondary files encoding various elements of the DB. The secondary files
-are "invisible" to the UNIX OS in the sense that they begin with a "." and hence are
-not listed by "ls" unless one specifies the -a flag. We chose to do this so that when
-a user lists the contents of a directory they just see a single name, e.g. FOO.db, that
-is the one used to refer to the DB in commands. The files associated with a database
-named, say FOO, are as follows:
-
-(a) "FOO.db": a text file containing
- (i) the list of input files added to the database so far, and
- (ii) how to partition the database into blocks (if the partition
- parameters have been set).
-
-(b) ".FOO.idx": a binary "index" of all the meta-data about each read allowing, for
- example, one to randomly access a read's sequence (in the store
- ".FOO.bps"). It is 28N + 88 bytes in size where N is the number of
- reads in the database.
-
-(c) ".FOO.bps": a binary compressed "store" of all the DNA sequences. It is M/4 bytes
- in size where M is the total number of base pairs in the database.
-
-(d) ".FOO.qvs": a binary compressed "store" of the 5 Pacbio quality value streams for
- the reads. Its size is roughly 5/3M bytes depending on the
- compression acheived. This file only exists if .quiva files have
- been added to the database.
-
-(e) ".FOO.<track>.anno": a *track* containing customized meta-data for each read. For
- ".FOO.<track>.data" example, the DBdust command annotates low complexity intervals
- of reads and records the intervals for each read in two files
- .FOO.dust.anno & .FOO.dust.data. Any kind of information
- about a read can be recorded, such as micro-sats, repeat
- intervals, corrected sequence, etc. Specific tracks will be
- described as modules that produce them are released.
-
-If one does not like the convention of the secondary files being invisible, then
-un-defining the constant HIDE_FILES in DB.h before compiling the library, creates
-commands that do not place a prefixing "." before secondary file names, e.g. FOO.idx
-instead of .FOO.idx. One then sees all the files realizing a DB when listing the
-contents of a directory with ls.
-
- While a Dazzler DB holds a collection of Pacbio reads, a Dazzler map DB or DAM holds
-a collection of contigs from a reference genome assembly. This special type of DB has
-been introduced in order to facilitate the mapping of reads to an assembly and has
-been given the suffix .dam to distinguish it from an ordinary DB. It is structurally
-identical to a .db except:
-
-(a) there is no concept of quality values, and hence no .FOO.qvs file.
-
-(b) every .fasta scaffold (a sequence with runs of N's between contigs estimating the
- length of the gap) is broken into a separate contig sequence in the DB and the
- header for each scaffold is retained in a new .FOO.hdr file.
-
-(c) the original and first and last pulse fields in the meta-data records held in
- .FOO.idx, hold instead the contig number and the interval of the contig within
- its original scaffold sequence.
-
-A map DB can equally well be the argument of many of the commands below that operate
-on normal DBs. In general, a .dam can be an argument anywhere a .db can, with the
-exception of routines or optioned calls to routines that involve quality values, or
-the special routines fasta2DAM and DAM2fasta that create a DAM and reverse said,
-just like the pair fasta2DB and DB2fasta do for a normal DB. So in general when we
-refer to a database we are referring to either a DB or a DAM.
-
- The command DBsplit sets or resets the current partition for a database which is
-determined by 3 parameters: (i) the total number of basepairs to place in each block,
-(ii) the minimum read length of reads to include within a block, and (iii) whether or
-not to only include the longest read from a given well or all reads from a well (NB:
-several reads of the same insert in a given well can be produced by the Pacbio
-instrument). Note that the length and uniqueness parameters effectively select a
-subset of the reads that contribute to the size of a block. We call this subset the
-*trimmed* data base. Some commands operate on the entire database, others on the
-trimmed database, and yet others have an option flag that permits them to operate on
-either at the users discretion. Therefore, one should note carefully to which version
-of the database a command refers to. This is especially important for any command that
-identifies reads by their index (ordinal position) in the database.
-
-Once the database has been split into blocks, the commands DBshow, DBstats, and DBdust
-below and commands yet to come, such as the local alignment finder dalign, can take a
-block or blocks as arguments. On the command line this is indicated by supplying the
-name of the DB followed by a period and then a block number, e.g. FOO.3.db or simply
-FOO.3, refers to the 3'rd block of DB FOO (assuming of course it has a current
-partition and said partition has a 3rd block). One should note carefully that a block
-is a contiguous range of reads such that once it is trimmed has a given size in base
-pairs (as set by DBsplit). Thus like an entire database, a block can be either
-untrimmed or trimmed and one needs to again be careful when giving a read index to
-a command such as DBshow.
-
-All programs add suffixes (e.g. .db) as needed. The commands of the database library
-are currently as follows:
-
-1. fasta2DB [-v] <path:db> ( -f<file> | <input:fasta> ... )
-
-Builds an initial data base, or adds to an existing database, the list of .fasta files
-following the database name argument, or if the -f option is used, the list of .fasta
-files in <file>. A given .fasta file can only be added once to the DB (this is checked
-by the command). The .fasta headers must be in the "Pacbio" format (i.e. the output
-of the Pacbio tools or our dextract program) and the well, pulse interval, and read
-quality are extracted from the header and kept with each read record. If the files
-are being added to an existing database, and the partition settings of the DB have
-already been set (see DBsplit below), then the partitioning of the database is updated
-to include the new data.
-
-2. DB2fasta [-vU] [-w<int(80)>] <path:db>
-
-The set of .fasta files for the given DB are recreated from the DB exactly as they were
-input. That is, this is a perfect inversion, including the reconstitution of the
-proper .fasta headers. Because of this property, one can, if desired, delete the
-.fasta source files once they are in the DB as they can always be recreated from it.
-By default the output sequences are in lower case and 80 chars per line. The -U option
-specifies upper case should be used, and the characters per line, or line width, can be
-set to any positive value with the -w option.
-
-3. quiva2DB [-vl] <path:db> ( -f<file> | <input:quiva> ... )
-
-Adds the given .quiva files on the command line or in the file specified by the
--f option to an existing DB "path". The input files must be added in the same order
-as the .fasta files were and have the same root names, e.g. FOO.fasta and FOO.quiva.
-The files can be added incrementally but must be added in the same order as the .fasta
-files. This is enforced by the program. With the -l option set the compression
-scheme is a bit lossy to get more compression (see the description of dexqv in the
-DEXTRACTOR module).
-
-4. DB2quiva [-vU] <path:db>
-
-The set of .quiva files within the given DB are recreated from the DB exactly as they
-were input. That is, this is a perfect inversion, including the reconstitution of the
-proper .quiva headers. Because of this property, one can, if desired, delete the
-.quiva source files once they are in the DB as they can always be recreated from it.
-By .fastq convention each QV vector is output as a line without new-lines, and by
-default the Deletion Tag entry is in lower case letters. The -U option specifies
-upper case letters should be used instead.
-
-5. fasta2DAM [-v] <path:dam> ( -f<file> | <input:fasta> ... )
-
-Builds a map DB or DAM from the list of .fasta files following the map database name
-argument, or if the -f option is used, the list of .fasta files in <file>. Any .fasta
-entry that has a run of N's in it will be split into separate "contig" entries and
-the interval of the contig in the original entry recorded. The header for each .fasta
-entry is saved with the contigs created from it.
-
-6. DAM2fasta [-vU] [-w<int(80)>] <path:dam>
-
-The set of .fasta files for the given map DB or DAM are recreated from the DAM
-exactly as they were input. That is, this is a perfect inversion, including the
-reconstitution of the proper .fasta headers and the concatenation of contigs with
-the proper number of N's between them. By default the output sequences are in lower
-case and 80 chars per line. The -U option specifies upper case should be used, and
-the characters per line, or line width, can be set to any positive value with
-the -w option.
-
-7. DBsplit [-a] [-x<int>] [-s<int(200)>] <path:db|dam>
-
-Divide the database <path>.db or <path>.dam conceptually into a series of blocks
-referable to on the command line as <path>.1, <path>.2, ... If the -x option is set
-then all reads less than the given length are ignored, and if the -a option is not
-set then secondary reads from a given well are also ignored. The remaining reads,
-constituting what we call the trimmed DB, are split amongst the blocks so that each
-block is of size -s * 1Mbp except for the last which necessarily contains a smaller
-residual. The default value for -s is 200Mbp because blocks of this size can be
-compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks are very
-space efficient in that their sub-index of the master .idx is computed on the fly
-when loaded, and the .bps and .qvs files (if a .db) of base pairs and quality values,
-respectively, is shared with the master DB. Any relevant portions of tracks
-associated with the DB are also computed on the fly when loading a database block.
-
-8. DBdust [-b] [-w<int(64)>] [-t<double(2.)>] [-m<int(10)>] <path:db|dam>
-
-Runs the symmetric DUST algorithm over the reads in the untrimmed DB <path>.db or
-<path>.dam producing a track .<path>.dust[.anno,.data] that marks all intervals of low
-complexity sequence, where the scan window is of size -w, the threshold for being a
-low-complexity interval is -t, and only perfect intervals of size greater than -m are
-recorded. If the -b option is set then the definition of low complexity takes into
-account the frequency of a given base. The command is incremental if given a DB to
-which new data has been added since it was last run on the DB, then it will extend
-the track to include the new reads. It is important to set this flag for genomes with
-a strong AT/GC bias, albeit the code is a tad slower. The dust track, if present,
-is understood and used by DBshow, DBstats, and dalign.
-
-DBdust can also be run over an untriimmed DB block in which case it outputs a track
-encoding where the trace file names contain the block number, e.g. .FOO.3.dust.anno
-and .FOO.3.dust.data, given FOO.3 on the command line. We call this a *block track*.
-This permits job parallelism in block-sized chunks, and the resulting sequence of
-block tracks can then be merged into a track for the entire untrimmed DB with Catrack.
-
-9. Catrack [-v] <path:db|dam> <track:name>
-
-Find all block tracks of the form .<path>.#.<track>... and merge them into a single
-track, .<path>.<track>..., for the given DB or DAM. The block track files must all
-encode the same kind of track data (this is checked), and the files must exist for
-block 1, 2, 3, ... up to the last block number.
-
-10. DBshow [-unqUQ] [-w<int(80)>] [-m<track>]+
- <path:db|dam> [ <reads:FILE> | <reads:range> ... ]
-
-Displays the requested reads in the database <path>.db or <path>.dam. By default the
-command applies to the trimmed database, but if -u is set then the entire DB is used.
-If no read arguments are given then every read in the database or database block is
-displayed. Otherwise the input file or the list of supplied integer ranges give the
-ordinal positions in the actively loaded portion of the db. In the case of a file, it
-should simply contain a read index, one per line. In the other case, a read range is
-either a lone integer or the symbol $, in which case the read range consists of just
-that read (the last read in the database if $). One may also give two positive
-integers separated by a dash to indicate a range of integers, where again a $
-represents the index of the last read in the actively loaded db. For example,
-1 3-5 $ displays reads 1, 3, 4, 5, and the last read in the active db. As another
-example, 1-$ displays every read in the active db (the default).
-
-By default a .fasta file of the read sequences is displayed. If the -q option is
-set, then the QV streams are also displayed in a non-standard modification of the
-fasta format. If the -n option is set then the DNA sequence is *not* displayed.
-If the -Q option is set then a .quiva file is displayed and in this case the -n
-and -m options mayt not be set (and the -q and -w options have no effect).
-
-If one or more masks are set with the -m option then the track intervals are also
-displayed in an additional header line and the bases within an interval are displayed
-in the case opposite that used for all the other bases. By default the output
-sequences are in lower case and 80 chars per line. The -U option specifies upper
-case should be used, and the characters per line, or line width, can be set to any
-positive value with the -w option.
-
-The .fasta or .quiva files that are output can be converted into a DB by fasta2DB
-and quiva2DB (if the -q and -n options are not set and no -m options are set),
-giving one a simple way to make a DB of a subset of the reads for testing purposes.
-
-11. DBstats [-nu] [-b<int(1000)] [-m<track>]+ <path:db|dam>
-
-Show overview statistics for all the reads in the trimmed data base <path>.db or
-<path>.dam, including a histogram of read lengths where the bucket size is set
-with the -b option (default 1000). If the -u option is given then the untrimmed
-database is summarized. If the -n option is given then the histogran of read lengths
-is not displayed. Any track such as a "dust" track that gives a seried of
-intervals along the read can be specified with the -m option in which case a summary
-and a histogram of the interval lengths is displayed.
-
-12. DBrm <path:db|dam> ...
-
-Delete all the files for the given data bases. Do not use rm to remove a database, as
-there are at least two and often several secondary files for each DB including track
-files, and all of these are removed by DBrm.
-
-13. simulator <genlen:double> [-c<double(20.)>] [-b<double(.5)] [-r<int>]
- [-m<int(10000)>] [-s<int(2000)>]
- [-x<int(4000)>] [-e<double(.15)>]
- [-M<file>]
-
-In addition to the DB commands we include here, somewhat tangentially, a simple
-simulator that generates synthetic reads for a random genome. simulator first
-generates a fake genome of size genlen*1Mb long, that has an AT-bias of -b. It then
-generates sample reads of mean length -m from a log-normal length distribution with
-standard deviation -s, but ignores reads of length less than -x. It collects enough
-reads to cover the genome -c times and introduces -e fraction errors into each read
-where the ratio of insertions, deletions, and substitutions are set by defined
-constants INS_RATE (default 73%) and DEL_RATE (default 20%) within generate.c. One
-can also control the rate at which reads are picked from the forward and reverse
-strands by setting the defined constant FLIP_RATE (default 50/50). The -r option seeds
-the random number generator for the generation of the genome so that one can
-reproducibly generate the same underlying genome to sample from. If this parameter is
-missing, then the job id of the invocation seeds the random number generator. The
-output is sent to the standard output (i.e. it is a UNIX pipe). The output is in
-Pacbio .fasta format suitable as input to fasta2DB. Finally, the -M option requests
-that the coordinates from which each read has been sampled are written to the indicated
-file, one line per read, ASCII encoded. This "map" file essentially tells one where
-every read belongs in an assembly and is very useful for debugging and testing
-purposes. If a read pair is say b,e then if b < e the read was sampled from [b,e] in
-the forward direction, and if b > e from [e,b] in the reverse direction.
-
-Example:
-
- A small complete example of most of the commands above.
-
-> simulator 1.0 -c20. >G.fasta // Generate a 20x data sets of a 1Mb genome
-> fasta2DB G G.fasta // Create a compressed data base of the reads, G.db
-> rm G.fasta // Redundant, recreate any time with "DB2fasta G"
-> DBsplit -s11 G // Split G into 2 parts of size ~ 11MB each
-> DBdust G.1 // Produce a "dust" track on each part
-> DBdust G.2
-> Catrack G dust // Create one track for all of the DB
-> rm .G.*.dust.* // Clean up the sub-tracks
-> DBstats -mdust G // Take a look at the statistics for the database
-
-Statistics for all reads in the data set
-
- 1,836 reads out of 1,836 (100.0%)
- 20,007,090 base pairs out of 20,007,090 (100.0%)
-
- 10,897 average read length
- 2,192 standard deviation
-
- Base composition: 0.250(A) 0.250(C) 0.250(G) 0.250(T)
-
- Distribution of Read Lengths (Bin size = 1,000)
-
- Bin: Count % Reads % Bases Average
- 22,000: 1 0.1 0.1 22654
- 21,000: 0 0.1 0.1 22654
- 20,000: 1 0.1 0.2 21355
- 19,000: 0 0.1 0.2 21355
- 18,000: 4 0.3 0.6 19489
- 17,000: 8 0.8 1.3 18374
- 16,000: 19 1.8 2.8 17231
- 15,000: 43 4.1 6.2 16253
- 14,000: 81 8.6 12.0 15341
- 13,000: 146 16.5 21.9 14428
- 12,000: 200 27.4 34.4 13664
- 11,000: 315 44.6 52.4 12824
- 10,000: 357 64.0 71.2 12126
- 9,000: 306 80.7 85.8 11586
- 8,000: 211 92.2 94.8 11208
- 7,000: 95 97.3 98.4 11017
- 6,000: 43 99.7 99.8 10914
- 5,000: 6 100.0 100.0 10897
-
-
-Statistics for dust-track
-
- There are 158 intervals totaling 1,820 bases (0.0% of all data)
-
- Distribution of dust intervals (Bin size = 1,000)
-
- Bin: Count % Intervals % Bases Average
- 0: 158 100.0 100.0 11
-
-> ls -al
-total 66518744
-drwxr-xr-x+ 177 myersg staff 6018 Mar 2 13:28 .
-drwxr-xr-x+ 20 myersg staff 680 Feb 26 19:52 ..
--rw-r--r--+ 1 myersg staff 5002464 Mar 2 13:28 .G.bps
--rw-r--r--+ 1 myersg staff 14704 Mar 2 13:28 .G.dust.anno
--rw-r--r--+ 1 myersg staff 1264 Mar 2 13:28 .G.dust.data
--rw-r--r--+ 1 myersg staff 73552 Mar 2 13:28 .G.idx
--rw-r--r--+ 1 myersg staff 162 Mar 2 13:28 G.db
-> cat G.db
-files = 1
- 1836 G Sim
-blocks = 2
-size = 11 cutoff = 0 all = 0
- 0 0
- 1011 1011
- 1836 1836
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..515de32
--- /dev/null
+++ b/README.md
@@ -0,0 +1,501 @@
+# The Dazzler Database Library
+
+## _Author: Gene Myers_
+## _First: July 17, 2013_
+
+For typeset documentation, examples of use, and design philosophy please go to
+my [blog](https://dazzlerblog.wordpress.com/command-guides/dazz_db-command-guide).
+
+To facilitate the multiple phases of the dazzler assembler, we organize all the read
+data into what is effectively a "database" of the reads and their meta-information.
+The design goals for this data base are as follows:
+
+1. The database stores the source Pacbio read information in such a way that it can
+recreate the original input data, thus permitting a user to remove the
+(effectively redundant) source files. This avoids duplicating the same data,
+once in the source file and once in the database.
+
+2. The data base can be built up incrementally, that is new sequence data can be added
+to the data base over time.
+
+3. The data base flexibly allows one to store any meta-data desired for reads. This
+is accomplished with the concept of *tracks* that implementors can add as they
+need them.
+
+4. The data is held in a compressed form equivalent to the .dexta and .dexqv files of
+the data extraction module. Both the .fasta and .quiva information for each
+read is held in the data base and can be recreated from it. The .quiva
+information can be added separately and later on if desired.
+
+5. To facilitate job parallel, cluster operation of the phases of our assembler, the
+data base has a concept of a *current partitioning* in which all the reads that
+are over a given length and optionally unique to a well, are divided up into
+*blocks* containing roughly a given number of bases, except possibly the last
+block which may have a short count. Often programs con be run on blocks or
+pairs of blocks and each such job is reasonably well balanced as the blocks are
+all the same size. One must be careful about changing the partition during an
+assembly as doing so can void the structural validity of any interim
+block-based results.
+
+A Dazzler DB consists of one named, *visible* file, e.g. FOO.db, and several
+*invisible* secondary files encoding various elements of the DB. The secondary files
+are "invisible" to the UNIX OS in the sense that they begin with a "." and hence are
+not listed by "ls" unless one specifies the -a flag. We chose to do this so that when
+a user lists the contents of a directory they just see a single name, e.g. FOO.db, that
+is the one used to refer to the DB in commands. The files associated with a database
+named, say FOO, are as follows:
+
+* "FOO.db": a text file containing
+ 1. the list of input files added to the database so far, and
+ 2. how to partition the database into blocks (if the partition
+ parameters have been set).
+
+* ".FOO.idx": a binary "index" of all the meta-data about each read allowing, for
+ example, one to randomly access a read's sequence (in the store
+ ".FOO.bps"). It is 28N + 88 bytes in size where N is the number of
+ reads in the database.
+
+* ".FOO.bps": a binary compressed "store" of all the DNA sequences. It is M/4 bytes
+ in size where M is the total number of base pairs in the database.
+
+* ".FOO.qvs": a binary compressed "store" of the 5 Pacbio quality value streams for
+ the reads. Its size is roughly 5/3M bytes depending on the
+ compression acheived. This file only exists if .quiva files have
+ been added to the database.
+
+* ".FOO.\<track\>.[anno,data]": a *track* containing customized meta-data for each read. For
+ example, the DBdust command annotates low complexity intervals
+ of reads and records the intervals for each read in two files
+ .FOO.dust.anno & .FOO.dust.data. Any kind of information
+ about a read can be recorded, such as micro-sats, repeat
+ intervals, corrected sequence, etc. Specific tracks will be
+ described as modules that produce them are released.
+
+If one does not like the convention of the secondary files being invisible, then
+un-defining the constant HIDE_FILES in DB.h before compiling the library, creates
+commands that do not place a prefixing "." before secondary file names, e.g. FOO.idx
+instead of .FOO.idx. One then sees all the files realizing a DB when listing the
+contents of a directory with ls.
+
+While a Dazzler DB holds a collection of Pacbio reads, a Dazzler map DB or DAM holds
+a collection of contigs from a reference genome assembly. This special type of DB has
+been introduced in order to facilitate the mapping of reads to an assembly and has
+been given the suffix .dam to distinguish it from an ordinary DB. It is structurally
+identical to a .db except:
+
+* there is no concept of quality values, and hence no .FOO.qvs file.
+
+* every .fasta scaffold (a sequence with runs of N's between contigs estimating the
+ length of the gap) is broken into a separate contig sequence in the DB and the
+ header for each scaffold is retained in a new .FOO.hdr file.
+
+* the original and first and last pulse fields in the meta-data records held in
+ .FOO.idx, hold instead the contig number and the interval of the contig within
+ its original scaffold sequence.
+
+A map DB can equally well be the argument of many of the commands below that operate
+on normal DBs. In general, a .dam can be an argument anywhere a .db can, with the
+exception of routines or optioned calls to routines that involve quality values, or
+the special routines fasta2DAM and DAM2fasta that create a DAM and reverse said,
+just like the pair fasta2DB and DB2fasta do for a normal DB. So in general when we
+refer to a database we are referring to either a DB or a DAM.
+
+The command DBsplit sets or resets the current partition for a database which is
+determined by 3 parameters: (i) the total number of basepairs to place in each block,
+(ii) the minimum read length of reads to include within a block, and (iii) whether or
+not to only include the longest read from a given well or all reads from a well (NB:
+several reads of the same insert in a given well can be produced by the Pacbio
+instrument). Note that the length and uniqueness parameters effectively select a
+subset of the reads that contribute to the size of a block. We call this subset the
+*trimmed* data base. Some commands operate on the entire database, others on the
+trimmed database, and yet others have an option flag that permits them to operate on
+either at the users discretion. Therefore, one should note carefully to which version
+of the database a command refers to. This is especially important for any command that
+identifies reads by their index (ordinal position) in the database.
+
+Once the database has been split into blocks, the commands DBshow, DBstats, and DBdust
+below and commands yet to come, such as the local alignment finder dalign, can take a
+block or blocks as arguments. On the command line this is indicated by supplying the
+name of the DB followed by a period and then a block number, e.g. FOO.3.db or simply
+FOO.3, refers to the 3'rd block of DB FOO (assuming of course it has a current
+partition and said partition has a 3rd block). One should note carefully that a block
+is a contiguous range of reads such that once it is trimmed has a given size in base
+pairs (as set by DBsplit). Thus like an entire database, a block can be either
+untrimmed or trimmed and one needs to again be careful when giving a read index to
+a command such as DBshow.
+
+All programs add suffixes (e.g. .db) as needed. The commands of the database library
+are currently as follows:
+
+```
+1. fasta2DB [-v] <path:db> ( -f<file> | -i[<name>] | <input:fasta> ... )
+```
+
+Builds an initial data base, or adds to an existing database, either (a) the list of
+.fasta files following the database name argument, or (b) the list of .fasta files in
+\<file\> if the -f option is used, or (c) entries piped from the standard input if the
+-i option is used. If a faux file name, \<name\>, follows the -i option then all the
+input received is considered to have come from a file by the name of \<name\>.fasta by
+DB2fasta, otherwise it will be sent to the standard output by DB2fasta. The SMRT cells
+in a given named input (i.e. all sources other than -i without a name) can only be
+added consecutively to the DB (this is checked by the command). The .fasta headers must
+be in the "Pacbio" format (i.e. the output of the Pacbio tools or our dextract program)
+and the well, pulse interval, and read quality are extracted from the header and kept
+with each read record. If the files are being added to an existing database, and the
+partition settings of the DB have already been set (see DBsplit below), then the
+partitioning of the database is updated to include the new data. A file may contain
+the data from multiple SMRT cells provided the reads for each SMRT cell are consecutive
+in the file.
+
+```
+2. DB2fasta [-vU] [-w<int(80)>] <path:db>
+```
+
+The set of .fasta files for the given DB are recreated from the DB exactly as they were
+input. That is, this is a perfect inversion, including the reconstitution of the
+proper .fasta headers. Because of this property, one can, if desired, delete the
+.fasta source files once they are in the DB as they can always be recreated from it.
+Entries imported from the standard input will be place in the faux file name given on
+import, or to the standard output if no name was given.
+By default the output sequences are in lower case and 80 chars per line. The -U option
+specifies upper case should be used, and the characters per line, or line width, can be
+set to any positive value with the -w option.
+
+```
+3. quiva2DB [-vl] <path:db> ( -f<file> | -i | <input:quiva> ... )
+```
+
+Adds .quiva streams to an existing DB "path". The data comes from (a) the given .quiva
+files on the command line, or (b) those in the file specified by the -f option, or
+(c) the standard input if the -i option is given. The input files must be added in the
+same order as the .fasta files were and have the same root names, e.g. FOO.fasta and
+FOO.quiva. The files can be added incrementally but must be added in the same order as
+their corresponding .fasta files. This is enforced by the program. With the -l option
+set the compression scheme is a bit lossy to get more compression (see the description
+of dexqv in the DEXTRACTOR module here).
+
+```
+4. DB2quiva [-vU] <path:db>
+```
+
+The set of .quiva files within the given DB are recreated from the DB exactly as they
+were input. That is, this is a perfect inversion, including the reconstitution of the
+proper .quiva headers. Because of this property, one can, if desired, delete the
+.quiva source files once they are in the DB as they can always be recreated from it.
+Entries imported from the standard input will be place in the faux file name given on
+import, or to the standard output if no name was given.
+By .fastq convention each QV vector is output as a line without new-lines, and by
+default the Deletion Tag entry is in lower case letters. The -U option specifies
+upper case letters should be used instead.
+
+```
+5. fasta2DAM [-v] <path:dam> ( -f<file> | -i[<name>] | <input:fasta> ... )
+```
+
+Builds an initial map DB or DAM, or adds to an existing DAM, either (a) the list of
+.fasta files following the database name argument, or (b) the list of .fasta files in
+\<file\> if the -f option is used, or (c) entries piped from the standard input if the -i
+option is used. If a faux file name, \<name\>, follows the -i option then all the input
+received is considered to have come from a file by the name of \<name\>.fasta by
+DB2fasta, otherwise it will be sent to the standard output by DB2fasta. Any .fasta
+entry that has a run of N's in it will be split into separate "contig" entries and the
+interval of the contig in the original entry recorded. The header for each .fasta entry
+is saved with the contigs created from it.
+
+```
+6. DAM2fasta [-vU] [-w<int(80)>] <path:dam>
+```
+
+The set of .fasta files for the given map DB or DAM are recreated from the DAM
+exactly as they were input. That is, this is a perfect inversion, including the
+reconstitution of the proper .fasta headers and the concatenation of contigs with
+the proper number of N's between them to recreate scaffolds.
+Entries imported from the standard input will be place in the faux file name given on
+import, or to the standard output if no name was given. By default the output
+sequences are in lower case and 80 chars per line. The -U option specifies upper case
+should be used, and the characters per line, or line width, can be set to any positive
+value with the -w option.
+
+```
+7. DBsplit [-a] [-x<int>] [-s<int(200)>] <path:db|dam>
+```
+
+Divide the database \<path\>.db or \<path\>.dam conceptually into a series of blocks
+referable to on the command line as \<path\>.1, \<path\>.2, ... If the -x option is set
+then all reads less than the given length are ignored, and if the -a option is not
+set then secondary reads from a given well are also ignored. The remaining reads,
+constituting what we call the trimmed DB, are split amongst the blocks so that each
+block is of size -s * 1Mbp except for the last which necessarily contains a smaller
+residual. The default value for -s is 200Mbp because blocks of this size can be
+compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks are very
+space efficient in that their sub-index of the master .idx is computed on the fly
+when loaded, and the .bps and .qvs files (if a .db) of base pairs and quality values,
+respectively, is shared with the master DB. Any relevant portions of tracks
+associated with the DB are also computed on the fly when loading a database block.
+If the -f option is set, the split is forced regardless of whether or not the DB in
+question has previously bin split, i.e. one is not interactively asked if they wish
+to proceed.
+
+```
+8. DBdust [-b] [-w<int(64)>] [-t<double(2.)>] [-m<int(10)>] <path:db|dam>
+```
+
+Runs the symmetric DUST algorithm over the reads in the untrimmed DB \<path\>.db or
+\<path\>.dam producing a track .\<path\>.dust[.anno,.data] that marks all intervals of low
+complexity sequence, where the scan window is of size -w, the threshold for being a
+low-complexity interval is -t, and only perfect intervals of size greater than -m are
+recorded. If the -b option is set then the definition of low complexity takes into
+account the frequency of a given base. The command is incremental if given a DB to
+which new data has been added since it was last run on the DB, then it will extend
+the track to include the new reads. It is important to set this flag for genomes with
+a strong AT/GC bias, albeit the code is a tad slower. The dust track, if present,
+is understood and used by DBshow, DBstats, and dalign.
+
+DBdust can also be run over an untriimmed DB block in which case it outputs a track
+encoding where the trace file names contain the block number, e.g. .FOO.3.dust.anno
+and .FOO.3.dust.data, given FOO.3 on the command line. We call this a *block track*.
+This permits job parallelism in block-sized chunks, and the resulting sequence of
+block tracks can then be merged into a track for the entire untrimmed DB with Catrack.
+
+```
+9. Catrack [-v] <path:db|dam> <track:name>
+```
+
+Find all block tracks of the form .\<path\>.#.\<track\>... and merge them into a single
+track, .\<path\>.\<track\>..., for the given DB or DAM. The block track files must all
+encode the same kind of track data (this is checked), and the files must exist for
+block 1, 2, 3, ... up to the last block number.
+
+```
+10. DBshow [-unqUQ] [-w<int(80)>] [-m<track>]+
+ <path:db|dam> [ <reads:FILE> | <reads:range> ... ]
+```
+
+Displays the requested reads in the database \<path\>.db or \<path\>.dam. By default the
+command applies to the trimmed database, but if -u is set then the entire DB is used.
+If no read arguments are given then every read in the database or database block is
+displayed. Otherwise the input file or the list of supplied integer ranges give the
+ordinal positions in the actively loaded portion of the db. In the case of a file, it
+should simply contain a read index, one per line. In the other case, a read range is
+either a lone integer or the symbol $, in which case the read range consists of just
+that read (the last read in the database if $). One may also give two positive
+integers separated by a dash to indicate a range of integers, where again a $
+represents the index of the last read in the actively loaded db. For example,
+1 3-5 $ displays reads 1, 3, 4, 5, and the last read in the active db. As another
+example, 1-$ displays every read in the active db (the default).
+
+By default a .fasta file of the read sequences is displayed. If the -q option is
+set, then the QV streams are also displayed in a non-standard modification of the
+fasta format. If the -n option is set then the DNA sequence is *not* displayed.
+If the -Q option is set then a .quiva file is displayed and in this case the -n
+and -m options mayt not be set (and the -q and -w options have no effect).
+
+If one or more masks are set with the -m option then the track intervals are also
+displayed in an additional header line and the bases within an interval are displayed
+in the case opposite that used for all the other bases. By default the output
+sequences are in lower case and 80 chars per line. The -U option specifies upper
+case should be used, and the characters per line, or line width, can be set to any
+positive value with the -w option.
+
+The .fasta or .quiva files that are output can be converted into a DB by fasta2DB
+and quiva2DB (if the -q and -n options are not set and no -m options are set),
+giving one a simple way to make a DB of a subset of the reads for testing purposes.
+
+```
+12. DBdump [-rhsiqp] [-uU] [-m<track>]+
+ <path:db|dam> [ <reads:FILE> | <reads:range> ... ]
+```
+
+Like DBshow, DBdump allows one to display a subset of the reads in the DB and select
+which information to show about them including any mask tracks. The difference is
+that the information is written in a very simple "1-code" ASCII format that makes it
+easy for one to read and parse the information for further use. -r requests that each
+read number be displayed (useful if only a subset of reads is requested). -h prints
+the header information which is the source file name, well #, and pulse range.
+-s requests the sequence be output, -i requests that the intrinsic quality values be
+output, -q requests that the 5 quiva sequences be output, -p requests the repeat
+profile be output (if available), and -m\<track\> requests that mask \<track\> be output.
+Set -u if you want data from the untrimmed database (the default is trimmed) and
+set -U if you'd like upper-case letter used in the DNA sequence strings.
+
+The format is very simple. Each requested piece of information occurs on a line. The
+first character of every line is a "1-code" character that tells you what information
+to expect on the line. The rest of the line contains information where each item is
+separated by a single blank space. Strings are output as first an integer giving the
+length of the string, a blank space, and then the string terminated by a new-line.
+Intrinsic quality values are between 0 and 50, inclusive, and a vector of said are
+displayed as an alphabetic string where 'a' is 0, 'b' is '1', ... 'z' is 25, 'A' is
+26, 'B' is 27, ... and 'Y' is 50. Repeat profiles are also displayed as string where
+'_' denotes 0 repetitions, and then 'a' through 'N' denote the values 1 through 40,
+respectively.
+
+```
+ R # - read number
+ H # string - original file name string (header)
+ L # # # - location: well, pulse start, pulse end
+ Tx #n (#b #e)^#n - x'th track on command line, #n intervals all on same line
+ S # string - sequence string
+ I # string - intrinsic quality vector (as an ASCII string)
+ P # string - repeat profile vector (as an ASCII string)
+ d # string - Quiva deletion values (as an ASCII string)
+ c # string - Quiva deletion character string
+ i # string - Quiva insertion value string
+ m # string - Quiva merge value string
+ s # string - Quiva substitution value string
+ + X # - Total amount of X (X = H or S or I or P or R or M or T#)
+ @ X # - Maximum amount of X (X = H or S or I or P or T#)
+```
+
+1-code lines that begin with + or @ are always the first lines in the output. They
+give size information about what is contained in the output. That is '+ X #' gives
+the number of reads (X=R), the number of masks (X=M), or the total number of
+characters in all headers (X=H), sequences (X=S), intrinsic quality vectors (X=I),
+read profile vector (X=P), or track (X=T#). And '@ X #' gives the maximum number of
+characters in any header (X=H), sequence (X=S), intrincic quality vector (X=I), read
+profile vector (X=P), or track (X=T#). The size numbers for the Quiva strings are
+identical to that for the sequence as they are all of the same length for any
+given entry.
+
+```
+12. DBstats [-nu] [-b<int(1000)] [-m<track>]+ <path:db|dam>
+```
+
+Show overview statistics for all the reads in the trimmed data base \<path\>.db or
+\<path\>.dam, including a histogram of read lengths where the bucket size is set
+with the -b option (default 1000). If the -u option is given then the untrimmed
+database is summarized. If the -n option is given then the histogran of read lengths
+is not displayed. Any track such as a "dust" track that gives a series of
+intervals along the read can be specified with the -m option in which case a summary
+and a histogram of the interval lengths is displayed.
+
+```
+13. DBrm <path:db|dam> ...
+```
+
+Delete all the files for the given data bases. Do not use rm to remove a database, as
+there are at least two and often several secondary files for each DB including track
+files, and all of these are removed by DBrm.
+
+```
+14. simulator <genome:dam> [-CU] [-m<int(10000)>] [-s<int(2000)>] [-e<double(.15)]
+ [-c<double(50.)>] [-f<double(.5)>] [-x<int(4000)>]
+ [-w<int(80)>] [-r<int>] [-M<file>]
+```
+
+In addition to the DB commands we include here, somewhat tangentially, a simple
+simulator that generates synthetic reads over a given genome reference contained in a
+supplied .dam DB. The simulator first reconstitutes the scaffolds of the reference
+genome and fills in their gaps (a run of N's in .fasta format indicating the estimate
+gap length) with a random sequence that follows the base distribution of the contigs.
+It will then sample reads from these scaffold sequences.
+
+The simulator generates sample reads of mean length -m from a log-normal length
+distribution with standard deviation -s, but ignores reads of length less than -x. It
+collects enough reads to cover the genome -c times and Introduces -e fraction errors
+into each read where the ratio of insertions, deletions, and substitutions are set by
+defined constants INS_RATE (default 73%) and DEL_RATE (default 20%) within generate.c.
+One can control the rate at which reads are picked from the forward and reverse
+strands with the -f option. The -r option seeds the random number generator for the
+generation process so that one can reproducibly generate the same dataset. If this
+parameter is missing, then the job id of the invocation seeds the random number
+generator effectively guaranteeing a different sampling with each invocation.
+
+The output is sent to the standard output (i.e. it is a UNIX pipe). The output is in
+Pacbio .fasta format suitable as input to fasta2DB. Uppercase letters are used if the
+-U option is given, and the width of each line can be controlled with the -w option.
+
+Finally, the -M option requests that the scaffold and coordinates within said scaffold
+from which each read has been sampled are written to the indicated file, one line per
+read, ASCII encoded. This "map" file essential tells one where every read belongs in
+an assembly and is very useful for debugging and testing purposes. If the map line for
+a read is say 's b e' then if b \< e the read is a perturbed copy of s[b,e] in the
+forward direction, and a perturbed copy s[e,b] in the reverse direction otherwise.
+
+```
+15. rangen <genlen:double> [-U] [-b<double(.5)>] [-w<int(80)>] [-r<int>]
+```
+
+Generate a random DNA sequence of length genlen*1Mbp that has an AT-bias of -b.
+Output the sequence to the standard output in .fasta format. Use uppercase letters if
+-U is set and -w base pairs per line (default 80). The result can then be converted
+into a .dam DB and given to the simulator to create a read database over a random
+synthetic sequence. The -r option seeds the random number generator for the
+generation process so that one can reproducibly generate the same sequence. If this
+parameter is missing, then the job id of the invocation seeds the random number
+generator effectively guaranteeing a different sequence with each invocation.
+
+Example: A small complete example of most of the commands above.
+
+```
+> rangen 1.0 >R.fasta // Generate a randome 1Mbp sequence R.fasta
+> fasta2DAM R R.fasta // Load it into a .dam DB R.dam
+> simulator R -c20. >G.fasta // Sample a 20x data sets of the random geneome R
+> fasta2DB G G.fasta // Create a compressed data base of the reads, G.db
+> rm G.fasta // Redundant, recreate any time with "DB2fasta G"
+> DBsplit -s11 G // Split G into 2 parts of size ~ 11MB each
+> DBdust G.1 // Produce a "dust" track on each part
+> DBdust G.2
+> Catrack G dust // Create one track for all of the DB
+> rm .G.*.dust.* // Clean up the sub-tracks
+> DBstats -mdust G // Take a look at the statistics for the database
+
+Statistics for all reads in the data set
+
+ 1,836 reads out of 1,836 (100.0%)
+ 20,007,090 base pairs out of 20,007,090 (100.0%)
+
+ 10,897 average read length
+ 2,192 standard deviation
+
+ Base composition: 0.250(A) 0.250(C) 0.250(G) 0.250(T)
+
+ Distribution of Read Lengths (Bin size = 1,000)
+
+ Bin: Count % Reads % Bases Average
+ 22,000: 1 0.1 0.1 22654
+ 21,000: 0 0.1 0.1 22654
+ 20,000: 1 0.1 0.2 21355
+ 19,000: 0 0.1 0.2 21355
+ 18,000: 4 0.3 0.6 19489
+ 17,000: 8 0.8 1.3 18374
+ 16,000: 19 1.8 2.8 17231
+ 15,000: 43 4.1 6.2 16253
+ 14,000: 81 8.6 12.0 15341
+ 13,000: 146 16.5 21.9 14428
+ 12,000: 200 27.4 34.4 13664
+ 11,000: 315 44.6 52.4 12824
+ 10,000: 357 64.0 71.2 12126
+ 9,000: 306 80.7 85.8 11586
+ 8,000: 211 92.2 94.8 11208
+ 7,000: 95 97.3 98.4 11017
+ 6,000: 43 99.7 99.8 10914
+ 5,000: 6 100.0 100.0 10897
+
+
+Statistics for dust-track
+
+ There are 158 intervals totaling 1,820 bases (0.0% of all data)
+
+ Distribution of dust intervals (Bin size = 1,000)
+
+ Bin: Count % Intervals % Bases Average
+ 0: 158 100.0 100.0 11
+
+> ls -al
+total 66518744
+drwxr-xr-x+ 177 myersg staff 6018 Mar 2 13:28 .
+drwxr-xr-x+ 20 myersg staff 680 Feb 26 19:52 ..
+-rw-r--r--+ 1 myersg staff 5002464 Mar 2 13:28 .G.bps
+-rw-r--r--+ 1 myersg staff 14704 Mar 2 13:28 .G.dust.anno
+-rw-r--r--+ 1 myersg staff 1264 Mar 2 13:28 .G.dust.data
+-rw-r--r--+ 1 myersg staff 73552 Mar 2 13:28 .G.idx
+-rw-r--r--+ 1 myersg staff 162 Mar 2 13:28 G.db
+> cat G.db
+files = 1
+ 1836 G Sim
+blocks = 2
+size = 11 cutoff = 0 all = 0
+ 0 0
+ 1011 1011
+ 1836 1836
+```
diff --git a/fasta2DAM.c b/fasta2DAM.c
index d7f75ec..15e074a 100644
--- a/fasta2DAM.c
+++ b/fasta2DAM.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
* Add .fasta files to a DB:
@@ -72,7 +35,7 @@
#define PATHSEP "/"
#endif
-static char *Usage = "[-v] <path:string> ( -f<file> | <input:fasta> ... )";
+static char *Usage = "[-v] <path:dam> ( -f<file> | -i[<name>] | <input:fasta> ... )";
static char number[128] =
{ 0, 0, 0, 0, 0, 0, 0, 0,
@@ -105,6 +68,8 @@ File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first)
{ File_Iterator *it;
it = Malloc(sizeof(File_Iterator),"Allocating file iterator");
+ if (it == NULL)
+ return (NULL);
it->argc = argc;
it->argv = argv;
it->input = input;
@@ -131,12 +96,15 @@ int next_file(File_Iterator *it)
if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL)
{ if (feof(it->input))
return (0);
- SYSTEM_ERROR;
+ fprintf(stderr,"%s: IO error reading line %d of -f file of names\n",Prog_Name,it->count);
+ it->name = NULL;
+ return (1);
}
if ((eol = index(nbuffer,'\n')) == NULL)
{ fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n",
Prog_Name,it->count,MAX_NAME+7);
it->name = NULL;
+ return (1);
}
*eol = '\0';
it->count += 1;
@@ -147,21 +115,23 @@ int next_file(File_Iterator *it)
int main(int argc, char *argv[])
-{ FILE *ostub;
+{ FILE *istub, *ostub;
char *dbname;
char *root, *pwd;
FILE *bases, *indx, *hdrs;
- int64 boff, hoff;
+ int64 boff, ioff, hoff, noff;
int ifiles, ofiles;
char **flist;
HITS_DB db;
int ureads;
+ int64 offset, hdrset;
- int VERBOSE;
+ char *PIPE;
FILE *IFILE;
+ int VERBOSE;
// Process command line
@@ -171,6 +141,7 @@ int main(int argc, char *argv[])
ARG_INIT("fasta2DAM")
IFILE = NULL;
+ PIPE = NULL;
j = 1;
for (i = 1; i < argc; i++)
@@ -186,6 +157,20 @@ int main(int argc, char *argv[])
exit (1);
}
break;
+ case 'i':
+ PIPE = argv[i]+2;
+ if (PIPE[0] != '\0')
+ { FILE *temp;
+
+ temp = fopen(PIPE,"w");
+ if (temp == NULL)
+ { fprintf(stderr,"%s: Cannot create -i name '%s'\n",Prog_Name,argv[i]+2);
+ exit (1);
+ }
+ fclose(temp);
+ unlink(PIPE);
+ }
+ break;
}
else
argv[j++] = argv[i];
@@ -193,70 +178,148 @@ int main(int argc, char *argv[])
VERBOSE = flags['v'];
- if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2))
+ if (IFILE != NULL && PIPE != NULL)
+ { fprintf(stderr,"%s: Cannot use both -f and -i together\n",Prog_Name);
+ exit (1);
+ }
+
+ if ( (IFILE == NULL && PIPE == NULL && argc <= 2) ||
+ ((IFILE != NULL || PIPE != NULL) && argc != 2))
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
exit (1);
}
}
- // Try to open DB file, if present then adding to DB, otherwise creating new DB. Set up
+ // Try to open DAM file, if present then adding to DAM, otherwise creating new DAM. Set up
// variables as follows:
// dbname = full name of map index = <pwd>/<root>.dam
+ // istub = open db file (if adding) or NULL (if creating)
// ostub = new image of db file (will overwrite old image at end)
// bases = .bps file positioned for appending
// indx = .idx file positioned for appending
+ // hdrs = .hdr file positioned for appending
// ureads = # of reads currently in db
- // boff = offset in .bps at which to place next sequence
- // hoff = offset in .hdr at which to place next header prefix
+ // offset = offset in .bps at which to place next sequence
+ // hdrset = offset in .hdr at which to place next header
+ // ioff = offset in .idx file to truncate to if command fails
+ // boff = offset in .bps file to truncate to if command fails
+ // hoff = offset in .hdr file to truncate to if command fails
// ifiles = # of .fasta files to add
// ofiles = # of .fasta files added so far
- // flist = [0..ifiles] list of file names (root only) added to db so far
+ // flist = [0..ifiles+ofiles] list of file names (root only) added to dam so far
+
+ { int i;
+
+ root = Root(argv[1],".dam");
+ pwd = PathTo(argv[1]);
+ dbname = Strdup(Catenate(pwd,"/",root,".dam"),"Allocating map index name");
+ if (dbname == NULL)
+ exit (1);
+
+ if (PIPE != NULL)
+ ifiles = 1;
+ else if (IFILE == NULL)
+ ifiles = argc-2;
+ else
+ { File_Iterator *ng;
+
+ ifiles = 0;
+ ng = init_file_iterator(argc,argv,IFILE,2);
+ if (ng == NULL)
+ exit (1);
+ while (next_file(ng))
+ { if (ng->name == NULL)
+ exit (1);
+ ifiles += 1;
+ }
+ free(ng);
+ }
- root = Root(argv[1],".dam");
- pwd = PathTo(argv[1]);
- dbname = Strdup(Catenate(pwd,"/",root,".dam"),"Allocating map index name");
- if (dbname == NULL)
- exit (1);
+ bases = NULL;
+ indx = NULL;
+ hdrs = NULL;
+ ostub = NULL;
+ ioff = 0;
+ boff = 0;
+ hoff = 0;
+
+ istub = fopen(dbname,"r");
+ if (istub == NULL)
+ { ofiles = 0;
+
+ bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"w+");
+ indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w+");
+ hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"w+");
+ if (bases == NULL || indx == NULL || hdrs == NULL)
+ goto error;
- if (IFILE == NULL)
- ifiles = argc-2;
- else
- { File_Iterator *ng;
+ fwrite(&db,sizeof(HITS_DB),1,indx);
- ifiles = 0;
- ng = init_file_iterator(argc,argv,IFILE,2);
- while (next_file(ng))
- ifiles += 1;
- free(ng);
- }
- ofiles = 0;
+ ureads = 0;
+ offset = 0;
+ hdrset = 0;
+ boff = 0;
+ ioff = 0;
+ hoff = 0;
+ }
+ else
+ { if (fscanf(istub,DB_NFILE,&ofiles) != 1)
+ { fprintf(stderr,"%s: %s.dam is corrupted, read failed 1\n",Prog_Name,root);
+ goto error;
+ }
- bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"w");
- indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w");
- hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"w");
- if (bases == NULL || indx == NULL || hdrs == NULL)
- exit (1);
+ bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"r+");
+ indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+");
+ hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"r+");
+ if (bases == NULL || indx == NULL || hdrs == NULL)
+ goto error;
- flist = (char **) Malloc(sizeof(char *)*ifiles,"Allocating file list");
- fwrite(&db,sizeof(HITS_DB),1,indx);
+ if (fread(&db,sizeof(HITS_DB),1,indx) != 1)
+ { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root);
+ goto error;
+ }
+ fseeko(bases,0,SEEK_END);
+ fseeko(indx, 0,SEEK_END);
+ fseeko(hdrs, 0,SEEK_END);
+
+ ureads = db.ureads;
+ offset = ftello(bases);
+ hdrset = ftello(hdrs);
+ boff = offset;
+ ioff = ftello(indx);
+ hoff = hdrset;
+ }
- ureads = 0;
- boff = 0;
- hoff = 0;
+ flist = (char **) Malloc(sizeof(char *)*(ofiles+ifiles),"Allocating file list");
+ ostub = Fopen(Catenate(pwd,"/",root,".dbx"),"w+");
+ if (ostub == NULL || flist == NULL)
+ goto error;
- ostub = Fopen(dbname,"w+");
- if (ostub == NULL)
- exit (1);
+ fprintf(ostub,DB_NFILE,ofiles+ifiles);
+ noff = 0;
+ for (i = 0; i < ofiles; i++)
+ { int last;
+ char prolog[MAX_NAME], fname[MAX_NAME];
- fprintf(ostub,DB_NFILE,argc-2);
+ if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ { fprintf(stderr,"%s: %s.dam is corrupted, read failed 5(%d)\n",Prog_Name,root,i);
+ goto error;
+ }
+ if ((flist[i] = Strdup(fname,"Adding to file list")) == NULL)
+ goto error;
+ noff = ftello(ostub);
+ fprintf(ostub,DB_FDATA,last,fname,prolog);
+ }
+ }
{ int maxlen;
int64 totlen, count[4];
int rmax;
HITS_READ prec;
char *read;
+ int append;
int c;
- File_Iterator *ng;
+ File_Iterator *ng = NULL;
// Buffer for accumulating .fasta sequence over multiple lines
@@ -272,31 +335,66 @@ int main(int argc, char *argv[])
// For each .fasta file do:
- ng = init_file_iterator(argc,argv,IFILE,2);
- while (next_file(ng))
+ if (PIPE == NULL)
+ { ng = init_file_iterator(argc,argv,IFILE,2);
+ if (ng == NULL)
+ goto error;
+ }
+
+ while (PIPE != NULL || next_file(ng))
{ FILE *input;
char *path, *core;
int nline, eof, rlen;
- if (ng->name == NULL) goto error;
+ // Open it: <path>/<core>.fasta if file, stdin otherwise with core = PIPE or "stdout"
- // Open it: <path>/<core>.fasta, check that core is not too long,
- // and checking that it is not already in flist.
+ if (PIPE == NULL)
- path = PathTo(ng->name);
- core = Root(ng->name,".fasta");
- if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL)
- goto error;
- free(path);
+ { if (ng->name == NULL) goto error;
+
+ path = PathTo(ng->name);
+ core = Root(ng->name,".fasta");
+ if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL)
+ goto error;
+ free(path);
+ }
+
+ else
+
+ { if (PIPE[0] == '\0')
+ core = Strdup("stdout","Allocating file name");
+ else
+ core = Strdup(PIPE,"Allocating file name");
+ if (core == NULL)
+ goto error;
+ input = stdin;
+ }
+
+ // Check that core is not too long and name is unique or last source if PIPE'd
+ // If PIPE'd and last source, then overwrite last file line of new stub file.
+
+ if (strlen(core) >= MAX_NAME)
+ { fprintf(stderr,"%s: File name over %d chars: '%.200s'\n",
+ Prog_Name,MAX_NAME,core);
+ goto error;
+ }
{ int j;
- for (j = 0; j < ofiles; j++)
- if (strcmp(core,flist[j]) == 0)
- { fprintf(stderr,"%s: File %s.fasta is already in database %s.db\n",
- Prog_Name,core,Root(argv[1],".db"));
- goto error;
- }
+ append = 0;
+ if (PIPE == NULL || (strcmp(core,"stdout") != 0 &&
+ (ofiles == 0 || strcmp(core,flist[ofiles-1]) != 0)))
+ { for (j = 0; j < ofiles; j++)
+ if (strcmp(core,flist[j]) == 0)
+ { fprintf(stderr,"%s: File %s.fasta is already in database %s.dam\n",
+ Prog_Name,core,Root(argv[1],".dam"));
+ goto error;
+ }
+ }
+ else if (ofiles > 0 && strcmp(core,flist[ofiles-1]) == 0)
+ { fseeko(ostub,noff,SEEK_SET);
+ append = 1;
+ }
}
// Get the header of the first line. If the file is empty skip.
@@ -314,12 +412,17 @@ int main(int argc, char *argv[])
// Add the file name to flist
if (VERBOSE)
- { fprintf(stderr,"Adding '%s' ...\n",core);
+ { if (PIPE != NULL && PIPE[0] == '\0')
+ fprintf(stderr,"Adding scaffolds from stdio ...\n");
+ else
+ fprintf(stderr,"Adding '%s.fasta' ...\n",core);
fflush(stderr);
}
- flist[ofiles++] = core;
+
+ if (!append)
+ flist[ofiles++] = core;
- // Check that the first line has PACBIO format, and record prolog in 'prolog'.
+ // Check that the first line is a header line
if (read[strlen(read)-1] != '\n')
{ fprintf(stderr,"File %s.fasta, Line 1: Fasta line is too long (> %d chars)\n",
@@ -336,13 +439,12 @@ int main(int argc, char *argv[])
{ int i, x, n;
while (!eof)
- { int hlen, hline;
+ { int hlen;
read[rlen] = '>';
hlen = strlen(read+rlen);
fwrite(read+rlen,1,hlen,hdrs);
- hline = nline;
rlen = 0;
while (1)
{ eof = (fgets(read+rlen,MAX_NAME,input) == NULL);
@@ -382,8 +484,8 @@ int main(int argc, char *argv[])
pbeg = i;
prec.fpulse = pbeg;
prec.origin = n++;
- prec.boff = boff;
- prec.coff = hoff;
+ prec.boff = offset;
+ prec.coff = hdrset;
prec.flags = DB_BEST;
while (i < rlen)
{ x = number[(int) read[i]];
@@ -400,51 +502,186 @@ int main(int argc, char *argv[])
Compress_Read(plen,read+pbeg);
clen = COMPRESSED_LEN(plen);
fwrite(read+pbeg,1,clen,bases);
- boff += clen;
+ offset += clen;
fwrite(&prec,sizeof(HITS_READ),1,indx);
}
- hoff += hlen;
+ hdrset += hlen;
}
+ }
- fprintf(ostub,DB_FDATA,ureads,core,core);
+ fprintf(ostub,DB_FDATA,ureads,core,core);
+ if (PIPE == NULL)
fclose(input);
- }
+ else
+ break;
}
// Update relevant fields in db record
db.ureads = ureads;
- db.treads = ureads;
- for (c = 0; c < 4; c++)
- db.freq[c] = (float) ((1.*count[c])/totlen);
- db.totlen = totlen;
- db.maxlen = maxlen;
- db.cutoff = -1;
+ if (istub == NULL)
+ { for (c = 0; c < 4; c++)
+ db.freq[c] = (float) ((1.*count[c])/totlen);
+ db.totlen = totlen;
+ db.maxlen = maxlen;
+ db.cutoff = -1;
+ }
+ else
+ { for (c = 0; c < 4; c++)
+ db.freq[c] = (float) ((db.freq[c]*db.totlen + (1.*count[c]))/(db.totlen + totlen));
+ db.totlen += totlen;
+ if (maxlen > db.maxlen)
+ db.maxlen = maxlen;
+ }
}
+ // If db has been previously partitioned then calculate additional partition points and
+ // write to new db file image
+
+ if (db.cutoff >= 0)
+ { int64 totlen, dbpos, size;
+ int nblock, ireads, tfirst, rlen;
+ int ufirst, cutoff, allflag;
+ HITS_READ record;
+ int i;
+
+ if (VERBOSE)
+ { fprintf(stderr,"Updating block partition ...\n");
+ fflush(stderr);
+ }
+
+ // Read the block portion of the existing db image getting the indices of the first
+ // read in the last block of the exisiting db as well as the partition parameters.
+ // Copy the old image block information to the new block information (except for
+ // the indices of the last partial block)
+
+ if (fscanf(istub,DB_NBLOCK,&nblock) != 1)
+ { fprintf(stderr,"%s: %s.dam is corrupted, read failed 2\n",Prog_Name,root);
+ goto error;
+ }
+ dbpos = ftello(ostub);
+ fprintf(ostub,DB_NBLOCK,0);
+ if (fscanf(istub,DB_PARAMS,&size,&cutoff,&allflag) != 3)
+ { fprintf(stderr,"%s: %s.dam is corrupted, read failed 3\n",Prog_Name,root);
+ goto error;
+ }
+ fprintf(ostub,DB_PARAMS,size,cutoff,allflag);
+ if (allflag)
+ allflag = 0;
+ else
+ allflag = DB_BEST;
+
+ nblock -= 1;
+ for (i = 0; i <= nblock; i++)
+ { if (fscanf(istub,DB_BDATA,&ufirst,&tfirst) != 2)
+ { fprintf(stderr,"%s: %s.dam is corrupted, read failed 4\n",Prog_Name,root);
+ goto error;
+ }
+ fprintf(ostub,DB_BDATA,ufirst,tfirst);
+ }
+
+ // Seek the first record of the last block of the existing db in .idx, and then
+ // compute and record partition indices for the rest of the db from this point
+ // forward.
+
+ fseeko(indx,sizeof(HITS_DB)+sizeof(HITS_READ)*ufirst,SEEK_SET);
+ totlen = 0;
+ ireads = 0;
+ for (i = ufirst; i < ureads; i++)
+ { if (fread(&record,sizeof(HITS_READ),1,indx) != 1)
+ { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root);
+ goto error;
+ }
+ rlen = record.rlen;
+ if (rlen >= cutoff)
+ { ireads += 1;
+ tfirst += 1;
+ totlen += rlen;
+ if (totlen >= size)
+ { fprintf(ostub," %9d %9d\n",i+1,tfirst);
+ totlen = 0;
+ ireads = 0;
+ nblock += 1;
+ }
+ }
+ }
+
+ if (ireads > 0)
+ { fprintf(ostub,DB_BDATA,ureads,tfirst);
+ nblock += 1;
+ }
+
+ db.treads = tfirst;
+
+ fseeko(ostub,dbpos,SEEK_SET);
+
+ fprintf(ostub,DB_NBLOCK,nblock); // Rewind and record the new number of blocks
+ }
+ else
+ db.treads = ureads;
+
+ rewind(ostub);
+ fprintf(ostub,DB_NFILE,ofiles);
+
rewind(indx);
fwrite(&db,sizeof(HITS_DB),1,indx); // Write the finalized db record into .idx
+ if (istub != NULL)
+ fclose(istub);
fclose(ostub);
fclose(indx);
fclose(bases);
fclose(hdrs);
+ rename(Catenate(pwd,"/",root,".dbx"),dbname); // New image replaces old image
+
exit (0);
- // Error exit: Remove the .idx, .bps, and .dam files
+ // Error exit: Either truncate or remove the .idx, .bps, and .hdr files as appropriate.
+ // Remove the new image file <pwd>/<root>.dbx
error:
- fclose(ostub);
- fclose(indx);
- fclose(hdrs);
- fclose(bases);
- unlink(Catenate(pwd,PATHSEP,root,".idx"));
- unlink(Catenate(pwd,PATHSEP,root,".bps"));
- unlink(Catenate(pwd,PATHSEP,root,".hdr"));
- unlink(Catenate(pwd,"/",root,".dam"));
+ if (ioff != 0)
+ { fseeko(indx,0,SEEK_SET);
+ if (ftruncate(fileno(indx),ioff) < 0)
+ fprintf(stderr,"%s: Fatal: could not restore %s.idx after error, truncate failed\n",
+ Prog_Name,root);
+ }
+ if (boff != 0)
+ { fseeko(bases,0,SEEK_SET);
+ if (ftruncate(fileno(bases),boff) < 0)
+ fprintf(stderr,"%s: Fatal: could not restore %s.bps after error, truncate failed\n",
+ Prog_Name,root);
+ }
+ if (hoff != 0)
+ { fseeko(hdrs,0,SEEK_SET);
+ if (ftruncate(fileno(hdrs),hoff) < 0)
+ fprintf(stderr,"%s: Fatal: could not restore %s.hdr after error, truncate failed\n",
+ Prog_Name,root);
+ }
+ if (indx != NULL)
+ { fclose(indx);
+ if (ioff == 0)
+ unlink(Catenate(pwd,PATHSEP,root,".idx"));
+ }
+ if (bases != NULL)
+ { fclose(bases);
+ if (boff == 0)
+ unlink(Catenate(pwd,PATHSEP,root,".bps"));
+ }
+ if (hdrs != NULL)
+ { fclose(hdrs);
+ if (hoff == 0)
+ unlink(Catenate(pwd,PATHSEP,root,".hdr"));
+ }
+ if (ostub != NULL)
+ { fclose(ostub);
+ unlink(Catenate(pwd,"/",root,".dbx"));
+ }
+ if (istub != NULL)
+ fclose(istub);
exit (1);
}
diff --git a/fasta2DB.c b/fasta2DB.c
index 50061d5..071bbe0 100644
--- a/fasta2DB.c
+++ b/fasta2DB.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
* Add .fasta files to a DB:
@@ -72,7 +35,7 @@
#define PATHSEP "/"
#endif
-static char *Usage = "[-v] <path:string> ( -f<file> | <input:fasta> ... )";
+static char *Usage = "[-v] <path:string> ( -f<file> | -i[<name>] | <input:fasta> ... )";
static char number[128] =
{ 0, 0, 0, 0, 0, 0, 0, 0,
@@ -105,6 +68,8 @@ File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first)
{ File_Iterator *it;
it = Malloc(sizeof(File_Iterator),"Allocating file iterator");
+ if (it == NULL)
+ return (NULL);
it->argc = argc;
it->argv = argv;
it->input = input;
@@ -131,12 +96,15 @@ int next_file(File_Iterator *it)
if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL)
{ if (feof(it->input))
return (0);
- SYSTEM_ERROR;
+ fprintf(stderr,"%s: IO error reading line %d of -f file of names\n",Prog_Name,it->count);
+ it->name = NULL;
+ return (1);
}
if ((eol = index(nbuffer,'\n')) == NULL)
{ fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n",
Prog_Name,it->count,MAX_NAME+7);
it->name = NULL;
+ return (1);
}
*eol = '\0';
it->count += 1;
@@ -154,17 +122,18 @@ int main(int argc, char *argv[])
FILE *bases, *indx;
int64 boff, ioff;
- int ifiles, ofiles;
+ int ifiles, ofiles, ocells;
char **flist;
HITS_DB db;
int ureads;
int64 offset;
+ char *PIPE;
FILE *IFILE;
int VERBOSE;
- // Usage: [-v] <path:string> ( -f<file> | <input:fasta> ... )
+ // Process command line
{ int i, j, k;
int flags[128];
@@ -172,6 +141,7 @@ int main(int argc, char *argv[])
ARG_INIT("fasta2DB")
IFILE = NULL;
+ PIPE = NULL;
j = 1;
for (i = 1; i < argc; i++)
@@ -187,6 +157,20 @@ int main(int argc, char *argv[])
exit (1);
}
break;
+ case 'i':
+ PIPE = argv[i]+2;
+ if (PIPE[0] != '\0')
+ { FILE *temp;
+
+ temp = fopen(PIPE,"w");
+ if (temp == NULL)
+ { fprintf(stderr,"%s: Cannot create -i name '%s'\n",Prog_Name,argv[i]+2);
+ exit (1);
+ }
+ fclose(temp);
+ unlink(PIPE);
+ }
+ break;
}
else
argv[j++] = argv[i];
@@ -194,7 +178,13 @@ int main(int argc, char *argv[])
VERBOSE = flags['v'];
- if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2))
+ if (IFILE != NULL && PIPE != NULL)
+ { fprintf(stderr,"%s: Cannot use both -f and -i together\n",Prog_Name);
+ exit (1);
+ }
+
+ if ( (IFILE == NULL && PIPE == NULL && argc <= 2) ||
+ ((IFILE != NULL || PIPE != NULL) && argc != 2))
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
exit (1);
}
@@ -212,10 +202,11 @@ int main(int argc, char *argv[])
// ioff = offset in .idx file to truncate to if command fails
// boff = offset in .bps file to truncate to if command fails
// ifiles = # of .fasta files to add
- // ofiles = # of .fasta files already in db
- // flist = [0..ifiles+ofiles] list of file names (root only) added to db so far
+ // ofiles = # of .fasta files added so far
+ // ocells = # of SMRT cells already in db
+ // flist = [0..ifiles+ocells] list of file names (root only) added to db so far
- { int i;
+ { int i;
root = Root(argv[1],".db");
pwd = PathTo(argv[1]);
@@ -223,26 +214,39 @@ int main(int argc, char *argv[])
if (dbname == NULL)
exit (1);
- if (IFILE == NULL)
+ if (PIPE != NULL)
+ ifiles = 1;
+ else if (IFILE == NULL)
ifiles = argc-2;
else
{ File_Iterator *ng;
ifiles = 0;
ng = init_file_iterator(argc,argv,IFILE,2);
+ if (ng == NULL)
+ exit (1);
while (next_file(ng))
- ifiles += 1;
+ { if (ng->name == NULL)
+ exit (1);
+ ifiles += 1;
+ }
free(ng);
}
+ bases = NULL;
+ indx = NULL;
+ ostub = NULL;
+ ioff = 0;
+ boff = 0;
+
istub = fopen(dbname,"r");
if (istub == NULL)
- { ofiles = 0;
+ { ocells = 0;
bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"w+");
indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w+");
if (bases == NULL || indx == NULL)
- exit (1);
+ goto error;
fwrite(&db,sizeof(HITS_DB),1,indx);
@@ -252,16 +256,20 @@ int main(int argc, char *argv[])
ioff = 0;
}
else
- { if (fscanf(istub,DB_NFILE,&ofiles) != 1)
- SYSTEM_ERROR
+ { if (fscanf(istub,DB_NFILE,&ocells) != 1)
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root);
+ goto error;
+ }
bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"r+");
indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+");
if (bases == NULL || indx == NULL)
- exit (1);
+ goto error;
if (fread(&db,sizeof(HITS_DB),1,indx) != 1)
- SYSTEM_ERROR
+ { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root);
+ goto error;
+ }
fseeko(bases,0,SEEK_END);
fseeko(indx, 0,SEEK_END);
@@ -271,20 +279,24 @@ int main(int argc, char *argv[])
ioff = ftello(indx);
}
- flist = (char **) Malloc(sizeof(char *)*(ofiles+ifiles),"Allocating file list");
+ flist = (char **) Malloc(sizeof(char *)*(ocells+ifiles),"Allocating file list");
ostub = Fopen(Catenate(pwd,"/",root,".dbx"),"w+");
if (ostub == NULL || flist == NULL)
- exit (1);
+ goto error;
- fprintf(ostub,DB_NFILE,ofiles+ifiles);
- for (i = 0; i < ofiles; i++)
+ fprintf(ostub,DB_NFILE,ocells+ifiles); // Will write again with correct value at end
+ ofiles = 0;
+ for (i = 0; i < ocells; i++)
{ int last;
char prolog[MAX_NAME], fname[MAX_NAME];
if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
- SYSTEM_ERROR
- if ((flist[i] = Strdup(fname,"Adding to file list")) == NULL)
- goto error;
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root);
+ goto error;
+ }
+ if (ofiles == 0 || strcmp(flist[ofiles-1],fname) != 0)
+ if ((flist[ofiles++] = Strdup(fname,"Adding to file list")) == NULL)
+ goto error;
fprintf(ostub,DB_FDATA,last,fname,prolog);
}
}
@@ -295,7 +307,7 @@ int main(int argc, char *argv[])
HITS_READ *prec;
char *read;
int c;
- File_Iterator *ng;
+ File_Iterator *ng = NULL;
// Buffer for reads all in the same well
@@ -316,25 +328,47 @@ int main(int argc, char *argv[])
for (c = 0; c < 4; c++) // count of acgt in new .fasta files
count[c] = 0;
- // For each new .fasta file do:
+ // For each new input source do
- ng = init_file_iterator(argc,argv,IFILE,2);
- while (next_file(ng))
+ if (PIPE == NULL)
+ { ng = init_file_iterator(argc,argv,IFILE,2); // Setup to read .fasta's
+ if (ng == NULL) // from command line or file
+ goto error;
+ }
+
+ while (PIPE != NULL || next_file(ng))
{ FILE *input;
- char *path, *core, *prolog;
+ char prolog[MAX_NAME];
+ char *path, *core;
int nline, eof, rlen, pcnt;
int pwell;
- if (ng->name == NULL) goto error;
+ // Open it: <path>/<core>.fasta if file, stdin otherwise with core = PIPE or "stdout"
- // Open it: <path>/<core>.fasta, check that core is not too long,
- // and checking that it is not already in flist.
+ if (PIPE == NULL)
+
+ { if (ng->name == NULL) goto error;
+
+ path = PathTo(ng->name);
+ core = Root(ng->name,".fasta");
+ if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL)
+ goto error;
+ free(path);
+ }
+
+ else
+
+ { if (PIPE[0] == '\0')
+ core = Strdup("stdout","Allocating file name");
+ else
+ core = Strdup(PIPE,"Allocating file name");
+ if (core == NULL)
+ goto error;
+ input = stdin;
+ }
+
+ // Check that core is not too long and name is unique or last source if PIPE'd
- path = PathTo(ng->name);
- core = Root(ng->name,".fasta");
- if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL)
- goto error;
- free(path);
if (strlen(core) >= MAX_NAME)
{ fprintf(stderr,"%s: File name over %d chars: '%.200s'\n",
Prog_Name,MAX_NAME,core);
@@ -343,12 +377,15 @@ int main(int argc, char *argv[])
{ int j;
- for (j = 0; j < ofiles; j++)
- if (strcmp(core,flist[j]) == 0)
- { fprintf(stderr,"%s: File %s.fasta is already in database %s.db\n",
- Prog_Name,core,Root(argv[1],".db"));
- goto error;
- }
+
+ if (PIPE == NULL || (strcmp(core,"stdout") != 0 &&
+ (ofiles == 0 || strcmp(core,flist[ofiles-1]) != 0)))
+ for (j = 0; j < ofiles; j++)
+ if (strcmp(core,flist[j]) == 0)
+ { fprintf(stderr,"%s: File %s.fasta is already in database %s.db\n",
+ Prog_Name,core,Root(argv[1],".db"));
+ goto error;
+ }
}
// Get the header of the first line. If the file is empty skip.
@@ -359,6 +396,8 @@ int main(int argc, char *argv[])
eof = (fgets(read,MAX_NAME,input) == NULL);
if (eof || strlen(read) < 1)
{ fprintf(stderr,"Skipping '%s', file is empty!\n",core);
+ if (input == stdin)
+ break;
fclose(input);
free(core);
continue;
@@ -367,12 +406,15 @@ int main(int argc, char *argv[])
// Add the file name to flist
if (VERBOSE)
- { fprintf(stderr,"Adding '%s' ...\n",core);
+ { if (PIPE != NULL && PIPE[0] == '\0')
+ fprintf(stderr,"Adding reads from stdio ...\n");
+ else
+ fprintf(stderr,"Adding '%s.fasta' ...\n",core);
fflush(stderr);
}
flist[ofiles++] = core;
- // Check that the first line has PACBIO format, and record prolog in 'prolog'.
+ // Check that the first line is a header and has PACBIO format.
if (read[strlen(read)-1] != '\n')
{ fprintf(stderr,"File %s.fasta, Line 1: Fasta line is too long (> %d chars)\n",
@@ -390,9 +432,8 @@ int main(int argc, char *argv[])
find = index(read+1,'/');
if (find != NULL && sscanf(find+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) >= 3)
{ *find = '\0';
- prolog = Strdup(read+1,"Extracting prolog");
+ strcpy(prolog,read+1);
*find = '/';
- if (prolog == NULL) goto error;
}
else
{ fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n",
@@ -407,7 +448,7 @@ int main(int argc, char *argv[])
pwell = -1;
while (!eof)
- { int beg, end, clen, hline;
+ { int beg, end, clen;
int well, qv;
char *find;
@@ -419,9 +460,9 @@ int main(int argc, char *argv[])
}
*find = '\0';
if (strcmp(read+(rlen+1),prolog) != 0)
- { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line name inconsisten\n",
- core,nline);
- goto error;
+ { fprintf(ostub,DB_FDATA,ureads,core,prolog);
+ ocells += 1;
+ strcpy(prolog,read+(rlen+1));
}
*find = '/';
x = sscanf(find+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv);
@@ -433,16 +474,20 @@ int main(int argc, char *argv[])
else if (x == 3)
qv = 0;
- hline = nline;
rlen = 0;
while (1)
{ eof = (fgets(read+rlen,MAX_NAME,input) == NULL);
nline += 1;
x = strlen(read+rlen)-1;
if (read[rlen+x] != '\n')
- { fprintf(stderr,"File %s.fasta, Line %d:",core,nline);
- fprintf(stderr," Fasta line is too long (> %d chars)\n",MAX_NAME-2);
- goto error;
+ { if (read[rlen] == '>')
+ { fprintf(stderr,"File %s.fasta, Line %d:",core,nline);
+ fprintf(stderr," Fasta header line is too long (> %d chars)\n",
+ MAX_NAME-2);
+ goto error;
+ }
+ else
+ x += 1;
}
if (eof || read[rlen] == '>')
break;
@@ -510,7 +555,7 @@ int main(int argc, char *argv[])
}
// Complete processing of .fasta file: flush last well group, write file line
- // in db image, free prolog, and close file
+ // in db image, and close file
x = 0;
for (i = 1; i < pcnt; i++)
@@ -518,12 +563,15 @@ int main(int argc, char *argv[])
x = i;
prec[x].flags |= DB_BEST;
fwrite(prec,sizeof(HITS_READ),pcnt,indx);
-
- fprintf(ostub,DB_FDATA,ureads,core,prolog);
}
- free(prolog);
- fclose(input);
+ fprintf(ostub,DB_FDATA,ureads,core,prolog);
+ ocells += 1;
+
+ if (input != stdin)
+ fclose(input);
+ else
+ break;
}
// Finished loading all sequences: update relevant fields in db record
@@ -566,22 +614,27 @@ int main(int argc, char *argv[])
// the indices of the last partial block)
if (fscanf(istub,DB_NBLOCK,&nblock) != 1)
- SYSTEM_ERROR
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root);
+ goto error;
+ }
dbpos = ftello(ostub);
fprintf(ostub,DB_NBLOCK,0);
if (fscanf(istub,DB_PARAMS,&size,&cutoff,&allflag) != 3)
- SYSTEM_ERROR
- fprintf(ostub,DB_PARAMS,size,cutoff,allflag);
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root);
+ goto error;
+ }
+ fprintf(ostub,DB_PARAMS,size,cutoff,allflag);
if (allflag)
allflag = 0;
else
allflag = DB_BEST;
- size *= 1000000ll;
nblock -= 1;
for (i = 0; i <= nblock; i++)
{ if (fscanf(istub,DB_BDATA,&ufirst,&tfirst) != 2)
- SYSTEM_ERROR
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root);
+ goto error;
+ }
fprintf(ostub,DB_BDATA,ufirst,tfirst);
}
@@ -594,7 +647,9 @@ int main(int argc, char *argv[])
ireads = 0;
for (i = ufirst; i < ureads; i++)
{ if (fread(&record,sizeof(HITS_READ),1,indx) != 1)
- SYSTEM_ERROR
+ { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root);
+ goto error;
+ }
rlen = record.rlen;
if (rlen >= cutoff && (record.flags & DB_BEST) >= allflag)
{ ireads += 1;
@@ -626,7 +681,7 @@ int main(int argc, char *argv[])
fwrite(&db,sizeof(HITS_DB),1,indx); // Write the finalized db record into .idx
rewind(ostub); // Rewrite the number of files actually added
- fprintf(ostub,DB_NFILE,ofiles);
+ fprintf(ostub,DB_NFILE,ocells);
if (istub != NULL)
fclose(istub);
@@ -645,24 +700,31 @@ error:
if (ioff != 0)
{ fseeko(indx,0,SEEK_SET);
if (ftruncate(fileno(indx),ioff) < 0)
- SYSTEM_ERROR
+ fprintf(stderr,"%s: Fatal: could not restore %s.idx after error, truncate failed\n",
+ Prog_Name,root);
}
if (boff != 0)
{ fseeko(bases,0,SEEK_SET);
if (ftruncate(fileno(bases),boff) < 0)
- SYSTEM_ERROR
+ fprintf(stderr,"%s: Fatal: could not restore %s.bps after error, truncate failed\n",
+ Prog_Name,root);
+ }
+ if (indx != NULL)
+ { fclose(indx);
+ if (ioff == 0)
+ unlink(Catenate(pwd,PATHSEP,root,".idx"));
+ }
+ if (bases != NULL)
+ { fclose(bases);
+ if (boff == 0)
+ unlink(Catenate(pwd,PATHSEP,root,".bps"));
+ }
+ if (ostub != NULL)
+ { fclose(ostub);
+ unlink(Catenate(pwd,"/",root,".dbx"));
}
- fclose(indx);
- fclose(bases);
- if (ioff == 0)
- unlink(Catenate(pwd,PATHSEP,root,".idx"));
- if (boff == 0)
- unlink(Catenate(pwd,PATHSEP,root,".bps"));
-
if (istub != NULL)
fclose(istub);
- fclose(ostub);
- unlink(Catenate(pwd,"/",root,".dbx"));
exit (1);
}
diff --git a/quiva2DB.c b/quiva2DB.c
index 0ec2628..6d099f5 100644
--- a/quiva2DB.c
+++ b/quiva2DB.c
@@ -1,40 +1,3 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
* Adds the given .quiva files to an existing DB "path". The input files must be added in
@@ -59,13 +22,16 @@
#include "DB.h"
#include "QV.h"
+// Compiled in INTERACTIVE mode as all routines must return with an error
+// so that cleanup and restore is possible.
+
#ifdef HIDE_FILES
#define PATHSEP "/."
#else
#define PATHSEP "/"
#endif
-static char *Usage = "[-vl] <path:string> ( -f<file> | <input:quiva> ... )";
+static char *Usage = "[-vl] <path:string> ( -f<file> | -i | <input:quiva> ... )";
typedef struct
{ int argc;
@@ -79,6 +45,8 @@ File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first)
{ File_Iterator *it;
it = Malloc(sizeof(File_Iterator),"Allocating file iterator");
+ if (it == NULL)
+ return (NULL);
it->argc = argc;
it->argv = argv;
it->input = input;
@@ -105,12 +73,15 @@ int next_file(File_Iterator *it)
if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL)
{ if (feof(it->input))
return (0);
- SYSTEM_ERROR;
+ fprintf(stderr,"%s: IO error reading line %d of -f file of names\n",Prog_Name,it->count);
+ it->name = NULL;
+ return (1);
}
if ((eol = index(nbuffer,'\n')) == NULL)
{ fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n",
Prog_Name,it->count,MAX_NAME+7);
it->name = NULL;
+ return (1);
}
*eol = '\0';
it->count += 1;
@@ -121,15 +92,23 @@ int next_file(File_Iterator *it)
int main(int argc, char *argv[])
-{ FILE *istub, *quiva, *indx;
+{ FILE *istub;
+ char *root, *pwd;
+
+ FILE *quiva, *indx;
int64 coff;
- int ofile;
+
HITS_DB db;
HITS_READ *reads;
+ int nfiles;
+
+ FILE *temp;
+ char *tname;
int VERBOSE;
int LOSSY;
- FILE *IFILE;
+ int PIPE;
+ FILE *INFILE;
// Process command line
@@ -138,18 +117,18 @@ int main(int argc, char *argv[])
ARG_INIT("quiva2DB")
- IFILE = NULL;
+ INFILE = NULL;
j = 1;
for (i = 1; i < argc; i++)
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
- ARG_FLAGS("vl")
+ ARG_FLAGS("vli")
break;
case 'f':
- IFILE = fopen(argv[i]+2,"r");
- if (IFILE == NULL)
+ INFILE = fopen(argv[i]+2,"r");
+ if (INFILE == NULL)
{ fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2);
exit (1);
}
@@ -161,192 +140,335 @@ int main(int argc, char *argv[])
VERBOSE = flags['v'];
LOSSY = flags['l'];
+ PIPE = flags['i'];
+
+ if (INFILE != NULL && PIPE)
+ { fprintf(stderr,"%s: Cannot use both -f and -i together\n",Prog_Name);
+ exit (1);
+ }
- if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2))
+ if ( (INFILE == NULL && ! PIPE && argc <= 2) ||
+ ((INFILE != NULL || PIPE) && argc != 2))
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
exit (1);
}
}
- // Open DB stub file and index, load db and read records. Confirm that the .fasta files
- // corresponding to the command line .quiva files are in the DB and in order where the
- // index of the first file is ofile and the index of the first read to be added is ofirst.
- // Record in coff the current size of the .qvs file in case an error occurs and it needs
- // to be truncated back to its size at the start.
-
- { int i;
- char *pwd, *root;
- int nfiles;
- File_Iterator *ng;
-
- root = Root(argv[1],".db");
- pwd = PathTo(argv[1]);
- istub = Fopen(Catenate(pwd,"/",root,".db"),"r");
- if (istub == NULL)
- exit (1);
+ // Open DB stub file, index, and .qvs file for appending. Load db and read records,
+ // get number of cells from stub file, and note current offset to end of .qvs
- indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+");
- if (indx == NULL)
+ root = Root(argv[1],".db");
+ pwd = PathTo(argv[1]);
+ istub = Fopen(Catenate(pwd,"/",root,".db"),"r");
+ if (istub == NULL)
+ { fprintf(stderr,"%s",Ebuffer);
+ exit (1);
+ }
+ if (fscanf(istub,DB_NFILE,&nfiles) != 1)
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",root,Prog_Name);
exit (1);
- if (fread(&db,sizeof(HITS_DB),1,indx) != 1)
- SYSTEM_ERROR
+ }
- reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*db.ureads,"Allocating DB index");
- if (reads == NULL)
+ indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+");
+ if (indx == NULL)
+ { fprintf(stderr,"%s",Ebuffer);
+ exit (1);
+ }
+ if (fread(&db,sizeof(HITS_DB),1,indx) != 1)
+ { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",root,Prog_Name);
exit (1);
- if (fread(reads,sizeof(HITS_READ),db.ureads,indx) != (size_t) (db.ureads))
- SYSTEM_ERROR
+ }
- { int first, last;
- char prolog[MAX_NAME], fname[MAX_NAME];
- char *core;
+ reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*db.ureads,"Allocating DB index");
+ if (reads == NULL)
+ { fprintf(stderr,"%s",Ebuffer);
+ exit (1);
+ }
+ if (fread(reads,sizeof(HITS_READ),db.ureads,indx) != (size_t) (db.ureads))
+ { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",root,Prog_Name);
+ exit (1);
+ }
- ng = init_file_iterator(argc,argv,IFILE,2);
- if ( ! next_file(ng))
- { fprintf(stderr,"%s: file list is empty!\n",Prog_Name);
- exit (1);
- }
- if (ng->name == NULL) exit (1);
+ quiva = NULL;
+ temp = NULL;
+ coff = 0;
- core = Root(ng->name,".quiva");
+ if (reads[0].coff < 0)
+ quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"w");
+ else
+ quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"r+");
- if (fscanf(istub,DB_NFILE,&nfiles) != 1)
- SYSTEM_ERROR
- first = 0;
- for (i = 0; i < nfiles; i++)
- { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
- SYSTEM_ERROR
- if (strcmp(core,fname) == 0)
- break;
- first = last;
- }
- if (i >= nfiles)
- { fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core);
- exit (1);
- }
+ tname = Strdup(Catenate(".",PATHSEP,root,Numbered_Suffix("",getpid(),".tmp")),
+ "Allocating temporary name");
+ temp = Fopen(tname,"w+");
- ofile = i;
- if (first > 0 && reads[first-1].coff < 0)
- { fprintf(stderr,"%s: Predecessor of %s.quiva has not been added yet\n",Prog_Name,core);
- exit (1);
- }
- if (reads[first].coff >= 0)
- { fprintf(stderr,"%s: %s.quiva has already been added\n",Prog_Name,core);
- exit (1);
- }
+ if (quiva == NULL || temp == NULL)
+ { fprintf(stderr,"%s",Ebuffer);
+ goto error;
+ }
+ fseeko(quiva,0,SEEK_END);
+ coff = ftello(quiva);
+
+ // Do a merged traversal of cell lines in .db stub file and .quiva files to be
+ // imported, driving the loop with the cell line #
+
+ { FILE *input = NULL;
+ char *path = NULL;
+ char *core = NULL;
+ File_Iterator *ng = NULL;
+ char lname[MAX_NAME];
+ int first, last, cline;
+ int cell;
+
+ if (!PIPE)
+ { ng = init_file_iterator(argc,argv,INFILE,2);
+ if (ng == NULL)
+ { fprintf(stderr,"%s",Ebuffer);
+ goto error;
+ }
+ }
- while (next_file(ng))
- { if (ng->name == NULL)
- exit (1);
- core = Root(ng->name,".quiva");
- if (++i >= nfiles)
- { fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core);
- exit (1);
+ for (cell = 0; cell < nfiles; cell++)
+ { char prolog[MAX_NAME], fname[MAX_NAME];
+
+ if (cell == 0)
+
+ // First addition, a pipe: find the first cell that does not have .quiva's yet
+ // (error if none) and set input source to stdin.
+
+ if (PIPE)
+ { first = 0;
+ while (cell < nfiles)
+ { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",core,Prog_Name);
+ goto error;
+ }
+ if (reads[first].coff < 0)
+ break;
+ first = last;
+ cell += 1;
+ }
+ if (cell >= nfiles)
+ { fprintf(stderr,"%s: All .quiva's have already been added !?\n",Prog_Name);
+ goto error;
+ }
+
+ input = stdin;
+
+ if (VERBOSE)
+ { fprintf(stderr,"Adding quiva's from stdin ...\n");
+ fflush(stderr);
+ }
+ cline = 0;
}
- if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
- SYSTEM_ERROR
- if (strcmp(core,fname) != 0)
- { fprintf(stderr,"%s: Files not being added in order (expect %s, given %s)",
- Prog_Name,fname,core);
- exit (1);
+
+ // First addition, not a pipe: then get first .quiva file name (error if not one) to
+ // add, find the first cell name whose file name matches (error if none), check that
+ // the previous .quiva's have been added and this is the next slot. Then open
+ // the .quiva file for compression
+
+ else
+ { if (! next_file(ng))
+ { fprintf(stderr,"%s: file list is empty!\n",Prog_Name);
+ goto error;
+ }
+ if (ng->name == NULL)
+ { fprintf(stderr,"%s",Ebuffer);
+ goto error;
+ }
+
+ core = Root(ng->name,".quiva");
+ path = PathTo(ng->name);
+ if ((input = Fopen(Catenate(path,"/",core,".quiva"),"r")) == NULL)
+ { fprintf(stderr,"%s",Ebuffer);
+ goto error;
+ }
+
+ first = 0;
+ while (cell < nfiles)
+ { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",core,Prog_Name);
+ goto error;
+ }
+ if (strcmp(core,fname) == 0)
+ break;
+ first = last;
+ cell += 1;
+ }
+ if (cell >= nfiles)
+ { fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core);
+ goto error;
+ }
+
+ if (first > 0 && reads[first-1].coff < 0)
+ { fprintf(stderr,"%s: Predecessor of %s.quiva has not been added yet\n",
+ Prog_Name,core);
+ goto error;
+ }
+ if (reads[first].coff >= 0)
+ { fprintf(stderr,"%s: %s.quiva has already been added\n",Prog_Name,core);
+ goto error;
+ }
+
+ if (VERBOSE)
+ { fprintf(stderr,"Adding '%s.quiva' ...\n",core);
+ fflush(stderr);
+ }
+ cline = 0;
}
- }
- if (ofile == 0)
- quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"w");
- else
- quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"r+");
- if (quiva == NULL)
- exit (1);
+ // Not the first addition: get next cell line. If not a pipe and the file name is new,
+ // then close the current .quiva, open the next one and after ensuring the names
+ // match, open it for compression
- fseeko(quiva,0,SEEK_END);
- coff = ftello(quiva);
+ else
+ { first = last;
+ strcpy(lname,fname);
+ if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3)
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",core,Prog_Name);
+ goto error;
+ }
+ if (PIPE)
+ { int c;
+ if ((c = fgetc(input)) == EOF)
+ break;
+ ungetc(c,input);
+ }
+ else if (strcmp(lname,fname) != 0)
+ { if (fgetc(input) != EOF)
+ { fprintf(stderr,"%s: Too many reads in %s.quiva while handling %s.fasta\n",
+ Prog_Name,core,fname);
+ goto error;
+ }
+
+ fclose(input);
+ free(path);
+ free(core);
+
+ if ( ! next_file(ng))
+ break;
+ if (ng->name == NULL)
+ { fprintf(stderr,"%s",Ebuffer);
+ goto error;
+ }
+
+ path = PathTo(ng->name);
+ core = Root(ng->name,".quiva");
+ if ((input = Fopen(Catenate(path,"/",core,".quiva"),"r")) == NULL)
+ { fprintf(stderr,"%s",Ebuffer);
+ goto error;
+ }
+
+ if (strcmp(core,fname) != 0)
+ { fprintf(stderr,"%s: Files not being added in order (expect %s, given %s)\n",
+ Prog_Name,fname,core);
+ goto error;
+ }
+
+ if (VERBOSE)
+ { fprintf(stderr,"Adding '%s.quiva' ...\n",core);
+ fflush(stderr);
+ }
+ cline = 0;
+ }
+ }
- free(core);
- free(ng);
- }
+ // Compress reads [first..last) from open .quiva appending to .qvs and record
+ // offset in .coff field of reads (offset of first in a cell is to the compression
+ // table).
- free(root);
- free(pwd);
- }
+ { int64 qpos;
+ QVcoding *coding;
+ int i, s;
- // For each .quiva file, determine its compression scheme in a fast scan and append it to
- // the .qvs file Then compress every .quiva entry in the file, appending its compressed
- // form to the .qvs file as you go and recording the offset in the .qvs in the .coff field
- // of each read record (*except* the first, that points at the compression scheme immediately
- // preceding it). Ensure that the # of .quiva entries matches the # of .fasta entries
- // in each added file.
-
- { int i;
- int last, cur;
- File_Iterator *ng;
-
- // For each .quiva file do:
-
- rewind(istub);
- if (fscanf(istub,"files = %*d\n") != 0)
- SYSTEM_ERROR
-
- last = 0;
- for (i = 0; i < ofile; i++)
- if (fscanf(istub," %9d %*s %*s\n",&last) != 1)
- SYSTEM_ERROR
-
- ng = init_file_iterator(argc,argv,IFILE,2);
- cur = last;
- while (next_file(ng))
- { FILE *input;
- int64 qpos;
- char *pwd, *root;
- QVcoding *coding;
-
- // Open next .quiva file and create its compression scheme
-
- pwd = PathTo(ng->name);
- root = Root(ng->name,".quiva");
- if ((input = Fopen(Catenate(pwd,"/",root,".quiva"),"r")) == NULL)
- goto error;
-
- if (VERBOSE)
- { fprintf(stderr,"Analyzing '%s' ...\n",root);
- fflush(stderr);
- }
+ rewind(temp);
+ if (ftruncate(fileno(temp),0) < 0)
+ { fprintf(stderr,"%s: System error: could not truncate temporary file\n",Prog_Name);
+ goto error;
+ }
+ Set_QV_Line(cline);
+ s = QVcoding_Scan(input,last-first,temp);
+ if (s < 0)
+ { fprintf(stderr,"%s",Ebuffer);
+ goto error;
+ }
+ if (s != last-first)
+ { if (PIPE)
+ fprintf(stderr,"%s: Insufficient # of reads on input while handling %s.fasta\n",
+ Prog_Name,fname);
+ else
+ fprintf(stderr,"%s: Insufficient # of reads in %s.quiva while handling %s.fasta\n",
+ Prog_Name,core,fname);
+ goto error;
+ }
- QVcoding_Scan(input);
- coding = Create_QVcoding(LOSSY);
- coding->prefix = Strdup(".qvs","Allocating header prefix");
+ coding = Create_QVcoding(LOSSY);
+ if (coding == NULL)
+ { fprintf(stderr,"%s",Ebuffer);
+ goto error;
+ }
- qpos = ftello(quiva);
- Write_QVcoding(quiva,coding);
+ coding->prefix = Strdup(".qvs","Allocating header prefix");
+ if (coding->prefix == NULL)
+ { fprintf(stderr,"%s",Ebuffer);
+ goto error;
+ }
- // Then compress and append to the .qvs each compressed QV entry
+ qpos = ftello(quiva);
+ Write_QVcoding(quiva,coding);
+
+ // Then compress and append to the .qvs each compressed QV entry
- if (VERBOSE)
- { fprintf(stderr,"Compressing '%s' ...\n",root);
- fflush(stderr);
- }
+ rewind(temp);
+ Set_QV_Line(cline);
+ for (i = first; i < last; i++)
+ { s = Read_Lines(temp,1);
+ if (s < -1)
+ { fprintf(stderr,"%s",Ebuffer);
+ goto error;
+ }
+ reads[i].coff = qpos;
+ s = Compress_Next_QVentry(temp,quiva,coding,LOSSY);
+ if (s < 0)
+ { fprintf(stderr,"%s",Ebuffer);
+ goto error;
+ }
+ if (s != reads[i].rlen)
+ { fprintf(stderr,"%s: Length of quiva %d is different than fasta in DB\n",
+ Prog_Name,i+1);
+ goto error;
+ }
+ qpos = ftello(quiva);
+ }
+ cline = Get_QV_Line();
- rewind(input);
- while (Read_Lines(input,1) > 0)
- { reads[cur++].coff = qpos;
- Compress_Next_QVentry(input,quiva,coding,LOSSY);
- qpos = ftello(quiva);
- }
+ Free_QVcoding(coding);
+ }
+ }
- if (fscanf(istub," %9d %*s %*s\n",&last) != 1)
- SYSTEM_ERROR
- if (last != cur)
- { fprintf(stderr,"%s: Number of reads in %s.quiva doesn't match number in %s.fasta\n",
- Prog_Name,root,root);
+ if (fgetc(input) != EOF)
+ { if (PIPE)
+ fprintf(stderr,"%s: Too many reads on input while handling %s.fasta\n",
+ Prog_Name,lname);
+ else
+ fprintf(stderr,"%s: Too many reads in %s.quiva while handling %s.fasta\n",
+ Prog_Name,core,lname);
+ goto error;
+ }
+ if ( ! PIPE && cell >= nfiles)
+ { fclose(input);
+ free(core);
+ free(path);
+ if (next_file(ng))
+ { if (ng->name == NULL)
+ { fprintf(stderr,"%s",Ebuffer);
+ goto error;
+ }
+ core = Root(ng->name,".quiva");
+ fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core);
goto error;
}
-
- Free_QVcoding(coding);
- free(root);
- free(pwd);
- }
-
- free(ng);
+ }
}
// Write the db record and read index into .idx and clean up
@@ -358,6 +480,8 @@ int main(int argc, char *argv[])
fclose(istub);
fclose(indx);
fclose(quiva);
+ fclose(temp);
+ unlink(tname);
exit (0);
@@ -367,18 +491,20 @@ error:
if (coff != 0)
{ fseeko(quiva,0,SEEK_SET);
if (ftruncate(fileno(quiva),coff) < 0)
- SYSTEM_ERROR
+ fprintf(stderr,"%s: Fatal: could not restore %s.qvs after error, truncate failed\n",
+ Prog_Name,root);
+ }
+ if (quiva != NULL)
+ { fclose(quiva);
+ if (coff == 0)
+ unlink(Catenate(pwd,PATHSEP,root,".qvs"));
+ }
+ if (temp != NULL)
+ { fclose(temp);
+ unlink(tname);
}
fclose(istub);
fclose(indx);
- fclose(quiva);
- if (coff == 0)
- { char *root = Root(argv[1],".db");
- char *pwd = PathTo(argv[1]);
- unlink(Catenate(pwd,PATHSEP,root,".qvs"));
- free(pwd);
- free(root);
- }
exit (1);
}
diff --git a/rangen.c b/rangen.c
new file mode 100644
index 0000000..27d8354
--- /dev/null
+++ b/rangen.c
@@ -0,0 +1,182 @@
+/*******************************************************************************************
+ *
+ * Synthetic DNA shotgun sequence generator
+ * Generate a fake genome of size genlen*1Mb long, that has an AT-bias of -b.
+ * The -r parameter seeds the random number generator for the generation of the genome
+ * so that one can reproducbile produce the same underlying genome to sample from. If
+ * missing, then the job id of the invocation seeds the generator. The sequence is
+ * sent to the standard output in .fasta format.
+ *
+ * Author: Gene Myers
+ * Date : April 2016
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+
+static char *Usage = "<genlen:double> [-U] [-b<double(.5)>] [-w<int(80)>] [-r<int>]";
+
+static int GENOME; // -g option * 1Mbp
+static double BIAS; // -b option
+static int HASR = 0; // -r option is set?
+static int SEED; // -r option
+static int WIDTH; // -w option
+static int UPPER; // -U option
+
+static char *Prog_Name;
+
+// Generate a random DNA sequence of length *len* with an AT-bias of BIAS.
+// Uppercase letters if UPPER is set, lowercase otherwise.
+
+static char *random_genome(int len)
+{ static char *seq = NULL;
+ static double x, PRA, PRC, PRG;
+ int i;
+
+ if (seq == NULL)
+ { PRA = BIAS/2.;
+ PRC = (1.-BIAS)/2. + PRA;
+ PRG = (1.-BIAS)/2. + PRC;
+ if ((seq = (char *) malloc(WIDTH+1)) == NULL)
+ { fprintf(stderr,"%s: Allocating genome sequence\n",Prog_Name);
+ exit (1);
+ }
+ }
+
+ if (UPPER)
+ for (i = 0; i < len; i++)
+ { x = drand48();
+ if (x < PRC)
+ if (x < PRA)
+ seq[i] = 'A';
+ else
+ seq[i] = 'C';
+ else
+ if (x < PRG)
+ seq[i] = 'G';
+ else
+ seq[i] = 'T';
+ }
+ else
+ for (i = 0; i < len; i++)
+ { x = drand48();
+ if (x < PRC)
+ if (x < PRA)
+ seq[i] = 'a';
+ else
+ seq[i] = 'c';
+ else
+ if (x < PRG)
+ seq[i] = 'g';
+ else
+ seq[i] = 't';
+ }
+ seq[len] = '\0';
+
+ return (seq);
+}
+
+int main(int argc, char *argv[])
+{ int i, j;
+ char *eptr;
+ double glen;
+
+ // Process command arguments
+ //
+ // Usage: <GenomeLen:double> [-b<double(.5)>] [-r<int>]
+
+ Prog_Name = strdup("rangen");
+
+ WIDTH = 80;
+ BIAS = .5;
+ HASR = 0;
+ UPPER = 0;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ fprintf(stderr,"%s: %s is an illegal option\n",Prog_Name,argv[i]);
+ exit (1);
+ case 'U':
+ if (argv[i][2] != '\0')
+ { fprintf(stderr,"%s: %s is an illegal option\n",Prog_Name,argv[i]);
+ exit (1);
+ }
+ UPPER = 1;
+ break;
+ case 'b':
+ BIAS = strtod(argv[i]+2,&eptr);
+ if (*eptr != '\0' || argv[i][2] == '\0')
+ { fprintf(stderr,"%s: -%c '%s' argument is not a real number\n",
+ Prog_Name,argv[i][1],argv[i]+2);
+ exit (1);
+ }
+ if (BIAS < 0. || BIAS > 1.)
+ { fprintf(stderr,"%s: AT-bias must be in [0,1] (%g)\n",Prog_Name,BIAS);
+ exit (1);
+ }
+ break;
+ case 'r':
+ SEED = strtol(argv[i]+2,&eptr,10);
+ HASR = 1;
+ if (*eptr != '\0' || argv[i][2] == '\0')
+ { fprintf(stderr,"%s: -r argument is not an integer\n",Prog_Name);
+ exit (1);
+ }
+ break;
+ case 'w':
+ WIDTH = strtol(argv[i]+2,&eptr,10);
+ if (*eptr != '\0' || argv[i][2] == '\0')
+ { fprintf(stderr,"%s: -w '%s' argument is not an integer\n",Prog_Name,argv[i]+2);
+ exit (1);
+ }
+ if (WIDTH < 0)
+ { fprintf(stderr,"%s: Line width must be non-negative (%d)\n",Prog_Name,WIDTH);
+ exit (1);
+ }
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+
+ glen = strtod(argv[1],&eptr);
+ if (*eptr != '\0')
+ { fprintf(stderr,"%s: genome length is not a real number\n",Prog_Name);
+ exit (1);
+ }
+ if (glen < 0.)
+ { fprintf(stderr,"%s: Genome length must be positive (%g)\n",Prog_Name,glen);
+ exit (1);
+ }
+ GENOME = (int) (glen*1000000.);
+
+ // Set up random number generator
+
+ if (HASR)
+ srand48(SEED);
+ else
+ srand48(getpid());
+
+ // Generate the sequence line at a time where all lines have width WDITH, save the last.
+
+
+ fprintf(stdout,">random len=%d bias=%g\n",GENOME,BIAS);
+ for (j = 0; j+WIDTH < GENOME; j += WIDTH)
+ fprintf(stdout,"%s\n",random_genome(WIDTH));
+ if (j < GENOME)
+ fprintf(stdout,"%s\n",random_genome(GENOME-j));
+
+ exit (0);
+}
diff --git a/simulator.c b/simulator.c
index b16fb02..cf1d694 100644
--- a/simulator.c
+++ b/simulator.c
@@ -1,51 +1,14 @@
-/************************************************************************************\
-* *
-* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
-* *
-* Redistribution and use in source and binary forms, with or without modification, *
-* are permitted provided that the following conditions are met: *
-* *
-* · Redistributions of source code must retain the above copyright notice, this *
-* list of conditions and the following disclaimer. *
-* *
-* · Redistributions in binary form must reproduce the above copyright notice, this *
-* list of conditions and the following disclaimer in the documentation and/or *
-* other materials provided with the distribution. *
-* *
-* · The name of EWM may not be used to endorse or promote products derived from *
-* this software without specific prior written permission. *
-* *
-* THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, *
-* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND *
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE *
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
-* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
-* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *
-* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
-* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN *
-* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
-* *
-* For any issues regarding this software and its use, contact EWM at: *
-* *
-* Eugene W. Myers Jr. *
-* Bautzner Str. 122e *
-* 01099 Dresden *
-* GERMANY *
-* Email: gene.myers at gmail.com *
-* *
-\************************************************************************************/
-
/*******************************************************************************************
*
* Synthetic DNA shotgun dataset simulator
- * Generate a fake genome of size genlen*1Mb long, that has an AT-bias of -b. Then
- * sample reads of mean length -m from a log-normal length distribution with
- * standard deviation -s, but ignore reads of length less than -x. Collect enough
- * reads to cover the genome -c times. Introduce -e fraction errors into each
- * read where the ratio of insertions, deletions, and substitutions are set by
- * defined constants INS_RATE and DEL_RATE within generate.c. One can also control
- * the rate at which reads are picked from the forward and reverse strands by setting
- * the defined constant FLIP_RATE.
+ * From a supplied reference genome in the form of a Dazzler .dam, sample reads of
+ * mean length -m from a log-normal length distribution with standard deviation -s,
+ * but ignore reads of length less than -x. Collect enough reads to cover the genome
+ * -c times. Introduce -e fraction errors into each read where the ratio of insertions,
+ * deletions, and substitutions are set by defined constants INS_RATE and DEL_RATE
+ * within generate.c. The fraction -f controls the rate at which reads are picked from
+ * the forward and reverse strands which defaults to 50%. If -C is set then assume the
+ * scaffolds are circular.
*
* The -r parameter seeds the random number generator for the generation of the genome
* so that one can reproducbile produce the same underlying genome to sample from. If
@@ -53,16 +16,22 @@
* to the standard output (i.e. it is a pipe). The output is in fasta format (i.e. it is
* a UNIX pipe). The output is in Pacbio .fasta format suitable as input to fasta2DB.
*
- * The -M option requests that the coordinates from which each read has been sampled are
- * written to the indicated file, one line per read, ASCII encoded. This "map" file
- * essentially tells one where every read belongs in an assembly and is very useful for
- * debugging and testing purposes. If a read pair is say b,e then if b < e the read was
- * sampled from [b,e] in the forward direction, and from [e,b] in the reverse direction
- * otherwise.
+ * The genome is considered a sequence of *scaffolds* (these are reconstituted from the
+ * Dazzler's internal encoding of a .dam), where the gaps are filled with a random
+ * sequence that follows the base distribution of the contigs of the genome. The program
+ * then samples these filled in scaffolds for reads. If the -C optioin is set then the
+ * program assumes each scaffold is a circular sequence.
+ *
+ * The -M option requests that the scaffold and coordinates from which each read has
+ * been sampled are written to the indicated file, one line per read, ASCII encoded.
+ * This "map" file essentially tells one where every read belongs in an assembly and
+ * is very useful for debugging and testing purposes. If a read pair is say b,e then
+ * if b < e the read was sampled from [b,e] in the forward direction, and from [e,b]
+ * in the reverse direction otherwise.
*
* Author: Gene Myers
* Date : July 2013
- * Mod : April 2014 (made independent of "mylib")
+ * Mod : April 2016 (generates reads w.r.t. a reference genome)
*
********************************************************************************************/
@@ -74,59 +43,27 @@
#include "DB.h"
-static char *Usage[] = { "<genlen:double> [-c<double(20.)>] [-b<double(.5)>] [-r<int>]",
- " [-m<int(10000)>] [-s<int(2000)>] [-x<int(4000)>]",
- " [-e<double(.15)>] [-M<file>]"
+static char *Usage[] = { "<genome:dam> [-CU] [-m<int(10000)>] [-s<int(2000)>] [-e<double(.15)>]",
+ " [-c<double(50.)>] [-f<double(.5)>] [-x<int(4000)>]",
+ " [-w<int(80)>] [-r<int>] [-M<file>]",
};
-static int GENOME; // -g option * 1Mbp
-static double COVERAGE; // -c option
-static double BIAS; // -b option
-static int HASR = 0; // -r option is set?
-static int SEED; // -r option
+static int CIRCULAR; // -C option
+static int UPPER; // -U option
static int RMEAN; // -m option
static int RSDEV; // -s option
-static int RSHORT; // -x option
static double ERROR; // -e option
+static double COVERAGE; // -c option
+static double FLIP_RATE; // -f option
+static int RSHORT; // -x option
+static int WIDTH; // -w option
+static int HASR; // -r option is set?
+static int SEED; // -r option
static FILE *MAP; // -M option
#define INS_RATE .73333 // insert rate
#define DEL_RATE .20000 // deletion rate
#define IDL_RATE .93333 // insert + delete rate
-#define FLIP_RATE .5 // orientation rate (equal)
-
-// Generate a random 4 letter string of length *len* with every letter having equal probability.
-
-static char *random_genome()
-{ char *seq;
- int i;
- double x, PRA, PRC, PRG;
-
- PRA = BIAS/2.;
- PRC = (1.-BIAS)/2. + PRA;
- PRG = (1.-BIAS)/2. + PRC;
-
- if (HASR)
- srand48(SEED);
- else
- srand48(getpid());
-
- if ((seq = (char *) Malloc(GENOME+1,"Allocating genome sequence")) == NULL)
- exit (1);
- for (i = 0; i < GENOME; i++)
- { x = drand48();
- if (x < PRA)
- seq[i] = 0;
- else if (x < PRC)
- seq[i] = 1;
- else if (x < PRG)
- seq[i] = 2;
- else
- seq[i] = 3;
- }
- seq[GENOME] = 4;
- return (seq);
-}
// Complement (in the DNA sense) string *s*.
@@ -144,6 +81,8 @@ static void complement(int elen, char *s)
}
}
+// A unit normal distribution random number generator
+
#define UNORM_LEN 60000
#define UNORM_MAX 6.0
@@ -226,54 +165,227 @@ static double sample_unorm(double x)
return (y);
}
+// Open and trim the reference genome *name*. Determine the number of scaffolds and sizes
+// of each scaffold (in nscaffs and the .coff field of the read records) in the dam. Then
+// create a sequence for each scaffold (index in the .boff field of the read records), that
+// consists of its contigs with a random sequence filling the gaps (generated according to
+// the bp frequency in db.freq[4]).
+
+HITS_DB *load_and_fill(char *name, int *pscaffs)
+{ static HITS_DB db;
+ HITS_READ *reads;
+ FILE *bases;
+ char *seq;
+ int nreads, nscaffs;
+ int i, c;
+ int64 ctot;
+ int64 o, u;
+ double PRA, PRC, PRG;
+
+ if (Open_DB(name,&db) != 1)
+ { fprintf(stderr,"%s: %s is not a Dazzler .dam\n",Prog_Name,name);
+ exit (1);
+ }
+ Trim_DB(&db);
+
+ PRA = db.freq[0];
+ PRC = PRA + db.freq[1];
+ PRG = PRC + db.freq[2];
+
+ nreads = db.nreads;
+ reads = db.reads;
+
+ nscaffs = 0;
+ for (i = 0; i < nreads; i++)
+ if (reads[i].origin == 0)
+ nscaffs += 1;
+
+ for (i = 0; i < nscaffs; i++)
+ reads[i].coff = 0;
+
+ c = -1;
+ for (i = 0; i < nreads; i++)
+ { if (reads[i].origin == 0)
+ c += 1;
+ reads[c].coff = reads[i].fpulse+reads[i].rlen;
+ }
+
+ ctot = 0;
+ for (i = 0; i < nscaffs; i++)
+ ctot += reads[i].coff+1;
+
+ bases = Fopen(Catenate(db.path,"","",".bps"),"r");
+ if (bases == NULL)
+ exit (1);
+
+ seq = (char *) Malloc(ctot+4,"Allocating space for genome");
+ if (seq == NULL)
+ exit (1);
+ *seq++ = 4;
+
+ c = -1;
+ o = u = 0;
+ for (i = 0; i < nreads; i++)
+ { int len, clen;
+ int64 off;
+
+ if (reads[i].origin == 0)
+ { if (c >= 0)
+ o += reads[c].coff + 1;
+ c += 1;
+ u = o;
+ }
+ else
+ { int64 p;
+ double x;
+
+ p = u + reads[i-1].rlen;
+ u = o + reads[i].fpulse;
+ while (p < u)
+ { x = drand48();
+ if (x < PRC)
+ if (x < PRA)
+ seq[p++] = 0;
+ else
+ seq[p++] = 1;
+ else
+ if (x < PRG)
+ seq[p++] = 2;
+ else
+ seq[p++] = 3;
+ }
+ }
+
+ len = reads[i].rlen;
+ off = reads[i].boff;
+ if (ftello(bases) != off)
+ fseeko(bases,off,SEEK_SET);
+ clen = COMPRESSED_LEN(len);
+ if (clen > 0)
+ { if (fread(seq+u,clen,1,bases) != 1)
+ { EPRINTF(EPLACE,"%s: Read of .bps file failed\n",Prog_Name);
+ exit (1);
+ }
+ }
+ Uncompress_Read(len,seq+u);
+ if (reads[i].origin == 0)
+ reads[c].boff = o;
+ }
+ reads[nscaffs].boff = ctot;
+
+ db.bases = (void *) seq;
+ db.loaded = 1;
+
+ *pscaffs = nscaffs;
+ return (&db);
+}
// Generate reads (a) whose lengths are exponentially distributed with mean *mean* and
-// standard deviation *stdev*, (b) that are never shorter than *shortest* and never
-// longer than the string *source*. Each read is a randomly sampled interval of
-// *source* (each interval is equally likely) that has insertion, deletion, and/or
-// substitution errors introduced into it and which is oriented in either the forward
-// or reverse strand direction with probability FLIP_RATE. The number of errors
-// introduced is the length of the string times *erate*, and the probability of an
-// insertion, deletion, or substitution is controlled by the defined constants INS_RATE
-// and DEL_RATE. Generate reads until the sum of the lengths of the reads is greater
-// than slen*coverage. The reads are output as fasta entries with a specific header
-// format that contains the sampling interval, read length, and a read id.
-
-static void shotgun(char *source)
-{ int maxlen, nreads, qv;
- int64 totlen, totbp;
- char *rbuffer;
- double nmean, nsdev;
+// standard deviation *stdev*, and (b) that are never shorter than *shortest*. Each
+// read is a randomly sampled interval of one of the filled scaffolds of *source*
+// (each interval is equally likely) that has insertion, deletion, and/or substitution
+// errors introduced into it and which is oriented in either the forward or reverse
+// strand direction with probability FLIP_RATE. The number of errors introduced is the
+// length of the string times *erate*, and the probability of an insertion, delection,
+// or substitution is controlled by the defined constants INS_RATE and DEL_RATE.
+// If the -C option is set then each scaffold is assumed to be circular and reads can
+// be sampled that span the origin. Reads are generated until the sum of the lengths of
+// the reads is greater thant coverage times the sum of the lengths of the scaffolds in
+// the reference (i.e. including filled scaffold gaps in the genome size). The reads are
+// output as fasta entries with the PacBio-specific header format that contains the
+// sampling interval, read length, and a read id.
+
+static void shotgun(HITS_DB *source, int nscaffs)
+{ HITS_READ *reads;
+ int gleng;
+ int maxlen, nreads, qv;
+ int64 totlen, totbp;
+ char *rbuffer, *bases;
+ double nmean, nsdev;
+ double *weights;
+ int scf;
nsdev = (1.*RSDEV)/RMEAN;
nsdev = log(1.+nsdev*nsdev);
nmean = log(1.*RMEAN) - .5*nsdev;
nsdev = sqrt(nsdev);
- if (GENOME < RSHORT)
+ bases = source->bases;
+ reads = source->reads;
+ gleng = reads[nscaffs].boff - nscaffs;
+ if (gleng <= RSHORT)
{ fprintf(stderr,"Genome length is less than shortest read length !\n");
exit (1);
}
init_unorm();
+ weights = (double *) Malloc(sizeof(double)*(nscaffs+1),"Allocating contig weights");
+ if (weights == NULL)
+ exit (1);
+
+ { double r;
+
+ r = 0.;
+ for (scf = 0; scf < nscaffs; scf++)
+ { weights[scf] = r/gleng;
+ r += reads[scf].coff;
+ }
+ weights[nscaffs] = 1.;
+ }
+
qv = (int) (1000 * (1.-ERROR));
rbuffer = NULL;
maxlen = 0;
totlen = 0;
- totbp = COVERAGE*GENOME;
+ totbp = COVERAGE*gleng;
nreads = 0;
while (totlen < totbp)
- { int len, sdl, ins, del, elen, rbeg, rend;
- int j;
- char *s, *t;
+ { int len, sdl, ins, del, elen, slen, rbeg, rend;
+ int j;
+ double uni;
+ char *s, *t;
+
+ scf = bin_search(nscaffs,weights,drand48()) - 1; // Pick a scaffold with probabilitye
+ // proportional to its length
- len = (int) exp(nmean + nsdev*sample_unorm(drand48())); // Determine length of read.
- if (len > GENOME) len = GENOME;
- if (len < RSHORT)
+ uni = drand48();
+ len = (int) exp(nmean + nsdev*sample_unorm(uni)); // Pick a read length
+ if (len <= RSHORT)
continue;
+ // New sampler:
+
+ slen = reads[scf].coff;
+ rbeg = (int) (drand48()*slen); // Pick a spot for read start
+ if (CIRCULAR)
+ rend = (rbeg + len) % slen; // Wrap if circular
+ else
+ { if (drand48() < .5) // Pick direction and trim if necessary
+ { rend = rbeg + len; // if not circular
+ if (rend > slen)
+ { rend = slen;
+ len = rend - rbeg;
+ }
+ }
+ else
+ { rend = rbeg;
+ rbeg = rbeg - len;
+ if (rbeg < 0)
+ { rbeg = 0;
+ len = rend;
+ }
+ }
+ if (len <= RSHORT)
+ continue;
+ }
+
+ // Old sampler:
+ //
+ // rbeg = (int) (drand48()*((reads[scf].coff-len)+.9999999));
+ // rend = rbeg + len;
+
sdl = (int) (len*ERROR); // Determine number of inserts *ins*, deletions *del,
ins = del = 0; // and substitions+deletions *sdl*.
for (j = 0; j < sdl; j++)
@@ -285,8 +397,6 @@ static void shotgun(char *source)
}
sdl -= ins;
elen = len + (ins-del);
- rbeg = (int) (drand48()*((GENOME-len)+.9999999));
- rend = rbeg + len;
if (elen > maxlen)
{ maxlen = ((int) (1.2*elen)) + 1000;
@@ -296,7 +406,7 @@ static void shotgun(char *source)
}
t = rbuffer;
- s = source + rbeg;
+ s = bases + (reads[scf].boff + rbeg);
// Generate the string with errors. NB that inserts occur randomly between source
// characters, while deletions and substitutions occur on source characters.
@@ -320,6 +430,8 @@ static void shotgun(char *source)
sdl -= 1;
}
s += 1;
+ if (*s == 4)
+ s = bases + reads[scf].boff;
while (len * drand48() < ins)
{ *t++ = (char) (4.*drand48());
ins -= 1;
@@ -328,23 +440,24 @@ static void shotgun(char *source)
*t = 4;
if (drand48() >= FLIP_RATE) // Complement the string with probability FLIP_RATE.
- { printf(">Sim/%d/%d_%d RQ=0.%d\n",nreads+1,0,elen,qv);
- complement(elen,rbuffer);
+ { complement(elen,rbuffer);
j = rend;
rend = rbeg;
rbeg = j;
}
- else
- printf(">Sim/%d/%d_%d RQ=0.%d\n",nreads+1,0,elen,qv);
- Lower_Read(rbuffer);
- for (j = 0; j+80 < elen; j += 80)
- printf("%.80s\n",rbuffer+j);
+ printf(">Sim/%d/%d_%d RQ=0.%d\n",nreads+1,0,elen,qv);
+ if (UPPER)
+ Upper_Read(rbuffer);
+ else
+ Lower_Read(rbuffer);
+ for (j = 0; j+WIDTH < elen; j += WIDTH)
+ printf("%.*s\n",WIDTH,rbuffer+j);
if (j < elen)
printf("%s\n",rbuffer+j);
if (MAP != NULL)
- fprintf(MAP," %9d %9d\n",rbeg,rend);
+ fprintf(MAP," %6d %9d %9d\n",scf,rbeg,rend);
totlen += elen;
nreads += 1;
@@ -352,34 +465,34 @@ static void shotgun(char *source)
}
int main(int argc, char *argv[])
-{ char *source;
+{ HITS_DB *source;
+ int nscaffs;
-// Usage: <GenomeLen:double> [-c<double(20.)>] [-b<double(.5)>] [-r<int>]
-// [-m<int(10000)>] [-s<int(2000)>] [-x<int(4000)>]
-// [-e<double(.15)>] [-M<file]"
+ // Process command line
- { int i, j;
+ { int i, j, k;
+ int flags[128];
char *eptr;
- double glen;
- Prog_Name = Strdup("simulator","");
+ ARG_INIT("simulator");
- COVERAGE = 20.;
- BIAS = .5;
- HASR = 0;
- RMEAN = 10000;
- RSDEV = 2000;
- RSHORT = 4000;
- ERROR = .15;
- MAP = NULL;
+ RMEAN = 10000;
+ RSDEV = 2000;
+ ERROR = .15;
+ COVERAGE = 50.;
+ FLIP_RATE = .5;
+ RSHORT = 4000;
+ HASR = 0;
+ MAP = NULL;
+ WIDTH = 80;
j = 1;
for (i = 1; i < argc; i++)
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
- fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][2]);
- exit (1);
+ ARG_FLAGS("CU");
+ break;
case 'c':
ARG_REAL(COVERAGE)
if (COVERAGE < 0.)
@@ -387,13 +500,23 @@ int main(int argc, char *argv[])
exit (1);
}
break;
- case 'b':
- ARG_REAL(BIAS)
- if (BIAS < 0. || BIAS > 1.)
- { fprintf(stderr,"%s: AT-bias must be in [0,1] (%g)\n",Prog_Name,BIAS);
+ case 'e':
+ ARG_REAL(ERROR)
+ if (ERROR < 0. || ERROR > .5)
+ { fprintf(stderr,"%s: Error rate must be in [0,.5] (%g)\n",Prog_Name,ERROR);
exit (1);
}
break;
+ case 'f':
+ ARG_REAL(FLIP_RATE)
+ if (FLIP_RATE < 0. || FLIP_RATE > 1.)
+ { fprintf(stderr,"%s: Error rate must be in [0,1] (%g)\n",Prog_Name,FLIP_RATE);
+ exit (1);
+ }
+ break;
+ case 'm':
+ ARG_POSITIVE(RMEAN,"Mean read length")
+ break;
case 'r':
SEED = strtol(argv[i]+2,&eptr,10);
HASR = 1;
@@ -402,54 +525,46 @@ int main(int argc, char *argv[])
exit (1);
}
break;
- case 'M':
- MAP = Fopen(argv[i]+2,"w");
- if (MAP == NULL)
- exit (1);
- break;
- case 'm':
- ARG_POSITIVE(RMEAN,"Mean read length")
- break;
case 's':
- ARG_POSITIVE(RSDEV,"Read length standard deviation")
+ ARG_NON_NEGATIVE(RSDEV,"Read length standard deviation")
break;
case 'x':
ARG_NON_NEGATIVE(RSHORT,"Read length minimum")
break;
- case 'e':
- ARG_REAL(ERROR)
- if (ERROR < 0. || ERROR > .5)
- { fprintf(stderr,"%s: Error rate must be in [0,.5] (%g)\n",Prog_Name,ERROR);
- exit (1);
- }
+ case 'w':
+ ARG_NON_NEGATIVE(WIDTH,"Line width")
+ break;
+ case 'M':
+ MAP = Fopen(argv[i]+2,"w");
+ if (MAP == NULL)
+ exit (1);
break;
}
else
argv[j++] = argv[i];
argc = j;
+ CIRCULAR = flags['C'];
+ UPPER = flags['U'];
+
if (argc != 2)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]);
exit (1);
}
-
- glen = strtod(argv[1],&eptr);
- if (*eptr != '\0')
- { fprintf(stderr,"%s: genome length is not a real number\n",Prog_Name);
- exit (1);
- }
- if (glen < 0.)
- { fprintf(stderr,"%s: Genome length must be positive (%g)\n",Prog_Name,glen);
- exit (1);
- }
- GENOME = (int) (glen*1000000.);
}
- source = random_genome();
+ if (HASR)
+ srand48(SEED);
+ else
+ srand48(getpid());
+
+ // Read and generate
+
+ source = load_and_fill(argv[1],&nscaffs);
- shotgun(source);
+ shotgun(source,nscaffs);
if (MAP != NULL)
fclose(MAP);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/dazzdb.git
More information about the debian-med-commit
mailing list