[med-svn] [dazzdb] 01/05: Imported Upstream version 1.0+20161112
Afif Elghraoui
afif at moszumanska.debian.org
Thu Jan 19 07:01:25 UTC 2017
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository dazzdb.
commit 49a5b8b9525073504b07e6af19d52e65217de620
Author: Afif Elghraoui <afif at debian.org>
Date: Wed Jan 18 22:14:15 2017 -0800
Imported Upstream version 1.0+20161112
---
DB.c | 95 +++++++++++++-
DB.h | 14 ++-
DB2quiva.c => DB2arrow.c | 105 ++++++++--------
DB2quiva.c | 4 +
DBdump.c | 65 ++++++++--
DBshow.c | 113 +++++++++++++----
DBsplit.c | 3 +-
DBstats.c | 2 +-
DBwipe.c | 89 +++++++++++++
Makefile | 12 +-
QV.c | 94 ++++++++++++++
QV.h | 3 +
README.md | 169 ++++++++++++++++++-------
quiva2DB.c => arrow2DB.c | 319 +++++++++++++++++++++++++----------------------
fasta2DAM.c | 1 +
fasta2DB.c | 52 ++++----
quiva2DB.c | 12 +-
17 files changed, 824 insertions(+), 328 deletions(-)
diff --git a/DB.c b/DB.c
index 3764842..3afff14 100644
--- a/DB.c
+++ b/DB.c
@@ -314,6 +314,14 @@ void Upper_Read(char *s)
*s = '\0';
}
+void Letter_Arrow(char *s)
+{ static char letter[4] = { '1', '2', '3', '4' };
+
+ for ( ; *s != 4; s++)
+ *s = letter[(int) *s];
+ *s = '\0';
+}
+
// Convert read in ascii representation to [0-3] representation (end with 4)
void Number_Read(char *s)
@@ -341,6 +349,31 @@ void Number_Read(char *s)
*s = 4;
}
+void Number_Arrow(char *s)
+{ static char arrow[128] =
+ { 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 0, 1, 2, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ };
+
+ for ( ; *s != '\0'; s++)
+ *s = arrow[(int) *s];
+ *s = 4;
+}
+
/*******************************************************************************************
*
@@ -426,7 +459,7 @@ int Open_DB(char* path, HITS_DB *db)
if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1)
if (part == 0)
{ cutoff = 0;
- all = 1;
+ all = DB_ALL;
}
else
{ EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n",
@@ -466,7 +499,7 @@ int Open_DB(char* path, HITS_DB *db)
db->tracks = NULL;
db->part = part;
db->cutoff = cutoff;
- db->all = all;
+ db->allarr |= all;
db->ufirst = ufirst;
db->tfirst = tfirst;
@@ -557,10 +590,10 @@ void Trim_DB(HITS_DB *db)
if (db->trimmed) return;
- if (db->cutoff <= 0 && db->all) return;
+ if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return;
cutoff = db->cutoff;
- if (db->all)
+ if ((db->allarr & DB_ALL) != 0)
allflag = 0;
else
allflag = DB_BEST;
@@ -660,10 +693,10 @@ static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart)
if (!db->trimmed) return (0);
- if (db->cutoff <= 0 && db->all) return (0);
+ if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return (0);
cutoff = db->cutoff;
- if (db->all)
+ if ((db->allarr & DB_ALL) != 0)
allflag = 0;
else
allflag = DB_BEST;
@@ -1435,6 +1468,56 @@ int Load_Read(HITS_DB *db, int i, char *read, int ascii)
return (0);
}
+// Load into 'read' the i'th arrow in 'db'. As an ASCII string if ascii is 1,
+// and as a numeric string otherwise.
+//
+
+HITS_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow"
+FILE *Arrow_File = NULL; // Becomes invalid after closing
+
+int Load_Arrow(HITS_DB *db, int i, char *read, int ascii)
+{ FILE *arrow;
+ int64 off;
+ int len, clen;
+ HITS_READ *r = db->reads;
+
+ if (i >= db->nreads)
+ { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Arrow)\n",Prog_Name);
+ EXIT(1);
+ }
+ if (Arrow_DB != db)
+ { fclose(Arrow_File);
+ arrow = Fopen(Catenate(db->path,"","",".arw"),"r");
+ if (arrow == NULL)
+ EXIT(1);
+ Arrow_File = arrow;
+ Arrow_DB = db;
+ }
+ else
+ arrow = Arrow_File;
+
+ off = r[i].boff;
+ len = r[i].rlen;
+
+ if (ftello(arrow) != off)
+ fseeko(arrow,off,SEEK_SET);
+ clen = COMPRESSED_LEN(len);
+ if (clen > 0)
+ { if (fread(read,clen,1,arrow) != 1)
+ { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Arrow)\n",Prog_Name);
+ EXIT(1);
+ }
+ }
+ Uncompress_Read(len,read);
+ if (ascii == 1)
+ { Letter_Arrow(read);
+ read[-1] = '\0';
+ }
+ else
+ read[-1] = 4;
+ return (0);
+}
+
char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii)
{ FILE *bases = (FILE *) db->bases;
int64 off;
diff --git a/DB.h b/DB.h
index 983bee7..dc281de 100644
--- a/DB.h
+++ b/DB.h
@@ -164,6 +164,9 @@ void Lower_Read(char *s); // Convert read from numbers to lowercase letters
void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT)
void Number_Read(char *s); // Convert read from letters to numbers
+void Letter_Arrow(char *s); // Convert arrow pw's from numbers to uppercase letters (0-3 to 1234)
+void Number_Arrow(char *s); // Convert arrow pw string from letters to numbers
+
/*******************************************************************************************
*
@@ -175,6 +178,9 @@ void Number_Read(char *s); // Convert read from letters to numbers
#define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert
#define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1)
+#define DB_ARROW 0x2 // DB is an arrow DB
+#define DB_ALL 0x1 // all wells are in the trimmed DB
+
// Fields have different interpretations if a .db versus a .dam
typedef struct
@@ -185,6 +191,7 @@ typedef struct
// uncompressed bases in memory block
int64 coff; // Offset (in bytes) of compressed quiva streams in '.qvs' file (DB),
// Offset (in bytes) of scaffold header string in '.hdr' file (DAM)
+ // 4 compressed shorts containing snr info if an arrow DB.
int flags; // QV of read + flags above (DB only)
} HITS_READ;
@@ -226,7 +233,7 @@ typedef struct
{ int ureads; // Total number of reads in untrimmed DB
int treads; // Total number of reads in trimmed DB
int cutoff; // Minimum read length in block (-1 if not yet set)
- int all; // Consider multiple reads from a given well
+ int allarr; // DB_ALL | DB_ARROW
float freq[4]; // frequency of A, C, G, T, respectively
// Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed)
@@ -368,6 +375,11 @@ char *New_Read_Buffer(HITS_DB *db);
int Load_Read(HITS_DB *db, int i, char *read, int ascii);
+ // Exactly the same as Load_Read, save the arrow information is loaded, not the DNA sequence,
+ // and there is only a choice between numeric (0) or ascii (1);
+
+int Load_Arrow(HITS_DB *db, int i, char *read, int ascii);
+
// Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the
// the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii
// string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string
diff --git a/DB2quiva.c b/DB2arrow.c
similarity index 59%
copy from DB2quiva.c
copy to DB2arrow.c
index a7b93da..9247ab4 100644
--- a/DB2quiva.c
+++ b/DB2arrow.c
@@ -1,9 +1,9 @@
/********************************************************************************************
*
- * Recreate all the .quiva files that have been loaded into a specified database.
+ * Recreate all the .arrow files that have been loaded into a specified database.
*
* Author: Gene Myers
- * Date : May 2014
+ * Date : October 2016
*
********************************************************************************************/
@@ -12,38 +12,40 @@
#include <string.h>
#include "DB.h"
-#include "QV.h"
-#ifdef HIDE_FILES
-#define PATHSEP "/."
-#else
-#define PATHSEP "/"
-#endif
-
-static char *Usage = "[-vU] <path:db>";
+static char *Usage = "[-v] [-w<int(80)>] <path:db>";
int main(int argc, char *argv[])
{ HITS_DB _db, *db = &_db;
- FILE *dbfile, *quiva;
- int VERBOSE, UPPER;
+ FILE *dbfile;
+ int VERBOSE, WIDTH;
// Process arguments
{ int i, j, k;
int flags[128];
+ char *eptr;
+
+ ARG_INIT("DB2arrow")
- ARG_INIT("DB2quiva")
+ WIDTH = 80;
j = 1;
for (i = 1; i < argc; i++)
if (argv[i][0] == '-')
- { ARG_FLAGS("vU") }
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("vU")
+ break;
+ case 'w':
+ ARG_NON_NEGATIVE(WIDTH,"Line width")
+ break;
+ }
else
argv[j++] = argv[i];
argc = j;
VERBOSE = flags['v'];
- UPPER = flags['U'];
if (argc != 2)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
@@ -51,10 +53,10 @@ int main(int argc, char *argv[])
}
}
- // Open db, db stub file, and .qvs file
+ // Open db, and db stub file
- { char *pwd, *root;
- int status;
+ { int status;
+ char *pwd, *root;
status = Open_DB(argv[1],db);
if (status < 0)
@@ -67,14 +69,17 @@ int main(int argc, char *argv[])
{ fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
exit (1);
}
+ if ((db->allarr & DB_ARROW) == 0)
+ { fprintf(stderr,"%s: There is no Arrow information in the DB: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
pwd = PathTo(argv[1]);
root = Root(argv[1],".db");
dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r");
- quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"r");
free(pwd);
free(root);
- if (dbfile == NULL || quiva == NULL)
+ if (dbfile == NULL)
exit (1);
}
@@ -84,22 +89,19 @@ int main(int argc, char *argv[])
char lname[MAX_NAME];
FILE *ofile = NULL;
int f, first, last, ofirst, nfiles;
- QVcoding *coding;
- char **entry;
+ char *read;
if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
SYSTEM_ERROR
reads = db->reads;
- entry = New_QV_Buffer(db);
+ read = New_Read_Buffer(db);
first = ofirst = 0;
for (f = 0; f < nfiles; f++)
{ int i;
char prolog[MAX_NAME], fname[MAX_NAME];
- // Scan db image file line, create .quiva file for writing
-
- if (reads[first].coff < 0) break;
+ // Scan db image file line, create .arrow file for writing
if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3)
SYSTEM_ERROR
@@ -107,7 +109,7 @@ int main(int argc, char *argv[])
if (f == 0 || strcmp(fname,lname) != 0)
{ if (f > 0)
{ if (ofile == stdout)
- { fprintf(stderr," %d quivas\n",first-ofirst);
+ { fprintf(stderr," %d reads\n",first-ofirst);
fflush(stderr);
}
else
@@ -124,48 +126,44 @@ int main(int argc, char *argv[])
}
}
else
- { if ((ofile = Fopen(Catenate(".","/",fname,".quiva"),"w")) == NULL)
+ { if ((ofile = Fopen(Catenate(".","/",fname,".arrow"),"w")) == NULL)
exit (1);
if (VERBOSE)
- { fprintf(stderr,"Creating %s.quiva ...\n",fname);
- fflush(stderr);
+ { fprintf(stderr,"Creating %s.arrow ...\n",fname);
+ fflush(stdout);
}
}
strcpy(lname,fname);
}
- // For the relevant range of reads, write the header for each to the file
- // and then uncompress and write the quiva entry for each
-
- coding = Read_QVcoding(quiva);
+ // For the relevant range of reads, write each to the file
+ // recreating the original headers with the index meta-data about each read
for (i = first; i < last; i++)
- { int e, flags, qv, rlen;
+ { int j, len;
+ uint64 big;
+ float snr[4];
HITS_READ *r;
r = reads + i;
- flags = r->flags;
- rlen = r->rlen;
- qv = (flags & DB_QV);
- fprintf(ofile,"@%s/%d/%d_%d",prolog,r->origin,r->fpulse,r->fpulse+rlen);
- if (qv > 0)
- fprintf(ofile," RQ=0.%3d",qv);
+ len = r->rlen;
+ big = *((uint64 *) &(r->coff));
+ for (j = 0; j < 4; j++)
+ { snr[3-j] = (big & 0xffff) / 100.;
+ big >>= 16;
+ }
+ fprintf(ofile,">%s",prolog);
+ fprintf(ofile," SN=%.2f,%.2f,%.2f,%.2f",snr[0],snr[1],snr[2],snr[3]);
fprintf(ofile,"\n");
- Uncompress_Next_QVentry(quiva,entry,coding,rlen);
-
- if (UPPER)
- { char *deltag = entry[1];
- int j;
-
- for (j = 0; j < rlen; j++)
- deltag[j] -= 32;
- }
+ Load_Arrow(db,i,read,1);
- for (e = 0; e < 5; e++)
- fprintf(ofile,"%.*s\n",rlen,entry[e]);
+ for (j = 0; j+WIDTH < len; j += WIDTH)
+ fprintf(ofile,"%.*s\n",WIDTH,read+j);
+ if (j < len)
+ fprintf(ofile,"%s\n",read+j);
}
first = last;
@@ -173,7 +171,7 @@ int main(int argc, char *argv[])
if (f > 0)
{ if (ofile == stdout)
- { fprintf(stderr," %d quivas\n",first-ofirst);
+ { fprintf(stderr," %d reads\n",first-ofirst);
fflush(stderr);
}
else
@@ -181,7 +179,6 @@ int main(int argc, char *argv[])
}
}
- fclose(quiva);
fclose(dbfile);
Close_DB(db);
diff --git a/DB2quiva.c b/DB2quiva.c
index a7b93da..45c5cff 100644
--- a/DB2quiva.c
+++ b/DB2quiva.c
@@ -67,6 +67,10 @@ int main(int argc, char *argv[])
{ fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
exit (1);
}
+ if (db->reads[0].coff < 0 || (db->allarr & DB_ARROW) != 0)
+ { fprintf(stderr,"%s: There is no Quiver information in the DB: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
pwd = PathTo(argv[1]);
root = Root(argv[1],".db");
diff --git a/DBdump.c b/DBdump.c
index 735e726..6227251 100644
--- a/DBdump.c
+++ b/DBdump.c
@@ -21,8 +21,8 @@
#endif
static char *Usage[] =
- { "[-rhsiqp] [-uU] [-m<track>]+",
- " <path:db|dam> [ <reads:FILE> | <reads:range> ... ]"
+ { "[-rhsaiqp] [-uU] [-m<track>]+",
+ " <path:db|dam> [ <reads:FILE> | <reads:range> ... ]"
};
#define LAST_READ_SYMBOL '$'
@@ -93,6 +93,7 @@ static int prof_map[41] =
int main(int argc, char *argv[])
{ HITS_DB _db, *db = &_db;
+ int Quiva_DB, Arrow_DB;
FILE *hdrs = NULL;
int64 *qv_idx = NULL;
uint8 *qv_val = NULL;
@@ -110,7 +111,7 @@ int main(int argc, char *argv[])
FILE *input = NULL;
int TRIM, UPPER;
- int DORED, DOSEQ, DOQVS, DOHDR, DOIQV, DOPRF, DAM;
+ int DORED, DOSEQ, DOARW, DOQVS, DOHDR, DOIQV, DOPRF, DAM;
int MMAX, MTOP;
char **MASK;
@@ -134,7 +135,7 @@ int main(int argc, char *argv[])
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
- ARG_FLAGS("hpqrsiuU")
+ ARG_FLAGS("hpqrsaiuU")
break;
case 'm':
if (MTOP >= MMAX)
@@ -156,6 +157,7 @@ int main(int argc, char *argv[])
DOQVS = flags['q'];
DORED = flags['r'];
DOSEQ = flags['s'];
+ DOARW = flags['a'];
DOHDR = flags['h'];
DOIQV = flags['i'];
DOPRF = flags['p'];
@@ -195,13 +197,32 @@ int main(int argc, char *argv[])
exit (1);
DAM = 1;
if (DOQVS)
- { fprintf(stderr,"%s: -Q and -q options not compatible with a .dam DB\n",Prog_Name);
+ { fprintf(stderr,"%s: -q option is not compatible with a .dam DB\n",Prog_Name);
+ exit (1);
+ }
+ if (DOARW)
+ { fprintf(stderr,"%s: -a option is not compatible with a .dam DB\n",Prog_Name);
exit (1);
}
free(root);
free(pwd);
}
+
+ Arrow_DB = ((db->allarr & DB_ARROW) != 0);
+ Quiva_DB = (db->reads[0].coff >= 0 && (db->allarr & DB_ARROW) == 0);
+ if (DOARW)
+ { if (!Arrow_DB)
+ { fprintf(stderr,"%s: -a option set but no Arrow data in DB\n",Prog_Name);
+ exit (1);
+ }
+ }
+ if (DOQVS)
+ { if (!Quiva_DB)
+ { fprintf(stderr,"%s: -q option set but no Quiver data in DB\n",Prog_Name);
+ exit (1);
+ }
+ }
}
// Load QVs if requested
@@ -294,7 +315,7 @@ int main(int argc, char *argv[])
reads = db->reads - db->ufirst;
cutoff = db->cutoff;
- if (db->all)
+ if ((db->allarr & DB_ALL) != 0)
allflag = 0;
else
allflag = DB_BEST;
@@ -566,7 +587,7 @@ int main(int argc, char *argv[])
lst = len;
}
- if (DOSEQ | DOQVS)
+ if (DOSEQ | DOQVS | DOARW)
{ int ten = lst-fst;
if (ten > seqmax)
seqmax = ten;
@@ -597,7 +618,7 @@ int main(int argc, char *argv[])
{ printf("+ T%d %lld\n",m,trktot[m]);
printf("@ T%d %lld\n",m,trkmax[m]);
}
- if (DOSEQ | DOQVS)
+ if (DOSEQ | DOQVS | DOARW)
{ printf("+ S %lld\n",seqtot);
printf("@ S %lld\n",seqmax);
}
@@ -615,7 +636,7 @@ int main(int argc, char *argv[])
// range pairs in pts[0..reps) and according to the display options.
{ HITS_READ *reads;
- char *read, **entry;
+ char *read, *arrow, **entry;
int c, b, e, i, m;
int substr;
int map;
@@ -626,6 +647,10 @@ int main(int argc, char *argv[])
entry = New_QV_Buffer(db);
else
entry = NULL;
+ if (DOARW)
+ arrow = New_Read_Buffer(db);
+ else
+ arrow = NULL;
map = 0;
reads = db->reads;
@@ -685,8 +710,19 @@ int main(int argc, char *argv[])
map += 1;
printf("H %ld %s\n",strlen(flist[map]),flist[map]);
printf("L %d %d %d\n",r->origin,r->fpulse,r->fpulse+len);
- if (qv > 0)
- printf("Q: %d\n",qv);
+ if (Quiva_DB && qv > 0)
+ printf("Q %d\n",qv);
+ else if (Arrow_DB)
+ { int j, snr[4];
+ int64 big;
+
+ big = *((uint64 *) &(r->coff));
+ for (j = 0; j < 4; j++)
+ { snr[3-j] = (big & 0xffff);
+ big >>= 16;
+ }
+ printf("N %d %d %d %d\n",snr[0],snr[1],snr[2],snr[3]);
+ }
}
}
@@ -694,6 +730,8 @@ int main(int argc, char *argv[])
Load_QVentry(db,i,entry,UPPER);
if (DOSEQ)
Load_Read(db,i,read,UPPER);
+ if (DOARW)
+ Load_Arrow(db,i,arrow,1);
for (m = 0; m < MTOP; m++)
{ int64 *anno;
@@ -727,6 +765,11 @@ int main(int argc, char *argv[])
printf("%.*s\n",lst-fst,read+fst);
}
+ if (DOARW)
+ { printf("A %d ",lst-fst);
+ printf("%.*s\n",lst-fst,arrow+fst);
+ }
+
if (DOIQV)
{ int64 k, e;
diff --git a/DBshow.c b/DBshow.c
index d4346a5..c376ed4 100644
--- a/DBshow.c
+++ b/DBshow.c
@@ -26,8 +26,8 @@
#endif
static char *Usage[] =
- { "[-unqUQ] [-w<int(80)>] [-m<track>]+",
- " <path:db|dam> [ <reads:FILE> | <reads:range> ... ]"
+ { "[-unqaUQA] [-w<int(80)>] [-m<track>]+",
+ " <path:db|dam> [ <reads:FILE> | <reads:range> ... ]"
};
#define LAST_READ_SYMBOL '$'
@@ -93,7 +93,7 @@ int main(int argc, char *argv[])
FILE *input;
int TRIM, UPPER;
- int DOSEQ, DOQVS, QUIVA, DAM;
+ int DOSEQ, DOQVS, DOARR, QUIVA, ARROW, DAM;
int WIDTH;
int MMAX, MTOP;
@@ -119,7 +119,7 @@ int main(int argc, char *argv[])
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
- ARG_FLAGS("unqUQ")
+ ARG_FLAGS("unqaUQA")
break;
case 'w':
ARG_NON_NEGATIVE(WIDTH,"Line width")
@@ -142,15 +142,26 @@ int main(int argc, char *argv[])
TRIM = 1-flags['u'];
UPPER = 1+flags['U'];
DOQVS = flags['q'];
+ DOARR = flags['a'];
DOSEQ = 1-flags['n'];
QUIVA = flags['Q'];
- if (QUIVA && (!DOSEQ || MTOP > 0))
- { fprintf(stderr,"%s: -Q (quiva) format request inconsistent with -n and -m options\n",
+ ARROW = flags['A'];
+ if ((QUIVA || DOQVS) && (ARROW || DOARR))
+ { fprintf(stderr,"%s: Cannot request both Quiver (-Q,-q) and Arrow (-A,a) information\n",
Prog_Name);
exit (1);
}
+
if (QUIVA)
- DOQVS = 1;
+ { DOQVS = 1;
+ DOSEQ = 0;
+ MTOP = 0;
+ }
+ if (ARROW)
+ { DOARR = 1;
+ DOSEQ = 0;
+ MTOP = 0;
+ }
if (argc <= 1)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
@@ -177,14 +188,27 @@ int main(int argc, char *argv[])
if (hdrs == NULL)
exit (1);
DAM = 1;
- if (QUIVA || DOQVS)
- { fprintf(stderr,"%s: -Q and -q options not compatible with a .dam DB\n",Prog_Name);
+ if (DOQVS || DOARR)
+ { fprintf(stderr,"%s: -q, a, Q and A options not compatible with a .dam DB\n",Prog_Name);
exit (1);
}
free(root);
free(pwd);
}
+
+ if (DOQVS)
+ { if (db->reads[0].coff < 0 || (db->allarr & DB_ARROW) != 0)
+ { fprintf(stderr,"%s: -q or Q option but no Quiver data in DB!\n",Prog_Name);
+ exit (1);
+ }
+ }
+ if (DOARR)
+ { if ((db->allarr & DB_ARROW) == 0)
+ { fprintf(stderr,"%s: -a or A option but no Arrow data in DB!\n",Prog_Name);
+ exit (1);
+ }
+ }
}
// Load QVs if requested
@@ -263,11 +287,11 @@ int main(int argc, char *argv[])
reads = db->reads - db->ufirst;
cutoff = db->cutoff;
- if (db->all)
+ if ((db->allarr & DB_ALL) != 0)
allflag = 0;
else
allflag = DB_BEST;
-
+
nid = 0;
oid = db->ufirst;
lid = oid + db->nreads;
@@ -394,7 +418,7 @@ int main(int argc, char *argv[])
{ HITS_READ *reads;
HITS_TRACK *first;
- char *read, **entry;
+ char *read, *arrow, **entry;
int c, b, e, i;
int hilight, substr;
int map;
@@ -409,6 +433,8 @@ int main(int argc, char *argv[])
{ entry = NULL;
first = db->tracks;
}
+ if (DOARR)
+ arrow = New_Read_Buffer(db);
if (UPPER == 1)
{ hilight = 'A'-'a';
@@ -446,21 +472,40 @@ int main(int argc, char *argv[])
{ int len;
int fst, lst;
int flags, qv;
+ float snr[4];
HITS_READ *r;
HITS_TRACK *track;
r = reads + i;
len = r->rlen;
+ if (substr)
+ { fst = iter->beg;
+ lst = iter->end;
+ }
+ else
+ { fst = 0;
+ lst = len;
+ }
flags = r->flags;
qv = (flags & DB_QV);
+ if (DOARR)
+ { uint64 big;
+ int j;
+
+ big = *((uint64 *) &(r->coff));
+ for (j = 0; j < 4; j++)
+ { snr[3-j] = (big & 0xffff) / 100.;
+ big >>= 16;
+ }
+ }
if (DAM)
{ char header[MAX_NAME];
fseeko(hdrs,r->coff,SEEK_SET);
fgets(header,MAX_NAME,hdrs);
header[strlen(header)-1] = '\0';
- printf("%s :: Contig %d[%d,%d]",header,r->origin,r->fpulse,r->fpulse+len);
+ printf("%s :: Contig %d[%d,%d]",header,r->origin,r->fpulse+fst,r->fpulse+lst);
}
else
{ while (i < findx[map-1])
@@ -468,11 +513,15 @@ int main(int argc, char *argv[])
while (i >= findx[map])
map += 1;
if (QUIVA)
- printf("@%s/%d/%d_%d",flist[map],r->origin,r->fpulse,r->fpulse+len);
+ printf("@%s/%d/%d_%d",flist[map],r->origin,r->fpulse+fst,r->fpulse+lst);
+ else if (ARROW)
+ printf(">%s",flist[map]);
else
- printf(">%s/%d/%d_%d",flist[map],r->origin,r->fpulse,r->fpulse+len);
+ printf(">%s/%d/%d_%d",flist[map],r->origin,r->fpulse+fst,r->fpulse+lst);
if (qv > 0)
printf(" RQ=0.%3d",qv);
+ if (DOARR)
+ printf(" SN=%.2f,%.2f,%.2f,%.2f",snr[0],snr[1],snr[2],snr[3]);
}
printf("\n");
@@ -480,6 +529,8 @@ int main(int argc, char *argv[])
Load_QVentry(db,i,entry,UPPER);
if (DOSEQ)
Load_Read(db,i,read,UPPER);
+ if (DOARR)
+ Load_Arrow(db,i,arrow,1);
for (track = first; track != NULL; track = track->next)
{ int64 *anno;
@@ -508,21 +559,20 @@ int main(int argc, char *argv[])
}
}
- if (substr)
- { fst = iter->beg;
- lst = iter->end;
- }
- else
- { fst = 0;
- lst = len;
- }
-
if (QUIVA)
{ int k;
for (k = 0; k < 5; k++)
printf("%.*s\n",lst-fst,entry[k]+fst);
}
+ else if (ARROW)
+ { int k;
+
+ for (k = fst; k+WIDTH < lst; k += WIDTH)
+ printf("%.*s\n",WIDTH,arrow+k);
+ if (k < lst)
+ printf("%.*s\n",lst-k,arrow+k);
+ }
else
{ if (DOQVS)
{ int j, k;
@@ -543,6 +593,21 @@ int main(int argc, char *argv[])
printf("\n");
}
}
+ else if (DOARR)
+ { int j;
+
+ printf("\n");
+ for (j = fst; j+WIDTH < lst; j += WIDTH)
+ { if (DOSEQ)
+ printf("%.*s\n",WIDTH,read+j);
+ printf("%.*s\n\n",WIDTH,arrow+j);
+ }
+ if (j < lst)
+ { if (DOSEQ)
+ printf("%.*s\n",lst-j,read+j);
+ printf("%.*s\n\n",lst-j,arrow+j);
+ }
+ }
else if (DOSEQ)
{ int j;
diff --git a/DBsplit.c b/DBsplit.c
index 6a218ec..9ef8b77 100644
--- a/DBsplit.c
+++ b/DBsplit.c
@@ -202,7 +202,8 @@ int main(int argc, char *argv[])
fprintf(dbfile,DB_NBLOCK,nblock);
dbs.cutoff = CUTOFF;
- dbs.all = ALL;
+ if (ALL)
+ dbs.allarr |= DB_ALL;
dbs.treads = treads;
rewind(ixfile);
fwrite(&dbs,sizeof(HITS_DB),1,ixfile);
diff --git a/DBstats.c b/DBstats.c
index b0e46c2..4a3badf 100644
--- a/DBstats.c
+++ b/DBstats.c
@@ -152,7 +152,7 @@ int main(int argc, char *argv[])
if (dam)
printf("\nStatistics for all contigs");
- else if (db->all || !TRIM)
+ else if ((db->allarr & DB_ALL) != 0 || !TRIM)
printf("\nStatistics for all wells");
else
printf("\nStatistics for all reads");
diff --git a/DBwipe.c b/DBwipe.c
new file mode 100644
index 0000000..9730062
--- /dev/null
+++ b/DBwipe.c
@@ -0,0 +1,89 @@
+/*******************************************************************************************
+ *
+ * Split a .db into a set of sub-database blocks for use by the Dazzler:
+ * Divide the database <path>.db conceptually into a series of blocks referable to on the
+ * command line as <path>.1.db, <path>.2.db, ... If the -x option is set then all reads
+ * less than the given length are ignored, and if the -a option is not set then secondary
+ * reads from a given well are also ignored. The remaining reads are split amongst the
+ * blocks so that each block is of size -s * 1Mbp except for the last which necessarily
+ * contains a smaller residual. The default value for -s is 400Mbp because blocks of this
+ * size can be compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks
+ * are very space efficient in that their sub-index of the master .idx is computed on the
+ * fly when loaded, and the .bps file of base pairs is shared with the master DB. Any
+ * tracks associated with the DB are also computed on the fly when loading a database block.
+ *
+ * Author: Gene Myers
+ * Date : September 2013
+ * Mod : New splitting definition to support incrementality, and new stub file format
+ * Date : April 2014
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+static char *Usage = "<path:db>";
+
+int main(int argc, char *argv[])
+{ HITS_DB db;
+ int status;
+
+ Prog_Name = Strdup("DBwipe","Allocating Program Name");
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+
+ // Open db
+
+ status = Open_DB(argv[1],&db);
+ if (status < 0)
+ exit (1);
+ if (db.part > 0)
+ { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+ if (status)
+ { fprintf(stderr,"%s: Cannot be called on a .dam: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+
+ { char *pwd, *root;
+ FILE *index;
+ int i;
+
+ pwd = PathTo(argv[1]);
+ root = Root(argv[1],".db");
+ unlink(Catenate(pwd,PATHSEP,root,".arw"));
+ unlink(Catenate(pwd,PATHSEP,root,".qvs"));
+
+ for (i = 0; i < db.nreads; i++)
+ db.reads[i].coff = -1;
+ db.allarr &= ~DB_ARROW;
+
+ if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w")) == NULL)
+ { fprintf(stderr,"%s: Cannot open %s%s%s.idx\n",Prog_Name,pwd,PATHSEP,root);
+ exit (1);
+ }
+
+ fwrite(&db,sizeof(HITS_DB),1,index);
+ fwrite(db.reads,sizeof(HITS_READ),db.nreads,index);
+
+ fclose(index);
+ }
+
+ Close_DB(&db);
+
+ exit (0);
+}
diff --git a/Makefile b/Makefile
index cf0afb8..873296f 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ DEST_DIR = ~/bin
CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing
ALL = fasta2DB DB2fasta quiva2DB DB2quiva DBsplit DBdust Catrack DBshow DBstats DBrm simulator \
- fasta2DAM DAM2fasta DBdump rangen
+ fasta2DAM DAM2fasta DBdump rangen arrow2DB DB2arrow DBwipe
all: $(ALL)
@@ -19,6 +19,12 @@ quiva2DB: quiva2DB.c DB.c DB.h QV.c QV.h
DB2quiva: DB2quiva.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DB2quiva DB2quiva.c DB.c QV.c -lm
+DB2arrow: DB2arrow.c DB.c QV.c DB.h QV.h
+ gcc $(CFLAGS) -o DB2arrow DB2arrow.c DB.c QV.c -lz
+
+arrow2DB: arrow2DB.c DB.c QV.c DB.h QV.h
+ gcc $(CFLAGS) -o arrow2DB arrow2DB.c DB.c QV.c -lz
+
DBsplit: DBsplit.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DBsplit DBsplit.c DB.c QV.c -lm
@@ -52,6 +58,10 @@ fasta2DAM: fasta2DAM.c DB.c DB.h QV.c QV.h
DAM2fasta: DAM2fasta.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DAM2fasta DAM2fasta.c DB.c QV.c -lm
+DBwipe: DBwipe.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DBwipe DBwipe.c DB.c QV.c -lm
+
+
clean:
rm -f $(ALL)
rm -fr *.dSYM
diff --git a/QV.c b/QV.c
index cdb6a63..d7d7263 100644
--- a/QV.c
+++ b/QV.c
@@ -863,6 +863,62 @@ static int delChar, subChar;
// Referred by: QVcoding_Scan, Create_QVcoding
+void QVcoding_Scan1(int rlen, char *delQV, char *delTag, char *insQV, char *mergeQV, char *subQV)
+{
+ if (rlen == 0) // Initialization call
+ { int i;
+
+ // Zero histograms
+
+ bzero(delHist,sizeof(uint64)*256);
+ bzero(mrgHist,sizeof(uint64)*256);
+ bzero(insHist,sizeof(uint64)*256);
+ bzero(subHist,sizeof(uint64)*256);
+
+ for (i = 0; i < 256; i++)
+ delRun[i] = subRun[i] = 1;
+
+ totChar = 0;
+ delChar = -1;
+ subChar = -1;
+ return;
+ }
+
+ // Add streams to accumulating histograms and figure out the run chars
+ // for the deletion and substition streams
+
+ Histogram_Seqs(delHist,(uint8 *) delQV,rlen);
+ Histogram_Seqs(insHist,(uint8 *) insQV,rlen);
+ Histogram_Seqs(mrgHist,(uint8 *) mergeQV,rlen);
+ Histogram_Seqs(subHist,(uint8 *) subQV,rlen);
+
+ if (delChar < 0)
+ { int k;
+
+ for (k = 0; k < rlen; k++)
+ if (delTag[k] == 'n' || delTag[k] == 'N')
+ { delChar = delQV[k];
+ break;
+ }
+ }
+ if (delChar >= 0)
+ Histogram_Runs( delRun,(uint8 *) delQV,rlen,delChar);
+ totChar += rlen;
+ if (subChar < 0)
+ { if (totChar >= 100000)
+ { int k;
+
+ subChar = 0;
+ for (k = 1; k < 256; k++)
+ if (subHist[k] > subHist[subChar])
+ subChar = k;
+ }
+ }
+ if (subChar >= 0)
+ Histogram_Runs( subRun,(uint8 *) subQV,rlen,subChar);
+ return;
+}
+
int QVcoding_Scan(FILE *input, int num, FILE *temp)
{ char *slash;
int rlen;
@@ -1284,6 +1340,44 @@ void Free_QVcoding(QVcoding *coding)
*
********************************************************************************************/
+void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub,
+ FILE *output, QVcoding *coding, int lossy)
+{ int clen;
+
+ if (coding->delChar < 0)
+ { Encode(coding->delScheme, output, (uint8 *) del, rlen);
+ clen = rlen;
+ }
+ else
+ { Encode_Run(coding->delScheme, coding->dRunScheme, output,
+ (uint8 *) del, rlen, coding->delChar);
+ clen = Pack_Tag(tag,del,rlen,coding->delChar);
+ }
+ Number_Read(tag);
+ Compress_Read(clen,tag);
+ fwrite(tag,1,COMPRESSED_LEN(clen),output);
+
+ if (lossy)
+ { uint8 *insert = (uint8 *) ins;
+ uint8 *merge = (uint8 *) mrg;
+ int k;
+
+ for (k = 0; k < rlen; k++)
+ { insert[k] = (uint8) ((insert[k] >> 1) << 1);
+ merge[k] = (uint8) (( merge[k] >> 2) << 2);
+ }
+ }
+
+ Encode(coding->insScheme, output, (uint8 *) ins, rlen);
+ Encode(coding->mrgScheme, output, (uint8 *) mrg, rlen);
+ if (coding->subChar < 0)
+ Encode(coding->subScheme, output, (uint8 *) sub, rlen);
+ else
+ Encode_Run(coding->subScheme, coding->sRunScheme, output,
+ (uint8 *) sub, rlen, coding->subChar);
+ return;
+}
+
int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy)
{ int rlen, clen;
diff --git a/QV.h b/QV.h
index 532b2f4..e5c9485 100644
--- a/QV.h
+++ b/QV.h
@@ -58,6 +58,7 @@ int Get_QV_Line();
// If there is an error then -1 is returned, otherwise the number of entries read.
int QVcoding_Scan(FILE *input, int num, FILE *temp);
+void QVcoding_Scan1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub);
// Given QVcoding_Scan has been called at least once, create an encoding scheme based on
// the accumulated statistics and return a pointer to it. The returned encoding object
@@ -83,6 +84,8 @@ void Free_QVcoding(QVcoding *coding);
// occurred, and the sequence length otherwise.
int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy);
+void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub,
+ FILE *output, QVcoding *coding, int lossy);
// Assuming the input is position just beyond the compressed encoding of an entry header,
// read the set of compressed encodings for the ensuing 5 QV vectors, decompress them,
diff --git a/README.md b/README.md
index 515de32..ba37408 100644
--- a/README.md
+++ b/README.md
@@ -22,12 +22,15 @@ to the data base over time.
is accomplished with the concept of *tracks* that implementors can add as they
need them.
-4. The data is held in a compressed form equivalent to the .dexta and .dexqv files of
-the data extraction module. Both the .fasta and .quiva information for each
-read is held in the data base and can be recreated from it. The .quiva
-information can be added separately and later on if desired.
+4. The data is held in a compressed form equivalent to the .dexta and .dexqv/.dexar
+files of the data extraction module.
-5. To facilitate job parallel, cluster operation of the phases of our assembler, the
+5. Quiver or Arrow information can be added separately from the sequence information
+and later on if desired, but a database can only hold either Quiver or Arrow information,
+but not both. The Arrow or Quiver information can be removed from the database at any
+time leaving a database just containing sequence information.
+
+6. To facilitate job parallel, cluster operation of the phases of our assembler, the
data base has a concept of a *current partitioning* in which all the reads that
are over a given length and optionally unique to a well, are divided up into
*blocks* containing roughly a given number of bases, except possibly the last
@@ -37,12 +40,18 @@ all the same size. One must be careful about changing the partition during an
assembly as doing so can void the structural validity of any interim
block-based results.
+A DB con contain the information needed by Quiver, or by Arrow, or neither, but
+not both. A DB containing neither Quiver or Arrow information is termed a
+Sequence-DB (S-DB). A DB with Quiver information is a Quiver-DB (Q-DB) and
+a DB with Arrow information is an Arrow-DB (A-DB). All commands are aware of
+the state of a DB and respond to options according to their type.
+
A Dazzler DB consists of one named, *visible* file, e.g. FOO.db, and several
*invisible* secondary files encoding various elements of the DB. The secondary files
are "invisible" to the UNIX OS in the sense that they begin with a "." and hence are
not listed by "ls" unless one specifies the -a flag. We chose to do this so that when
a user lists the contents of a directory they just see a single name, e.g. FOO.db, that
-is the one used to refer to the DB in commands. The files associated with a database
+is used to refer to the DB in commands. The files associated with a database
named, say FOO, are as follows:
* "FOO.db": a text file containing
@@ -60,7 +69,11 @@ named, say FOO, are as follows:
* ".FOO.qvs": a binary compressed "store" of the 5 Pacbio quality value streams for
the reads. Its size is roughly 5/3M bytes depending on the
- compression acheived. This file only exists if .quiva files have
+ compression acheived. This file only exists if Quiver information has
+ been added to the database.
+
+* ".FOO.arw": a binary compressed "store" of the clipped pulse width stream for
+ the reads. Its size is roughly M/4 bytes. This file only exists if Arrow information has
been added to the database.
* ".FOO.\<track\>.[anno,data]": a *track* containing customized meta-data for each read. For
@@ -83,7 +96,7 @@ been introduced in order to facilitate the mapping of reads to an assembly and h
been given the suffix .dam to distinguish it from an ordinary DB. It is structurally
identical to a .db except:
-* there is no concept of quality values, and hence no .FOO.qvs file.
+* there is no concept of quality values, and hence no .FOO.qvs or .FOO.arw file.
* every .fasta scaffold (a sequence with runs of N's between contigs estimating the
length of the gap) is broken into a separate contig sequence in the DB and the
@@ -134,7 +147,9 @@ are currently as follows:
Builds an initial data base, or adds to an existing database, either (a) the list of
.fasta files following the database name argument, or (b) the list of .fasta files in
\<file\> if the -f option is used, or (c) entries piped from the standard input if the
--i option is used. If a faux file name, \<name\>, follows the -i option then all the
+-i option is used. If the DB is being created it is established as a Sequence-DB (S-DB)
+otherwise its type is unchanged. If a faux file name, \<name\>, follows the -i option
+then all the
input received is considered to have come from a file by the name of \<name\>.fasta by
DB2fasta, otherwise it will be sent to the standard output by DB2fasta. The SMRT cells
in a given named input (i.e. all sources other than -i without a name) can only be
@@ -165,12 +180,13 @@ set to any positive value with the -w option.
3. quiva2DB [-vl] <path:db> ( -f<file> | -i | <input:quiva> ... )
```
-Adds .quiva streams to an existing DB "path". The data comes from (a) the given .quiva
+Adds .quiva streams to an existing DB "path". The DB must either be an S-DB or a
+Q-DB and upon completion the DB is a Q-DB. The data comes from (a) the given .quiva
files on the command line, or (b) those in the file specified by the -f option, or
-(c) the standard input if the -i option is given. The input files must be added in the
+(c) the standard input if the -i option is given. The input files can be added incrementally
+but must be added in the
same order as the .fasta files were and have the same root names, e.g. FOO.fasta and
-FOO.quiva. The files can be added incrementally but must be added in the same order as
-their corresponding .fasta files. This is enforced by the program. With the -l option
+FOO.quiva. This is enforced by the program. With the -l option
set the compression scheme is a bit lossy to get more compression (see the description
of dexqv in the DEXTRACTOR module here).
@@ -178,18 +194,44 @@ of dexqv in the DEXTRACTOR module here).
4. DB2quiva [-vU] <path:db>
```
-The set of .quiva files within the given DB are recreated from the DB exactly as they
+The set of .quiva files within the given Q-DB are recreated from the DB exactly as they
were input. That is, this is a perfect inversion, including the reconstitution of the
proper .quiva headers. Because of this property, one can, if desired, delete the
.quiva source files once they are in the DB as they can always be recreated from it.
-Entries imported from the standard input will be place in the faux file name given on
+Entries imported from the standard input will be placed in the faux file name given on
import, or to the standard output if no name was given.
By .fastq convention each QV vector is output as a line without new-lines, and by
default the Deletion Tag entry is in lower case letters. The -U option specifies
upper case letters should be used instead.
```
-5. fasta2DAM [-v] <path:dam> ( -f<file> | -i[<name>] | <input:fasta> ... )
+5. arrow2DB [-v] <path:db> ( -f<file> | -i | <input:arrow> ... )
+```
+
+Adds .arrow streams to an existing DB "path". The DB must either be an S-DB or an
+A-DB and upon completion the DB is an A-DB. The data comes from (a) the given .arrow
+files on the command line, or (b) those in the file specified by the -f option, or
+(c) the standard input if the -i option is given. The input files can be added
+incrementally but must be added in the
+same order as the .fasta files were and have the same root names, e.g. FOO.fasta and
+FOO.quiva. This is enforced by the program.
+
+```
+6. DB2arrow [-v] [-w<int(80)>] <path:db>
+```
+
+The set of .arrow files within the given A-DB are recreated from the DB exactly as they
+were input. That is, this is a perfect inversion, including the reconstitution of the
+proper .arrow headers. Because of this property, one can, if desired, delete the
+.arrow source files once they are in the DB as they can always be recreated from it.
+Entries imported from the standard input will be placed in the faux file name given on
+import, or to the standard output if no name was given.
+By default the output sequences are formatted 80 chars per line,
+but the characters per line, or line width, can be
+set to any positive value with the -w option.
+
+```
+7. fasta2DAM [-v] <path:dam> ( -f<file> | -i[<name>] | <input:fasta> ... )
```
Builds an initial map DB or DAM, or adds to an existing DAM, either (a) the list of
@@ -197,13 +239,13 @@ Builds an initial map DB or DAM, or adds to an existing DAM, either (a) the list
\<file\> if the -f option is used, or (c) entries piped from the standard input if the -i
option is used. If a faux file name, \<name\>, follows the -i option then all the input
received is considered to have come from a file by the name of \<name\>.fasta by
-DB2fasta, otherwise it will be sent to the standard output by DB2fasta. Any .fasta
+DAM2fasta, otherwise it will be sent to the standard output by DAM2fasta. Any .fasta
entry that has a run of N's in it will be split into separate "contig" entries and the
interval of the contig in the original entry recorded. The header for each .fasta entry
is saved with the contigs created from it.
```
-6. DAM2fasta [-vU] [-w<int(80)>] <path:dam>
+8. DAM2fasta [-vU] [-w<int(80)>] <path:dam>
```
The set of .fasta files for the given map DB or DAM are recreated from the DAM
@@ -217,7 +259,7 @@ should be used, and the characters per line, or line width, can be set to any po
value with the -w option.
```
-7. DBsplit [-a] [-x<int>] [-s<int(200)>] <path:db|dam>
+9. DBsplit [-af] [-x<int>] [-s<int(200)>] <path:db|dam>
```
Divide the database \<path\>.db or \<path\>.dam conceptually into a series of blocks
@@ -237,7 +279,7 @@ question has previously bin split, i.e. one is not interactively asked if they w
to proceed.
```
-8. DBdust [-b] [-w<int(64)>] [-t<double(2.)>] [-m<int(10)>] <path:db|dam>
+10. DBdust [-b] [-w<int(64)>] [-t<double(2.)>] [-m<int(10)>] <path:db|dam>
```
Runs the symmetric DUST algorithm over the reads in the untrimmed DB \<path\>.db or
@@ -258,7 +300,7 @@ This permits job parallelism in block-sized chunks, and the resulting sequence o
block tracks can then be merged into a track for the entire untrimmed DB with Catrack.
```
-9. Catrack [-v] <path:db|dam> <track:name>
+11. Catrack [-v] <path:db|dam> <track:name>
```
Find all block tracks of the form .\<path\>.#.\<track\>... and merge them into a single
@@ -267,8 +309,8 @@ encode the same kind of track data (this is checked), and the files must exist f
block 1, 2, 3, ... up to the last block number.
```
-10. DBshow [-unqUQ] [-w<int(80)>] [-m<track>]+
- <path:db|dam> [ <reads:FILE> | <reads:range> ... ]
+12. DBshow [-unqaUQA] [-w<int(80)>] [-m<track>]+
+ <path:db|dam> [ <reads:FILE> | <reads:range> ... ]
```
Displays the requested reads in the database \<path\>.db or \<path\>.dam. By default the
@@ -285,10 +327,14 @@ represents the index of the last read in the actively loaded db. For example,
example, 1-$ displays every read in the active db (the default).
By default a .fasta file of the read sequences is displayed. If the -q option is
-set, then the QV streams are also displayed in a non-standard modification of the
-fasta format. If the -n option is set then the DNA sequence is *not* displayed.
-If the -Q option is set then a .quiva file is displayed and in this case the -n
-and -m options mayt not be set (and the -q and -w options have no effect).
+set and the DB is a Q-DB, then the QV streams are also displayed in a non-standard
+modification of the fasta format.
+Similarly, if the -a option is set and the DB is an A-DB, then the pulse width stream is
+also displayed in a non-standard format.
+If the -n option is set then the DNA sequence is *not* displayed.
+If the -Q option is set then a .quiva file of the selected reads is displayed and
+all other options except -u and -U are ignored. If the -A option is set then a .arrow
+file of the selected reads is displayed and all other options except -u and -w are ignored.
If one or more masks are set with the -m option then the track intervals are also
displayed in an additional header line and the bases within an interval are displayed
@@ -297,44 +343,63 @@ sequences are in lower case and 80 chars per line. The -U option specifies uppe
case should be used, and the characters per line, or line width, can be set to any
positive value with the -w option.
-The .fasta or .quiva files that are output can be converted into a DB by fasta2DB
-and quiva2DB (if the -q and -n options are not set and no -m options are set),
-giving one a simple way to make a DB of a subset of the reads for testing purposes.
+The .fasta, .quiva, and .arrow files that are output can be used to build a new DB with
+fasta2DB, quiva2D, and arrow2DB, giving one a simple way to make a DB of a subset of
+the reads for testing purposes.
```
-12. DBdump [-rhsiqp] [-uU] [-m<track>]+
- <path:db|dam> [ <reads:FILE> | <reads:range> ... ]
+13. DBdump [-rhsaqip] [-uU] [-m<track>]+
+ <path:db|dam> [ <reads:FILE> | <reads:range> ... ]
```
Like DBshow, DBdump allows one to display a subset of the reads in the DB and select
which information to show about them including any mask tracks. The difference is
that the information is written in a very simple "1-code" ASCII format that makes it
-easy for one to read and parse the information for further use. -r requests that each
-read number be displayed (useful if only a subset of reads is requested). -h prints
-the header information which is the source file name, well #, and pulse range.
--s requests the sequence be output, -i requests that the intrinsic quality values be
-output, -q requests that the 5 quiva sequences be output, -p requests the repeat
-profile be output (if available), and -m\<track\> requests that mask \<track\> be output.
+easy for one to read and parse the information for further use. The option flags determine
+which items of information are output as follows:
+
+* -r requests that each read number be displayed in an R-line (see below, useful if only a
+subset of reads is requested).
+
+* -h requests the header information be output as the source file name on an H-line, the
+well # and pulse range on an L-line, and optionally the quality of the read if given on a Q-line.
+
+* -s requests the sequence be output on an S-line.
+
+* -a requests the Arrow information be output as a pulse-width string on an A-line and
+the 4 SNR channel values on an N-line,
+
+* -q requests that the 5 Quiver quality streams be output on d-, c-, i-, m-, and s-lines.
+
+* -i requests that the intrinsic quality values be output on an I-line.
+
+* -p requests the repeat profile be output (if available) on a P-line, on a P-line
+
+* -m\<track\> requests that mask \<track\> be output on a T-line.
+
Set -u if you want data from the untrimmed database (the default is trimmed) and
set -U if you'd like upper-case letter used in the DNA sequence strings.
-The format is very simple. Each requested piece of information occurs on a line. The
+The format is very simple. A requested unit of information occurs on a line. The
first character of every line is a "1-code" character that tells you what information
-to expect on the line. The rest of the line contains information where each item is
+to expect on the line. The rest of the line contains the information where each item is
separated by a single blank space. Strings are output as first an integer giving the
length of the string, a blank space, and then the string terminated by a new-line.
Intrinsic quality values are between 0 and 50, inclusive, and a vector of said are
displayed as an alphabetic string where 'a' is 0, 'b' is '1', ... 'z' is 25, 'A' is
26, 'B' is 27, ... and 'Y' is 50. Repeat profiles are also displayed as string where
'_' denotes 0 repetitions, and then 'a' through 'N' denote the values 1 through 40,
-respectively.
+respectively. The set of all possible lines is as follows:
```
R # - read number
H # string - original file name string (header)
L # # # - location: well, pulse start, pulse end
+ Q # - quality of read (#/1000)
+ N # # # # - SNR of ACGT channels (#/100)
Tx #n (#b #e)^#n - x'th track on command line, #n intervals all on same line
S # string - sequence string
+ A # string - arrow pulse-width string
I # string - intrinsic quality vector (as an ASCII string)
P # string - repeat profile vector (as an ASCII string)
d # string - Quiva deletion values (as an ASCII string)
@@ -352,12 +417,12 @@ the number of reads (X=R), the number of masks (X=M), or the total number of
characters in all headers (X=H), sequences (X=S), intrinsic quality vectors (X=I),
read profile vector (X=P), or track (X=T#). And '@ X #' gives the maximum number of
characters in any header (X=H), sequence (X=S), intrincic quality vector (X=I), read
-profile vector (X=P), or track (X=T#). The size numbers for the Quiva strings are
-identical to that for the sequence as they are all of the same length for any
-given entry.
+profile vector (X=P), or track (X=T#). The size numbers for the Quiva strings and
+Arrow pulse width strings are identical to that for the sequence as they are all of
+the same length for any given entry.
```
-12. DBstats [-nu] [-b<int(1000)] [-m<track>]+ <path:db|dam>
+14. DBstats [-nu] [-b<int(1000)] [-m<track>]+ <path:db|dam>
```
Show overview statistics for all the reads in the trimmed data base \<path\>.db or
@@ -369,7 +434,7 @@ intervals along the read can be specified with the -m option in which case a sum
and a histogram of the interval lengths is displayed.
```
-13. DBrm <path:db|dam> ...
+15. DBrm <path:db|dam> ...
```
Delete all the files for the given data bases. Do not use rm to remove a database, as
@@ -377,7 +442,15 @@ there are at least two and often several secondary files for each DB including t
files, and all of these are removed by DBrm.
```
-14. simulator <genome:dam> [-CU] [-m<int(10000)>] [-s<int(2000)>] [-e<double(.15)]
+16. DBwipe <path:db|dam> ...
+```
+
+Delete any Arrow or Quiver data from the given databases. This removes the .arw or
+.qvs file and resets information in the .idx file containing information for Arrow
+or Quiver. Basically, converts an A-DB or Q-DB back to a simple S-DB.
+
+```
+17. simulator <genome:dam> [-CU] [-m<int(10000)>] [-s<int(2000)>] [-e<double(.15)]
[-c<double(50.)>] [-f<double(.5)>] [-x<int(4000)>]
[-w<int(80)>] [-r<int>] [-M<file>]
```
@@ -412,7 +485,7 @@ a read is say 's b e' then if b \< e the read is a perturbed copy of s[b,e] in t
forward direction, and a perturbed copy s[e,b] in the reverse direction otherwise.
```
-15. rangen <genlen:double> [-U] [-b<double(.5)>] [-w<int(80)>] [-r<int>]
+18. rangen <genlen:double> [-U] [-b<double(.5)>] [-w<int(80)>] [-r<int>]
```
Generate a random DNA sequence of length genlen*1Mbp that has an AT-bias of -b.
diff --git a/quiva2DB.c b/arrow2DB.c
similarity index 58%
copy from quiva2DB.c
copy to arrow2DB.c
index 6d099f5..19975aa 100644
--- a/quiva2DB.c
+++ b/arrow2DB.c
@@ -1,8 +1,8 @@
/*******************************************************************************************
*
- * Adds the given .quiva files to an existing DB "path". The input files must be added in
+ * Adds the given .arrow files to an existing DB "path". The input files must be added in
* the same order as the .fasta files were and have the same root names, e.g. FOO.fasta
- * and FOO.quiva. The files can be added incrementally but must be added in the same order
+ * and FOO.arrow. The files can be added incrementally but must be added in the same order
* as the .fasta files. This is enforced by the program. With the -l option set the
* compression scheme is a bit lossy to get more compression (see the description of dexqv
* in the DEXTRACTOR module).
@@ -31,7 +31,7 @@
#define PATHSEP "/"
#endif
-static char *Usage = "[-vl] <path:string> ( -f<file> | -i | <input:quiva> ... )";
+static char *Usage = "[-v] <path:db> ( -f<file> | -i | <input:arrow> ... )";
typedef struct
{ int argc;
@@ -95,18 +95,14 @@ int main(int argc, char *argv[])
{ FILE *istub;
char *root, *pwd;
- FILE *quiva, *indx;
- int64 coff;
+ FILE *arrow, *indx;
+ int64 boff;
HITS_DB db;
HITS_READ *reads;
int nfiles;
- FILE *temp;
- char *tname;
-
int VERBOSE;
- int LOSSY;
int PIPE;
FILE *INFILE;
@@ -115,7 +111,7 @@ int main(int argc, char *argv[])
{ int i, j, k;
int flags[128];
- ARG_INIT("quiva2DB")
+ ARG_INIT("arrow2DB")
INFILE = NULL;
@@ -139,7 +135,6 @@ int main(int argc, char *argv[])
argc = j;
VERBOSE = flags['v'];
- LOSSY = flags['l'];
PIPE = flags['i'];
if (INFILE != NULL && PIPE)
@@ -154,78 +149,77 @@ int main(int argc, char *argv[])
}
}
- // Open DB stub file, index, and .qvs file for appending. Load db and read records,
- // get number of cells from stub file, and note current offset to end of .qvs
+ // Open DB stub file, index, and .arw file for appending. Load db and read records,
+ // get number of cells from stub file, and note current offset to end of .arw
root = Root(argv[1],".db");
pwd = PathTo(argv[1]);
istub = Fopen(Catenate(pwd,"/",root,".db"),"r");
if (istub == NULL)
- { fprintf(stderr,"%s",Ebuffer);
- exit (1);
- }
+ exit (1);
if (fscanf(istub,DB_NFILE,&nfiles) != 1)
- { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",root,Prog_Name);
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root);
exit (1);
}
indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+");
if (indx == NULL)
- { fprintf(stderr,"%s",Ebuffer);
- exit (1);
- }
+ exit (1);
if (fread(&db,sizeof(HITS_DB),1,indx) != 1)
- { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",root,Prog_Name);
+ { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root);
exit (1);
}
reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*db.ureads,"Allocating DB index");
if (reads == NULL)
- { fprintf(stderr,"%s",Ebuffer);
+ exit (1);
+ if (fread(reads,sizeof(HITS_READ),db.ureads,indx) != (size_t) (db.ureads))
+ { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root);
exit (1);
}
- if (fread(reads,sizeof(HITS_READ),db.ureads,indx) != (size_t) (db.ureads))
- { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",root,Prog_Name);
+
+ if (reads[0].coff >= 0 && (db.allarr & DB_ARROW) == 0)
+ { fprintf(stderr,"%s: Database %s has Quiver data!\n",Prog_Name,root);
exit (1);
}
- quiva = NULL;
- temp = NULL;
- coff = 0;
+ arrow = NULL;
+ boff = 0;
if (reads[0].coff < 0)
- quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"w");
+ arrow = Fopen(Catenate(pwd,PATHSEP,root,".arw"),"w");
else
- quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"r+");
+ arrow = Fopen(Catenate(pwd,PATHSEP,root,".arw"),"r+");
- tname = Strdup(Catenate(".",PATHSEP,root,Numbered_Suffix("",getpid(),".tmp")),
- "Allocating temporary name");
- temp = Fopen(tname,"w+");
+ if (arrow == NULL)
+ goto error;
+ fseeko(arrow,0,SEEK_END);
+ boff = ftello(arrow);
- if (quiva == NULL || temp == NULL)
- { fprintf(stderr,"%s",Ebuffer);
- goto error;
- }
- fseeko(quiva,0,SEEK_END);
- coff = ftello(quiva);
-
- // Do a merged traversal of cell lines in .db stub file and .quiva files to be
+ // Do a merged traversal of cell lines in .db stub file and .arrow files to be
// imported, driving the loop with the cell line #
{ FILE *input = NULL;
char *path = NULL;
char *core = NULL;
+ char *read;
+ int rmax, rlen, eof;
File_Iterator *ng = NULL;
char lname[MAX_NAME];
int first, last, cline;
int cell;
+ // Buffer for accumulating .arrow sequence over multiple lines
+
+ rmax = MAX_NAME + 60000;
+ read = (char *) Malloc(rmax+1,"Allocating line buffer");
+ if (read == NULL)
+ goto error;
+
if (!PIPE)
{ ng = init_file_iterator(argc,argv,INFILE,2);
if (ng == NULL)
- { fprintf(stderr,"%s",Ebuffer);
- goto error;
- }
+ goto error;
}
for (cell = 0; cell < nfiles; cell++)
@@ -233,7 +227,7 @@ int main(int argc, char *argv[])
if (cell == 0)
- // First addition, a pipe: find the first cell that does not have .quiva's yet
+ // First addition, a pipe: find the first cell that does not have .arrow's yet
// (error if none) and set input source to stdin.
if (PIPE)
@@ -249,23 +243,23 @@ int main(int argc, char *argv[])
cell += 1;
}
if (cell >= nfiles)
- { fprintf(stderr,"%s: All .quiva's have already been added !?\n",Prog_Name);
+ { fprintf(stderr,"%s: All .arrows's have already been added !?\n",Prog_Name);
goto error;
}
input = stdin;
if (VERBOSE)
- { fprintf(stderr,"Adding quiva's from stdin ...\n");
+ { fprintf(stderr,"Adding arrows's from stdin ...\n");
fflush(stderr);
}
cline = 0;
}
- // First addition, not a pipe: then get first .quiva file name (error if not one) to
+ // First addition, not a pipe: then get first .arrow file name (error if not one) to
// add, find the first cell name whose file name matches (error if none), check that
- // the previous .quiva's have been added and this is the next slot. Then open
- // the .quiva file for compression
+ // the previous .arrow's have been added and this is the next slot. Then open
+ // the .arrow file for compression
else
{ if (! next_file(ng))
@@ -273,16 +267,12 @@ int main(int argc, char *argv[])
goto error;
}
if (ng->name == NULL)
- { fprintf(stderr,"%s",Ebuffer);
- goto error;
- }
+ goto error;
- core = Root(ng->name,".quiva");
+ core = Root(ng->name,".arrow");
path = PathTo(ng->name);
- if ((input = Fopen(Catenate(path,"/",core,".quiva"),"r")) == NULL)
- { fprintf(stderr,"%s",Ebuffer);
- goto error;
- }
+ if ((input = Fopen(Catenate(path,"/",core,".arrow"),"r")) == NULL)
+ goto error;
first = 0;
while (cell < nfiles)
@@ -301,25 +291,25 @@ int main(int argc, char *argv[])
}
if (first > 0 && reads[first-1].coff < 0)
- { fprintf(stderr,"%s: Predecessor of %s.quiva has not been added yet\n",
+ { fprintf(stderr,"%s: Predecessor of %s.arrow has not been added yet\n",
Prog_Name,core);
goto error;
}
if (reads[first].coff >= 0)
- { fprintf(stderr,"%s: %s.quiva has already been added\n",Prog_Name,core);
+ { fprintf(stderr,"%s: %s.arrow has already been added\n",Prog_Name,core);
goto error;
}
if (VERBOSE)
- { fprintf(stderr,"Adding '%s.quiva' ...\n",core);
+ { fprintf(stderr,"Adding '%s.arrow' ...\n",core);
fflush(stderr);
}
cline = 0;
}
// Not the first addition: get next cell line. If not a pipe and the file name is new,
- // then close the current .quiva, open the next one and after ensuring the names
- // match, open it for compression
+ // then close the current .arrow, open the next one and after ensuring the names
+ // match, open it for incorporation
else
{ first = last;
@@ -335,8 +325,8 @@ int main(int argc, char *argv[])
ungetc(c,input);
}
else if (strcmp(lname,fname) != 0)
- { if (fgetc(input) != EOF)
- { fprintf(stderr,"%s: Too many reads in %s.quiva while handling %s.fasta\n",
+ { if ( ! eof)
+ { fprintf(stderr,"%s: Too many reads in %s.arrow while handling %s.fasta\n",
Prog_Name,core,fname);
goto error;
}
@@ -348,16 +338,12 @@ int main(int argc, char *argv[])
if ( ! next_file(ng))
break;
if (ng->name == NULL)
- { fprintf(stderr,"%s",Ebuffer);
- goto error;
- }
+ goto error;
path = PathTo(ng->name);
- core = Root(ng->name,".quiva");
- if ((input = Fopen(Catenate(path,"/",core,".quiva"),"r")) == NULL)
- { fprintf(stderr,"%s",Ebuffer);
- goto error;
- }
+ core = Root(ng->name,".arrow");
+ if ((input = Fopen(Catenate(path,"/",core,".arrow"),"r")) == NULL)
+ goto error;
if (strcmp(core,fname) != 0)
{ fprintf(stderr,"%s: Files not being added in order (expect %s, given %s)\n",
@@ -366,92 +352,128 @@ int main(int argc, char *argv[])
}
if (VERBOSE)
- { fprintf(stderr,"Adding '%s.quiva' ...\n",core);
+ { fprintf(stderr,"Adding '%s.arrow' ...\n",core);
fflush(stderr);
}
cline = 0;
}
}
- // Compress reads [first..last) from open .quiva appending to .qvs and record
- // offset in .coff field of reads (offset of first in a cell is to the compression
- // table).
+ // If first cell or source is a new file, then start IO
- { int64 qpos;
- QVcoding *coding;
- int i, s;
+ if (cline == 0)
+ {
+ // Read in first line and make sure it is a header in PACBIO format.
- rewind(temp);
- if (ftruncate(fileno(temp),0) < 0)
- { fprintf(stderr,"%s: System error: could not truncate temporary file\n",Prog_Name);
- goto error;
- }
- Set_QV_Line(cline);
- s = QVcoding_Scan(input,last-first,temp);
- if (s < 0)
- { fprintf(stderr,"%s",Ebuffer);
- goto error;
- }
- if (s != last-first)
- { if (PIPE)
- fprintf(stderr,"%s: Insufficient # of reads on input while handling %s.fasta\n",
- Prog_Name,fname);
- else
- fprintf(stderr,"%s: Insufficient # of reads in %s.quiva while handling %s.fasta\n",
- Prog_Name,core,fname);
- goto error;
- }
-
- coding = Create_QVcoding(LOSSY);
- if (coding == NULL)
- { fprintf(stderr,"%s",Ebuffer);
- goto error;
- }
+ rlen = 0;
+ eof = (fgets(read,MAX_NAME,input) == NULL);
+ if (read[strlen(read)-1] != '\n')
+ { fprintf(stderr,"File %s.arrow, Line 1: Fasta line is too long (> %d chars)\n",
+ core,MAX_NAME-2);
+ goto error;
+ }
+ if (!eof && read[0] != '>')
+ { fprintf(stderr,"File %s.arrow, Line 1: First header in arrow file is missing\n",
+ core);
+ goto error;
+ }
+ }
- coding->prefix = Strdup(".qvs","Allocating header prefix");
- if (coding->prefix == NULL)
- { fprintf(stderr,"%s",Ebuffer);
- goto error;
- }
+ // Compress reads [first..last) from open .arrow appending to .arw and record
+ // snr in .coff field of reads (offset is the same as for the DNA sequence, .boff)
- qpos = ftello(quiva);
- Write_QVcoding(quiva,coding);
+ { int i, x;
- // Then compress and append to the .qvs each compressed QV entry
-
- rewind(temp);
- Set_QV_Line(cline);
for (i = first; i < last; i++)
- { s = Read_Lines(temp,1);
- if (s < -1)
- { fprintf(stderr,"%s",Ebuffer);
+ { char *find;
+ int clen;
+ float snr[4];
+ uint16 cnr[4];
+
+ if (eof)
+ { if (PIPE)
+ fprintf(stderr,"%s: Insufficient # of reads on input while handling %s.arrow\n",
+ Prog_Name,fname);
+ else
+ { fprintf(stderr,"%s: Insufficient # of reads in %s.arrow while handling",
+ Prog_Name,core);
+ fprintf(stderr," %s.arrow\n",fname);
+ }
+ goto error;
+ }
+
+ find = index(read+(rlen+1),' ');
+ if (find == NULL)
+ { fprintf(stderr,"File %s.arrow, Line %d: Pacbio header line format error\n",
+ core,cline);
goto error;
}
- reads[i].coff = qpos;
- s = Compress_Next_QVentry(temp,quiva,coding,LOSSY);
- if (s < 0)
- { fprintf(stderr,"%s",Ebuffer);
+ *find = '\0';
+ if (strcmp(read+(rlen+1),prolog) != 0)
+ { fprintf(stderr,"File %s.arrow, Line %d: Pacbio prolog doesn't match DB entry\n",
+ core,cline);
goto error;
}
- if (s != reads[i].rlen)
- { fprintf(stderr,"%s: Length of quiva %d is different than fasta in DB\n",
- Prog_Name,i+1);
+ *find = ' ';
+ x = sscanf(find+1," SN=%f,%f,%f,%f\n",snr,snr+1,snr+2,snr+3);
+ if (x != 4)
+ { fprintf(stderr,"File %s.arrow, Line %d: Pacbio header line format error\n",
+ core,cline);
goto error;
}
- qpos = ftello(quiva);
- }
- cline = Get_QV_Line();
- Free_QVcoding(coding);
+ rlen = 0;
+ while (1)
+ { eof = (fgets(read+rlen,MAX_NAME,input) == NULL);
+ cline += 1;
+ x = strlen(read+rlen)-1;
+ if (read[rlen+x] != '\n')
+ { if (read[rlen] == '>')
+ { fprintf(stderr,"File %s.arrow, Line %d:",core,cline);
+ fprintf(stderr," Fasta header line is too long (> %d chars)\n",
+ MAX_NAME-2);
+ goto error;
+ }
+ else
+ x += 1;
+ }
+ if (eof || read[rlen] == '>')
+ break;
+ rlen += x;
+ if (rlen + MAX_NAME > rmax)
+ { rmax = ((int) (1.2 * rmax)) + 1000 + MAX_NAME;
+ read = (char *) realloc(read,rmax+1);
+ if (read == NULL)
+ { fprintf(stderr,"File %s.arrow, Line %d:",core,cline);
+ fprintf(stderr," Out of memory (Allocating line buffer)\n");
+ goto error;
+ }
+ }
+ }
+ read[rlen] = '\0';
+
+ for (x = 0; x < 4; x++)
+ cnr[x] = (uint32) (snr[x] * 100.);
+
+ *((uint64 *) &(reads[i].coff)) = ((uint64) cnr[0]) << 48 |
+ ((uint64) cnr[1]) << 32 |
+ ((uint64) cnr[2]) << 16 |
+ ((uint64) cnr[3]);
+
+ Number_Arrow(read);
+ Compress_Read(rlen,read);
+ clen = COMPRESSED_LEN(rlen);
+ fwrite(read,1,clen,arrow);
+ }
}
}
- if (fgetc(input) != EOF)
+ if (!eof)
{ if (PIPE)
fprintf(stderr,"%s: Too many reads on input while handling %s.fasta\n",
Prog_Name,lname);
else
- fprintf(stderr,"%s: Too many reads in %s.quiva while handling %s.fasta\n",
+ fprintf(stderr,"%s: Too many reads in %s.arrow while handling %s.fasta\n",
Prog_Name,core,lname);
goto error;
}
@@ -461,10 +483,8 @@ int main(int argc, char *argv[])
free(path);
if (next_file(ng))
{ if (ng->name == NULL)
- { fprintf(stderr,"%s",Ebuffer);
- goto error;
- }
- core = Root(ng->name,".quiva");
+ goto error;
+ core = Root(ng->name,".arrow");
fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core);
goto error;
}
@@ -473,35 +493,30 @@ int main(int argc, char *argv[])
// Write the db record and read index into .idx and clean up
+ db.allarr |= DB_ARROW;
rewind(indx);
fwrite(&db,sizeof(HITS_DB),1,indx);
fwrite(reads,sizeof(HITS_READ),db.ureads,indx);
fclose(istub);
fclose(indx);
- fclose(quiva);
- fclose(temp);
- unlink(tname);
+ fclose(arrow);
exit (0);
- // Error exit: Either truncate or remove the .qvs file as appropriate.
+ // Error exit: Either truncate or remove the .arw file as appropriate.
error:
- if (coff != 0)
- { fseeko(quiva,0,SEEK_SET);
- if (ftruncate(fileno(quiva),coff) < 0)
- fprintf(stderr,"%s: Fatal: could not restore %s.qvs after error, truncate failed\n",
+ if (boff != 0)
+ { fseeko(arrow,0,SEEK_SET);
+ if (ftruncate(fileno(arrow),boff) < 0)
+ fprintf(stderr,"%s: Fatal: could not restore %s.arw after error, truncate failed\n",
Prog_Name,root);
}
- if (quiva != NULL)
- { fclose(quiva);
- if (coff == 0)
- unlink(Catenate(pwd,PATHSEP,root,".qvs"));
- }
- if (temp != NULL)
- { fclose(temp);
- unlink(tname);
+ if (arrow != NULL)
+ { fclose(arrow);
+ if (boff == 0)
+ unlink(Catenate(pwd,PATHSEP,root,".arw"));
}
fclose(istub);
fclose(indx);
diff --git a/fasta2DAM.c b/fasta2DAM.c
index 15e074a..af6722a 100644
--- a/fasta2DAM.c
+++ b/fasta2DAM.c
@@ -527,6 +527,7 @@ int main(int argc, char *argv[])
db.totlen = totlen;
db.maxlen = maxlen;
db.cutoff = -1;
+ db.allarr = 0;
}
else
{ for (c = 0; c < 4; c++)
diff --git a/fasta2DB.c b/fasta2DB.c
index 071bbe0..98298e3 100644
--- a/fasta2DB.c
+++ b/fasta2DB.c
@@ -35,7 +35,7 @@
#define PATHSEP "/"
#endif
-static char *Usage = "[-v] <path:string> ( -f<file> | -i[<name>] | <input:fasta> ... )";
+static char *Usage = "[-v] <path:db> ( -f<file> | -i[<name>] | <input:fasta> ... )";
static char number[128] =
{ 0, 0, 0, 0, 0, 0, 0, 0,
@@ -252,23 +252,21 @@ int main(int argc, char *argv[])
ureads = 0;
offset = 0;
- boff = 0;
- ioff = 0;
}
else
{ if (fscanf(istub,DB_NFILE,&ocells) != 1)
{ fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root);
- goto error;
+ exit (1);
}
bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"r+");
indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+");
if (bases == NULL || indx == NULL)
- goto error;
+ exit (1);
if (fread(&db,sizeof(HITS_DB),1,indx) != 1)
{ fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root);
- goto error;
+ exit (1);
}
fseeko(bases,0,SEEK_END);
fseeko(indx, 0,SEEK_END);
@@ -340,8 +338,7 @@ int main(int argc, char *argv[])
{ FILE *input;
char prolog[MAX_NAME];
char *path, *core;
- int nline, eof, rlen, pcnt;
- int pwell;
+ int eof;
// Open it: <path>/<core>.fasta if file, stdin otherwise with core = PIPE or "stdout"
@@ -367,6 +364,22 @@ int main(int argc, char *argv[])
input = stdin;
}
+ // Get the header of the first line. If the file is empty skip.
+
+ eof = (fgets(read,MAX_NAME,input) == NULL);
+ if (eof || strlen(read) < 1)
+ { free(core);
+ fclose(input);
+ if (PIPE != NULL)
+ { fprintf(stderr,"Standard input is empty, terminating!\n");
+ break;
+ }
+ else
+ { fprintf(stderr,"Skipping '%s', file is empty!\n",core);
+ continue;
+ }
+ }
+
// Check that core is not too long and name is unique or last source if PIPE'd
if (strlen(core) >= MAX_NAME)
@@ -388,21 +401,6 @@ int main(int argc, char *argv[])
}
}
- // Get the header of the first line. If the file is empty skip.
-
- pcnt = 0;
- rlen = 0;
- nline = 1;
- eof = (fgets(read,MAX_NAME,input) == NULL);
- if (eof || strlen(read) < 1)
- { fprintf(stderr,"Skipping '%s', file is empty!\n",core);
- if (input == stdin)
- break;
- fclose(input);
- free(core);
- continue;
- }
-
// Add the file name to flist
if (VERBOSE)
@@ -436,8 +434,7 @@ int main(int argc, char *argv[])
*find = '/';
}
else
- { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n",
- core,nline);
+ { fprintf(stderr,"File %s.fasta, Line 1: Pacbio header line format error\n",core);
goto error;
}
}
@@ -445,7 +442,11 @@ int main(int argc, char *argv[])
// Read in all the sequences until end-of-file
{ int i, x;
+ int nline, pwell, rlen, pcnt;
+ pcnt = 0;
+ rlen = 0;
+ nline = 1;
pwell = -1;
while (!eof)
{ int beg, end, clen;
@@ -583,6 +584,7 @@ int main(int argc, char *argv[])
db.totlen = totlen;
db.maxlen = maxlen;
db.cutoff = -1;
+ db.allarr = 0;
}
else
{ for (c = 0; c < 4; c++)
diff --git a/quiva2DB.c b/quiva2DB.c
index 6d099f5..55e54d6 100644
--- a/quiva2DB.c
+++ b/quiva2DB.c
@@ -31,7 +31,7 @@
#define PATHSEP "/"
#endif
-static char *Usage = "[-vl] <path:string> ( -f<file> | -i | <input:quiva> ... )";
+static char *Usage = "[-vl] <path:db> ( -f<file> | -i | <input:quiva> ... )";
typedef struct
{ int argc;
@@ -165,7 +165,7 @@ int main(int argc, char *argv[])
exit (1);
}
if (fscanf(istub,DB_NFILE,&nfiles) != 1)
- { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",root,Prog_Name);
+ { fprintf(stderr,"%s: %s.db is corrupted, read failed\n",Prog_Name,root);
exit (1);
}
@@ -175,7 +175,11 @@ int main(int argc, char *argv[])
exit (1);
}
if (fread(&db,sizeof(HITS_DB),1,indx) != 1)
- { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",root,Prog_Name);
+ { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root);
+ exit (1);
+ }
+ if ((db.allarr & DB_ARROW) != 0)
+ { fprintf(stderr,"%s: Database %s has Arrow data!\n",Prog_Name,root);
exit (1);
}
@@ -185,7 +189,7 @@ int main(int argc, char *argv[])
exit (1);
}
if (fread(reads,sizeof(HITS_READ),db.ureads,indx) != (size_t) (db.ureads))
- { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",root,Prog_Name);
+ { fprintf(stderr,"%s: %s.idx is corrupted, read failed\n",Prog_Name,root);
exit (1);
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/dazzdb.git
More information about the debian-med-commit
mailing list