[med-svn] [dazzdb] 01/06: New upstream version 1.0+20171014
Afif Elghraoui
afif at moszumanska.debian.org
Fri Oct 20 04:02:32 UTC 2017
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository dazzdb.
commit 37db065aaaa30550bcfd55019929f682e22a4280
Author: Afif Elghraoui <afif at debian.org>
Date: Thu Oct 19 23:08:56 2017 -0400
New upstream version 1.0+20171014
---
Catrack.c | 10 ++--
DB.c | 3 +-
DBdump.c | 19 +++++++
DBshow.c | 2 +
DBtrim.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Makefile | 5 +-
README.md | 29 ++++++----
arrow2DB.c | 1 +
simulator.c | 20 ++++++-
9 files changed, 257 insertions(+), 17 deletions(-)
diff --git a/Catrack.c b/Catrack.c
index 14d9b64..8ea6c38 100644
--- a/Catrack.c
+++ b/Catrack.c
@@ -21,12 +21,13 @@
#define PATHSEP "/"
#endif
-static char *Usage = "[-v] <path:db|dam> <track:name>";
+static char *Usage = "[-vf] <path:db|dam> <track:name>";
int main(int argc, char *argv[])
{ char *prefix;
FILE *aout, *dout;
int VERBOSE;
+ int FORCE;
// Process arguments
@@ -38,12 +39,13 @@ int main(int argc, char *argv[])
j = 1;
for (i = 1; i < argc; i++)
if (argv[i][0] == '-')
- { ARG_FLAGS("v") }
+ { ARG_FLAGS("vf") }
else
argv[j++] = argv[i];
argc = j;
VERBOSE = flags['v'];
+ FORCE = flags['f'];
if (argc != 3)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
@@ -65,14 +67,14 @@ int main(int argc, char *argv[])
free(root);
aout = fopen(Catenate(prefix,argv[2],".","anno"),"r");
- if (aout != NULL)
+ if (aout != NULL && !FORCE)
{ fprintf(stderr,"%s: Track file %s%s.anno already exists!\n",Prog_Name,prefix,argv[2]);
fclose(aout);
exit (1);
}
dout = fopen(Catenate(prefix,argv[2],".","data"),"r");
- if (dout != NULL)
+ if (dout != NULL && !FORCE)
{ fprintf(stderr,"%s: Track file %s%s.data already exists!\n",Prog_Name,prefix,argv[2]);
fclose(dout);
exit (1);
diff --git a/DB.c b/DB.c
index 3afff14..69060d0 100644
--- a/DB.c
+++ b/DB.c
@@ -1486,7 +1486,8 @@ int Load_Arrow(HITS_DB *db, int i, char *read, int ascii)
EXIT(1);
}
if (Arrow_DB != db)
- { fclose(Arrow_File);
+ { if (Arrow_File != NULL)
+ fclose(Arrow_File);
arrow = Fopen(Catenate(db->path,"","",".arw"),"r");
if (arrow == NULL)
EXIT(1);
diff --git a/DBdump.c b/DBdump.c
index 6227251..faef253 100644
--- a/DBdump.c
+++ b/DBdump.c
@@ -165,6 +165,25 @@ int main(int argc, char *argv[])
if (argc <= 1)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
+ fprintf(stderr,"\n");
+ fprintf(stderr," -r: R # - read number\n");
+ fprintf(stderr," -h: H # string - original file name string (header)\n");
+ fprintf(stderr," L # # # - location: well, pulse start, pulse end\n");
+ fprintf(stderr," Q # - quality of read (#/1000)\n");
+ fprintf(stderr," -s: S # string - sequence string\n");
+ fprintf(stderr," -a: N # # # # - SNR of ACGT channels (#/100)\n");
+ fprintf(stderr," A # string - arrow pulse-width string\n");
+ fprintf(stderr," -i: I # string - intrinsic quality vector (as an ASCII string)\n");
+ fprintf(stderr," -q: d # string - Quiva deletion values (as an ASCII string)\n");
+ fprintf(stderr," c # string - Quiva deletion character string\n");
+ fprintf(stderr," i # string - Quiva insertion value string\n");
+ fprintf(stderr," m # string - Quiva merge value string\n");
+ fprintf(stderr," s # string - Quiva substitution value string\n");
+ fprintf(stderr," -p: P # string - repeat profile vector (as an ASCII string)\n");
+ fprintf(stderr," -m: Tx #n (#b #e)^#n - x'th track on command line, #n intervals all on same line\n");
+ fprintf(stderr,"\n");
+ fprintf(stderr," -u: Dump entire untrimmed database.\n");
+ fprintf(stderr," -U: Output base pairs in upper case letters\n");
exit (1);
}
diff --git a/DBshow.c b/DBshow.c
index c376ed4..5114665 100644
--- a/DBshow.c
+++ b/DBshow.c
@@ -435,6 +435,8 @@ int main(int argc, char *argv[])
}
if (DOARR)
arrow = New_Read_Buffer(db);
+ else
+ arrow = NULL;
if (UPPER == 1)
{ hilight = 'A'-'a';
diff --git a/DBtrim.c b/DBtrim.c
new file mode 100644
index 0000000..778123d
--- /dev/null
+++ b/DBtrim.c
@@ -0,0 +1,185 @@
+/*******************************************************************************************
+ *
+ * Reset the trimming parameters for a .db:
+ * Rewrite the .db or .dam file with the new thresholds and the new read counts for
+ * each trimmed block.
+ *
+ * Author: Gene Myers
+ * Date : September 2017
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+static char *Usage = "[-af] [-x<int>] <path:db|dam>";
+
+int main(int argc, char *argv[])
+{ HITS_DB db, dbs;
+ int64 dbpos;
+ FILE *dbfile, *ixfile;
+ int nblocks;
+ int status;
+
+ int FORCE;
+ int ALL;
+ int CUTOFF;
+
+ { int i, j, k;
+ int flags[128];
+ char *eptr;
+ float size;
+
+ ARG_INIT("DBtrim")
+
+ CUTOFF = 0;
+ size = 200;
+
+ j = 1;
+ for (i = 1; i < argc; i++)
+ if (argv[i][0] == '-')
+ switch (argv[i][1])
+ { default:
+ ARG_FLAGS("af")
+ break;
+ case 'x':
+ ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff")
+ break;
+ }
+ else
+ argv[j++] = argv[i];
+ argc = j;
+
+ ALL = flags['a'];
+ FORCE = flags['f'];
+
+ if (argc != 2)
+ { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+ exit (1);
+ }
+ }
+
+ // Open db
+
+ status = Open_DB(argv[1],&db);
+ if (status < 0)
+ exit (1);
+ if (db.part > 0)
+ { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
+ exit (1);
+ }
+
+ { char *pwd, *root;
+ char buffer[2*MAX_NAME+100];
+ int nfiles;
+ int all, cutoff;
+ int64 size;
+ int i;
+
+ pwd = PathTo(argv[1]);
+ if (status)
+ { root = Root(argv[1],".dam");
+ dbfile = Fopen(Catenate(pwd,"/",root,".dam"),"r+");
+ }
+ else
+ { root = Root(argv[1],".db");
+ dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r+");
+ }
+ ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+");
+ if (dbfile == NULL || ixfile == NULL)
+ exit (1);
+ free(pwd);
+ free(root);
+
+ if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
+ SYSTEM_ERROR
+ for (i = 0; i < nfiles; i++)
+ if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL)
+ SYSTEM_ERROR
+
+ if (fread(&dbs,sizeof(HITS_DB),1,ixfile) != 1)
+ SYSTEM_ERROR
+
+ if (dbs.cutoff >= 0)
+ { if (!FORCE)
+ { printf("You are about to reset the thresholds for the trimmed DB.\n");
+ printf("This will invalidate any .las files produced by daligner\n");
+ printf("Are you sure you want to proceed? [Y/N] ");
+ fflush(stdout);
+ if (fgets(buffer,100,stdin) == NULL)
+ SYSTEM_ERROR
+ if (index(buffer,'n') != NULL || index(buffer,'N') != NULL)
+ { printf("Aborted\n");
+ fflush(stdout);
+ fclose(dbfile);
+ exit (1);
+ }
+ }
+ }
+ else
+ { fprintf(stderr,"%s: DB has not yet been split, use DBsplit\n",Prog_Name);
+ exit (1);
+ }
+
+ fscanf(dbfile,DB_NBLOCK,&nblocks);
+
+ dbpos = ftello(dbfile);
+ fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all);
+ fseeko(dbfile,dbpos,SEEK_SET);
+ fprintf(dbfile,DB_PARAMS,size,CUTOFF,ALL);
+ }
+
+ { HITS_READ *reads = db.reads;
+ int uread, tread;
+ int rlen;
+ int b, u, t;
+
+ u = 0;
+ t = 0;
+ fprintf(dbfile,DB_BDATA,0,0);
+ for (b = 0; b < nblocks; b++)
+ { dbpos = ftello(dbfile);
+ fscanf(dbfile,DB_BDATA,&uread,&tread);
+
+ if (ALL)
+ while (u < uread)
+ { rlen = reads[u++].rlen;
+ if (rlen >= CUTOFF)
+ t += 1;
+ }
+ else
+ while (u < uread)
+ { rlen = reads[u].rlen;
+ if (rlen >= CUTOFF && (reads[u].flags & DB_BEST) != 0)
+ t += 1;
+ u += 1;
+ }
+
+ fseeko(dbfile,dbpos,SEEK_SET);
+ fprintf(dbfile,DB_BDATA,uread,t);
+ }
+
+ dbs.cutoff = CUTOFF;
+ if (ALL)
+ dbs.allarr |= DB_ALL;
+ dbs.treads = t;
+ rewind(ixfile);
+ fwrite(&dbs,sizeof(HITS_DB),1,ixfile);
+ }
+
+ fclose(ixfile);
+ fclose(dbfile);
+ Close_DB(&db);
+
+ exit (0);
+}
diff --git a/Makefile b/Makefile
index 873296f..8ed8f68 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ DEST_DIR = ~/bin
CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing
ALL = fasta2DB DB2fasta quiva2DB DB2quiva DBsplit DBdust Catrack DBshow DBstats DBrm simulator \
- fasta2DAM DAM2fasta DBdump rangen arrow2DB DB2arrow DBwipe
+ fasta2DAM DAM2fasta DBdump rangen arrow2DB DB2arrow DBwipe DBtrim
all: $(ALL)
@@ -28,6 +28,9 @@ arrow2DB: arrow2DB.c DB.c QV.c DB.h QV.h
DBsplit: DBsplit.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DBsplit DBsplit.c DB.c QV.c -lm
+DBtrim: DBtrim.c DB.c DB.h QV.c QV.h
+ gcc $(CFLAGS) -o DBtrim DBtrim.c DB.c QV.c -lm
+
DBdust: DBdust.c DB.c DB.h QV.c QV.h
gcc $(CFLAGS) -o DBdust DBdust.c DB.c QV.c -lm
diff --git a/README.md b/README.md
index ba37408..0967a17 100644
--- a/README.md
+++ b/README.md
@@ -279,7 +279,14 @@ question has previously bin split, i.e. one is not interactively asked if they w
to proceed.
```
-10. DBdust [-b] [-w<int(64)>] [-t<double(2.)>] [-m<int(10)>] <path:db|dam>
+10. DBtrim [-af] [-x<int>] <path:db|dam>
+```
+
+Exactly like DBsplit except that it only resets the trimming parameters (and not the split
+partition itself).
+
+```
+11. DBdust [-b] [-w<int(64)>] [-t<double(2.)>] [-m<int(10)>] <path:db|dam>
```
Runs the symmetric DUST algorithm over the reads in the untrimmed DB \<path\>.db or
@@ -300,16 +307,18 @@ This permits job parallelism in block-sized chunks, and the resulting sequence o
block tracks can then be merged into a track for the entire untrimmed DB with Catrack.
```
-11. Catrack [-v] <path:db|dam> <track:name>
+12. Catrack [-vf] <path:db|dam> <track:name>
```
Find all block tracks of the form .\<path\>.#.\<track\>... and merge them into a single
track, .\<path\>.\<track\>..., for the given DB or DAM. The block track files must all
encode the same kind of track data (this is checked), and the files must exist for
-block 1, 2, 3, ... up to the last block number.
+block 1, 2, 3, ... up to the last block number. If the -f option is set, then the
+concatenation takes place regardless of whether or not the single, combined track
+exists or not.
```
-12. DBshow [-unqaUQA] [-w<int(80)>] [-m<track>]+
+13. DBshow [-unqaUQA] [-w<int(80)>] [-m<track>]+
<path:db|dam> [ <reads:FILE> | <reads:range> ... ]
```
@@ -348,7 +357,7 @@ fasta2DB, quiva2D, and arrow2DB, giving one a simple way to make a DB of a subse
the reads for testing purposes.
```
-13. DBdump [-rhsaqip] [-uU] [-m<track>]+
+14. DBdump [-rhsaqip] [-uU] [-m<track>]+
<path:db|dam> [ <reads:FILE> | <reads:range> ... ]
```
@@ -422,7 +431,7 @@ Arrow pulse width strings are identical to that for the sequence as they are all
the same length for any given entry.
```
-14. DBstats [-nu] [-b<int(1000)] [-m<track>]+ <path:db|dam>
+15. DBstats [-nu] [-b<int(1000)] [-m<track>]+ <path:db|dam>
```
Show overview statistics for all the reads in the trimmed data base \<path\>.db or
@@ -434,7 +443,7 @@ intervals along the read can be specified with the -m option in which case a sum
and a histogram of the interval lengths is displayed.
```
-15. DBrm <path:db|dam> ...
+16. DBrm <path:db|dam> ...
```
Delete all the files for the given data bases. Do not use rm to remove a database, as
@@ -442,7 +451,7 @@ there are at least two and often several secondary files for each DB including t
files, and all of these are removed by DBrm.
```
-16. DBwipe <path:db|dam> ...
+17. DBwipe <path:db|dam> ...
```
Delete any Arrow or Quiver data from the given databases. This removes the .arw or
@@ -450,7 +459,7 @@ Delete any Arrow or Quiver data from the given databases. This removes the .arw
or Quiver. Basically, converts an A-DB or Q-DB back to a simple S-DB.
```
-17. simulator <genome:dam> [-CU] [-m<int(10000)>] [-s<int(2000)>] [-e<double(.15)]
+18. simulator <genome:dam> [-CU] [-m<int(10000)>] [-s<int(2000)>] [-e<double(.15)]
[-c<double(50.)>] [-f<double(.5)>] [-x<int(4000)>]
[-w<int(80)>] [-r<int>] [-M<file>]
```
@@ -485,7 +494,7 @@ a read is say 's b e' then if b \< e the read is a perturbed copy of s[b,e] in t
forward direction, and a perturbed copy s[e,b] in the reverse direction otherwise.
```
-18. rangen <genlen:double> [-U] [-b<double(.5)>] [-w<int(80)>] [-r<int>]
+19. rangen <genlen:double> [-U] [-b<double(.5)>] [-w<int(80)>] [-r<int>]
```
Generate a random DNA sequence of length genlen*1Mbp that has an AT-bias of -b.
diff --git a/arrow2DB.c b/arrow2DB.c
index 19975aa..d7890e9 100644
--- a/arrow2DB.c
+++ b/arrow2DB.c
@@ -222,6 +222,7 @@ int main(int argc, char *argv[])
goto error;
}
+ eof = 0;
for (cell = 0; cell < nfiles; cell++)
{ char prolog[MAX_NAME], fname[MAX_NAME];
diff --git a/simulator.c b/simulator.c
index cf1d694..57cd956 100644
--- a/simulator.c
+++ b/simulator.c
@@ -41,6 +41,8 @@
#include <unistd.h>
#include <math.h>
+#define PACBIO
+
#include "DB.h"
static char *Usage[] = { "<genome:dam> [-CU] [-m<int(10000)>] [-s<int(2000)>] [-e<double(.15)>]",
@@ -61,10 +63,26 @@ static int HASR; // -r option is set?
static int SEED; // -r option
static FILE *MAP; // -M option
-#define INS_RATE .73333 // insert rate
+#ifdef PACBIO
+
+#define INS_RATE .73333 // insert rate (for PB data)
#define DEL_RATE .20000 // deletion rate
#define IDL_RATE .93333 // insert + delete rate
+#elif ILLUMINA
+
+#define INS_RATE .1 // insert rate (for Illumina data)
+#define DEL_RATE .1 // deletion rate
+#define IDL_RATE .2 // insert + delete rate
+
+#else
+
+#define INS_RATE .33333 // insert rate (equal weighting)
+#define DEL_RATE .33333 // deletion rate
+#define IDL_RATE .66666 // insert + delete rate
+
+#endif
+
// Complement (in the DNA sense) string *s*.
static void complement(int elen, char *s)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/dazzdb.git
More information about the debian-med-commit
mailing list