[med-svn] [Git][med-team/dazzdb][upstream] New upstream version 1.0+git20180908.0bd5e07

Andreas Tille gitlab at salsa.debian.org
Sun Oct 28 09:31:33 GMT 2018


Andreas Tille pushed to branch upstream at Debian Med / dazzdb


Commits:
d9350062 by Andreas Tille at 2018-10-28T09:26:00Z
New upstream version 1.0+git20180908.0bd5e07
- - - - -


13 changed files:

- Catrack.c
- DB.c
- DB.h
- DBdump.c
- DBdust.c
- DBshow.c
- DBsplit.c
- DBtrim.c
- README.md
- fasta2DAM.c
- fasta2DB.c
- rangen.c
- simulator.c


Changes:

=====================================
Catrack.c
=====================================
@@ -131,6 +131,7 @@ int main(int argc, char *argv[])
     int         nextra;
     int64       extail;
 
+    extra    = NULL;
     anno     = NULL;
     trackoff = 0;
     tracktot = tracksiz = 0;
@@ -144,7 +145,7 @@ int main(int argc, char *argv[])
     while (1)
       { FILE *dfile, *afile;
         char *dfile_name, *afile_name;
-        int   i, size, esize, tracklen;
+        int   i, apos, size, esize, tracklen;
 
         afile_name = Strdup(Numbered_Suffix(prefix,nfiles+1,Catenate(".",argv[2],".","anno")),
                             "Allocating .anno file name");
@@ -256,10 +257,11 @@ int main(int argc, char *argv[])
           }
 
         FSEEKO(afile,0,SEEK_END)
+        FTELLO(apos,afile)
         if (dfile != NULL)
-          extail = FTELLO(afile) - (esize*(tracklen+1) + 2*sizeof(int)); 
+          extail = apos - (esize*(tracklen+1) + 2*sizeof(int)); 
         else
-          extail = FTELLO(afile) - (esize*tracklen + 2*sizeof(int)); 
+          extail = apos - (esize*tracklen + 2*sizeof(int)); 
         FSEEKO(afile,-extail,SEEK_END)
 
         if (extail >= 20)
@@ -338,16 +340,16 @@ int main(int argc, char *argv[])
         FWRITE(&tracksiz,sizeof(int),1,aout)
       }
   }
-  
-  FCLOSE(aout);
-  if (dout != NULL)
-    FCLOSE(dout);
 
   if (nfiles != nblocks)
     { fprintf(stderr,"%s: Did not catenate all tracks of DB (nfiles %d != nblocks %d)\n",
 	             Prog_Name, nfiles, nblocks);
       goto error;
     }
+  
+  FCLOSE(aout);
+  if (dout != NULL)
+    FCLOSE(dout);
 
   if (DELETE)
     { int   i;


=====================================
DB.c
=====================================
@@ -17,6 +17,8 @@
 #include <ctype.h>
 #include <unistd.h>
 #include <dirent.h>
+#include <limits.h>
+#include <sys/stat.h>
 
 #include "DB.h"
 
@@ -599,8 +601,8 @@ error:
 //   for the retained reads.
 
 void Trim_DB(DAZZ_DB *db)
-{ int         i, j, r;
-  int         allflag, cutoff;
+{ int         i, j, r, f;
+  int         allflag, cutoff, css;
   int64       totlen;
   int         maxlen, nreads;
   DAZZ_TRACK *record;
@@ -676,14 +678,23 @@ void Trim_DB(DAZZ_DB *db)
         record->anno = Realloc(record->anno,record->size*(j+1),NULL);
       }
 
+  css    = 0;
   totlen = maxlen = 0;
   for (j = i = 0; i < nreads; i++)
-    { r = reads[i].rlen;
-      if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff)
+    { f = reads[i].flags;
+      if ((f & DB_CSS) == 0)
+        css = 0;
+      r = reads[i].rlen;
+      if ((f & DB_BEST) >= allflag && r >= cutoff)
         { totlen += r;
           if (r > maxlen)
             maxlen = r;
-          reads[j++] = reads[i];
+          reads[j] = reads[i];
+          if (css)
+            reads[j++].flags |= DB_CSS;
+          else
+            reads[j++].flags &= ~DB_CSS;
+          css = 1;
         }
     }
   
@@ -1989,3 +2000,196 @@ void Print_Read(char *s, int width)
       printf("\n");
     }
 }
+
+
+/*******************************************************************************************
+ *
+ *  COMMAND LINE BLOCK PARSER
+ *    Take a command line argument and interpret the '@' block number ranges.
+ *    Parse_Block_Arg produces an Block_Looper iterator object that can then
+ *    be invoked multiple times to iterate through all the files implied by
+ *    the @ pattern/range.
+ *
+ ********************************************************************************************/
+
+typedef struct
+  { int first, last, next;
+    char *root, *pwd, *ppnt;
+    char *slice;
+  } _Block_Looper;
+
+  //  Advance the iterator e_parse to the next file, open it, and return the file pointer
+  //   to it.  Return NULL if at the end of the list of files.
+
+int Next_Block_Exists(Block_Looper *e_parse)
+{ _Block_Looper *parse = (_Block_Looper *) e_parse;
+
+  char       *disp;
+  struct stat sts;
+
+  if (parse->next+1 > parse->last)
+    return (0);
+
+  if (parse->next < 0)
+    disp  = parse->root;
+  else
+    disp = Numbered_Suffix(parse->root,parse->next+1,parse->ppnt);
+
+  if (stat(Catenate(parse->pwd,"/",disp,".las"),&sts))
+    return (0);
+  else
+    return (1);
+}
+
+
+FILE *Next_Block_Arg(Block_Looper *e_parse)
+{ _Block_Looper *parse = (_Block_Looper *) e_parse;
+
+  char *disp;
+  FILE *input;
+
+  parse->next += 1;
+  if (parse->next > parse->last)
+    return (NULL);
+
+  if (parse->next < 0)
+    disp  = parse->root;
+  else
+    disp = Numbered_Suffix(parse->root,parse->next,parse->ppnt);
+
+  if ((input = fopen(Catenate(parse->pwd,"/",disp,".las"),"r")) == NULL)
+    { if (parse->last != INT_MAX)
+        { fprintf(stderr,"%s: %s.las is not present\n",Prog_Name,disp);
+          exit (1);
+        }
+      return (NULL);
+    }
+  return (input);
+}
+
+  //  Reset the iterator e_parse to the first file
+
+void Reset_Block_Arg(Block_Looper *e_parse)
+{ _Block_Looper *parse = (_Block_Looper *) e_parse;
+
+  parse->next = parse->first - 1;
+}
+
+  //  Return a pointer to the path for the current file
+
+char *Block_Arg_Path(Block_Looper *e_parse)
+{ _Block_Looper *parse = (_Block_Looper *) e_parse;
+
+  return (parse->pwd);
+}
+
+  //  Return a pointer to the root name for the current file
+
+char *Block_Arg_Root(Block_Looper *e_parse)
+{ _Block_Looper *parse = (_Block_Looper *) e_parse;
+
+  if (parse->next < 0)
+    return (parse->root);
+  else
+    return (Numbered_Suffix(parse->root,parse->next,parse->ppnt));
+}
+
+  //  Free the iterator
+
+void Free_Block_Arg(Block_Looper *e_parse)
+{ _Block_Looper *parse = (_Block_Looper *) e_parse;
+
+  free(parse->root);
+  free(parse->pwd);
+  free(parse->slice);
+  free(parse);
+}
+
+char *Next_Block_Slice(Block_Looper *e_parse, int slice)
+{ _Block_Looper *parse = (_Block_Looper *) e_parse;
+
+  if (parse->slice == NULL)
+    { int size = strlen(parse->pwd) + strlen(Block_Arg_Root(parse)) + 30;
+      parse->slice =  (char *)  Malloc(size,"Block argument slice");
+      if (parse->slice == NULL)
+        exit (1);
+    }
+
+  if (parse->first < 0)
+    sprintf(parse->slice,"%s/%s",parse->pwd,parse->root);
+  else
+    sprintf(parse->slice,"%s/%s%c%d-%d%s",parse->pwd,parse->root,BLOCK_SYMBOL,parse->next+1,
+                                          parse->next+slice,parse->ppnt);
+  parse->next += slice;
+  return (parse->slice);
+}
+
+  //  Parse the command line argument and return an iterator to move through the
+  //    file names, setting it up to report the first file.
+
+Block_Looper *Parse_Block_Arg(char *arg)
+{ _Block_Looper *parse;
+  char *pwd, *root;
+  char *ppnt, *cpnt;
+  int   first, last;
+
+  parse = (_Block_Looper *) Malloc(sizeof(_Block_Looper),"Allocating parse node");
+  pwd   = PathTo(arg);
+  root  = Root(arg,".las");
+  if (parse == NULL || pwd == NULL || root == NULL)
+    exit (1);
+
+  ppnt = index(root,BLOCK_SYMBOL);
+  if (ppnt == NULL)
+    first = last = -1;
+  else
+    { if (index(ppnt+1,BLOCK_SYMBOL) != NULL)
+        { fprintf(stderr,"%s: Two or more occurences of %c-sign in source name '%s'\n",
+                         Prog_Name,BLOCK_SYMBOL,root);
+          exit (1);
+        }
+      *ppnt++ = '\0';
+      first = strtol(ppnt,&cpnt,10);
+      if (cpnt == ppnt)
+        { first = 1;
+          last  = INT_MAX;
+        }
+      else
+        { if (first < 0)
+            { fprintf(stderr,
+                      "%s: Integer following %c-sigan is less than 0 in source name '%s'\n",
+                      Prog_Name,BLOCK_SYMBOL,root);
+              exit (1);
+            }
+          if (*cpnt == '-')
+            { ppnt = cpnt+1;
+              last = strtol(ppnt,&cpnt,10);
+              if (cpnt == ppnt)
+                { fprintf(stderr,"%s: Second integer must follow - in source name '%s'\n",
+                                 Prog_Name,root);
+                  exit (1);
+                }
+              if (last < first)
+                { fprintf(stderr,
+                          "%s: 2nd integer is less than 1st integer in source name '%s'\n",
+                          Prog_Name,root);
+                  exit (1);
+                }
+              ppnt = cpnt;
+            }
+          else
+            { last = INT_MAX;
+              ppnt = cpnt;
+            }
+        }
+    }
+
+  parse->pwd   = pwd;
+  parse->root  = root;
+  parse->ppnt  = ppnt;
+  parse->first = first;
+  parse->last  = last;
+  parse->next  = first-1;
+  parse->slice = NULL;
+  return ((Block_Looper *) parse);
+}


=====================================
DB.h
=====================================
@@ -59,6 +59,8 @@ typedef signed long long   int64;
 typedef float              float32;
 typedef double             float64;
 
+#define LAST_READ_SYMBOL '$'
+#define BLOCK_SYMBOL     '@'
 
 /*******************************************************************************************
  *
@@ -215,12 +217,11 @@ int Count_Args(char *arg);
       SYSTEM_READ_ERROR		\
   }
 
-#define FTELLO(file)		\
-  ( { int x = ftello(file);	\
-      if (x < 0)		\
-        SYSTEM_READ_ERROR	\
-      ; x; 			\
-  } )
+#define FTELLO(val,file)	\
+  { val = ftello(file);		\
+    if (val < 0)		\
+      SYSTEM_READ_ERROR		\
+  }
 
 /*******************************************************************************************
  *
@@ -567,4 +568,25 @@ int Read_All_Sequences(DAZZ_DB *db, int ascii);
 
 int List_DB_Files(char *path, void actor(char *path, char *extension));
 
+  //   Take a command line argument and interpret the '@' block number ranges.
+  //   Parse_Block_Arg produces a Block_Looper iterator object that can then
+  //   be invoked multiple times to iterate through all the files implied by
+  //   the @ pattern/range.  Next_Block_Slice returns a string encoing the next
+  //   slice files represented by an @-notation, and advances the iterator by
+  //   that many files.
+
+typedef void Block_Looper;
+
+Block_Looper *Parse_Block_Arg(char *arg);
+
+FILE *Next_Block_Arg(Block_Looper *e_parse);
+int   Next_Block_Exists(Block_Looper *e_parse);
+
+char *Next_Block_Slice(Block_Looper *e_parse,int slice);
+
+void  Reset_Block_Arg(Block_Looper *e_parse);  //  Reset iterator to first file
+char *Block_Arg_Path(Block_Looper *e_parse);   //  Path of current file
+char *Block_Arg_Root(Block_Looper *e_parse);   //  Root name of current file
+void  Free_Block_Arg(Block_Looper *e_parse);   //  Free the iterator
+
 #endif // _DAZZ_DB


=====================================
DBdump.c
=====================================
@@ -94,6 +94,7 @@ static int prof_map[41] =
 int main(int argc, char *argv[])
 { DAZZ_DB    _db, *db = &_db;
   int         Quiva_DB, Arrow_DB;
+  int         FirstRead;
   FILE       *hdrs      = NULL;
   char       *hdrs_name = NULL;
   int64      *qv_idx    = NULL;
@@ -377,7 +378,10 @@ int main(int argc, char *argv[])
           else if (status == 1)
             MTRACK[i] = Load_Track(db,MASK[i]);
         }
+      FirstRead = db->tfirst;
     }
+  else
+    FirstRead = db->ufirst;
 
   if (DOIQV)
     { int         status, kind;
@@ -713,7 +717,7 @@ int main(int argc, char *argv[])
             r   = reads + i;
             len = r->rlen;
             if (DORED)
-              printf("R %d\n",i+1);
+              printf("R %d\n",FirstRead + (i+1));
 
             flags = r->flags;
             qv    = (flags & DB_QV);


=====================================
DBdust.c
=====================================
@@ -155,7 +155,7 @@ int main(int argc, char *argv[])
         FWRITE(&size,sizeof(int),1,afile)
         FSEEKO(afile,0,SEEK_END)
         FSEEKO(dfile,0,SEEK_END)
-        indx = FTELLO(dfile);
+        FTELLO(indx,dfile)
       }
 
     free(pwd);


=====================================
DBshow.c
=====================================
@@ -175,8 +175,8 @@ int main(int argc, char *argv[])
         fprintf(stderr,"      -n: Do not show the default read DNA sequences.\n");
         fprintf(stderr,"      -m: Show mask intervals and highlight in sequence.\n");
         fprintf(stderr,"\n");
-        fprintf(stderr,"      -Q: Produce a .quiva file (ignore all other options but -uU.\n");
-        fprintf(stderr,"      -A: Produce a .arrow file (ignore all other options but -uw.\n");
+        fprintf(stderr,"      -Q: Produce a .quiva file (ignore all other options but -uU).\n");
+        fprintf(stderr,"      -A: Produce a .arrow file (ignore all other options but -uw).\n");
         fprintf(stderr,"\n");
         fprintf(stderr,"      -U: Use upper case for DNA (default is lower case).\n");
         fprintf(stderr,"      -w: Print -w bp per line (default is 80).\n");
@@ -349,14 +349,13 @@ int main(int argc, char *argv[])
   if (argc == 3)
     { if (argv[2][0] != LAST_READ_SYMBOL || argv[2][1] != '\0')
         { char *eptr, *fptr;
-          int   b, e;
 
-          b = strtol(argv[2],&eptr,10);
-          if (eptr > argv[2] && b > 0)
+          strtol(argv[2],&eptr,10);
+          if (eptr > argv[2])
             { if (*eptr == '-')
                 { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0')
-                    { e = strtol(eptr+1,&fptr,10);
-                      input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0);
+                    { strtol(eptr+1,&fptr,10);
+                      input_pts = (fptr <= eptr+1 || *fptr != '\0');
                     }
                 }
               else


=====================================
DBsplit.c
=====================================
@@ -32,7 +32,15 @@
 #define PATHSEP "/"
 #endif
 
-static char *Usage = "[-af] [-x<int>] [-s<double(200.)>] <path:db|dam>";
+static char *Usage = "[-aflm] [-x<int>] [-s<double(200.)>] <path:db|dam>";
+
+static DAZZ_READ *reads;
+
+static int MSORT(const void *l, const void *r)
+{ int x = *((int *) l);
+  int y = *((int *) r);
+  return (reads[x].rlen - reads[y].rlen);
+}
 
 int main(int argc, char *argv[])
 { DAZZ_DB    db, dbs;
@@ -40,11 +48,16 @@ int main(int argc, char *argv[])
   FILE      *dbfile, *ixfile;
   char      *dbfile_name, *ixfile_name;
   int        status;
+  int        nreads;
+
+  int        medmax = 0;
+  int       *msort = NULL;
 
   int        FORCE;
   int        ALL;
   int        CUTOFF;
   int64      SIZE;
+  int        SELECT;
 
   { int   i, j, k;
     int   flags[128];
@@ -53,6 +66,7 @@ int main(int argc, char *argv[])
 
     ARG_INIT("DBsplit")
 
+    SELECT = -1;
     CUTOFF = 0;
     size   = 200;
 
@@ -61,7 +75,7 @@ int main(int argc, char *argv[])
       if (argv[i][0] == '-')
         switch (argv[i][1])
         { default:
-            ARG_FLAGS("af")
+            ARG_FLAGS("aflm")
             break;
           case 'x':
             ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff")
@@ -82,6 +96,16 @@ int main(int argc, char *argv[])
     ALL   = flags['a'];
     FORCE = flags['f'];
 
+    if (flags['l'])
+      { SELECT = 0;
+        if (flags['m'])
+          { fprintf(stderr,"%s: Cannot set both -l and -m, mutually exclusive\n",Prog_Name);
+            exit (1);
+          }
+      }
+    else if (flags['m'])
+      SELECT = 1;
+
     if (argc != 2)
       { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
         fprintf(stderr,"\n");
@@ -89,6 +113,9 @@ int main(int argc, char *argv[])
         fprintf(stderr,"      -x: Trimmed DB has reads >= this threshold.\n");
         fprintf(stderr,"      -a: Trimmed DB contains all reads from a well (not just longest).\n");
         fprintf(stderr,"      -f: Force the split to occur even if already split.\n");
+        fprintf(stderr,"\n");
+        fprintf(stderr,"      -l: Set primary read for a well to be the longest.\n");
+        fprintf(stderr,"      -m: Set primary read for a well to be the median.\n");
         exit (1);
       }
   }
@@ -102,6 +129,8 @@ int main(int argc, char *argv[])
     { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
       exit (1);
     }
+  reads  = db.reads;
+  nreads = db.ureads;
 
   { char    *pwd, *root;
     char     buffer[2*MAX_NAME+100];
@@ -146,18 +175,59 @@ int main(int argc, char *argv[])
           }
       }
 
-    dbpos = FTELLO(dbfile);
+    FTELLO(dbpos,dbfile)
     FSEEKO(dbfile,dbpos,SEEK_SET)
     FPRINTF(dbfile,DB_NBLOCK,0)
     FPRINTF(dbfile,DB_PARAMS,SIZE,CUTOFF,ALL)
   }
 
-  { DAZZ_READ *reads  = db.reads;
-    int        nreads = db.ureads;
-    int64      totlen;
+  if (SELECT >= 0)
+    { int        i, j, k, h;
+      int        off;
+
+      off = (~ DB_BEST);
+
+      for (i = 0; i < nreads; i = j)
+        { j = i+1;
+          reads[i].flags &= off;
+          while ((reads[j].flags & DB_CSS) != 0)
+            reads[j++].flags &= off;
+  
+          if (j-i <= 1)
+            { reads[i].flags |= DB_BEST;
+              continue;
+            }
+  
+          if (SELECT == 0)
+            { int mlen;
+
+              mlen = reads[h = i].rlen;
+              for (k = i+1; k < j; k++)
+                if (mlen < reads[k].rlen)
+                  mlen = reads[h = k].rlen;
+            }
+          else
+            { if (j-i > medmax)
+                { medmax = 1.2*(j-i) + 1000;
+                  msort  = (int *) Realloc(msort,sizeof(int)*medmax,"Allocating Median Array");
+                  if (msort == NULL)
+                    exit (1);
+                }
+              for (k = i; k < j; k++)
+                msort[k-i] = k;
+              qsort(msort,j-i,sizeof(int),MSORT);
+              h = msort[(j-i)/2];
+            }
+          reads[h].flags |= DB_BEST;
+        } 
+    }
+
+
+  { int64      totlen;
     int        nblock, ireads, treads, rlen, fno;
-    int        i;
+    int        i, upos, css;
 
+    css    = 0;
     nblock = 0;
     totlen = 0;
     ireads = 0;
@@ -166,16 +236,19 @@ int main(int argc, char *argv[])
     if (ALL)
       for (i = 0; i < nreads; i++)
         { rlen = reads[i].rlen;
+          if ((reads[i].flags & DB_CSS) == 0)
+            css = 0;
           if (rlen >= CUTOFF)
-            { ireads += 1;
-              treads += 1;
-              totlen += rlen;
-              if (totlen >= SIZE)
-                { FPRINTF(dbfile,DB_BDATA,i+1,treads)
+            { if (css == 0 && totlen >= SIZE)
+                { FPRINTF(dbfile,DB_BDATA,i,treads)
                   totlen = 0;
                   ireads = 0;
                   nblock += 1;
                 }
+              ireads += 1;
+              treads += 1;
+              totlen += rlen;
+              css = 1;
             }
         }
     else
@@ -199,7 +272,8 @@ int main(int argc, char *argv[])
         nblock += 1;
       }
     fno = fileno(dbfile);
-    if (ftruncate(fno,FTELLO(dbfile)) < 0)
+    FTELLO(upos,dbfile)
+    if (ftruncate(fno,upos) < 0)
       SYSTEM_WRITE_ERROR
 
     FSEEKO(dbfile,dbpos,SEEK_SET)
@@ -208,9 +282,13 @@ int main(int argc, char *argv[])
     dbs.cutoff = CUTOFF;
     if (ALL)
       dbs.allarr |= DB_ALL;
+    else
+      dbs.allarr &= ~DB_ALL;
     dbs.treads = treads;
     FSEEKO(ixfile,0,SEEK_SET)
     FWRITE(&dbs,sizeof(DAZZ_DB),1,ixfile)
+    if (SELECT >= 0)
+      FWRITE(reads,sizeof(DAZZ_READ),nreads,ixfile)
   }
 
   FCLOSE(ixfile)


=====================================
DBtrim.c
=====================================
@@ -22,7 +22,15 @@
 #define PATHSEP "/"
 #endif
 
-static char *Usage = "[-af] [-x<int>] <path:db|dam>";
+static char *Usage = "[-aflm] [-x<int>] <path:db|dam>";
+
+static DAZZ_READ *reads;
+
+static int MSORT(const void *l, const void *r)
+{ int x = *((int *) l);
+  int y = *((int *) r);
+  return (reads[x].rlen - reads[y].rlen);
+}
 
 int main(int argc, char *argv[])
 { DAZZ_DB    db, dbs;
@@ -31,10 +39,15 @@ int main(int argc, char *argv[])
   char      *dbfile_name, *ixfile_name;
   int        nblocks;
   int        status;
+  int        nreads;
+
+  int        medmax = 0;
+  int       *msort = NULL;
 
   int        FORCE;
   int        ALL;
   int        CUTOFF;
+  int        SELECT;
 
   { int   i, j, k;
     int   flags[128];
@@ -42,6 +55,7 @@ int main(int argc, char *argv[])
 
     ARG_INIT("DBtrim")
 
+    SELECT = -1;
     CUTOFF = 0;
 
     j = 1;
@@ -49,7 +63,7 @@ int main(int argc, char *argv[])
       if (argv[i][0] == '-')
         switch (argv[i][1])
         { default:
-            ARG_FLAGS("af")
+            ARG_FLAGS("aflm")
             break;
           case 'x':
             ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff")
@@ -62,12 +76,25 @@ int main(int argc, char *argv[])
     ALL   = flags['a'];
     FORCE = flags['f'];
 
+    if (flags['l'])
+      { SELECT = 0;
+        if (flags['m'])
+          { fprintf(stderr,"%s: Cannot set both -l and -m, mutually exclusive\n",Prog_Name);
+            exit (1);
+          }
+      }
+    else if (flags['m'])
+      SELECT = 1;
+
     if (argc != 2)
       { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
         fprintf(stderr,"\n");
         fprintf(stderr,"      -x: Trimmed DB has reads >= this threshold.\n");
         fprintf(stderr,"      -a: Trimmed DB contains all reads from a well (not just longest).\n");
         fprintf(stderr,"      -f: Force the new trim setting even if already set.\n");
+        fprintf(stderr,"\n");
+        fprintf(stderr,"      -l: Set primary read for a well to be the longest.\n");
+        fprintf(stderr,"      -m: Set primary read for a well to be the median.\n");
         exit (1);
       }
   }
@@ -81,6 +108,8 @@ int main(int argc, char *argv[])
     { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
       exit (1);
     }
+  reads  = db.reads;
+  nreads = db.ureads;
 
   { char    *pwd, *root;
     char     buffer[2*MAX_NAME+100];
@@ -135,12 +164,53 @@ int main(int argc, char *argv[])
 
     FSCANF(dbfile,DB_NBLOCK,&nblocks)
 
-    dbpos = FTELLO(dbfile);
+    FTELLO(dbpos,dbfile)
     FSCANF(dbfile,DB_PARAMS,&size,&cutoff,&all)
     FSEEKO(dbfile,dbpos,SEEK_SET)
     FPRINTF(dbfile,DB_PARAMS,size,CUTOFF,ALL)
   }
 
+  if (SELECT >= 0)
+    { int        i, j, k, h;
+      int        off;
+
+      off = (~ DB_BEST);
+
+      for (i = 0; i < nreads; i = j)
+        { j = i+1;
+          reads[i].flags &= off;
+          while ((reads[j].flags & DB_CSS) != 0)
+            reads[j++].flags &= off;
+
+          if (j-i <= 1)
+            { reads[i].flags |= DB_BEST;
+              continue;
+            }
+
+          if (SELECT == 0)
+            { int mlen;
+
+              mlen = reads[h = i].rlen;
+              for (k = i+1; k < j; k++)
+                if (mlen < reads[k].rlen)
+                  mlen = reads[h = k].rlen;
+            }
+          else
+            { if (j-i > medmax)
+                { medmax = 1.2*(j-i) + 1000;
+                  msort  = (int *) Realloc(msort,sizeof(int)*medmax,"Allocating Median Array");
+                  if (msort == NULL)
+                    exit (1);
+                }
+              for (k = i; k < j; k++)
+                msort[k-i] = k;
+              qsort(msort,j-i,sizeof(int),MSORT);
+              h = msort[(j-i)/2];
+            }
+          reads[h].flags |= DB_BEST;
+        }
+    }
+
   { DAZZ_READ *reads  = db.reads;
     int        uread, tread;
     int        rlen;
@@ -150,7 +220,7 @@ int main(int argc, char *argv[])
     t = 0;
     fprintf(dbfile,DB_BDATA,0,0);
     for (b = 0; b < nblocks; b++)
-      { dbpos = FTELLO(dbfile);
+      { FTELLO(dbpos,dbfile);
         FSCANF(dbfile,DB_BDATA,&uread,&tread)
 
         if (ALL)
@@ -174,9 +244,13 @@ int main(int argc, char *argv[])
     dbs.cutoff = CUTOFF;
     if (ALL)
       dbs.allarr |= DB_ALL;
+    else
+      dbs.allarr &= ~DB_ALL;
     dbs.treads = t;
     FSEEKO(ixfile,0,SEEK_SET)
     FWRITE(&dbs,sizeof(DAZZ_DB),1,ixfile)
+    if (SELECT >= 0)
+      FWRITE(reads,sizeof(DAZZ_READ),nreads,ixfile)
   }
 
   FCLOSE(ixfile)


=====================================
README.md
=====================================
@@ -116,8 +116,9 @@ refer to a database we are referring to either a DB or a DAM.
 The command DBsplit sets or resets the current partition for a database which is
 determined by 3 parameters: (i) the total number of basepairs to place in each block,
 (ii) the minimum read length of reads to include within a block, and (iii) whether or
-not to only include the longest read from a given well or all reads from a well (NB:
-several reads of the same insert in a given well can be produced by the Pacbio
+not to only include the longest (-l) or median (-m) read from a given well or all
+reads from a well (-a) (NB: several reads of the same insert in a given well can be
+produced by the Pacbio
 instrument).  Note that the length and uniqueness parameters effectively select a
 subset of the reads that contribute to the size of a block.  We call this subset the
 *trimmed* data base.  Some commands operate on the entire database, others on the
@@ -259,7 +260,7 @@ should be used, and the characters per line, or line width, can be set to any po
 value with the -w option.
 
 ```
-9. DBsplit [-af] [-x<int>] [-s<double(200.)>] <path:db|dam>
+9. DBsplit [-aflm] [-x<int>] [-s<double(200.)>] <path:db|dam>
 ```
 
 Divide the database \<path\>.db or \<path\>.dam conceptually into a series of blocks
@@ -278,6 +279,12 @@ If the -f option is set, the split is forced regardless of whether or not the DB
 question has previously bin split, i.e. one is not interactively asked if they wish
 to proceed.
 
+By default, the primary read for each well consists of the longest subread of the insert
+from the well.  By setting the -m parameter, the subread of median length becomes the
+primary read instead.  One can at any later time change this back to the default longest
+by splitting again with the -l parameter set.  The setting of the primary reads occurs
+regardless of whether the -a parameter is set or not.
+
 ```
 10. DBtrim [-af] [-x<int>] <path:db|dam>
 ```


=====================================
fasta2DAM.c
=====================================
@@ -407,8 +407,12 @@ int main(int argc, char *argv[])
         nline = 1;
         eof   = (fgets(read,MAX_NAME,input) == NULL);
         if (eof || strlen(read) < 1)
-          { fprintf(stderr,"Skipping '%s', file is empty!\n",core);
-            fclose(input);
+          { fclose(input);
+            if (PIPE != NULL)
+              { fprintf(stderr,"Standard input is empty, terminating!\n");
+                break;
+              }
+            fprintf(stderr,"Skipping '%s', file is empty!\n",core);
             free(core);
             continue;
           }


=====================================
fasta2DB.c
=====================================
@@ -390,16 +390,14 @@ int main(int argc, char *argv[])
 
         eof = (fgets(read,MAX_NAME,input) == NULL);
         if (eof || strlen(read) < 1)
-          { free(core);
-            fclose(input);
+          { fclose(input);
             if (PIPE != NULL)
               { fprintf(stderr,"Standard input is empty, terminating!\n");
                 break;
               }
-            else
-              { fprintf(stderr,"Skipping '%s', file is empty!\n",core);
-                continue;
-              }
+            fprintf(stderr,"Skipping '%s', file is empty!\n",core);
+            free(core);
+            continue;
           }
 
         //  Check that core is not too long and name is unique or last source if PIPE'd


=====================================
rangen.c
=====================================
@@ -148,6 +148,11 @@ int main(int argc, char *argv[])
 
   if (argc != 2)
     { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+      fprintf(stderr,"\n");
+      fprintf(stderr,"      -r: Random number generator seed (default is process id).\n");
+      fprintf(stderr,"      -b: AT vs GC ratio or bias.\n");
+      fprintf(stderr,"      -w: Print -w bp per line (default is 80).\n");
+      fprintf(stderr,"      -U: Output sequence in upper case (default is lower case).\n");
       exit (1);
     }
 


=====================================
simulator.c
=====================================
@@ -567,6 +567,19 @@ int main(int argc, char *argv[])
       { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
         fprintf(stderr,"       %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
         fprintf(stderr,"       %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]);
+        fprintf(stderr,"\n");
+        fprintf(stderr,"      -m: average read length (log normal distribution).\n");
+        fprintf(stderr,"      -s: standard deviation of read lengths (log normal)\n");
+        fprintf(stderr,"      -x: ignore reads below this length\n");
+        fprintf(stderr,"      -f: forward/reverse strand sampling fraction\n");
+        fprintf(stderr,"      -e: error rate\n");
+        fprintf(stderr,"      -c: coverage of genome\n");
+        fprintf(stderr,"      -C: assume genome is circular (default is linear)\n");
+        fprintf(stderr,"\n");
+        fprintf(stderr,"      -r: Random number generator seed (default is process id).\n");
+        fprintf(stderr,"      -w: Print -w bp per line (default is 80).\n");
+        fprintf(stderr,"      -U: Use upper case for DNA (default is lower case).\n");
+        fprintf(stderr,"      -M: create a map file that indicates where every read was sampled\n");
         exit (1);
       }
   }



View it on GitLab: https://salsa.debian.org/med-team/dazzdb/commit/d9350062a729e5294b2ca4cd106570d00dfe4719

-- 
View it on GitLab: https://salsa.debian.org/med-team/dazzdb/commit/d9350062a729e5294b2ca4cd106570d00dfe4719
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20181028/62bbcc29/attachment-0001.html>


More information about the debian-med-commit mailing list