[med-svn] [Git][med-team/dwgsim][upstream] New upstream version 0.1.13

Andreas Tille (@tille) gitlab at salsa.debian.org
Sun Jan 16 16:16:01 GMT 2022



Andreas Tille pushed to branch upstream at Debian Med / dwgsim


Commits:
386d48d6 by Andreas Tille at 2022-01-16T17:01:25+01:00
New upstream version 0.1.13
- - - - -


9 changed files:

- Makefile
- src/dwgsim.c
- src/dwgsim_opt.c
- src/dwgsim_opt.h
- src/mut_txt.c
- testdata/ex1.test.bfast.fastq.gz
- testdata/ex1.test.bwa.read1.fastq.gz
- testdata/ex1.test.bwa.read2.fastq.gz
- testdata/test.sh


Changes:

=====================================
Makefile
=====================================
@@ -1,4 +1,4 @@
-PACKAGE_VERSION="0.1.11"
+PACKAGE_VERSION="0.1.13"
 CC=			gcc
 CFLAGS=		-g -Wall -O3 #-m64 #-arch ppc
 DFLAGS=		-D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -DPACKAGE_VERSION=\\\"${PACKAGE_VERSION}\\\"


=====================================
src/dwgsim.c
=====================================
@@ -41,6 +41,7 @@
 #include <stdint.h>
 #include <ctype.h>
 #include <string.h>
+#include <zlib.h>
 #include "contigs.h"
 #include "mut.h"
 #include "mut_txt.h"
@@ -71,33 +72,33 @@ uint8_t nst_nt4_table[256] = {
     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
 };
 
-#define __gen_read(x, start, iter) do {									\
-    for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) {	\
-        mut_t c = currseq->s[i], mut_type = c & mutmsk;			\
-        if (ext_coor[x] < 0) {								\
+#define __gen_read(x, start, iter) do {                                    \
+    for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) {    \
+        mut_t c = currseq->s[i], mut_type = c & mutmsk;            \
+        if (ext_coor[x] < 0) {                                \
             if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
-            ext_coor[x] = i;								\
+            ext_coor[x] = i;                                \
             if(1 == strand[x]) ext_coor[x] -= s[x]-1; \
-        }													\
+        }                                                    \
         if (mut_type == DELETE) { \
-            ++n_indel[x];				\
+            ++n_indel[x];                \
             if(1 == strand[x]) ext_coor[x]--; \
             if(0 == k) n_indel_first[x]++; \
         } \
         else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
-            tmp_seq[x][k++] = c & 0xf;						\
+            tmp_seq[x][k++] = c & 0xf;                        \
             if (mut_type == SUBSTITUTE) { \
-                ++n_sub[x];			\
+                ++n_sub[x];            \
                 if(0 == k) n_sub_first[x]++; \
-            } 												\
-        } else {											\
-            mut_t n, ins;										\
+            }                                                 \
+        } else {                                            \
+            mut_t n, ins;                                        \
             assert(mut_type == INSERT); \
-            ++n_indel[x];									\
-            n_indel_first[x]++;							\
+            ++n_indel[x];                                    \
+            n_indel_first[x]++;                            \
             if(1 == mut_get_ins(currseq, i, &n, &ins)) { \
                 if(0 == strand[x]) { \
-                    if(k < s[x]) tmp_seq[x][k++] = c & 0xf;						\
+                    if(k < s[x]) tmp_seq[x][k++] = c & 0xf;                        \
                     while(n > 0 && k < s[x]) { \
                         tmp_seq[x][k++] = ins & 0x3;                \
                         --n, ins >>= 2; \
@@ -108,7 +109,7 @@ uint8_t nst_nt4_table[256] = {
                         tmp_seq[x][k++] = (ins >> ((n-1) << 1) & 0x3);                \
                         --n; \
                     } \
-                    if (k < s[x]) tmp_seq[x][k++] = c & 0xf;						\
+                    if (k < s[x]) tmp_seq[x][k++] = c & 0xf;                        \
                 } \
             } else { \
                 int32_t byte_index, bit_index; \
@@ -117,7 +118,7 @@ uint8_t nst_nt4_table[256] = {
                 insertion = mut_get_ins_long_n(currseq->ins[ins], &num_ins); \
                 if(0 == strand[x]) { \
                     byte_index = mut_packed_len(num_ins) - 1; bit_index = (num_ins-1) & 0x3 ; \
-                    if(k < s[x]) tmp_seq[x][k++] = c & 0xf;						\
+                    if(k < s[x]) tmp_seq[x][k++] = c & 0xf;                        \
                     while(num_ins > 0 && k < s[x]) { \
                         assert(0 <= byte_index); \
                         tmp_seq[x][k++] = (insertion[byte_index] >> (bit_index << 1)) & 0x3;                \
@@ -140,19 +141,18 @@ uint8_t nst_nt4_table[256] = {
                             byte_index++; \
                         } \
                     } \
-                    if (k < s[x]) tmp_seq[x][k++] = c & 0xf;						\
+                    if (k < s[x]) tmp_seq[x][k++] = c & 0xf;                        \
                 } \
-            }													\
+            }                                                    \
         } \
-    }														\
-    if (k != s[x]) ext_coor[x] = -10;						\
+    }                                                        \
+    if (k != s[x]) ext_coor[x] = -10;                        \
     if (1 == strand[x]) { \
         for (k = 0; k < s[x]; ++k) tmp_seq[x][k] = tmp_seq[x][k] < 4? 3 - tmp_seq[x][k] : 4; \
-    } 														\
+    }                                                         \
 } while (0)
 
 /* Simple normal random number generator, copied from genran.c */
-
 double ran_normal()
 { 
   static int iset = 0; 
@@ -174,6 +174,11 @@ double ran_normal()
   }
 }
 
+void unreachable(char *message) {
+    fprintf(stderr, "\n[dwgsim_core] Error: %s\n", message);
+    exit(1);
+}
+
 int32_t ran_num(double prob, int32_t n)
 {
   int32_t i, r;
@@ -532,8 +537,7 @@ void dwgsim_core(dwgsim_opt_t * opt)
               }
           }
       }
-
-      if(0 == opt->muts_only) {
+      else {
           if(0 == n_ref && opt->C < 0) {
               n_pairs = opt->N - n_sim;
           }
@@ -605,6 +609,7 @@ void dwgsim_core(dwgsim_opt_t * opt)
               continue;
           }
           else if (n_pairs < 0) { // NB: this should not happen
+              fprintf(stderr, "[dwgsim_core] #4 skip sequence '%s' as not enough pairs found\n", name);
               // not enough pairs
               continue;
           }
@@ -617,6 +622,7 @@ void dwgsim_core(dwgsim_opt_t * opt)
       mut_print(name, &seq, mutseq[0], mutseq[1], opt->fp_mut, opt->fp_vcf);
 
       if(0 == opt->muts_only) {
+          int num_failed = 0;
           for (ii = 0; ii != n_pairs; ++ii, ++ctr) { // the core loop
               if(0 == (ctr % 10000)) {
                   fprintf(stderr, "\r[dwgsim_core] %llu",
@@ -685,23 +691,27 @@ void dwgsim_core(dwgsim_opt_t * opt)
                   n_sub_first[0] = n_sub_first[1] = n_indel_first[0] = n_indel_first[1] = n_err_first[0] = n_err_first[1] = 0;
                   num_n[0]=num_n[1]=0;
 
-                  // strand
-                  if(2 == opt->strandedness || (0 == opt->strandedness && ILLUMINA == opt->data_type)) {
-                      // opposite strand by default for Illumina
-                      strand[0] = 0; strand[1] = 1; 
-                  }
-                  else if(1 == opt->strandedness || (0 == opt->strandedness && (SOLID == opt->data_type || IONTORRENT == opt->data_type))) {
-                      // same strands by default for SOLiD
-                      strand[0] = 0; strand[1] = 0; 
-                  }
-                  else {
-                      // should not reach here
-                      assert(1 == 0);
+                  // set read one's strand
+                  switch(opt->read_one_strand) {
+                      case 0: strand[0] = (drand48() < 0.5) ? 1 : 0; break;
+                      case 1: strand[0] = 0; break;
+                      case 2: strand[0] = 1; break;
+                      default: unreachable("read strand was not between 0-2");
                   }
-                  if (drand48() < 0.5) { // which strand ?
-                      // Flip strands 
-                      strand[0] = (1 + strand[0]) % 2;
-                      strand[1] = (1 + strand[1]) % 2;
+
+                  // set read two's strand
+                  switch(opt->strandedness) {
+                      case 0: // default to data type
+                          switch (opt->data_type) {
+                              case ILLUMINA: strand[1] = 1 - strand[0]; break; // paired end for Illumina (opposite strand)
+                              case SOLID: 
+                              case IONTORRENT: strand[1] = strand[0]; break; // mate pair for SOLiD and IonTorrent (same strand)
+                              default: unreachable("data type was not between 0-2");
+                          }
+                          break;
+                      case 1: strand[1] = strand[0]; break; // mate pair (same strand)
+                      case 2: strand[1] = 1 - strand[0]; break; // paired end (opposite strand)
+                      default: unreachable("strandedness was not between 0-2");
                   }
 
                   // generate the reads in base space
@@ -712,12 +722,8 @@ void dwgsim_core(dwgsim_opt_t * opt)
                                * 5' E2 -----> .... E1 -----> 3'
                                * 3'           ....           5'
                                */
-                              if(0 == opt->is_inner) {
-                                  __gen_read(0, pos + d - s[0], ++i); 
-                              }
-                              else {
-                                  __gen_read(0, pos + s[1] + d, ++i); 
-                              }
+                              int r0_pos = (0 == opt->is_inner) ? (pos + d - s[0]) : (pos + s[1] + d - 1);
+                              __gen_read(0, r0_pos, ++i); 
                               __gen_read(1, pos, ++i);
                           }
                           else { // - strand
@@ -725,13 +731,9 @@ void dwgsim_core(dwgsim_opt_t * opt)
                                * 3'           ....            5'
                                * 5' <----- E1 .... <----- E2  3'
                                */
-                              __gen_read(0, pos + s[0], --i);
-                              if(0 == opt->is_inner) {
-                                  __gen_read(1, pos + d, --i);
-                              }
-                              else {
-                                  __gen_read(1, pos + s[0] + d + s[1], --i);
-                              }
+                              int r1_pos = (0 == opt->is_inner) ? (pos + d - 1) : (pos + s[0] + d + s[1] - 1);
+                              __gen_read(0, pos + s[0] - 1, --i);
+                              __gen_read(1, r1_pos, --i);
                           }
                       }
                       else { // opposite strand
@@ -740,25 +742,17 @@ void dwgsim_core(dwgsim_opt_t * opt)
                                * 5' E1 -----> ....           3'
                                * 3'           .... <----- E2 5'
                                */
+                              int r1_pos = (0 == opt->is_inner) ? (pos + d - 1) : (pos + s[0] + d + s[1] - 1);
                               __gen_read(0, pos, ++i);
-                              if(0 == opt->is_inner) {
-                                  __gen_read(1, pos + d, --i);
-                              }
-                              else {
-                                  __gen_read(1, pos + s[0] + d + s[1], --i);
-                              }
+                              __gen_read(1, r1_pos, --i);
                           }
                           else { // - strand
                               /*
                                * 5' E2 -----> ....           3'
                                * 3'           .... <----- E1 5'
                                */
-                              if(0 == opt->is_inner) {
-                                  __gen_read(0, pos + d, --i);
-                              }
-                              else {
-                                  __gen_read(0, pos + s[1] + d + s[0], --i); 
-                              }
+                              int r0_pos = (0 == opt->is_inner) ? (pos + d - 1) : (pos + s[1] + d + s[0] - 1); 
+                              __gen_read(0, r0_pos, --i);
                               __gen_read(1, pos, i++);
                           }
                       }
@@ -785,8 +779,14 @@ void dwgsim_core(dwgsim_opt_t * opt)
                   if (ext_coor[0] < 0 || ext_coor[1] < 0 || opt->max_n < num_n[0] || opt->max_n < num_n[1]) { // fail to generate the read(s)
                       --ii;
                       --ctr;
+                      num_failed++;
+                      if (num_failed > 10000) {
+                          fprintf(stderr, "\r[dwgsim_core] failed to generate a read after %d trials\n", num_failed);
+                          exit(1);
+                      }
                       continue;
                   }
+                  num_failed = 0;
 
                   if(SOLID == opt->data_type) {
                       // Convert to color sequence, use the first base as the adaptor
@@ -858,61 +858,65 @@ void dwgsim_core(dwgsim_opt_t * opt)
                       qstr[i] = 0;
                       // BWA
                       FILE *fpo = (0 == j) ? opt->fp_bwa1: opt->fp_bwa2;
-                      if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
-                          fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", 
-                                  (NULL == opt->read_prefix) ? "" : opt->read_prefix,
-                                  (NULL == opt->read_prefix) ? "" : "_",
-                                  name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
-                                  n_err[0], n_sub[0], n_indel[0],
-                                  n_err[1], n_sub[1],n_indel[1],
-                                  (unsigned long long)ii, j+1);
-                          for (i = 0; i < s[j]; ++i)
-                            fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
-                          fprintf(fpo, "\n+\n%s\n", qstr);
+                      if (NULL != fpo) {
+                        if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+                            gzprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", 
+                                    (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+                                    (NULL == opt->read_prefix) ? "" : "_",
+                                    name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
+                                    n_err[0], n_sub[0], n_indel[0],
+                                    n_err[1], n_sub[1],n_indel[1],
+                                    (unsigned long long)ii, j+1);
+                            for (i = 0; i < s[j]; ++i)
+                              gzputc(fpo, "ACGTN"[(int)tmp_seq[j][i]]);
+                            gzprintf(fpo, "\n+\n%s\n", qstr);
+                        }
+                        else {
+                            // Note: BWA ignores the adapter and the first color, so this is a misrepresentation 
+                            // in samtools.  We must first skip the first color.  Basically, a 50 color read is a 
+                            // 49 color read for BWA.
+                            //
+                            // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2,
+                            // annotated as read "1".
+                            gzprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", 
+                                    (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+                                    (NULL == opt->read_prefix) ? "" : "_",
+                                    name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
+                                    n_err[0] - n_err_first[0], n_sub[0] - n_sub_first[0], n_indel[0] - n_indel_first[0], 
+                                    n_err[1] - n_err_first[1], n_sub[1] - n_sub_first[1], n_indel[1] - n_indel_first[1],
+                                    (unsigned long long)ii, 2 - j);
+                            //gzputc(fpo, 'A');
+                            for (i = 1; i < s[j]; ++i)
+                              gzputc(fpo, "ACGTN"[(int)tmp_seq[j][i]]);
+                            gzprintf(fpo, "\n+\n");
+                            for (i = 1; i < s[j]; ++i) 
+                              gzputc(fpo, qstr[i]);
+                            gzprintf(fpo, "\n");
+                        }
                       }
-                      else {
-                          // Note: BWA ignores the adapter and the first color, so this is a misrepresentation 
-                          // in samtools.  We must first skip the first color.  Basically, a 50 color read is a 
-                          // 49 color read for BWA.
-                          //
-                          // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2,
-                          // annotated as read "1".
-                          fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", 
+
+                      // BFAST output
+                      if (NULL != opt->fp_bfast) {
+                          gzprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n", 
                                   (NULL == opt->read_prefix) ? "" : opt->read_prefix,
                                   (NULL == opt->read_prefix) ? "" : "_",
                                   name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
-                                  n_err[0] - n_err_first[0], n_sub[0] - n_sub_first[0], n_indel[0] - n_indel_first[0], 
-                                  n_err[1] - n_err_first[1], n_sub[1] - n_sub_first[1], n_indel[1] - n_indel_first[1],
-                                  (unsigned long long)ii, 2 - j);
-                          //fputc('A', fpo);
-                          for (i = 1; i < s[j]; ++i)
-                            fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
-                          fprintf(fpo, "\n+\n");
-                          for (i = 1; i < s[j]; ++i) 
-                            fputc(qstr[i], fpo);
-                          fprintf(fpo, "\n");
-                      }
-
-                      // BFAST output
-                      fprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n", 
-                              (NULL == opt->read_prefix) ? "" : opt->read_prefix,
-                              (NULL == opt->read_prefix) ? "" : "_",
-                              name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
-                              n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
-                              (unsigned long long)ii);
-                      if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
-                          for (i = 0; i < s[j]; ++i)
-                            fputc("ACGTN"[(int)tmp_seq[j][i]], opt->fp_bfast);
-                          fprintf(opt->fp_bfast, "\n+\n%s\n", qstr);
-                      }
-                      else {
-                          fputc('A', opt->fp_bfast);
-                          for (i = 0; i < s[j]; ++i)
-                            fputc("01234"[(int)tmp_seq[j][i]], opt->fp_bfast);
-                          fprintf(opt->fp_bfast, "\n+\n");
-                          for (i = 0; i < s[j]; ++i) 
-                            fputc(qstr[i], opt->fp_bfast);
-                          fprintf(opt->fp_bfast, "\n");
+                                  n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
+                                  (unsigned long long)ii);
+                          if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+                              for (i = 0; i < s[j]; ++i)
+                                gzputc(opt->fp_bfast, "ACGTN"[(int)tmp_seq[j][i]]);
+                              gzprintf(opt->fp_bfast, "\n+\n%s\n", qstr);
+                          }
+                          else {
+                              gzputc(opt->fp_bfast, 'A');
+                              for (i = 0; i < s[j]; ++i)
+                                gzputc(opt->fp_bfast, "01234"[(int)tmp_seq[j][i]]);
+                              gzprintf(opt->fp_bfast, "\n+\n");
+                              for (i = 0; i < s[j]; ++i) 
+                                gzputc(opt->fp_bfast, qstr[i]);
+                              gzprintf(opt->fp_bfast, "\n");
+                          }
                       }
                   }
               }
@@ -962,61 +966,65 @@ void dwgsim_core(dwgsim_opt_t * opt)
                       }
                       // BWA
                       FILE *fpo = (0 == j) ? opt->fp_bwa1: opt->fp_bwa2;
-                      if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
-                          fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", 
-                                  (NULL == opt->read_prefix) ? "" : opt->read_prefix,
-                                  (NULL == opt->read_prefix) ? "" : "_",
-                                  "rand", 0, 0, 0, 0, 1, 1,
-                                  0, 0, 0, 0, 0, 0,
-                                  (unsigned long long)rand_ii,
-                                  j+1);
-                          for (i = 0; i < s[j]; ++i)
-                            fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
-                          fprintf(fpo, "\n+\n%s\n", qstr);
+                      if (NULL != fpo) {
+                          if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+                              gzprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", 
+                                      (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+                                      (NULL == opt->read_prefix) ? "" : "_",
+                                      "rand", 0, 0, 0, 0, 1, 1,
+                                      0, 0, 0, 0, 0, 0,
+                                      (unsigned long long)rand_ii,
+                                      j+1);
+                              for (i = 0; i < s[j]; ++i)
+                                gzputc(fpo, "ACGTN"[(int)tmp_seq[j][i]]);
+                              gzprintf(fpo, "\n+\n%s\n", qstr);
+                          }
+                          else {
+                              // Note: BWA ignores the adapter and the first color, so this is a misrepresentation 
+                              // in samtools.  We must first skip the first color.  Basically, a 50 color read is a 
+                              // 49 color read for BWA.
+                              //
+                              // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2,
+                              // annotated as read "1".
+                              gzprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", 
+                                      (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+                                      (NULL == opt->read_prefix) ? "" : "_",
+                                      "rand", 0, 0, 0, 0, 1, 1,
+                                      0, 0, 0, 0, 0, 0,
+                                      (unsigned long long)rand_ii,
+                                      2 - j);
+                              //gzputc(fpo, 'A');
+                              for (i = 1; i < s[j]; ++i)
+                                gzputc(fpo, "ACGTN"[(int)tmp_seq[j][i]]);
+                              gzprintf(fpo, "\n+\n");
+                              for (i = 1; i < s[j]; ++i) 
+                                gzputc(fpo, qstr[i]);
+                              gzprintf(fpo, "\n");
+                          }
                       }
-                      else {
-                          // Note: BWA ignores the adapter and the first color, so this is a misrepresentation 
-                          // in samtools.  We must first skip the first color.  Basically, a 50 color read is a 
-                          // 49 color read for BWA.
-                          //
-                          // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2,
-                          // annotated as read "1".
-                          fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", 
+
+                      // BFAST output
+                      if (NULL != opt->fp_bfast) {
+                          gzprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n", 
                                   (NULL == opt->read_prefix) ? "" : opt->read_prefix,
                                   (NULL == opt->read_prefix) ? "" : "_",
                                   "rand", 0, 0, 0, 0, 1, 1,
                                   0, 0, 0, 0, 0, 0,
-                                  (unsigned long long)rand_ii,
-                                  2 - j);
-                          //fputc('A', fpo);
-                          for (i = 1; i < s[j]; ++i)
-                            fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
-                          fprintf(fpo, "\n+\n");
-                          for (i = 1; i < s[j]; ++i) 
-                            fputc(qstr[i], fpo);
-                          fprintf(fpo, "\n");
-                      }
-
-                      // BFAST output
-                      fprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n", 
-                              (NULL == opt->read_prefix) ? "" : opt->read_prefix,
-                              (NULL == opt->read_prefix) ? "" : "_",
-                              "rand", 0, 0, 0, 0, 1, 1,
-                              0, 0, 0, 0, 0, 0,
-                              (unsigned long long)rand_ii);
-                      if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
-                          for (i = 0; i < s[j]; ++i)
-                            fputc("ACGTN"[(int)tmp_seq[j][i]], opt->fp_bfast);
-                          fprintf(opt->fp_bfast, "\n+\n%s\n", qstr);
-                      }
-                      else {
-                          fputc('A', opt->fp_bfast);
-                          for (i = 0; i < s[j]; ++i)
-                            fputc("01234"[(int)tmp_seq[j][i]], opt->fp_bfast);
-                          fprintf(opt->fp_bfast, "\n+\n");
-                          for (i = 0; i < s[j]; ++i) 
-                            fputc(qstr[i], opt->fp_bfast);
-                          fprintf(opt->fp_bfast, "\n");
+                                  (unsigned long long)rand_ii);
+                          if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+                              for (i = 0; i < s[j]; ++i)
+                                gzputc(opt->fp_bfast, "ACGTN"[(int)tmp_seq[j][i]]);
+                              gzprintf(opt->fp_bfast, "\n+\n%s\n", qstr);
+                          }
+                          else {
+                              gzputc(opt->fp_bfast, 'A');
+                              for (i = 0; i < s[j]; ++i)
+                                gzputc(opt->fp_bfast, "01234"[(int)tmp_seq[j][i]]);
+                              gzprintf(opt->fp_bfast, "\n+\n");
+                              for (i = 0; i < s[j]; ++i) 
+                                gzputc(opt->fp_bfast, qstr[i]);
+                              gzprintf(opt->fp_bfast, "\n");
+                          }
                       }
                   }
                   rand_ii++;
@@ -1063,7 +1071,7 @@ int main(int argc, char *argv[])
   }
 
   // Open files
-  opt->fp_fa =	xopen(argv[optind+0], "r");
+  opt->fp_fa = xopen(argv[optind+0], "r");
   strcpy(fn_fai, argv[optind+0]); strcat(fn_fai, ".fai");
   opt->fp_fai = fopen(fn_fai, "r"); // NB: depends on returning NULL;
   strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".mutations.txt");
@@ -1071,12 +1079,16 @@ int main(int argc, char *argv[])
   strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".mutations.vcf");
   opt->fp_vcf = xopen(fn_tmp, "w");
   if(0 == opt->muts_only) {
-      strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bfast.fastq");
-      opt->fp_bfast = xopen(fn_tmp, "w");
-      strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read1.fastq");
-      opt->fp_bwa1 = xopen(fn_tmp, "w");
-      strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read2.fastq");
-      opt->fp_bwa2 = xopen(fn_tmp, "w");
+      if (opt->output_type != OUTPUT_TYPE_BWA) {
+          strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bfast.fastq.gz");
+          opt->fp_bfast = gzopen(fn_tmp, "w");
+      }
+      if (opt->output_type != OUTPUT_TYPE_BFAST) {
+          strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read1.fastq.gz");
+          opt->fp_bwa1 = gzopen(fn_tmp, "w");
+          strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read2.fastq.gz");
+          opt->fp_bwa2 = gzopen(fn_tmp, "w");
+      }
   }
 
   // Run simulation
@@ -1084,7 +1096,13 @@ int main(int argc, char *argv[])
 
   // Close files
   if(0 == opt->muts_only) {
-      fclose(opt->fp_fa); fclose(opt->fp_bfast); fclose(opt->fp_bwa1); fclose(opt->fp_bwa2); 
+      if (opt->output_type != OUTPUT_TYPE_BWA) {
+          gzclose(opt->fp_bfast); 
+      }
+      if (opt->output_type != OUTPUT_TYPE_BFAST) {
+          gzclose(opt->fp_bwa1); gzclose(opt->fp_bwa2); 
+      }
+      fclose(opt->fp_fa); 
   }
   if(NULL != opt->fp_fai) fclose(opt->fp_fai);
   fclose(opt->fp_mut);


=====================================
src/dwgsim_opt.c
=====================================
@@ -32,6 +32,7 @@
 #include <stdint.h>
 #include <ctype.h>
 #include <string.h>
+#include <zlib.h>
 #include "mut.h"
 #include "dwgsim.h"
 #include "dwgsim_opt.h"
@@ -56,6 +57,7 @@ dwgsim_opt_t* dwgsim_opt_init()
   opt->rand_read = 0.05;
   opt->data_type = ILLUMINA;
   opt->strandedness = 0;
+  opt->read_one_strand = 0;
   opt->max_n = 0;
   opt->flow_order = NULL;
   opt->flow_order_len = 0;
@@ -67,7 +69,8 @@ dwgsim_opt_t* dwgsim_opt_init()
   opt->fn_muts_input = NULL;
   opt->fn_muts_input_type = -1;
   opt->fn_regions_bed = NULL;
-  opt->fp_mut = opt->fp_bfast = opt->fp_bwa1 = opt->fp_bwa2 = NULL;
+  opt->fp_mut = NULL;
+  opt->fp_bfast = opt->fp_bwa1 = opt->fp_bwa2 = NULL;
   opt->fp_fa = opt->fp_fai = NULL;
   opt->read_prefix = NULL;
 
@@ -106,7 +109,7 @@ int dwgsim_opt_usage(dwgsim_opt_t *opt)
   fprintf(stderr, "         -r FLOAT      rate of mutations [%.4f]\n", opt->mut_rate);
   fprintf(stderr, "         -F FLOAT      frequency of given mutation to simulate low fequency somatic mutations [%.4f]\n", opt->mut_freq);
   fprintf(stderr, "                           NB: freqeuncy F refers to the first strand of mutation, therefore mutations \n"); 
-  fprintf(stderr, "                           on the second strand occour with a frequency of 1-F \n");
+  fprintf(stderr, "                           on the second strand occur with a frequency of 1-F \n");
   fprintf(stderr, "         -R FLOAT      fraction of mutations that are indels [%.2f]\n", opt->indel_frac);
   fprintf(stderr, "         -X FLOAT      probability an indel is extended [%.2f]\n", opt->indel_extend);
   fprintf(stderr, "         -I INT        the minimum length indel [%d]\n", opt->indel_min);
@@ -116,10 +119,14 @@ int dwgsim_opt_usage(dwgsim_opt_t *opt)
   fprintf(stderr, "                           0: Illumina\n");
   fprintf(stderr, "                           1: SOLiD\n");
   fprintf(stderr, "                           2: Ion Torrent\n");
-  fprintf(stderr, "         -S INT        generate reads [%d]:\n", opt->strandedness);
+  fprintf(stderr, "         -S INT        generate paired end reads with orientation [%d]:\n", opt->strandedness);
   fprintf(stderr, "                           0: default (opposite strand for Illumina, same strand for SOLiD/Ion Torrent)\n");
   fprintf(stderr, "                           1: same strand (mate pair)\n");
   fprintf(stderr, "                           2: opposite strand (paired end)\n");
+  fprintf(stderr, "         -A INT        generate paired end reads with read one [%d]:\n", opt->read_one_strand);
+  fprintf(stderr, "                           0: default (both, random)\n");
+  fprintf(stderr, "                           1: forward genomic strand\n");
+  fprintf(stderr, "                           2: reverse genomic strand\n");
   fprintf(stderr, "         -f STRING     the flow order for Ion Torrent data [%s]\n", (char*)opt->flow_order);
   fprintf(stderr, "         -B            use a per-base error rate for Ion Torrent data [%s]\n", __IS_TRUE(opt->use_base_error));
   fprintf(stderr, "         -H            haploid mode [%s]\n", __IS_TRUE(opt->is_hap));
@@ -133,6 +140,10 @@ int dwgsim_opt_usage(dwgsim_opt_t *opt)
   fprintf(stderr, "         -q STRING     a fixed base quality to apply (single character) [%s]\n", (NULL == opt->fixed_quality) ? "not using" : opt->fixed_quality);
   fprintf(stderr, "         -Q FLOAT      standard deviation of the base quality scores [%.2lf]\n", (NULL == opt->fixed_quality) ? opt->quality_std : 0.0);
   fprintf(stderr, "         -s INT        standard deviation of the distance for pairs [%.3f]\n", opt->std_dev);
+  fprintf(stderr, "         -o INT        output type for the FASTQ files [%d]:\n", opt->output_type);
+  fprintf(stderr, "                           0: interleaved (bfast) and per-read-end (bwa)\n");
+  fprintf(stderr, "                           1: per-read-end (bwa) only\n");
+  fprintf(stderr, "                           2: interleaved (bfast) only\n");
   fprintf(stderr, "         -h            print this message\n");
   fprintf(stderr, "\n");
   fprintf(stderr, "Note: For SOLiD mate pair reads and BFAST, the first read is F3 and the second is R3. For SOLiD mate pair reads\n");
@@ -192,7 +203,7 @@ dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[])
   int c;
   int muts_input_type = 0;
   
-  while ((c = getopt(argc, argv, "id:s:N:C:1:2:e:E:r:F:R:X:I:c:S:n:y:BHf:z:Mm:b:v:x:P:q:Q:h")) >= 0) {
+  while ((c = getopt(argc, argv, "id:s:N:C:1:2:e:E:r:F:R:X:I:c:S:A:n:y:BHf:z:Mm:b:v:x:P:q:Q:o:h")) >= 0) {
       switch (c) {
         case 'i': opt->is_inner = 1; break;
         case 'd': opt->dist = dwgsim_atoi(optarg, 'd', 0); break;
@@ -210,6 +221,7 @@ dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[])
         case 'I': opt->indel_min = dwgsim_atoi(optarg, 'I', 0); break;
         case 'c': opt->data_type = dwgsim_atoi(optarg, 'c', 0); break;
         case 'S': opt->strandedness = dwgsim_atoi(optarg, 'S', 0); break;
+        case 'A': opt->read_one_strand = dwgsim_atoi(optarg, 'A', 0); break;
         case 'n': opt->max_n = dwgsim_atoi(optarg, 'n', 0); break;
         case 'y': opt->rand_read = atof(optarg); break;
         case 'f': 
@@ -228,6 +240,7 @@ dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[])
         case 'P': free(opt->read_prefix); opt->read_prefix = strdup(optarg); break;
         case 'q': opt->fixed_quality = strdup(optarg); break;
         case 'Q': opt->quality_std = atof(optarg); break;
+        case 'o': opt->output_type = atoi(optarg); break;
         default: fprintf(stderr, "Unrecognized option: -%c\n", c); return 0;
       }
   }
@@ -277,6 +290,7 @@ dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[])
   __check_option(opt->indel_min, 1, INT32_MAX, "-I");
   __check_option(opt->data_type, 0, 2, "-c");
   __check_option(opt->strandedness, 0, 2, "-S");
+  __check_option(opt->read_one_strand, 0, 2, "-A");
   __check_option(opt->max_n, 0, INT32_MAX, "-n");
   __check_option(opt->rand_read, 0, 1.0, "-y");
   if(IONTORRENT == opt->data_type && NULL == opt->flow_order) {
@@ -296,17 +310,7 @@ dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[])
       fprintf(stderr, "Warning: remember to use the -P option with dwgsim_eval\n");
   }
 
-  if(0 < opt->length[1]){ //paired end / mate pair
-    int s = opt->length[0] + opt->length[1] + opt->is_inner;
-    double r = (s-opt->dist)*1.0/opt->std_dev;
-    if(r>6){
-      fprintf(stderr, "Error: command line option -d is too small for the read length (%d)\n",opt->dist);
-      return 0;
-    }
-    if(r>4){
-      fprintf(stderr, "Warning: command line option -d is small for the read length (%d). Generation speed could be affected.\n",opt->dist);
-    }
-  }
+  __check_option(opt->output_type, 0, 2, "-o");
 
   switch(muts_input_type) {
     case 0x0:


=====================================
src/dwgsim_opt.h
=====================================
@@ -1,8 +1,15 @@
 #ifndef DWGSIM_OPT_H
 #define DWGSIM_OPT_H
 
+#include "zlib.h"
+
 #define ERROR_RATE_NUM_RANDOM_READS 1000000
 
+#define OUTPUT_TYPE_ALL 0
+#define OUTPUT_TYPE_BWA 1
+#define OUTPUT_TYPE_BFAST 2
+
+
 typedef struct {
     double start, by, end;
 } error_t;
@@ -24,6 +31,7 @@ typedef struct {
     int32_t max_n;
     int32_t data_type;
     int32_t strandedness;
+    int32_t read_one_strand;
     int8_t *flow_order;
     int32_t flow_order_len;
     int32_t use_base_error;
@@ -37,12 +45,13 @@ typedef struct {
     char *fn_regions_bed;
     FILE *fp_mut;
     FILE *fp_vcf;
-    FILE *fp_bfast;
-    FILE *fp_bwa1;
-    FILE *fp_bwa2;
+    gzFile *fp_bfast;
+    gzFile *fp_bwa1;
+    gzFile *fp_bwa2;
     FILE *fp_fa;
     FILE *fp_fai;
     char *read_prefix;
+    int32_t output_type;
 } dwgsim_opt_t;
 
 dwgsim_opt_t* 


=====================================
src/mut_txt.c
=====================================
@@ -58,7 +58,7 @@ muts_txt_t *muts_txt_init(FILE *fp, contigs_t *c)
           prev_pos = 0;
       }
       if(c->n == i) {
-          fprintf(stderr, "Error: contig not found [%s]\n", name);
+          fprintf(stderr, "Error: mutation contig not found or out of order [%s]\n", name);
           exit(1);
       }
       else if(pos <= 0 || c->contigs[i].len < pos) {


=====================================
testdata/ex1.test.bfast.fastq.gz
=====================================
Binary files a/testdata/ex1.test.bfast.fastq.gz and b/testdata/ex1.test.bfast.fastq.gz differ


=====================================
testdata/ex1.test.bwa.read1.fastq.gz
=====================================
Binary files a/testdata/ex1.test.bwa.read1.fastq.gz and b/testdata/ex1.test.bwa.read1.fastq.gz differ


=====================================
testdata/ex1.test.bwa.read2.fastq.gz
=====================================
Binary files a/testdata/ex1.test.bwa.read2.fastq.gz and b/testdata/ex1.test.bwa.read2.fastq.gz differ


=====================================
testdata/test.sh
=====================================
@@ -15,14 +15,17 @@ popd
 mkdir tmp
 
 # Generate the new test data
-./dwgsim -z 13 -N 10000 samtools/examples/ex1.fa ex1.test
+./dwgsim -z 13 -N 10000 samtools/examples/ex1.fa tmp/ex1.test
 
 # Test the differences
-for FILE in $(ls -1 ex1.test*)
+for GZFILE in $(ls -1 tmp/ex1.test*gz)
 do 
-	diff -q ${FILE} ${TESTDATADIR}/${FILE}
+	gunzip $GZFILE;
+    FILE=$(basename $GZFILE .gz);
+	diff -q tmp/${FILE} ${TESTDATADIR}/${FILE}
 done
 
 # Clean up the testdata
 find ${TESTDATADIR} \! -name "*gz" -type f | grep -v sh$ | xargs rm; 
-rm -r tmp;
+
+rm -r tmp



View it on GitLab: https://salsa.debian.org/med-team/dwgsim/-/commit/386d48d63429e49e80d06b119dfa4689747ea353

-- 
View it on GitLab: https://salsa.debian.org/med-team/dwgsim/-/commit/386d48d63429e49e80d06b119dfa4689747ea353
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220116/26e1bbf6/attachment-0001.htm>


More information about the debian-med-commit mailing list