[med-svn] [Git][med-team/dwgsim][upstream] New upstream version 0.1.13
Andreas Tille (@tille)
gitlab at salsa.debian.org
Sun Jan 16 16:16:01 GMT 2022
Andreas Tille pushed to branch upstream at Debian Med / dwgsim
Commits:
386d48d6 by Andreas Tille at 2022-01-16T17:01:25+01:00
New upstream version 0.1.13
- - - - -
9 changed files:
- Makefile
- src/dwgsim.c
- src/dwgsim_opt.c
- src/dwgsim_opt.h
- src/mut_txt.c
- testdata/ex1.test.bfast.fastq.gz
- testdata/ex1.test.bwa.read1.fastq.gz
- testdata/ex1.test.bwa.read2.fastq.gz
- testdata/test.sh
Changes:
=====================================
Makefile
=====================================
@@ -1,4 +1,4 @@
-PACKAGE_VERSION="0.1.11"
+PACKAGE_VERSION="0.1.13"
CC= gcc
CFLAGS= -g -Wall -O3 #-m64 #-arch ppc
DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -DPACKAGE_VERSION=\\\"${PACKAGE_VERSION}\\\"
=====================================
src/dwgsim.c
=====================================
@@ -41,6 +41,7 @@
#include <stdint.h>
#include <ctype.h>
#include <string.h>
+#include <zlib.h>
#include "contigs.h"
#include "mut.h"
#include "mut_txt.h"
@@ -71,33 +72,33 @@ uint8_t nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};
-#define __gen_read(x, start, iter) do { \
- for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \
- mut_t c = currseq->s[i], mut_type = c & mutmsk; \
- if (ext_coor[x] < 0) { \
+#define __gen_read(x, start, iter) do { \
+ for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \
+ mut_t c = currseq->s[i], mut_type = c & mutmsk; \
+ if (ext_coor[x] < 0) { \
if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
- ext_coor[x] = i; \
+ ext_coor[x] = i; \
if(1 == strand[x]) ext_coor[x] -= s[x]-1; \
- } \
+ } \
if (mut_type == DELETE) { \
- ++n_indel[x]; \
+ ++n_indel[x]; \
if(1 == strand[x]) ext_coor[x]--; \
if(0 == k) n_indel_first[x]++; \
} \
else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
- tmp_seq[x][k++] = c & 0xf; \
+ tmp_seq[x][k++] = c & 0xf; \
if (mut_type == SUBSTITUTE) { \
- ++n_sub[x]; \
+ ++n_sub[x]; \
if(0 == k) n_sub_first[x]++; \
- } \
- } else { \
- mut_t n, ins; \
+ } \
+ } else { \
+ mut_t n, ins; \
assert(mut_type == INSERT); \
- ++n_indel[x]; \
- n_indel_first[x]++; \
+ ++n_indel[x]; \
+ n_indel_first[x]++; \
if(1 == mut_get_ins(currseq, i, &n, &ins)) { \
if(0 == strand[x]) { \
- if(k < s[x]) tmp_seq[x][k++] = c & 0xf; \
+ if(k < s[x]) tmp_seq[x][k++] = c & 0xf; \
while(n > 0 && k < s[x]) { \
tmp_seq[x][k++] = ins & 0x3; \
--n, ins >>= 2; \
@@ -108,7 +109,7 @@ uint8_t nst_nt4_table[256] = {
tmp_seq[x][k++] = (ins >> ((n-1) << 1) & 0x3); \
--n; \
} \
- if (k < s[x]) tmp_seq[x][k++] = c & 0xf; \
+ if (k < s[x]) tmp_seq[x][k++] = c & 0xf; \
} \
} else { \
int32_t byte_index, bit_index; \
@@ -117,7 +118,7 @@ uint8_t nst_nt4_table[256] = {
insertion = mut_get_ins_long_n(currseq->ins[ins], &num_ins); \
if(0 == strand[x]) { \
byte_index = mut_packed_len(num_ins) - 1; bit_index = (num_ins-1) & 0x3 ; \
- if(k < s[x]) tmp_seq[x][k++] = c & 0xf; \
+ if(k < s[x]) tmp_seq[x][k++] = c & 0xf; \
while(num_ins > 0 && k < s[x]) { \
assert(0 <= byte_index); \
tmp_seq[x][k++] = (insertion[byte_index] >> (bit_index << 1)) & 0x3; \
@@ -140,19 +141,18 @@ uint8_t nst_nt4_table[256] = {
byte_index++; \
} \
} \
- if (k < s[x]) tmp_seq[x][k++] = c & 0xf; \
+ if (k < s[x]) tmp_seq[x][k++] = c & 0xf; \
} \
- } \
+ } \
} \
- } \
- if (k != s[x]) ext_coor[x] = -10; \
+ } \
+ if (k != s[x]) ext_coor[x] = -10; \
if (1 == strand[x]) { \
for (k = 0; k < s[x]; ++k) tmp_seq[x][k] = tmp_seq[x][k] < 4? 3 - tmp_seq[x][k] : 4; \
- } \
+ } \
} while (0)
/* Simple normal random number generator, copied from genran.c */
-
double ran_normal()
{
static int iset = 0;
@@ -174,6 +174,11 @@ double ran_normal()
}
}
+void unreachable(char *message) {
+ fprintf(stderr, "\n[dwgsim_core] Error: %s\n", message);
+ exit(1);
+}
+
int32_t ran_num(double prob, int32_t n)
{
int32_t i, r;
@@ -532,8 +537,7 @@ void dwgsim_core(dwgsim_opt_t * opt)
}
}
}
-
- if(0 == opt->muts_only) {
+ else {
if(0 == n_ref && opt->C < 0) {
n_pairs = opt->N - n_sim;
}
@@ -605,6 +609,7 @@ void dwgsim_core(dwgsim_opt_t * opt)
continue;
}
else if (n_pairs < 0) { // NB: this should not happen
+ fprintf(stderr, "[dwgsim_core] #4 skip sequence '%s' as not enough pairs found\n", name);
// not enough pairs
continue;
}
@@ -617,6 +622,7 @@ void dwgsim_core(dwgsim_opt_t * opt)
mut_print(name, &seq, mutseq[0], mutseq[1], opt->fp_mut, opt->fp_vcf);
if(0 == opt->muts_only) {
+ int num_failed = 0;
for (ii = 0; ii != n_pairs; ++ii, ++ctr) { // the core loop
if(0 == (ctr % 10000)) {
fprintf(stderr, "\r[dwgsim_core] %llu",
@@ -685,23 +691,27 @@ void dwgsim_core(dwgsim_opt_t * opt)
n_sub_first[0] = n_sub_first[1] = n_indel_first[0] = n_indel_first[1] = n_err_first[0] = n_err_first[1] = 0;
num_n[0]=num_n[1]=0;
- // strand
- if(2 == opt->strandedness || (0 == opt->strandedness && ILLUMINA == opt->data_type)) {
- // opposite strand by default for Illumina
- strand[0] = 0; strand[1] = 1;
- }
- else if(1 == opt->strandedness || (0 == opt->strandedness && (SOLID == opt->data_type || IONTORRENT == opt->data_type))) {
- // same strands by default for SOLiD
- strand[0] = 0; strand[1] = 0;
- }
- else {
- // should not reach here
- assert(1 == 0);
+ // set read one's strand
+ switch(opt->read_one_strand) {
+ case 0: strand[0] = (drand48() < 0.5) ? 1 : 0; break;
+ case 1: strand[0] = 0; break;
+ case 2: strand[0] = 1; break;
+ default: unreachable("read strand was not between 0-2");
}
- if (drand48() < 0.5) { // which strand ?
- // Flip strands
- strand[0] = (1 + strand[0]) % 2;
- strand[1] = (1 + strand[1]) % 2;
+
+ // set read two's strand
+ switch(opt->strandedness) {
+ case 0: // default to data type
+ switch (opt->data_type) {
+ case ILLUMINA: strand[1] = 1 - strand[0]; break; // paired end for Illumina (opposite strand)
+ case SOLID:
+ case IONTORRENT: strand[1] = strand[0]; break; // mate pair for SOLiD and IonTorrent (same strand)
+ default: unreachable("data type was not between 0-2");
+ }
+ break;
+ case 1: strand[1] = strand[0]; break; // mate pair (same strand)
+ case 2: strand[1] = 1 - strand[0]; break; // paired end (opposite strand)
+ default: unreachable("strandedness was not between 0-2");
}
// generate the reads in base space
@@ -712,12 +722,8 @@ void dwgsim_core(dwgsim_opt_t * opt)
* 5' E2 -----> .... E1 -----> 3'
* 3' .... 5'
*/
- if(0 == opt->is_inner) {
- __gen_read(0, pos + d - s[0], ++i);
- }
- else {
- __gen_read(0, pos + s[1] + d, ++i);
- }
+ int r0_pos = (0 == opt->is_inner) ? (pos + d - s[0]) : (pos + s[1] + d - 1);
+ __gen_read(0, r0_pos, ++i);
__gen_read(1, pos, ++i);
}
else { // - strand
@@ -725,13 +731,9 @@ void dwgsim_core(dwgsim_opt_t * opt)
* 3' .... 5'
* 5' <----- E1 .... <----- E2 3'
*/
- __gen_read(0, pos + s[0], --i);
- if(0 == opt->is_inner) {
- __gen_read(1, pos + d, --i);
- }
- else {
- __gen_read(1, pos + s[0] + d + s[1], --i);
- }
+ int r1_pos = (0 == opt->is_inner) ? (pos + d - 1) : (pos + s[0] + d + s[1] - 1);
+ __gen_read(0, pos + s[0] - 1, --i);
+ __gen_read(1, r1_pos, --i);
}
}
else { // opposite strand
@@ -740,25 +742,17 @@ void dwgsim_core(dwgsim_opt_t * opt)
* 5' E1 -----> .... 3'
* 3' .... <----- E2 5'
*/
+ int r1_pos = (0 == opt->is_inner) ? (pos + d - 1) : (pos + s[0] + d + s[1] - 1);
__gen_read(0, pos, ++i);
- if(0 == opt->is_inner) {
- __gen_read(1, pos + d, --i);
- }
- else {
- __gen_read(1, pos + s[0] + d + s[1], --i);
- }
+ __gen_read(1, r1_pos, --i);
}
else { // - strand
/*
* 5' E2 -----> .... 3'
* 3' .... <----- E1 5'
*/
- if(0 == opt->is_inner) {
- __gen_read(0, pos + d, --i);
- }
- else {
- __gen_read(0, pos + s[1] + d + s[0], --i);
- }
+ int r0_pos = (0 == opt->is_inner) ? (pos + d - 1) : (pos + s[1] + d + s[0] - 1);
+ __gen_read(0, r0_pos, --i);
__gen_read(1, pos, i++);
}
}
@@ -785,8 +779,14 @@ void dwgsim_core(dwgsim_opt_t * opt)
if (ext_coor[0] < 0 || ext_coor[1] < 0 || opt->max_n < num_n[0] || opt->max_n < num_n[1]) { // fail to generate the read(s)
--ii;
--ctr;
+ num_failed++;
+ if (num_failed > 10000) {
+ fprintf(stderr, "\r[dwgsim_core] failed to generate a read after %d trials\n", num_failed);
+ exit(1);
+ }
continue;
}
+ num_failed = 0;
if(SOLID == opt->data_type) {
// Convert to color sequence, use the first base as the adaptor
@@ -858,61 +858,65 @@ void dwgsim_core(dwgsim_opt_t * opt)
qstr[i] = 0;
// BWA
FILE *fpo = (0 == j) ? opt->fp_bwa1: opt->fp_bwa2;
- if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
- fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n",
- (NULL == opt->read_prefix) ? "" : opt->read_prefix,
- (NULL == opt->read_prefix) ? "" : "_",
- name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
- n_err[0], n_sub[0], n_indel[0],
- n_err[1], n_sub[1],n_indel[1],
- (unsigned long long)ii, j+1);
- for (i = 0; i < s[j]; ++i)
- fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
- fprintf(fpo, "\n+\n%s\n", qstr);
+ if (NULL != fpo) {
+ if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+ gzprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n",
+ (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+ (NULL == opt->read_prefix) ? "" : "_",
+ name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
+ n_err[0], n_sub[0], n_indel[0],
+ n_err[1], n_sub[1],n_indel[1],
+ (unsigned long long)ii, j+1);
+ for (i = 0; i < s[j]; ++i)
+ gzputc(fpo, "ACGTN"[(int)tmp_seq[j][i]]);
+ gzprintf(fpo, "\n+\n%s\n", qstr);
+ }
+ else {
+ // Note: BWA ignores the adapter and the first color, so this is a misrepresentation
+ // in samtools. We must first skip the first color. Basically, a 50 color read is a
+ // 49 color read for BWA.
+ //
+ // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2,
+ // annotated as read "1".
+ gzprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n",
+ (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+ (NULL == opt->read_prefix) ? "" : "_",
+ name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
+ n_err[0] - n_err_first[0], n_sub[0] - n_sub_first[0], n_indel[0] - n_indel_first[0],
+ n_err[1] - n_err_first[1], n_sub[1] - n_sub_first[1], n_indel[1] - n_indel_first[1],
+ (unsigned long long)ii, 2 - j);
+ //gzputc(fpo, 'A');
+ for (i = 1; i < s[j]; ++i)
+ gzputc(fpo, "ACGTN"[(int)tmp_seq[j][i]]);
+ gzprintf(fpo, "\n+\n");
+ for (i = 1; i < s[j]; ++i)
+ gzputc(fpo, qstr[i]);
+ gzprintf(fpo, "\n");
+ }
}
- else {
- // Note: BWA ignores the adapter and the first color, so this is a misrepresentation
- // in samtools. We must first skip the first color. Basically, a 50 color read is a
- // 49 color read for BWA.
- //
- // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2,
- // annotated as read "1".
- fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n",
+
+ // BFAST output
+ if (NULL != opt->fp_bfast) {
+ gzprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n",
(NULL == opt->read_prefix) ? "" : opt->read_prefix,
(NULL == opt->read_prefix) ? "" : "_",
name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
- n_err[0] - n_err_first[0], n_sub[0] - n_sub_first[0], n_indel[0] - n_indel_first[0],
- n_err[1] - n_err_first[1], n_sub[1] - n_sub_first[1], n_indel[1] - n_indel_first[1],
- (unsigned long long)ii, 2 - j);
- //fputc('A', fpo);
- for (i = 1; i < s[j]; ++i)
- fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
- fprintf(fpo, "\n+\n");
- for (i = 1; i < s[j]; ++i)
- fputc(qstr[i], fpo);
- fprintf(fpo, "\n");
- }
-
- // BFAST output
- fprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n",
- (NULL == opt->read_prefix) ? "" : opt->read_prefix,
- (NULL == opt->read_prefix) ? "" : "_",
- name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0,
- n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
- (unsigned long long)ii);
- if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
- for (i = 0; i < s[j]; ++i)
- fputc("ACGTN"[(int)tmp_seq[j][i]], opt->fp_bfast);
- fprintf(opt->fp_bfast, "\n+\n%s\n", qstr);
- }
- else {
- fputc('A', opt->fp_bfast);
- for (i = 0; i < s[j]; ++i)
- fputc("01234"[(int)tmp_seq[j][i]], opt->fp_bfast);
- fprintf(opt->fp_bfast, "\n+\n");
- for (i = 0; i < s[j]; ++i)
- fputc(qstr[i], opt->fp_bfast);
- fprintf(opt->fp_bfast, "\n");
+ n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
+ (unsigned long long)ii);
+ if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+ for (i = 0; i < s[j]; ++i)
+ gzputc(opt->fp_bfast, "ACGTN"[(int)tmp_seq[j][i]]);
+ gzprintf(opt->fp_bfast, "\n+\n%s\n", qstr);
+ }
+ else {
+ gzputc(opt->fp_bfast, 'A');
+ for (i = 0; i < s[j]; ++i)
+ gzputc(opt->fp_bfast, "01234"[(int)tmp_seq[j][i]]);
+ gzprintf(opt->fp_bfast, "\n+\n");
+ for (i = 0; i < s[j]; ++i)
+ gzputc(opt->fp_bfast, qstr[i]);
+ gzprintf(opt->fp_bfast, "\n");
+ }
}
}
}
@@ -962,61 +966,65 @@ void dwgsim_core(dwgsim_opt_t * opt)
}
// BWA
FILE *fpo = (0 == j) ? opt->fp_bwa1: opt->fp_bwa2;
- if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
- fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n",
- (NULL == opt->read_prefix) ? "" : opt->read_prefix,
- (NULL == opt->read_prefix) ? "" : "_",
- "rand", 0, 0, 0, 0, 1, 1,
- 0, 0, 0, 0, 0, 0,
- (unsigned long long)rand_ii,
- j+1);
- for (i = 0; i < s[j]; ++i)
- fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
- fprintf(fpo, "\n+\n%s\n", qstr);
+ if (NULL != fpo) {
+ if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+ gzprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n",
+ (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+ (NULL == opt->read_prefix) ? "" : "_",
+ "rand", 0, 0, 0, 0, 1, 1,
+ 0, 0, 0, 0, 0, 0,
+ (unsigned long long)rand_ii,
+ j+1);
+ for (i = 0; i < s[j]; ++i)
+ gzputc(fpo, "ACGTN"[(int)tmp_seq[j][i]]);
+ gzprintf(fpo, "\n+\n%s\n", qstr);
+ }
+ else {
+ // Note: BWA ignores the adapter and the first color, so this is a misrepresentation
+ // in samtools. We must first skip the first color. Basically, a 50 color read is a
+ // 49 color read for BWA.
+ //
+ // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2,
+ // annotated as read "1".
+ gzprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n",
+ (NULL == opt->read_prefix) ? "" : opt->read_prefix,
+ (NULL == opt->read_prefix) ? "" : "_",
+ "rand", 0, 0, 0, 0, 1, 1,
+ 0, 0, 0, 0, 0, 0,
+ (unsigned long long)rand_ii,
+ 2 - j);
+ //gzputc(fpo, 'A');
+ for (i = 1; i < s[j]; ++i)
+ gzputc(fpo, "ACGTN"[(int)tmp_seq[j][i]]);
+ gzprintf(fpo, "\n+\n");
+ for (i = 1; i < s[j]; ++i)
+ gzputc(fpo, qstr[i]);
+ gzprintf(fpo, "\n");
+ }
}
- else {
- // Note: BWA ignores the adapter and the first color, so this is a misrepresentation
- // in samtools. We must first skip the first color. Basically, a 50 color read is a
- // 49 color read for BWA.
- //
- // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2,
- // annotated as read "1".
- fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n",
+
+ // BFAST output
+ if (NULL != opt->fp_bfast) {
+ gzprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n",
(NULL == opt->read_prefix) ? "" : opt->read_prefix,
(NULL == opt->read_prefix) ? "" : "_",
"rand", 0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0,
- (unsigned long long)rand_ii,
- 2 - j);
- //fputc('A', fpo);
- for (i = 1; i < s[j]; ++i)
- fputc("ACGTN"[(int)tmp_seq[j][i]], fpo);
- fprintf(fpo, "\n+\n");
- for (i = 1; i < s[j]; ++i)
- fputc(qstr[i], fpo);
- fprintf(fpo, "\n");
- }
-
- // BFAST output
- fprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n",
- (NULL == opt->read_prefix) ? "" : opt->read_prefix,
- (NULL == opt->read_prefix) ? "" : "_",
- "rand", 0, 0, 0, 0, 1, 1,
- 0, 0, 0, 0, 0, 0,
- (unsigned long long)rand_ii);
- if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
- for (i = 0; i < s[j]; ++i)
- fputc("ACGTN"[(int)tmp_seq[j][i]], opt->fp_bfast);
- fprintf(opt->fp_bfast, "\n+\n%s\n", qstr);
- }
- else {
- fputc('A', opt->fp_bfast);
- for (i = 0; i < s[j]; ++i)
- fputc("01234"[(int)tmp_seq[j][i]], opt->fp_bfast);
- fprintf(opt->fp_bfast, "\n+\n");
- for (i = 0; i < s[j]; ++i)
- fputc(qstr[i], opt->fp_bfast);
- fprintf(opt->fp_bfast, "\n");
+ (unsigned long long)rand_ii);
+ if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) {
+ for (i = 0; i < s[j]; ++i)
+ gzputc(opt->fp_bfast, "ACGTN"[(int)tmp_seq[j][i]]);
+ gzprintf(opt->fp_bfast, "\n+\n%s\n", qstr);
+ }
+ else {
+ gzputc(opt->fp_bfast, 'A');
+ for (i = 0; i < s[j]; ++i)
+ gzputc(opt->fp_bfast, "01234"[(int)tmp_seq[j][i]]);
+ gzprintf(opt->fp_bfast, "\n+\n");
+ for (i = 0; i < s[j]; ++i)
+ gzputc(opt->fp_bfast, qstr[i]);
+ gzprintf(opt->fp_bfast, "\n");
+ }
}
}
rand_ii++;
@@ -1063,7 +1071,7 @@ int main(int argc, char *argv[])
}
// Open files
- opt->fp_fa = xopen(argv[optind+0], "r");
+ opt->fp_fa = xopen(argv[optind+0], "r");
strcpy(fn_fai, argv[optind+0]); strcat(fn_fai, ".fai");
opt->fp_fai = fopen(fn_fai, "r"); // NB: depends on returning NULL;
strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".mutations.txt");
@@ -1071,12 +1079,16 @@ int main(int argc, char *argv[])
strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".mutations.vcf");
opt->fp_vcf = xopen(fn_tmp, "w");
if(0 == opt->muts_only) {
- strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bfast.fastq");
- opt->fp_bfast = xopen(fn_tmp, "w");
- strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read1.fastq");
- opt->fp_bwa1 = xopen(fn_tmp, "w");
- strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read2.fastq");
- opt->fp_bwa2 = xopen(fn_tmp, "w");
+ if (opt->output_type != OUTPUT_TYPE_BWA) {
+ strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bfast.fastq.gz");
+ opt->fp_bfast = gzopen(fn_tmp, "w");
+ }
+ if (opt->output_type != OUTPUT_TYPE_BFAST) {
+ strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read1.fastq.gz");
+ opt->fp_bwa1 = gzopen(fn_tmp, "w");
+ strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read2.fastq.gz");
+ opt->fp_bwa2 = gzopen(fn_tmp, "w");
+ }
}
// Run simulation
@@ -1084,7 +1096,13 @@ int main(int argc, char *argv[])
// Close files
if(0 == opt->muts_only) {
- fclose(opt->fp_fa); fclose(opt->fp_bfast); fclose(opt->fp_bwa1); fclose(opt->fp_bwa2);
+ if (opt->output_type != OUTPUT_TYPE_BWA) {
+ gzclose(opt->fp_bfast);
+ }
+ if (opt->output_type != OUTPUT_TYPE_BFAST) {
+ gzclose(opt->fp_bwa1); gzclose(opt->fp_bwa2);
+ }
+ fclose(opt->fp_fa);
}
if(NULL != opt->fp_fai) fclose(opt->fp_fai);
fclose(opt->fp_mut);
=====================================
src/dwgsim_opt.c
=====================================
@@ -32,6 +32,7 @@
#include <stdint.h>
#include <ctype.h>
#include <string.h>
+#include <zlib.h>
#include "mut.h"
#include "dwgsim.h"
#include "dwgsim_opt.h"
@@ -56,6 +57,7 @@ dwgsim_opt_t* dwgsim_opt_init()
opt->rand_read = 0.05;
opt->data_type = ILLUMINA;
opt->strandedness = 0;
+ opt->read_one_strand = 0;
opt->max_n = 0;
opt->flow_order = NULL;
opt->flow_order_len = 0;
@@ -67,7 +69,8 @@ dwgsim_opt_t* dwgsim_opt_init()
opt->fn_muts_input = NULL;
opt->fn_muts_input_type = -1;
opt->fn_regions_bed = NULL;
- opt->fp_mut = opt->fp_bfast = opt->fp_bwa1 = opt->fp_bwa2 = NULL;
+ opt->fp_mut = NULL;
+ opt->fp_bfast = opt->fp_bwa1 = opt->fp_bwa2 = NULL;
opt->fp_fa = opt->fp_fai = NULL;
opt->read_prefix = NULL;
@@ -106,7 +109,7 @@ int dwgsim_opt_usage(dwgsim_opt_t *opt)
fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", opt->mut_rate);
fprintf(stderr, " -F FLOAT frequency of given mutation to simulate low fequency somatic mutations [%.4f]\n", opt->mut_freq);
fprintf(stderr, " NB: freqeuncy F refers to the first strand of mutation, therefore mutations \n");
- fprintf(stderr, " on the second strand occour with a frequency of 1-F \n");
+ fprintf(stderr, " on the second strand occur with a frequency of 1-F \n");
fprintf(stderr, " -R FLOAT fraction of mutations that are indels [%.2f]\n", opt->indel_frac);
fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", opt->indel_extend);
fprintf(stderr, " -I INT the minimum length indel [%d]\n", opt->indel_min);
@@ -116,10 +119,14 @@ int dwgsim_opt_usage(dwgsim_opt_t *opt)
fprintf(stderr, " 0: Illumina\n");
fprintf(stderr, " 1: SOLiD\n");
fprintf(stderr, " 2: Ion Torrent\n");
- fprintf(stderr, " -S INT generate reads [%d]:\n", opt->strandedness);
+ fprintf(stderr, " -S INT generate paired end reads with orientation [%d]:\n", opt->strandedness);
fprintf(stderr, " 0: default (opposite strand for Illumina, same strand for SOLiD/Ion Torrent)\n");
fprintf(stderr, " 1: same strand (mate pair)\n");
fprintf(stderr, " 2: opposite strand (paired end)\n");
+ fprintf(stderr, " -A INT generate paired end reads with read one [%d]:\n", opt->read_one_strand);
+ fprintf(stderr, " 0: default (both, random)\n");
+ fprintf(stderr, " 1: forward genomic strand\n");
+ fprintf(stderr, " 2: reverse genomic strand\n");
fprintf(stderr, " -f STRING the flow order for Ion Torrent data [%s]\n", (char*)opt->flow_order);
fprintf(stderr, " -B use a per-base error rate for Ion Torrent data [%s]\n", __IS_TRUE(opt->use_base_error));
fprintf(stderr, " -H haploid mode [%s]\n", __IS_TRUE(opt->is_hap));
@@ -133,6 +140,10 @@ int dwgsim_opt_usage(dwgsim_opt_t *opt)
fprintf(stderr, " -q STRING a fixed base quality to apply (single character) [%s]\n", (NULL == opt->fixed_quality) ? "not using" : opt->fixed_quality);
fprintf(stderr, " -Q FLOAT standard deviation of the base quality scores [%.2lf]\n", (NULL == opt->fixed_quality) ? opt->quality_std : 0.0);
fprintf(stderr, " -s INT standard deviation of the distance for pairs [%.3f]\n", opt->std_dev);
+ fprintf(stderr, " -o INT output type for the FASTQ files [%d]:\n", opt->output_type);
+ fprintf(stderr, " 0: interleaved (bfast) and per-read-end (bwa)\n");
+ fprintf(stderr, " 1: per-read-end (bwa) only\n");
+ fprintf(stderr, " 2: interleaved (bfast) only\n");
fprintf(stderr, " -h print this message\n");
fprintf(stderr, "\n");
fprintf(stderr, "Note: For SOLiD mate pair reads and BFAST, the first read is F3 and the second is R3. For SOLiD mate pair reads\n");
@@ -192,7 +203,7 @@ dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[])
int c;
int muts_input_type = 0;
- while ((c = getopt(argc, argv, "id:s:N:C:1:2:e:E:r:F:R:X:I:c:S:n:y:BHf:z:Mm:b:v:x:P:q:Q:h")) >= 0) {
+ while ((c = getopt(argc, argv, "id:s:N:C:1:2:e:E:r:F:R:X:I:c:S:A:n:y:BHf:z:Mm:b:v:x:P:q:Q:o:h")) >= 0) {
switch (c) {
case 'i': opt->is_inner = 1; break;
case 'd': opt->dist = dwgsim_atoi(optarg, 'd', 0); break;
@@ -210,6 +221,7 @@ dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[])
case 'I': opt->indel_min = dwgsim_atoi(optarg, 'I', 0); break;
case 'c': opt->data_type = dwgsim_atoi(optarg, 'c', 0); break;
case 'S': opt->strandedness = dwgsim_atoi(optarg, 'S', 0); break;
+ case 'A': opt->read_one_strand = dwgsim_atoi(optarg, 'A', 0); break;
case 'n': opt->max_n = dwgsim_atoi(optarg, 'n', 0); break;
case 'y': opt->rand_read = atof(optarg); break;
case 'f':
@@ -228,6 +240,7 @@ dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[])
case 'P': free(opt->read_prefix); opt->read_prefix = strdup(optarg); break;
case 'q': opt->fixed_quality = strdup(optarg); break;
case 'Q': opt->quality_std = atof(optarg); break;
+ case 'o': opt->output_type = atoi(optarg); break;
default: fprintf(stderr, "Unrecognized option: -%c\n", c); return 0;
}
}
@@ -277,6 +290,7 @@ dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[])
__check_option(opt->indel_min, 1, INT32_MAX, "-I");
__check_option(opt->data_type, 0, 2, "-c");
__check_option(opt->strandedness, 0, 2, "-S");
+ __check_option(opt->read_one_strand, 0, 2, "-A");
__check_option(opt->max_n, 0, INT32_MAX, "-n");
__check_option(opt->rand_read, 0, 1.0, "-y");
if(IONTORRENT == opt->data_type && NULL == opt->flow_order) {
@@ -296,17 +310,7 @@ dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[])
fprintf(stderr, "Warning: remember to use the -P option with dwgsim_eval\n");
}
- if(0 < opt->length[1]){ //paired end / mate pair
- int s = opt->length[0] + opt->length[1] + opt->is_inner;
- double r = (s-opt->dist)*1.0/opt->std_dev;
- if(r>6){
- fprintf(stderr, "Error: command line option -d is too small for the read length (%d)\n",opt->dist);
- return 0;
- }
- if(r>4){
- fprintf(stderr, "Warning: command line option -d is small for the read length (%d). Generation speed could be affected.\n",opt->dist);
- }
- }
+ __check_option(opt->output_type, 0, 2, "-o");
switch(muts_input_type) {
case 0x0:
=====================================
src/dwgsim_opt.h
=====================================
@@ -1,8 +1,15 @@
#ifndef DWGSIM_OPT_H
#define DWGSIM_OPT_H
+#include "zlib.h"
+
#define ERROR_RATE_NUM_RANDOM_READS 1000000
+#define OUTPUT_TYPE_ALL 0
+#define OUTPUT_TYPE_BWA 1
+#define OUTPUT_TYPE_BFAST 2
+
+
typedef struct {
double start, by, end;
} error_t;
@@ -24,6 +31,7 @@ typedef struct {
int32_t max_n;
int32_t data_type;
int32_t strandedness;
+ int32_t read_one_strand;
int8_t *flow_order;
int32_t flow_order_len;
int32_t use_base_error;
@@ -37,12 +45,13 @@ typedef struct {
char *fn_regions_bed;
FILE *fp_mut;
FILE *fp_vcf;
- FILE *fp_bfast;
- FILE *fp_bwa1;
- FILE *fp_bwa2;
+ gzFile *fp_bfast;
+ gzFile *fp_bwa1;
+ gzFile *fp_bwa2;
FILE *fp_fa;
FILE *fp_fai;
char *read_prefix;
+ int32_t output_type;
} dwgsim_opt_t;
dwgsim_opt_t*
=====================================
src/mut_txt.c
=====================================
@@ -58,7 +58,7 @@ muts_txt_t *muts_txt_init(FILE *fp, contigs_t *c)
prev_pos = 0;
}
if(c->n == i) {
- fprintf(stderr, "Error: contig not found [%s]\n", name);
+ fprintf(stderr, "Error: mutation contig not found or out of order [%s]\n", name);
exit(1);
}
else if(pos <= 0 || c->contigs[i].len < pos) {
=====================================
testdata/ex1.test.bfast.fastq.gz
=====================================
Binary files a/testdata/ex1.test.bfast.fastq.gz and b/testdata/ex1.test.bfast.fastq.gz differ
=====================================
testdata/ex1.test.bwa.read1.fastq.gz
=====================================
Binary files a/testdata/ex1.test.bwa.read1.fastq.gz and b/testdata/ex1.test.bwa.read1.fastq.gz differ
=====================================
testdata/ex1.test.bwa.read2.fastq.gz
=====================================
Binary files a/testdata/ex1.test.bwa.read2.fastq.gz and b/testdata/ex1.test.bwa.read2.fastq.gz differ
=====================================
testdata/test.sh
=====================================
@@ -15,14 +15,17 @@ popd
mkdir tmp
# Generate the new test data
-./dwgsim -z 13 -N 10000 samtools/examples/ex1.fa ex1.test
+./dwgsim -z 13 -N 10000 samtools/examples/ex1.fa tmp/ex1.test
# Test the differences
-for FILE in $(ls -1 ex1.test*)
+for GZFILE in $(ls -1 tmp/ex1.test*gz)
do
- diff -q ${FILE} ${TESTDATADIR}/${FILE}
+ gunzip $GZFILE;
+ FILE=$(basename $GZFILE .gz);
+ diff -q tmp/${FILE} ${TESTDATADIR}/${FILE}
done
# Clean up the testdata
find ${TESTDATADIR} \! -name "*gz" -type f | grep -v sh$ | xargs rm;
-rm -r tmp;
+
+rm -r tmp
View it on GitLab: https://salsa.debian.org/med-team/dwgsim/-/commit/386d48d63429e49e80d06b119dfa4689747ea353
--
View it on GitLab: https://salsa.debian.org/med-team/dwgsim/-/commit/386d48d63429e49e80d06b119dfa4689747ea353
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220116/26e1bbf6/attachment-0001.htm>
More information about the debian-med-commit
mailing list