[med-svn] [bwa] 01/02: Imported Upstream version 0.7.8
Andreas Tille
tille at debian.org
Sun Apr 13 07:00:33 UTC 2014
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository bwa.
commit e513f4dfd9dc01da946b063b2b09f331944d0fb2
Author: Andreas Tille <tille at debian.org>
Date: Sun Apr 13 08:52:51 2014 +0200
Imported Upstream version 0.7.8
---
Makefile | 2 +-
NEWS | 32 +++++++++++++
bwa.1 | 26 +++++++---
bwa.c | 32 +++++++++----
bwa.h | 2 +
bwamem.c | 97 +++++++++++++++++++++++++++-----------
bwamem.h | 5 +-
bwamem_pair.c | 16 ++++---
bwt.h | 8 ++--
bwt_lite.c | 2 +-
bwt_lite.h | 6 +--
fastmap.c | 125 +++++++++++++++++++++++++++++++++---------------
ksw.c | 149 +++++++++++++++++++++++++++++++++++-----------------------
ksw.h | 3 ++
main.c | 2 +-
15 files changed, 354 insertions(+), 153 deletions(-)
diff --git a/Makefile b/Makefile
index ff48a20..6490932 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
CC= gcc
#CC= clang --analyze
-CFLAGS= -g -Wall -O2
+CFLAGS= -g -Wall -O2 -Wno-unused-function
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
diff --git a/NEWS b/NEWS
index a7c64ed..40f4433 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,35 @@
+Release 0.7.8 (31 March, 2014)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Changes in BWA-MEM:
+
+ * Bugfix: off-diagonal X-dropoff (option -d) not working as intended.
+ Short-read alignment is not affected.
+
+ * Bugfix: unnecessarily large bandwidth used during global alignment,
+ which reduces the mapping speed by ~5% for short reads. Results are not
+ affected.
+
+ * Bugfix: when the matching score is not one, paired-end mapping quality is
+ inaccurate.
+
+ * When the matching score (option -A) is changed, scale all score-related
+ options accordingly unless overridden by users.
+
+ * Allow to specify different gap open (or extension) penalties for deletions
+ and insertions separately.
+
+ * Allow to specify the insert size distribution.
+
+ * Better and more detailed debugging information.
+
+With the default setting, 0.7.8 and 0.7.7 gave identical output on one million
+100bp read pairs.
+
+(0.7.8: 31 March 2014, r455)
+
+
+
Release 0.7.7 (25 Feburary, 2014)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/bwa.1 b/bwa.1
index 601a529..b6354e5 100644
--- a/bwa.1
+++ b/bwa.1
@@ -1,4 +1,4 @@
-.TH bwa 1 "25 Feburary 2014" "bwa-0.7.7" "Bioinformatics tools"
+.TH bwa 1 "31 March 2014" "bwa-0.7.8" "Bioinformatics tools"
.SH NAME
.PP
bwa - Burrows-Wheeler Alignment Tool
@@ -148,9 +148,10 @@ not work with split alignments. One may consider to use option
.B -M
to flag shorter split hits as secondary.
-.B OPTIONS:
.RS
.TP 10
+.B ALGORITHM OPTIONS:
+.TP
.BI -t \ INT
Number of threads [1]
.TP
@@ -202,11 +203,14 @@ Matching score. [1]
.BI -B \ INT
Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
.TP
-.BI -O \ INT
-Gap open penalty. [6]
+.BI -O \ INT[,INT]
+Gap open penalty. If two numbers are specified, the first is the penalty of
+openning a deletion and the second for openning an insertion. [6]
.TP
-.BI -E \ INT
-Gap extension penalty. A gap of length k costs O + k*E (i.e.
+.BI -E \ INT[,INT]
+Gap extension penalty. If two numbers are specified, the first is the penalty
+of extending a deletion and second for extending an insertion. A gap of length
+k costs O + k*E (i.e.
.B -O
is for opening a zero-length gap). [1]
.TP
@@ -224,6 +228,9 @@ Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
two scores to determine whether we should force pairing. A larger value leads to
more aggressive read pair. [17]
+
+.TP
+.B INPUT/OUTPUT OPTIONS:
.TP
.B -p
Assume the first input query file is interleaved paired-end FASTA/Q. See the
@@ -260,6 +267,13 @@ supported throughout BWA. Ideally, a value 0 for disabling all the output to
stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
all normal messages; 4 or higher for debugging. When this option takes value
4, the output is not SAM. [3]
+.TP
+.BI -I \ FLOAT[,FLOAT[,INT[,INT]]]
+Specify the mean, standard deviation (10% of the mean if absent), max (4 sigma
+from the mean if absent) and min (4 sigma if absent) of the insert size
+distribution. Only applicable to the FR orientation. By default, BWA-MEM infers
+these numbers and the pair orientations given enough reads. [inferred]
+
.RE
.TP
diff --git a/bwa.c b/bwa.c
index 140d57e..0e9e606 100644
--- a/bwa.c
+++ b/bwa.c
@@ -86,7 +86,7 @@ void bwa_fill_scmat(int a, int b, int8_t mat[25])
}
// Generate CIGAR when the alignment end points are known
-uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
+uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
{
uint32_t *cigar = 0;
uint8_t tmp, *rseq;
@@ -106,24 +106,30 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
}
if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
+ // FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance.
cigar = malloc(4);
cigar[0] = l_query<<4 | 0;
*n_cigar = 1;
for (i = 0, *score = 0; i < l_query; ++i)
*score += mat[rseq[i]*5 + query[i]];
} else {
- int w, max_gap, min_w;
- //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
- //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
+ int w, max_gap, max_ins, max_del, min_w;
// set the band-width
- max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.);
+ max_ins = (int)((double)(((l_query+1)>>1) * mat[0] - o_ins) / e_ins + 1.);
+ max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.);
+ max_gap = max_ins > max_del? max_ins : max_del;
max_gap = max_gap > 1? max_gap : 1;
w = (max_gap + abs(rlen - l_query) + 1) >> 1;
w = w < w_? w : w_;
min_w = abs(rlen - l_query) + 3;
w = w > min_w? w : min_w;
// NW alignment
- *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
+ if (bwa_verbose >= 4) {
+ printf("* Global bandwidth: %d\n", w);
+ printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
+ printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
+ }
+ *score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar);
}
{// compute NM and MD
int k, x, y, u, n_mm = 0, n_gap = 0;
@@ -165,7 +171,12 @@ ret_gen_cigar:
return cigar;
}
-int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
+uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
+{
+ return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM);
+}
+
+int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
{
int is_rev;
int64_t cb, ce, fm;
@@ -184,7 +195,7 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns,
int64_t x;
cb = cb > *rb? cb : *rb;
ce = ce < *re? ce : *re;
- cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
+ cigar = bwa_gen_cigar2(mat, o_del, e_del, o_ins, e_ins, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
int op = cigar[i]&0xf, len = cigar[i]>>4;
if (op == 0) {
@@ -210,6 +221,11 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns,
return (*qb == *qe || *rb == *re)? -2 : 0;
}
+int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
+{
+ return bwa_fix_xref2(mat, q, r, q, r, w, bns, pac, query, qb, qe, rb, re);
+}
+
/*********************
* Full index reader *
*********************/
diff --git a/bwa.h b/bwa.h
index 9d5b2aa..8d46e58 100644
--- a/bwa.h
+++ b/bwa.h
@@ -32,7 +32,9 @@ extern "C" {
void bwa_fill_scmat(int a, int b, int8_t mat[25]);
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
+ uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
+ int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);
diff --git a/bwamem.c b/bwamem.c
index 19ca561..33b86ad 100644
--- a/bwamem.c
+++ b/bwamem.c
@@ -47,7 +47,10 @@ mem_opt_t *mem_opt_init()
mem_opt_t *o;
o = calloc(1, sizeof(mem_opt_t));
o->flag = 0;
- o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
+ o->a = 1; o->b = 4;
+ o->o_del = o->o_ins = 6;
+ o->e_del = o->e_ins = 1;
+ o->w = 100;
o->T = 30;
o->zdrop = 100;
o->pen_unpaired = 17;
@@ -111,7 +114,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query)
itr->len = len;
}
-const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
+const bwtintv_v *smem_next2(smem_i *itr, int split_len, int split_width, int start_width)
{
int i, max, max_i, ori_start;
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
@@ -119,7 +122,7 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
if (itr->start == itr->len) return 0;
ori_start = itr->start;
- itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
+ itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, start_width, itr->matches, itr->tmpvec); // search for SMEM
if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
bwtintv_t *p = &itr->matches->a[i];
@@ -152,6 +155,11 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
return itr->matches;
}
+const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
+{
+ return smem_next2(itr, split_len, split_width, 1);
+}
+
/********************************
* Chaining while finding SMEMs *
********************************/
@@ -200,8 +208,9 @@ static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) *
{
const bwtintv_v *a;
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
+ int start_width = (opt->flag & MEM_F_NO_EXACT)? 2 : 1;
split_len = split_len < itr->len? split_len : itr->len;
- while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM
+ while ((a = smem_next2(itr, split_len, opt->split_width, start_width)) != 0) { // to find all SMEM and some internal MEM
int i;
for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start
bwtintv_t *p = &a->a[i];
@@ -215,7 +224,7 @@ static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) *
s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
s.qbeg = p->info>>32;
s.len = slen;
- if (bwa_verbose >= 5) printf("SEED l=%d,qb=%d,rb=%ld\n", s.len, s.qbeg, (long)s.rbeg);
+ if (bwa_verbose >= 5) printf("* Found SEED: length=%d,query_beg=%d,ref_beg=%ld\n", s.len, s.qbeg, (long)s.rbeg);
if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip
if (kb_size(tree)) {
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
@@ -257,14 +266,14 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
int i, j;
for (i = 0; i < chn->n; ++i) {
mem_chain_t *p = &chn->a[i];
- err_printf("CHAIN(%d) n=%d w=%d", i, p->n, mem_chain_weight(p));
+ err_printf("* Found CHAIN(%d): n=%d; weight=%d", i, p->n, mem_chain_weight(p));
for (j = 0; j < p->n; ++j) {
bwtint_t pos;
int is_rev, ref_id;
pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev);
if (is_rev) pos -= p->seeds[j].len - 1;
bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id);
- err_printf("\t%d,%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
+ err_printf("\t%d;%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
}
err_putchar('\n');
}
@@ -425,6 +434,13 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun)
return m;
}
+int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen)
+{
+ if (!(opt->flag & MEM_F_NO_EXACT) || n == 0 || a->truesc != qlen * opt->a) return n;
+ memmove(a, a + 1, (n - 1) * sizeof(mem_alnreg_t));
+ return n - 1;
+}
+
void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
{ // similar to the loop in mem_chain_flt()
int i, k, tmp;
@@ -433,7 +449,9 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i
kv_init(z);
for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i);
ks_introsort(mem_ars_hash, n, a);
- tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
+ tmp = opt->a + opt->b;
+ tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp;
+ tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp;
kv_push(int, z, 0);
for (i = 1; i < n; ++i) {
for (k = 0; k < z.n; ++k) {
@@ -509,7 +527,7 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac,
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
assert(rlen == re - rb);
xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
- x = ksw_align(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->q, opt->r, xtra, 0);
+ x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
free(rseq);
if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1;
@@ -518,13 +536,15 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac,
a.score = x.score;
a.csub = x.score2;
kv_push(mem_alnreg_t, *av, a);
- if (bwa_verbose >= 4) printf("chain2aln(short): [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re);
+ if (bwa_verbose >= 4) printf("** Added alignment region via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re);
return 0;
}
static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
{
- int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.);
+ int l_del = (int)((double)(qlen * opt->a - opt->o_del) / opt->e_del + 1.);
+ int l_ins = (int)((double)(qlen * opt->a - opt->o_ins) / opt->e_ins + 1.);
+ int l = l_del > l_ins? l_del : l_ins;
l = l > 1? l : 1;
return l < opt->w<<1? l : opt->w<<1;
}
@@ -584,7 +604,9 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
w = max_gap < opt->w? max_gap : opt->w;
if (qd - rd < w && rd - qd < w) break;
}
- if (i < av->n) { // the seed is (almost) contained in an existing alignment
+ if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln
+ if (bwa_verbose >= 4)
+ printf("** Seed(%d) [%ld;%ld,%ld] is almost contained in an existing alignment. Confirming whether extension is needed...\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg);
for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain
const mem_seed_t *t;
if (srt[i] == 0) continue;
@@ -597,6 +619,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
srt[k] = 0; // mark that seed extension has not been performed
continue;
}
+ if (bwa_verbose >= 4)
+ printf("** Seed(%d) might lead to a different alignment even though it is contained. Extension will be performed.\n", k);
}
a = kv_pushp(mem_alnreg_t, *av);
@@ -604,7 +628,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
a->w = aw[0] = aw[1] = opt->w;
a->score = a->truesc = -1;
- if (bwa_verbose >= 4) err_printf("Extending from seed [%ld,%ld,%ld]\n", (long)s->len, (long)s->qbeg, (long)s->rbeg);
+ if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg);
if (s->qbeg) { // left extension
uint8_t *rs, *qs;
int qle, tle, gtle, gscore;
@@ -616,8 +640,13 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
for (i = 0; i < MAX_BAND_TRY; ++i) {
int prev = a->score;
aw[0] = opt->w << i;
- a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]);
- if (bwa_verbose >= 4) { printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
+ if (bwa_verbose >= 4) {
+ int j;
+ printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n');
+ printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n');
+ }
+ a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]);
+ if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
}
// check whether we prefer to reach the end of the query
@@ -639,8 +668,13 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
for (i = 0; i < MAX_BAND_TRY; ++i) {
int prev = a->score;
aw[1] = opt->w << i;
- a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]);
- if (bwa_verbose >= 4) { printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); }
+ if (bwa_verbose >= 4) {
+ int j;
+ printf("*** Right ref: "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n');
+ printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n');
+ }
+ a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]);
+ if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); }
if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
}
// similar to the above
@@ -652,7 +686,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
a->truesc += gscore - sc0;
}
} else a->qe = l_query, a->re = s->rbeg + s->len;
- if (bwa_verbose >= 4) { printf("[%d]\taw={%d,%d}\tscore=%d\t[%d,%d) <=> [%ld,%ld)\n", k, aw[0], aw[1], a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); }
+ if (bwa_verbose >= 4) printf("*** Added alignment region: [%d,%d) <=> [%ld,%ld); score=%d; {left,right}_bandwidth={%d,%d}\n", a->qb, a->qe, (long)a->rb, (long)a->re, a->score, aw[0], aw[1]);
// compute seedcov
for (i = 0, a->seedcov = 0; i < c->n; ++i) {
@@ -673,7 +707,7 @@ static inline int infer_bw(int l1, int l2, int score, int a, int q, int r)
{
int w;
if (l1 == l2 && l1 * a - score < (q + r - a)<<1) return 0; // to get equal alignment length, we need at least two gaps
- w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.);
+ w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 2.);
if (w < abs(l1 - l2)) w = abs(l1 - l2);
return w;
}
@@ -887,13 +921,15 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
for (i = 0; i < chn.n; ++i) {
mem_chain_t *p = &chn.a[i];
int ret;
- if (bwa_verbose >= 4) err_printf("===> Processing chain(%d) <===\n", i);
+ if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i);
ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s);
if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s);
free(chn.a[i].seeds);
}
free(chn.a);
regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun);
+ if (opt->flag & MEM_F_NO_EXACT)
+ regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq);
return regs;
}
@@ -913,7 +949,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
{
mem_aln_t a;
- int i, w2, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD;
+ int i, w2, tmp, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD;
int64_t pos, rb, re;
uint8_t *query;
@@ -929,19 +965,21 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0;
if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment
- if (bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) {
+ if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) {
fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Abort. Sorry.\n", __func__);
exit(1);
}
- w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r);
- if (bwa_verbose >= 4) printf("Band width: infer=%d, opt=%d, alnreg=%d\n", w2, opt->w, ar->w);
+ tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del);
+ w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins);
+ w2 = w2 > tmp? w2 : tmp;
+ if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w);
if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w;
- else w2 = opt->w;
+// else w2 = opt->w; // TODO: check if we need this line on long reads. On 1-800bp reads, it does not matter and it should be.
i = 0; a.cigar = 0;
do {
free(a.cigar);
- a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
- if (bwa_verbose >= 4) printf("Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc);
+ a.cigar = bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
+ if (bwa_verbose >= 4) printf("* Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc);
if (score == last_sc) break; // it is possible that global alignment and local alignment give different scores
last_sc = score;
w2 <<= 1;
@@ -997,9 +1035,12 @@ static void worker1(void *data, int i, int tid)
{
worker_t *w = (worker_t*)data;
if (!(w->opt->flag&MEM_F_PE)) {
+ if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name);
w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
} else {
+ if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name);
w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
+ if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name);
w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
}
}
@@ -1009,10 +1050,12 @@ static void worker2(void *data, int i, int tid)
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);
worker_t *w = (worker_t*)data;
if (!(w->opt->flag&MEM_F_PE)) {
+ if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
free(w->regs[i].a);
} else {
+ if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name);
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]);
free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);
}
diff --git a/bwamem.h b/bwamem.h
index 8b24c51..5291491 100644
--- a/bwamem.h
+++ b/bwamem.h
@@ -16,9 +16,12 @@ typedef struct __smem_i smem_i;
#define MEM_F_ALL 0x8
#define MEM_F_NO_MULTI 0x10
#define MEM_F_NO_RESCUE 0x20
+#define MEM_F_NO_EXACT 0x40
typedef struct {
- int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
+ int a, b; // match score and mismatch penalty
+ int o_del, e_del;
+ int o_ins, e_ins;
int pen_unpaired; // phred-scaled penalty for unpaired reads
int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score.
int w; // band width
diff --git a/bwamem_pair.c b/bwamem_pair.c
index f1aa73a..4a7cdf3 100644
--- a/bwamem_pair.c
+++ b/bwamem_pair.c
@@ -144,8 +144,8 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me
if (len == re - rb) { // no funny things happening
kswr_t aln;
mem_alnreg_t b;
- int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len;
- aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0);
+ int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
+ aln = ksw_align2(l_ms, seq, len, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
memset(&b, 0, sizeof(mem_alnreg_t));
if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;
@@ -219,7 +219,9 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_
y[v.a[i].y&3] = i;
}
if (u.n) { // found at least one proper pair
- int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
+ int tmp = opt->a + opt->b;
+ tmp = tmp > opt->o_del + opt->e_del? tmp : opt->o_del + opt->e_del;
+ tmp = tmp > opt->o_ins + opt->e_ins? tmp : opt->o_ins + opt->e_ins;
ks_introsort_128(u.n, u.a);
i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32;
z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair
@@ -233,6 +235,8 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_
return ret;
}
+#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499))
+
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
{
extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
@@ -274,7 +278,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
//q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
subo = subo > score_un? subo : score_un;
- q_pe = (o - subo) * 6;
+ q_pe = raw_mapq(o - subo, opt->a);
if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);
if (q_pe < 0) q_pe = 0;
if (q_pe > 60) q_pe = 60;
@@ -291,8 +295,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;
extra_flag |= 2;
// cap at the tandem repeat score
- q_se[0] = q_se[0] < (c[0]->score - c[0]->csub) * 6? q_se[0] : (c[0]->score - c[0]->csub) * 6;
- q_se[1] = q_se[1] < (c[1]->score - c[1]->csub) * 6? q_se[1] : (c[1]->score - c[1]->csub) * 6;
+ q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a);
+ q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a);
} else { // the unpaired alignment is preferred
z[0] = z[1] = 0;
q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
diff --git a/bwt.h b/bwt.h
index c36bf9b..d2ff0ac 100644
--- a/bwt.h
+++ b/bwt.h
@@ -96,14 +96,14 @@ extern "C" {
void bwt_bwtupdate_core(bwt_t *bwt);
- inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
- inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
+ bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
+ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k);
// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
void bwt_gen_cnt_table(bwt_t *bwt);
- inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
- inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
+ void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
+ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end);
int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0);
diff --git a/bwt_lite.c b/bwt_lite.c
index 6cd3b1d..9b47270 100644
--- a/bwt_lite.c
+++ b/bwt_lite.c
@@ -56,7 +56,7 @@ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq)
}
return b;
}
-inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
+uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
{
uint32_t n, b;
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
diff --git a/bwt_lite.h b/bwt_lite.h
index 0096b93..4fadcce 100644
--- a/bwt_lite.h
+++ b/bwt_lite.h
@@ -17,9 +17,9 @@ extern "C" {
#endif
bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq);
- inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
- inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
- inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
+ uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
+ void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
+ void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
void bwtl_destroy(bwtl_t *bwt);
#ifdef __cplusplus
diff --git a/fastmap.c b/fastmap.c
index 72d850c..093fb7b 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -2,6 +2,7 @@
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
+#include <string.h>
#include <ctype.h>
#include <math.h>
#include "bwa.h"
@@ -19,50 +20,83 @@ int kclose(void *a);
int main_mem(int argc, char *argv[])
{
- mem_opt_t *opt;
+ mem_opt_t *opt, opt0;
int fd, fd2, i, c, n, copy_comment = 0;
gzFile fp, fp2 = 0;
kseq_t *ks, *ks2 = 0;
bseq1_t *seqs;
bwaidx_t *idx;
- char *rg_line = 0;
+ char *p, *rg_line = 0;
void *ko = 0, *ko2 = 0;
int64_t n_processed = 0;
+ mem_pestat_t pes[4], *pes0 = 0;
+
+ memset(pes, 0, 4 * sizeof(mem_pestat_t));
+ for (i = 0; i < 4; ++i) pes[i].failed = 1;
opt = mem_opt_init();
- while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:")) >= 0) {
+ opt0.a = opt0.b = opt0.o_del = opt0.e_del = opt0.o_ins = opt0.e_ins = opt0.pen_unpaired = -1;
+ opt0.pen_clip5 = opt0.pen_clip3 = opt0.zdrop = opt0.T = -1;
+ while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg);
else if (c == 'w') opt->w = atoi(optarg);
- else if (c == 'A') opt->a = atoi(optarg);
- else if (c == 'B') opt->b = atoi(optarg);
- else if (c == 'O') opt->q = atoi(optarg);
- else if (c == 'E') opt->r = atoi(optarg);
- else if (c == 'T') opt->T = atoi(optarg);
- else if (c == 'U') opt->pen_unpaired = atoi(optarg);
+ else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1;
+ else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1;
+ else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1;
+ else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1;
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
else if (c == 'a') opt->flag |= MEM_F_ALL;
else if (c == 'p') opt->flag |= MEM_F_PE;
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
+ else if (c == 'e') opt->flag |= MEM_F_NO_EXACT;
else if (c == 'c') opt->max_occ = atoi(optarg);
- else if (c == 'd') opt->zdrop = atoi(optarg);
+ else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
else if (c == 'v') bwa_verbose = atoi(optarg);
else if (c == 'r') opt->split_factor = atof(optarg);
else if (c == 'D') opt->chain_drop_ratio = atof(optarg);
else if (c == 'm') opt->max_matesw = atoi(optarg);
+ else if (c == 's') opt->split_width = atoi(optarg);
else if (c == 'C') copy_comment = 1;
else if (c == 'Q') {
opt->mapQ_coef_len = atoi(optarg);
opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0;
+ } else if (c == 'O') {
+ opt0.o_del = opt0.o_ins = 1;
+ opt->o_del = opt->o_ins = strtol(optarg, &p, 10);
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ opt->o_ins = strtol(p+1, &p, 10);
+ } else if (c == 'E') {
+ opt0.e_del = opt0.e_ins = 1;
+ opt->e_del = opt->e_ins = strtol(optarg, &p, 10);
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ opt->e_ins = strtol(p+1, &p, 10);
} else if (c == 'L') {
- char *p;
+ opt0.pen_clip5 = opt0.pen_clip3 = 1;
opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10);
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
opt->pen_clip3 = strtol(p+1, &p, 10);
} else if (c == 'R') {
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
- } else if (c == 's') opt->split_width = atoi(optarg);
+ } else if (c == 'I') { // specify the insert size distribution
+ pes0 = pes;
+ pes[1].failed = 0;
+ pes[1].avg = strtod(optarg, &p);
+ pes[1].std = pes[1].avg * .1;
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ pes[1].std = strtod(p+1, &p);
+ pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499);
+ pes[1].low = (int)(pes[1].avg - 4. * pes[1].std + .499);
+ if (pes[1].low < 1) pes[1].low = 1;
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ pes[1].high = (int)(strtod(p+1, &p) + .499);
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ pes[1].low = (int)(strtod(p+1, &p) + .499);
+ if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n",
+ __func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
+ }
else return 1;
}
if (opt->n_threads < 1) opt->n_threads = 1;
@@ -70,38 +104,55 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
fprintf(stderr, "Algorithm options:\n\n");
- fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
- fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
- fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
- fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
- fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
-// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
- fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
- fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->chain_drop_ratio);
- fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw);
- fprintf(stderr, " -S skip mate rescue\n");
- fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
- fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a);
- fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
- fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);
- fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
- fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip5);
- fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
+ fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
+ fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
+ fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
+ fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
+ fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
+// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
+ fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
+ fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->chain_drop_ratio);
+ fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw);
+ fprintf(stderr, " -S skip mate rescue\n");
+ fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
+ fprintf(stderr, " -e discard full-length exact matches\n");
+ fprintf(stderr, " -A INT score for a sequence match, which scales [-TdBOELU] unless overridden [%d]\n", opt->a);
+ fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
+ fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins);
+ fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
+ fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
+ fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
fprintf(stderr, "\nInput/output options:\n\n");
- fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
- fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
+ fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
+ fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
fprintf(stderr, "\n");
- fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
- fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
- fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
- fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
- fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n");
+ fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
+ fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
+ fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
+ fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
+ fprintf(stderr, " -M mark shorter split hits as secondary\n\n");
+ fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n");
+ fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n");
+ fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n");
+ fprintf(stderr, " FR orientation only. [inferred]\n");
fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n");
fprintf(stderr, "\n");
free(opt);
return 1;
}
+ if (opt0.a == 1) { // matching score is changed
+ if (opt0.b != 1) opt->b *= opt->a;
+ if (opt0.T != 1) opt->T *= opt->a;
+ if (opt0.o_del != 1) opt->o_del *= opt->a;
+ if (opt0.e_del != 1) opt->e_del *= opt->a;
+ if (opt0.o_ins != 1) opt->o_ins *= opt->a;
+ if (opt0.e_ins != 1) opt->e_ins *= opt->a;
+ if (opt0.zdrop != 1) opt->zdrop *= opt->a;
+ if (opt0.pen_clip5 != 1) opt->pen_clip5 *= opt->a;
+ if (opt0.pen_clip3 != 1) opt->pen_clip3 *= opt->a;
+ if (opt0.pen_unpaired != 1) opt->pen_unpaired *= opt->a;
+ }
bwa_fill_scmat(opt->a, opt->b, opt->mat);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
@@ -142,7 +193,7 @@ int main_mem(int argc, char *argv[])
for (i = 0; i < n; ++i) size += seqs[i].l_seq;
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
- mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, 0);
+ mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0);
n_processed += n;
for (i = 0; i < n; ++i) {
err_fputs(seqs[i].sam, stdout);
diff --git a/ksw.c b/ksw.c
index db018fa..bb055bb 100644
--- a/ksw.c
+++ b/ksw.c
@@ -107,11 +107,11 @@ kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t
return q;
}
-kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
+kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
{
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
- __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;
+ __m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
kswr_t r;
#define __max_16(ret, xx) do { \
@@ -128,8 +128,10 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
m_b = n_b = 0; b = 0;
zero = _mm_set1_epi32(0);
- gapoe = _mm_set1_epi8(_gapo + _gape);
- gape = _mm_set1_epi8(_gape);
+ oe_del = _mm_set1_epi8(_o_del + _e_del);
+ e_del = _mm_set1_epi8(_e_del);
+ oe_ins = _mm_set1_epi8(_o_ins + _e_ins);
+ e_ins = _mm_set1_epi8(_e_ins);
shift = _mm_set1_epi8(q->shift);
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
slen = q->slen;
@@ -141,7 +143,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
// the core loop
for (i = 0; i < tlen; ++i) {
int j, k, cmp, imax;
- __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+ __m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
for (j = 0; LIKELY(j < slen); ++j) {
@@ -159,13 +161,14 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
max = _mm_max_epu8(max, h); // set max
_mm_store_si128(H1 + j, h); // save to H'(i,j)
// now compute E'(i+1,j)
- h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo
- e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape
- e = _mm_max_epu8(e, h); // e=E'(i+1,j)
+ e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del
+ t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del
+ e = _mm_max_epu8(e, t); // e=E'(i+1,j)
_mm_store_si128(E + j, e); // save to E'(i+1,j)
// now compute F'(i,j+1)
- f = _mm_subs_epu8(f, gape);
- f = _mm_max_epu8(f, h);
+ f = _mm_subs_epu8(f, e_ins);
+ t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
+ f = _mm_max_epu8(f, t);
// get H'(i-1,j) and prepare for the next j
h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
}
@@ -176,8 +179,8 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
h = _mm_load_si128(H1 + j);
h = _mm_max_epu8(h, f); // h=H'(i,j)
_mm_store_si128(H1 + j, h);
- h = _mm_subs_epu8(h, gapoe);
- f = _mm_subs_epu8(f, gape);
+ h = _mm_subs_epu8(h, oe_ins);
+ f = _mm_subs_epu8(f, e_ins);
cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
}
@@ -225,11 +228,11 @@ end_loop16:
return r;
}
-kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
+kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
{
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
- __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
+ __m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
kswr_t r;
#define __max_8(ret, xx) do { \
@@ -245,8 +248,10 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
m_b = n_b = 0; b = 0;
zero = _mm_set1_epi32(0);
- gapoe = _mm_set1_epi16(_gapo + _gape);
- gape = _mm_set1_epi16(_gape);
+ oe_del = _mm_set1_epi16(_o_del + _e_del);
+ e_del = _mm_set1_epi16(_e_del);
+ oe_ins = _mm_set1_epi16(_o_ins + _e_ins);
+ e_ins = _mm_set1_epi16(_e_ins);
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
slen = q->slen;
for (i = 0; i < slen; ++i) {
@@ -257,7 +262,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
// the core loop
for (i = 0; i < tlen; ++i) {
int j, k, imax;
- __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+ __m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
h = _mm_slli_si128(h, 2);
for (j = 0; LIKELY(j < slen); ++j) {
@@ -267,12 +272,13 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
h = _mm_max_epi16(h, f);
max = _mm_max_epi16(max, h);
_mm_store_si128(H1 + j, h);
- h = _mm_subs_epu16(h, gapoe);
- e = _mm_subs_epu16(e, gape);
- e = _mm_max_epi16(e, h);
+ e = _mm_subs_epu16(e, e_del);
+ t = _mm_subs_epu16(h, oe_del);
+ e = _mm_max_epi16(e, t);
_mm_store_si128(E + j, e);
- f = _mm_subs_epu16(f, gape);
- f = _mm_max_epi16(f, h);
+ f = _mm_subs_epu16(f, e_ins);
+ t = _mm_subs_epu16(h, oe_ins);
+ f = _mm_max_epi16(f, t);
h = _mm_load_si128(H0 + j);
}
for (k = 0; LIKELY(k < 16); ++k) {
@@ -281,8 +287,8 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
h = _mm_load_si128(H1 + j);
h = _mm_max_epi16(h, f);
_mm_store_si128(H1 + j, h);
- h = _mm_subs_epu16(h, gapoe);
- f = _mm_subs_epu16(f, gape);
+ h = _mm_subs_epu16(h, oe_ins);
+ f = _mm_subs_epu16(f, e_ins);
if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
}
}
@@ -326,30 +332,30 @@ end_loop8:
return r;
}
-static void revseq(int l, uint8_t *s)
+static inline void revseq(int l, uint8_t *s)
{
int i, t;
for (i = 0; i < l>>1; ++i)
t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
}
-kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
+kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry)
{
int size;
kswq_t *q;
kswr_t r, rr;
- kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int);
+ kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int);
q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
if (qry && *qry == 0) *qry = q;
func = q->size == 2? ksw_i16 : ksw_u8;
size = q->size;
- r = func(q, tlen, target, gapo, gape, xtra);
+ r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra);
if (qry == 0) free(q);
if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
q = ksw_qinit(size, r.qe + 1, query, m, mat);
- rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score);
+ rr = func(q, tlen, target, o_del, e_del, o_ins, e_ins, KSW_XSTOP | r.score);
revseq(r.qe + 1, query); revseq(r.te + 1, target);
free(q);
if (r.score == rr.score)
@@ -357,6 +363,11 @@ kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, con
return r;
}
+kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
+{
+ return ksw_align2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, xtra, qry);
+}
+
/********************
*** SW extension ***
********************/
@@ -365,11 +376,11 @@ typedef struct {
int32_t h, e;
} eh_t;
-int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
+int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
{
eh_t *eh; // score array
int8_t *qp; // query profile
- int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore, max_off;
+ int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
if (h0 < 0) h0 = 0;
// allocate memory
qp = malloc(qlen * m);
@@ -380,25 +391,28 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
}
// fill the first row
- eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0;
- for (j = 2; j <= qlen && eh[j-1].h > gape; ++j)
- eh[j].h = eh[j-1].h - gape;
+ eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0;
+ for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j)
+ eh[j].h = eh[j-1].h - e_ins;
// adjust $w if it is too large
k = m * m;
for (i = 0, max = 0; i < k; ++i) // get the max score
max = max > mat[i]? max : mat[i];
- max_gap = (int)((double)(qlen * max + end_bonus - gapo) / gape + 1.);
- max_gap = max_gap > 1? max_gap : 1;
- w = w < max_gap? w : max_gap;
+ max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
+ max_ins = max_ins > 1? max_ins : 1;
+ w = w < max_ins? w : max_ins;
+ max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
+ max_del = max_del > 1? max_del : 1;
+ w = w < max_del? w : max_del; // TODO: is this necessary?
// DP loop
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
max_off = 0;
beg = 0, end = qlen;
for (i = 0; LIKELY(i < tlen); ++i) {
- int f = 0, h1, m = 0, mj = -1;
+ int t, f = 0, h1, m = 0, mj = -1;
int8_t *q = &qp[target[i] * qlen];
// compute the first column
- h1 = h0 - (gapo + gape * (i + 1));
+ h1 = h0 - (o_del + e_del * (i + 1));
if (h1 < 0) h1 = 0;
// apply the band and the constraint (if provided)
if (beg < i - w) beg = i - w;
@@ -419,23 +433,31 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
h1 = h; // save H(i,j) to h1 for the next column
mj = m > h? mj : j; // record the position where max score is achieved
m = m > h? m : h; // m is stored at eh[mj+1]
- h -= gapoe;
- h = h > 0? h : 0;
- e -= gape;
- e = e > h? e : h; // computed E(i+1,j)
+ t = h - oe_del;
+ t = t > 0? t : 0;
+ e -= e_del;
+ e = e > t? e : t; // computed E(i+1,j)
p->e = e; // save E(i+1,j) for the next row
- f -= gape;
- f = f > h? f : h; // computed F(i,j+1)
+ t = h - oe_ins;
+ t = t > 0? t : 0;
+ f -= e_ins;
+ f = f > t? f : t; // computed F(i,j+1)
}
eh[end].h = h1; eh[end].e = 0;
if (j == qlen) {
max_ie = gscore > h1? max_ie : i;
gscore = gscore > h1? gscore : h1;
}
- if (m == 0 || (zdrop > 0 && max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop)) break; // drop to zero, or below Z-dropoff
+ if (m == 0) break;
if (m > max) {
max = m, max_i = i, max_j = mj;
max_off = max_off > abs(mj - i)? max_off : abs(mj - i);
+ } else if (zdrop > 0) {
+ if (i - max_i > mj - max_j) {
+ if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break;
+ } else {
+ if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break;
+ }
}
// update beg and end for the next round
for (j = mj; j >= beg && eh[j].h; --j);
@@ -453,6 +475,11 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
return max;
}
+int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off)
+{
+ return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off);
+}
+
/********************
* Global alignment *
********************/
@@ -471,11 +498,11 @@ static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar,
return cigar;
}
-int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
+int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_)
{
eh_t *eh;
int8_t *qp; // query profile
- int i, j, k, gapoe = gapo + gape, score, n_col;
+ int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, score, n_col;
uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
if (n_cigar_) *n_cigar_ = 0;
// allocate memory
@@ -491,16 +518,16 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
// fill the first row
eh[0].h = 0; eh[0].e = MINUS_INF;
for (j = 1; j <= qlen && j <= w; ++j)
- eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF;
+ eh[j].h = -(o_ins + e_ins * j), eh[j].e = MINUS_INF;
for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
// DP loop
for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
- int32_t f = MINUS_INF, h1, beg, end;
+ int32_t f = MINUS_INF, h1, beg, end, t;
int8_t *q = &qp[target[i] * qlen];
uint8_t *zi = &z[i * n_col];
beg = i > w? i - w : 0;
end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
- h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF;
+ h1 = beg == 0? -(o_del + e_del * (i + 1)) : MINUS_INF;
for (j = beg; LIKELY(j < end); ++j) {
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// Cells are computed in the following order:
@@ -522,14 +549,15 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
d = h >= f? d : 2;
h = h >= f? h : f;
h1 = h;
- m -= gapoe;
- e -= gape;
- d |= e > m? 1<<2 : 0;
- e = e > m? e : m;
+ t = m - oe_del;
+ e -= e_del;
+ d |= e > t? 1<<2 : 0;
+ e = e > t? e : t;
p->e = e;
- f -= gape;
- d |= f > m? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
- f = f > m? f : m;
+ t = m - oe_ins;
+ f -= e_ins;
+ d |= f > t? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
+ f = f > t? f : t;
zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
}
eh[end].h = h1; eh[end].e = MINUS_INF;
@@ -555,6 +583,11 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
return score;
}
+int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
+{
+ return ksw_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_);
+}
+
/*******************************************
* Main function (not compiled by default) *
*******************************************/
diff --git a/ksw.h b/ksw.h
index 97559fd..5d45a67 100644
--- a/ksw.h
+++ b/ksw.h
@@ -61,6 +61,7 @@ extern "C" {
* query profile will be deallocated in ksw_align().
*/
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
+ kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry);
/**
* Banded global alignment
@@ -80,6 +81,7 @@ extern "C" {
* @return score of the alignment
*/
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar);
+ int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar, uint32_t **cigar);
/**
* Extend alignment
@@ -103,6 +105,7 @@ extern "C" {
* @return best semi-local alignment score
*/
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
+ int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
#ifdef __cplusplus
}
diff --git a/main.c b/main.c
index a8df9c0..0ae7978 100644
--- a/main.c
+++ b/main.c
@@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.7.7-r441"
+#define PACKAGE_VERSION "0.7.8-r455"
#endif
int bwa_fa2pac(int argc, char *argv[]);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bwa.git
More information about the debian-med-commit
mailing list