[med-svn] [bwa] 01/02: Imported Upstream version 0.7.8

Sun Apr 13 07:00:33 UTC 2014

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository bwa.

commit e513f4dfd9dc01da946b063b2b09f331944d0fb2
Author: Andreas Tille <tille at debian.org>
Date:   Sun Apr 13 08:52:51 2014 +0200

    Imported Upstream version 0.7.8
---
 Makefile      |   2 +-
 NEWS          |  32 +++++++++++++
 bwa.1         |  26 +++++++---
 bwa.c         |  32 +++++++++----
 bwa.h         |   2 +
 bwamem.c      |  97 +++++++++++++++++++++++++++-----------
 bwamem.h      |   5 +-
 bwamem_pair.c |  16 ++++---
 bwt.h         |   8 ++--
 bwt_lite.c    |   2 +-
 bwt_lite.h    |   6 +--
 fastmap.c     | 125 +++++++++++++++++++++++++++++++++---------------
 ksw.c         | 149 +++++++++++++++++++++++++++++++++++-----------------------
 ksw.h         |   3 ++
 main.c        |   2 +-
 15 files changed, 354 insertions(+), 153 deletions(-)

diff --git a/Makefile b/Makefile
index ff48a20..6490932 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 CC=			gcc
 #CC=			clang --analyze
-CFLAGS=		-g -Wall -O2
+CFLAGS=		-g -Wall -O2 -Wno-unused-function
 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
 AR=			ar
 DFLAGS=		-DHAVE_PTHREAD $(WRAP_MALLOC)
diff --git a/NEWS b/NEWS
index a7c64ed..40f4433 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,35 @@
+Release 0.7.8 (31 March, 2014)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Changes in BWA-MEM:
+
+ * Bugfix: off-diagonal X-dropoff (option -d) not working as intended.
+   Short-read alignment is not affected.
+
+ * Bugfix: unnecessarily large bandwidth used during global alignment,
+   which reduces the mapping speed by ~5% for short reads. Results are not
+   affected.
+
+ * Bugfix: when the matching score is not one, paired-end mapping quality is
+   inaccurate.
+
+ * When the matching score (option -A) is changed, scale all score-related
+   options accordingly unless overridden by users.
+
+ * Allow to specify different gap open (or extension) penalties for deletions
+   and insertions separately.
+
+ * Allow to specify the insert size distribution.
+
+ * Better and more detailed debugging information.
+
+With the default setting, 0.7.8 and 0.7.7 gave identical output on one million
+100bp read pairs.
+
+(0.7.8: 31 March 2014, r455)
+
+
+
 Release 0.7.7 (25 Feburary, 2014)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/bwa.1 b/bwa.1
index 601a529..b6354e5 100644
--- a/bwa.1
+++ b/bwa.1
@@ -1,4 +1,4 @@
-.TH bwa 1 "25 Feburary 2014" "bwa-0.7.7" "Bioinformatics tools"
+.TH bwa 1 "31 March 2014" "bwa-0.7.8" "Bioinformatics tools"
 .SH NAME
 .PP
 bwa - Burrows-Wheeler Alignment Tool
@@ -148,9 +148,10 @@ not work with split alignments. One may consider to use option
 .B -M
 to flag shorter split hits as secondary.
 
-.B OPTIONS:
 .RS
 .TP 10
+.B ALGORITHM OPTIONS:
+.TP
 .BI -t \ INT
 Number of threads [1]
 .TP
@@ -202,11 +203,14 @@ Matching score. [1]
 .BI -B \ INT
 Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
 .TP
-.BI -O \ INT
-Gap open penalty. [6]
+.BI -O \ INT[,INT]
+Gap open penalty. If two numbers are specified, the first is the penalty of
+openning a deletion and the second for openning an insertion. [6]
 .TP
-.BI -E \ INT
-Gap extension penalty. A gap of length k costs O + k*E (i.e.
+.BI -E \ INT[,INT]
+Gap extension penalty. If two numbers are specified, the first is the penalty
+of extending a deletion and second for extending an insertion. A gap of length
+k costs O + k*E (i.e.
 .B -O
 is for opening a zero-length gap). [1]
 .TP
@@ -224,6 +228,9 @@ Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
 and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
 two scores to determine whether we should force pairing. A larger value leads to
 more aggressive read pair. [17]
+
+.TP
+.B INPUT/OUTPUT OPTIONS:
 .TP
 .B -p
 Assume the first input query file is interleaved paired-end FASTA/Q. See the
@@ -260,6 +267,13 @@ supported throughout BWA. Ideally, a value 0 for disabling all the output to
 stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
 all normal messages; 4 or higher for debugging. When this option takes value
 4, the output is not SAM. [3]
+.TP
+.BI -I \ FLOAT[,FLOAT[,INT[,INT]]]
+Specify the mean, standard deviation (10% of the mean if absent), max (4 sigma
+from the mean if absent) and min (4 sigma if absent) of the insert size
+distribution. Only applicable to the FR orientation. By default, BWA-MEM infers
+these numbers and the pair orientations given enough reads. [inferred]
+
 .RE
 
 .TP
diff --git a/bwa.c b/bwa.c
index 140d57e..0e9e606 100644
--- a/bwa.c
+++ b/bwa.c
@@ -86,7 +86,7 @@ void bwa_fill_scmat(int a, int b, int8_t mat[25])
 }
 
 // Generate CIGAR when the alignment end points are known
-uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
+uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
 {
 	uint32_t *cigar = 0;
 	uint8_t tmp, *rseq;
@@ -106,24 +106,30 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
 			tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
 	}
 	if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
+		// FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance.
 		cigar = malloc(4);
 		cigar[0] = l_query<<4 | 0;
 		*n_cigar = 1;
 		for (i = 0, *score = 0; i < l_query; ++i)
 			*score += mat[rseq[i]*5 + query[i]];
 	} else {
-		int w, max_gap, min_w;
-		//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
-		//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
+		int w, max_gap, max_ins, max_del, min_w;
 		// set the band-width
-		max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.);
+		max_ins = (int)((double)(((l_query+1)>>1) * mat[0] - o_ins) / e_ins + 1.);
+		max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.);
+		max_gap = max_ins > max_del? max_ins : max_del;
 		max_gap = max_gap > 1? max_gap : 1;
 		w = (max_gap + abs(rlen - l_query) + 1) >> 1;
 		w = w < w_? w : w_;
 		min_w = abs(rlen - l_query) + 3;
 		w = w > min_w? w : min_w;
 		// NW alignment
-		*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
+		if (bwa_verbose >= 4) {
+			printf("* Global bandwidth: %d\n", w);
+			printf("* Global ref:   "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
+			printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
+		}
+		*score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar);
 	}
 	{// compute NM and MD
 		int k, x, y, u, n_mm = 0, n_gap = 0;
@@ -165,7 +171,12 @@ ret_gen_cigar:
 	return cigar;
 }
 
-int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
+uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
+{
+	return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM);
+}
+
+int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
 {
 	int is_rev;
 	int64_t cb, ce, fm;
@@ -184,7 +195,7 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns,
 		int64_t x;
 		cb = cb > *rb? cb : *rb;
 		ce = ce < *re? ce : *re;
-		cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
+		cigar = bwa_gen_cigar2(mat, o_del, e_del, o_ins, e_ins, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
 		for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
 			int op = cigar[i]&0xf, len = cigar[i]>>4;
 			if (op == 0) {
@@ -210,6 +221,11 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns,
 	return (*qb == *qe || *rb == *re)? -2 : 0;
 }
 
+int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
+{
+	return bwa_fix_xref2(mat, q, r, q, r, w, bns, pac, query, qb, qe, rb, re);
+}
+
 /*********************
  * Full index reader *
  *********************/
diff --git a/bwa.h b/bwa.h
index 9d5b2aa..8d46e58 100644
--- a/bwa.h
+++ b/bwa.h
@@ -32,7 +32,9 @@ extern "C" {
 
 	void bwa_fill_scmat(int a, int b, int8_t mat[25]);
 	uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
+	uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
 	int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
+	int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
 
 	char *bwa_idx_infer_prefix(const char *hint);
 	bwt_t *bwa_idx_load_bwt(const char *hint);
diff --git a/bwamem.c b/bwamem.c
index 19ca561..33b86ad 100644
--- a/bwamem.c
+++ b/bwamem.c
@@ -47,7 +47,10 @@ mem_opt_t *mem_opt_init()
 	mem_opt_t *o;
 	o = calloc(1, sizeof(mem_opt_t));
 	o->flag = 0;
-	o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
+	o->a = 1; o->b = 4;
+	o->o_del = o->o_ins = 6;
+	o->e_del = o->e_ins = 1;
+	o->w = 100;
 	o->T = 30;
 	o->zdrop = 100;
 	o->pen_unpaired = 17;
@@ -111,7 +114,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query)
 	itr->len = len;
 }
 
-const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
+const bwtintv_v *smem_next2(smem_i *itr, int split_len, int split_width, int start_width)
 {
 	int i, max, max_i, ori_start;
 	itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
@@ -119,7 +122,7 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
 	while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
 	if (itr->start == itr->len) return 0;
 	ori_start = itr->start;
-	itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
+	itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, start_width, itr->matches, itr->tmpvec); // search for SMEM
 	if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
 	for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
 		bwtintv_t *p = &itr->matches->a[i];
@@ -152,6 +155,11 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
 	return itr->matches;
 }
 
+const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
+{
+	return smem_next2(itr, split_len, split_width, 1);
+}
+
 /********************************
  * Chaining while finding SMEMs *
  ********************************/
@@ -200,8 +208,9 @@ static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) *
 {
 	const bwtintv_v *a;
 	int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
+	int start_width = (opt->flag & MEM_F_NO_EXACT)? 2 : 1;
 	split_len = split_len < itr->len? split_len : itr->len;
-	while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM
+	while ((a = smem_next2(itr, split_len, opt->split_width, start_width)) != 0) { // to find all SMEM and some internal MEM
 		int i;
 		for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start
 			bwtintv_t *p = &a->a[i];
@@ -215,7 +224,7 @@ static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) *
 				s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
 				s.qbeg = p->info>>32;
 				s.len  = slen;
-				if (bwa_verbose >= 5) printf("SEED l=%d,qb=%d,rb=%ld\n", s.len, s.qbeg, (long)s.rbeg);
+				if (bwa_verbose >= 5) printf("* Found SEED: length=%d,query_beg=%d,ref_beg=%ld\n", s.len, s.qbeg, (long)s.rbeg);
 				if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip
 				if (kb_size(tree)) {
 					kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
@@ -257,14 +266,14 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
 	int i, j;
 	for (i = 0; i < chn->n; ++i) {
 		mem_chain_t *p = &chn->a[i];
-		err_printf("CHAIN(%d) n=%d w=%d", i, p->n, mem_chain_weight(p));
+		err_printf("* Found CHAIN(%d): n=%d; weight=%d", i, p->n, mem_chain_weight(p));
 		for (j = 0; j < p->n; ++j) {
 			bwtint_t pos;
 			int is_rev, ref_id;
 			pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev);
 			if (is_rev) pos -= p->seeds[j].len - 1;
 			bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id);
-			err_printf("\t%d,%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
+			err_printf("\t%d;%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
 		}
 		err_putchar('\n');
 	}
@@ -425,6 +434,13 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun)
 	return m;
 }
 
+int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen)
+{
+	if (!(opt->flag & MEM_F_NO_EXACT) || n == 0 || a->truesc != qlen * opt->a) return n;
+	memmove(a, a + 1, (n - 1) * sizeof(mem_alnreg_t));
+	return n - 1;
+}
+
 void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
 { // similar to the loop in mem_chain_flt()
 	int i, k, tmp;
@@ -433,7 +449,9 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i
 	kv_init(z);
 	for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i);
 	ks_introsort(mem_ars_hash, n, a);
-	tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
+	tmp = opt->a + opt->b;
+	tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp;
+	tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp;
 	kv_push(int, z, 0);
 	for (i = 1; i < n; ++i) {
 		for (k = 0; k < z.n; ++k) {
@@ -509,7 +527,7 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac,
 	rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
 	assert(rlen == re - rb);
 	xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
-	x = ksw_align(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->q, opt->r, xtra, 0);
+	x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
 	free(rseq);
 	if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1;
 
@@ -518,13 +536,15 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac,
 	a.score = x.score;
 	a.csub = x.score2;
 	kv_push(mem_alnreg_t, *av, a);
-	if (bwa_verbose >= 4) printf("chain2aln(short): [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re);
+	if (bwa_verbose >= 4) printf("** Added alignment region via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re);
 	return 0;
 }
 
 static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
 {
-	int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.);
+	int l_del = (int)((double)(qlen * opt->a - opt->o_del) / opt->e_del + 1.);
+	int l_ins = (int)((double)(qlen * opt->a - opt->o_ins) / opt->e_ins + 1.);
+	int l = l_del > l_ins? l_del : l_ins;
 	l = l > 1? l : 1;
 	return l < opt->w<<1? l : opt->w<<1;
 }
@@ -584,7 +604,9 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
 			w = max_gap < opt->w? max_gap : opt->w;
 			if (qd - rd < w && rd - qd < w) break;
 		}
-		if (i < av->n) { // the seed is (almost) contained in an existing alignment
+		if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln
+			if (bwa_verbose >= 4)
+				printf("** Seed(%d) [%ld;%ld,%ld] is almost contained in an existing alignment. Confirming whether extension is needed...\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg);
 			for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain
 				const mem_seed_t *t;
 				if (srt[i] == 0) continue;
@@ -597,6 +619,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
 				srt[k] = 0; // mark that seed extension has not been performed
 				continue;
 			}
+			if (bwa_verbose >= 4)
+				printf("** Seed(%d) might lead to a different alignment even though it is contained. Extension will be performed.\n", k);
 		}
 
 		a = kv_pushp(mem_alnreg_t, *av);
@@ -604,7 +628,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
 		a->w = aw[0] = aw[1] = opt->w;
 		a->score = a->truesc = -1;
 
-		if (bwa_verbose >= 4) err_printf("Extending from seed [%ld,%ld,%ld]\n", (long)s->len, (long)s->qbeg, (long)s->rbeg);
+		if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg);
 		if (s->qbeg) { // left extension
 			uint8_t *rs, *qs;
 			int qle, tle, gtle, gscore;
@@ -616,8 +640,13 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
 			for (i = 0; i < MAX_BAND_TRY; ++i) {
 				int prev = a->score;
 				aw[0] = opt->w << i;
-				a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, &gtle, &gscore, &max_off[0]);
-				if (bwa_verbose >= 4) { printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
+				if (bwa_verbose >= 4) {
+					int j;
+					printf("*** Left ref:   "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n');
+					printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n');
+				}
+				a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, &gtle, &gscore, &max_off[0]);
+				if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
 				if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
 			}
 			// check whether we prefer to reach the end of the query
@@ -639,8 +668,13 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
 			for (i = 0; i < MAX_BAND_TRY; ++i) {
 				int prev = a->score;
 				aw[1] = opt->w << i;
-				a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, &gtle, &gscore, &max_off[1]);
-				if (bwa_verbose >= 4) { printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); }
+				if (bwa_verbose >= 4) {
+					int j;
+					printf("*** Right ref:   "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n');
+					printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n');
+				}
+				a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, &gtle, &gscore, &max_off[1]);
+				if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); }
 				if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
 			}
 			// similar to the above
@@ -652,7 +686,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
 				a->truesc += gscore - sc0;
 			}
 		} else a->qe = l_query, a->re = s->rbeg + s->len;
-		if (bwa_verbose >= 4) { printf("[%d]\taw={%d,%d}\tscore=%d\t[%d,%d) <=> [%ld,%ld)\n", k, aw[0], aw[1], a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); }
+		if (bwa_verbose >= 4) printf("*** Added alignment region: [%d,%d) <=> [%ld,%ld); score=%d; {left,right}_bandwidth={%d,%d}\n", a->qb, a->qe, (long)a->rb, (long)a->re, a->score, aw[0], aw[1]);
 
 		// compute seedcov
 		for (i = 0, a->seedcov = 0; i < c->n; ++i) {
@@ -673,7 +707,7 @@ static inline int infer_bw(int l1, int l2, int score, int a, int q, int r)
 {
 	int w;
 	if (l1 == l2 && l1 * a - score < (q + r - a)<<1) return 0; // to get equal alignment length, we need at least two gaps
-	w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.);
+	w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 2.);
 	if (w < abs(l1 - l2)) w = abs(l1 - l2);
 	return w;
 }
@@ -887,13 +921,15 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
 	for (i = 0; i < chn.n; ++i) {
 		mem_chain_t *p = &chn.a[i];
 		int ret;
-		if (bwa_verbose >= 4) err_printf("===> Processing chain(%d) <===\n", i);
+		if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i);
 		ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs);
 		if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs);
 		free(chn.a[i].seeds);
 	}
 	free(chn.a);
 	regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun);
+	if (opt->flag & MEM_F_NO_EXACT)
+		regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq);
 	return regs;
 }
 
@@ -913,7 +949,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
 mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
 {
 	mem_aln_t a;
-	int i, w2, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD;
+	int i, w2, tmp, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD;
 	int64_t pos, rb, re;
 	uint8_t *query;
 
@@ -929,19 +965,21 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
 		query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
 	a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0;
 	if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment
-	if (bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) {
+	if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) {
 		fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Abort. Sorry.\n", __func__);
 		exit(1);
 	}
-	w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r);
-	if (bwa_verbose >= 4) printf("Band width: infer=%d, opt=%d, alnreg=%d\n", w2, opt->w, ar->w);
+	tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del);
+	w2  = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins);
+	w2 = w2 > tmp? w2 : tmp;
+	if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w);
 	if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w;
-	else w2 = opt->w;
+//	else w2 = opt->w; // TODO: check if we need this line on long reads. On 1-800bp reads, it does not matter and it should be.
 	i = 0; a.cigar = 0;
 	do {
 		free(a.cigar);
-		a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
-		if (bwa_verbose >= 4) printf("Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc);
+		a.cigar = bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
+		if (bwa_verbose >= 4) printf("* Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc);
 		if (score == last_sc) break; // it is possible that global alignment and local alignment give different scores
 		last_sc = score;
 		w2 <<= 1;
@@ -997,9 +1035,12 @@ static void worker1(void *data, int i, int tid)
 {
 	worker_t *w = (worker_t*)data;
 	if (!(w->opt->flag&MEM_F_PE)) {
+		if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name);
 		w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
 	} else {
+		if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name);
 		w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
+		if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name);
 		w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
 	}
 }
@@ -1009,10 +1050,12 @@ static void worker2(void *data, int i, int tid)
 	extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);
 	worker_t *w = (worker_t*)data;
 	if (!(w->opt->flag&MEM_F_PE)) {
+		if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
 		mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
 		mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
 		free(w->regs[i].a);
 	} else {
+		if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name);
 		mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]);
 		free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);
 	}
diff --git a/bwamem.h b/bwamem.h
index 8b24c51..5291491 100644
--- a/bwamem.h
+++ b/bwamem.h
@@ -16,9 +16,12 @@ typedef struct __smem_i smem_i;
 #define MEM_F_ALL       0x8
 #define MEM_F_NO_MULTI  0x10
 #define MEM_F_NO_RESCUE 0x20
+#define MEM_F_NO_EXACT  0x40
 
 typedef struct {
-	int a, b, q, r;         // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
+	int a, b;               // match score and mismatch penalty
+	int o_del, e_del;
+	int o_ins, e_ins;
 	int pen_unpaired;       // phred-scaled penalty for unpaired reads
 	int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score.
 	int w;                  // band width
diff --git a/bwamem_pair.c b/bwamem_pair.c
index f1aa73a..4a7cdf3 100644
--- a/bwamem_pair.c
+++ b/bwamem_pair.c
@@ -144,8 +144,8 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me
 		if (len == re - rb) { // no funny things happening
 			kswr_t aln;
 			mem_alnreg_t b;
-			int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len;
-			aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0);
+			int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
+			aln = ksw_align2(l_ms, seq, len, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
 			memset(&b, 0, sizeof(mem_alnreg_t));
 			if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
 				b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;                                                                                                                                                                              
@@ -219,7 +219,9 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_
 		y[v.a[i].y&3] = i;
 	}
 	if (u.n) { // found at least one proper pair
-		int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
+		int tmp = opt->a + opt->b;
+		tmp = tmp > opt->o_del + opt->e_del? tmp : opt->o_del + opt->e_del;
+		tmp = tmp > opt->o_ins + opt->e_ins? tmp : opt->o_ins + opt->e_ins;
 		ks_introsort_128(u.n, u.a);
 		i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32;
 		z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair
@@ -233,6 +235,8 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_
 	return ret;
 }
 
+#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499))
+
 int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
 {
 	extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
@@ -274,7 +278,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
 		score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
 		//q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
 		subo = subo > score_un? subo : score_un;
-		q_pe = (o - subo) * 6;
+		q_pe = raw_mapq(o - subo, opt->a);
 		if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);
 		if (q_pe < 0) q_pe = 0;
 		if (q_pe > 60) q_pe = 60;
@@ -291,8 +295,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
 			q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;
 			extra_flag |= 2;
 			// cap at the tandem repeat score
-			q_se[0] = q_se[0] < (c[0]->score - c[0]->csub) * 6? q_se[0] : (c[0]->score - c[0]->csub) * 6;
-			q_se[1] = q_se[1] < (c[1]->score - c[1]->csub) * 6? q_se[1] : (c[1]->score - c[1]->csub) * 6;
+			q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a);
+			q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a);
 		} else { // the unpaired alignment is preferred
 			z[0] = z[1] = 0;
 			q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
diff --git a/bwt.h b/bwt.h
index c36bf9b..d2ff0ac 100644
--- a/bwt.h
+++ b/bwt.h
@@ -96,14 +96,14 @@ extern "C" {
 
 	void bwt_bwtupdate_core(bwt_t *bwt);
 
-	inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
-	inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
+	bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
+	void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
 	bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k);
 
 	// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
 	void bwt_gen_cnt_table(bwt_t *bwt);
-	inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
-	inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
+	void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
+	void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
 
 	int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end);
 	int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0);
diff --git a/bwt_lite.c b/bwt_lite.c
index 6cd3b1d..9b47270 100644
--- a/bwt_lite.c
+++ b/bwt_lite.c
@@ -56,7 +56,7 @@ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq)
 	}
 	return b;
 }
-inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
+uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
 {
 	uint32_t n, b;
 	if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
diff --git a/bwt_lite.h b/bwt_lite.h
index 0096b93..4fadcce 100644
--- a/bwt_lite.h
+++ b/bwt_lite.h
@@ -17,9 +17,9 @@ extern "C" {
 #endif
 
 	bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq);
-	inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
-	inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
-	inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
+	uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
+	void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
+	void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
 	void bwtl_destroy(bwtl_t *bwt);
 
 #ifdef __cplusplus
diff --git a/fastmap.c b/fastmap.c
index 72d850c..093fb7b 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -2,6 +2,7 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
+#include <string.h>
 #include <ctype.h>
 #include <math.h>
 #include "bwa.h"
@@ -19,50 +20,83 @@ int kclose(void *a);
 
 int main_mem(int argc, char *argv[])
 {
-	mem_opt_t *opt;
+	mem_opt_t *opt, opt0;
 	int fd, fd2, i, c, n, copy_comment = 0;
 	gzFile fp, fp2 = 0;
 	kseq_t *ks, *ks2 = 0;
 	bseq1_t *seqs;
 	bwaidx_t *idx;
-	char *rg_line = 0;
+	char *p, *rg_line = 0;
 	void *ko = 0, *ko2 = 0;
 	int64_t n_processed = 0;
+	mem_pestat_t pes[4], *pes0 = 0;
+
+	memset(pes, 0, 4 * sizeof(mem_pestat_t));
+	for (i = 0; i < 4; ++i) pes[i].failed = 1;
 
 	opt = mem_opt_init();
-	while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:")) >= 0) {
+	opt0.a = opt0.b = opt0.o_del = opt0.e_del = opt0.o_ins = opt0.e_ins = opt0.pen_unpaired = -1;
+	opt0.pen_clip5 = opt0.pen_clip3 = opt0.zdrop = opt0.T = -1;
+	while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:")) >= 0) {
 		if (c == 'k') opt->min_seed_len = atoi(optarg);
 		else if (c == 'w') opt->w = atoi(optarg);
-		else if (c == 'A') opt->a = atoi(optarg);
-		else if (c == 'B') opt->b = atoi(optarg);
-		else if (c == 'O') opt->q = atoi(optarg);
-		else if (c == 'E') opt->r = atoi(optarg);
-		else if (c == 'T') opt->T = atoi(optarg);
-		else if (c == 'U') opt->pen_unpaired = atoi(optarg);
+		else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1;
+		else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1;
+		else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1;
+		else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1;
 		else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
 		else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
 		else if (c == 'a') opt->flag |= MEM_F_ALL;
 		else if (c == 'p') opt->flag |= MEM_F_PE;
 		else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
 		else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
+		else if (c == 'e') opt->flag |= MEM_F_NO_EXACT;
 		else if (c == 'c') opt->max_occ = atoi(optarg);
-		else if (c == 'd') opt->zdrop = atoi(optarg);
+		else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
 		else if (c == 'v') bwa_verbose = atoi(optarg);
 		else if (c == 'r') opt->split_factor = atof(optarg);
 		else if (c == 'D') opt->chain_drop_ratio = atof(optarg);
 		else if (c == 'm') opt->max_matesw = atoi(optarg);
+		else if (c == 's') opt->split_width = atoi(optarg);
 		else if (c == 'C') copy_comment = 1;
 		else if (c == 'Q') {
 			opt->mapQ_coef_len = atoi(optarg);
 			opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0;
+		} else if (c == 'O') {
+			opt0.o_del = opt0.o_ins = 1;
+			opt->o_del = opt->o_ins = strtol(optarg, &p, 10);
+			if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+				opt->o_ins = strtol(p+1, &p, 10);
+		} else if (c == 'E') {
+			opt0.e_del = opt0.e_ins = 1;
+			opt->e_del = opt->e_ins = strtol(optarg, &p, 10);
+			if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+				opt->e_ins = strtol(p+1, &p, 10);
 		} else if (c == 'L') {
-			char *p;
+			opt0.pen_clip5 = opt0.pen_clip3 = 1;
 			opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10);
 			if (*p != 0 && ispunct(*p) && isdigit(p[1]))
 				opt->pen_clip3 = strtol(p+1, &p, 10);
 		} else if (c == 'R') {
 			if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
-		} else if (c == 's') opt->split_width = atoi(optarg);
+		} else if (c == 'I') { // specify the insert size distribution
+			pes0 = pes;
+			pes[1].failed = 0;
+			pes[1].avg = strtod(optarg, &p);
+			pes[1].std = pes[1].avg * .1;
+			if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+				pes[1].std = strtod(p+1, &p);
+			pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499);
+			pes[1].low  = (int)(pes[1].avg - 4. * pes[1].std + .499);
+			if (pes[1].low < 1) pes[1].low = 1;
+			if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+				pes[1].high = (int)(strtod(p+1, &p) + .499);
+			if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+				pes[1].low  = (int)(strtod(p+1, &p) + .499);
+			if (bwa_verbose >= 3)
+				fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n",
+						__func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
+		}
 		else return 1;
 	}
 	if (opt->n_threads < 1) opt->n_threads = 1;
@@ -70,38 +104,55 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "\n");
 		fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
 		fprintf(stderr, "Algorithm options:\n\n");
-		fprintf(stderr, "       -t INT     number of threads [%d]\n", opt->n_threads);
-		fprintf(stderr, "       -k INT     minimum seed length [%d]\n", opt->min_seed_len);
-		fprintf(stderr, "       -w INT     band width for banded alignment [%d]\n", opt->w);
-		fprintf(stderr, "       -d INT     off-diagonal X-dropoff [%d]\n", opt->zdrop);
-		fprintf(stderr, "       -r FLOAT   look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
-//		fprintf(stderr, "       -s INT     look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
-		fprintf(stderr, "       -c INT     skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
-		fprintf(stderr, "       -D FLOAT   drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->chain_drop_ratio);
-		fprintf(stderr, "       -m INT     perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw);
-		fprintf(stderr, "       -S         skip mate rescue\n");
-		fprintf(stderr, "       -P         skip pairing; mate rescue performed unless -S also in use\n");
-		fprintf(stderr, "       -A INT     score for a sequence match [%d]\n", opt->a);
-		fprintf(stderr, "       -B INT     penalty for a mismatch [%d]\n", opt->b);
-		fprintf(stderr, "       -O INT     gap open penalty [%d]\n", opt->q);
-		fprintf(stderr, "       -E INT     gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
-		fprintf(stderr, "       -L INT     penalty for clipping [%d]\n", opt->pen_clip5);
-		fprintf(stderr, "       -U INT     penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
+		fprintf(stderr, "       -t INT        number of threads [%d]\n", opt->n_threads);
+		fprintf(stderr, "       -k INT        minimum seed length [%d]\n", opt->min_seed_len);
+		fprintf(stderr, "       -w INT        band width for banded alignment [%d]\n", opt->w);
+		fprintf(stderr, "       -d INT        off-diagonal X-dropoff [%d]\n", opt->zdrop);
+		fprintf(stderr, "       -r FLOAT      look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
+//		fprintf(stderr, "       -s INT        look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
+		fprintf(stderr, "       -c INT        skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
+		fprintf(stderr, "       -D FLOAT      drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->chain_drop_ratio);
+		fprintf(stderr, "       -m INT        perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw);
+		fprintf(stderr, "       -S            skip mate rescue\n");
+		fprintf(stderr, "       -P            skip pairing; mate rescue performed unless -S also in use\n");
+		fprintf(stderr, "       -e            discard full-length exact matches\n");
+		fprintf(stderr, "       -A INT        score for a sequence match, which scales [-TdBOELU] unless overridden [%d]\n", opt->a);
+		fprintf(stderr, "       -B INT        penalty for a mismatch [%d]\n", opt->b);
+		fprintf(stderr, "       -O INT[,INT]  gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins);
+		fprintf(stderr, "       -E INT[,INT]  gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
+		fprintf(stderr, "       -L INT[,INT]  penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
+		fprintf(stderr, "       -U INT        penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
 		fprintf(stderr, "\nInput/output options:\n\n");
-		fprintf(stderr, "       -p         first query file consists of interleaved paired-end sequences\n");
-		fprintf(stderr, "       -R STR     read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
+		fprintf(stderr, "       -p            first query file consists of interleaved paired-end sequences\n");
+		fprintf(stderr, "       -R STR        read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
 		fprintf(stderr, "\n");
-		fprintf(stderr, "       -v INT     verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
-		fprintf(stderr, "       -T INT     minimum score to output [%d]\n", opt->T);
-		fprintf(stderr, "       -a         output all alignments for SE or unpaired PE\n");
-		fprintf(stderr, "       -C         append FASTA/FASTQ comment to SAM output\n");
-		fprintf(stderr, "       -M         mark shorter split hits as secondary (for Picard/GATK compatibility)\n");
+		fprintf(stderr, "       -v INT        verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
+		fprintf(stderr, "       -T INT        minimum score to output [%d]\n", opt->T);
+		fprintf(stderr, "       -a            output all alignments for SE or unpaired PE\n");
+		fprintf(stderr, "       -C            append FASTA/FASTQ comment to SAM output\n");
+		fprintf(stderr, "       -M            mark shorter split hits as secondary\n\n");
+		fprintf(stderr, "       -I FLOAT[,FLOAT[,INT[,INT]]]\n");
+		fprintf(stderr, "                     specify the mean, standard deviation (10%% of the mean if absent), max\n");
+		fprintf(stderr, "                     (4 sigma from the mean if absent) and min of the insert size distribution.\n");
+		fprintf(stderr, "                     FR orientation only. [inferred]\n");
 		fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n");
 		fprintf(stderr, "\n");
 		free(opt);
 		return 1;
 	}
 
+	if (opt0.a == 1) { // matching score is changed
+		if (opt0.b != 1) opt->b *= opt->a;
+		if (opt0.T != 1) opt->T *= opt->a;
+		if (opt0.o_del != 1) opt->o_del *= opt->a;
+		if (opt0.e_del != 1) opt->e_del *= opt->a;
+		if (opt0.o_ins != 1) opt->o_ins *= opt->a;
+		if (opt0.e_ins != 1) opt->e_ins *= opt->a;
+		if (opt0.zdrop != 1) opt->zdrop *= opt->a;
+		if (opt0.pen_clip5 != 1) opt->pen_clip5 *= opt->a;
+		if (opt0.pen_clip3 != 1) opt->pen_clip3 *= opt->a;
+		if (opt0.pen_unpaired != 1) opt->pen_unpaired *= opt->a;
+	}
 	bwa_fill_scmat(opt->a, opt->b, opt->mat);
 	if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
 
@@ -142,7 +193,7 @@ int main_mem(int argc, char *argv[])
 		for (i = 0; i < n; ++i) size += seqs[i].l_seq;
 		if (bwa_verbose >= 3)
 			fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
-		mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, 0);
+		mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0);
 		n_processed += n;
 		for (i = 0; i < n; ++i) {
 			err_fputs(seqs[i].sam, stdout);
diff --git a/ksw.c b/ksw.c
index db018fa..bb055bb 100644
--- a/ksw.c
+++ b/ksw.c
@@ -107,11 +107,11 @@ kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t
 	return q;
 }
 
-kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
+kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
 {
 	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
 	uint64_t *b;
-	__m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;
+	__m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
 	kswr_t r;
 
 #define __max_16(ret, xx) do { \
@@ -128,8 +128,10 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
 	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
 	m_b = n_b = 0; b = 0;
 	zero = _mm_set1_epi32(0);
-	gapoe = _mm_set1_epi8(_gapo + _gape);
-	gape = _mm_set1_epi8(_gape);
+	oe_del = _mm_set1_epi8(_o_del + _e_del);
+	e_del = _mm_set1_epi8(_e_del);
+	oe_ins = _mm_set1_epi8(_o_ins + _e_ins);
+	e_ins = _mm_set1_epi8(_e_ins);
 	shift = _mm_set1_epi8(q->shift);
 	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
 	slen = q->slen;
@@ -141,7 +143,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
 	// the core loop
 	for (i = 0; i < tlen; ++i) {
 		int j, k, cmp, imax;
-		__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+		__m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
 		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
 		h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
 		for (j = 0; LIKELY(j < slen); ++j) {
@@ -159,13 +161,14 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
 			max = _mm_max_epu8(max, h); // set max
 			_mm_store_si128(H1 + j, h); // save to H'(i,j)
 			// now compute E'(i+1,j)
-			h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo
-			e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape
-			e = _mm_max_epu8(e, h); // e=E'(i+1,j)
+			e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del
+			t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del
+			e = _mm_max_epu8(e, t); // e=E'(i+1,j)
 			_mm_store_si128(E + j, e); // save to E'(i+1,j)
 			// now compute F'(i,j+1)
-			f = _mm_subs_epu8(f, gape);
-			f = _mm_max_epu8(f, h);
+			f = _mm_subs_epu8(f, e_ins);
+			t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
+			f = _mm_max_epu8(f, t);
 			// get H'(i-1,j) and prepare for the next j
 			h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
 		}
@@ -176,8 +179,8 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
 				h = _mm_load_si128(H1 + j);
 				h = _mm_max_epu8(h, f); // h=H'(i,j)
 				_mm_store_si128(H1 + j, h);
-				h = _mm_subs_epu8(h, gapoe);
-				f = _mm_subs_epu8(f, gape);
+				h = _mm_subs_epu8(h, oe_ins);
+				f = _mm_subs_epu8(f, e_ins);
 				cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
 				if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
 			}
@@ -225,11 +228,11 @@ end_loop16:
 	return r;
 }
 
-kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
+kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
 {
 	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
 	uint64_t *b;
-	__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
+	__m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
 	kswr_t r;
 
 #define __max_8(ret, xx) do { \
@@ -245,8 +248,10 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
 	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
 	m_b = n_b = 0; b = 0;
 	zero = _mm_set1_epi32(0);
-	gapoe = _mm_set1_epi16(_gapo + _gape);
-	gape = _mm_set1_epi16(_gape);
+	oe_del = _mm_set1_epi16(_o_del + _e_del);
+	e_del = _mm_set1_epi16(_e_del);
+	oe_ins = _mm_set1_epi16(_o_ins + _e_ins);
+	e_ins = _mm_set1_epi16(_e_ins);
 	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
 	slen = q->slen;
 	for (i = 0; i < slen; ++i) {
@@ -257,7 +262,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
 	// the core loop
 	for (i = 0; i < tlen; ++i) {
 		int j, k, imax;
-		__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+		__m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
 		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
 		h = _mm_slli_si128(h, 2);
 		for (j = 0; LIKELY(j < slen); ++j) {
@@ -267,12 +272,13 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
 			h = _mm_max_epi16(h, f);
 			max = _mm_max_epi16(max, h);
 			_mm_store_si128(H1 + j, h);
-			h = _mm_subs_epu16(h, gapoe);
-			e = _mm_subs_epu16(e, gape);
-			e = _mm_max_epi16(e, h);
+			e = _mm_subs_epu16(e, e_del);
+			t = _mm_subs_epu16(h, oe_del);
+			e = _mm_max_epi16(e, t);
 			_mm_store_si128(E + j, e);
-			f = _mm_subs_epu16(f, gape);
-			f = _mm_max_epi16(f, h);
+			f = _mm_subs_epu16(f, e_ins);
+			t = _mm_subs_epu16(h, oe_ins);
+			f = _mm_max_epi16(f, t);
 			h = _mm_load_si128(H0 + j);
 		}
 		for (k = 0; LIKELY(k < 16); ++k) {
@@ -281,8 +287,8 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape,
 				h = _mm_load_si128(H1 + j);
 				h = _mm_max_epi16(h, f);
 				_mm_store_si128(H1 + j, h);
-				h = _mm_subs_epu16(h, gapoe);
-				f = _mm_subs_epu16(f, gape);
+				h = _mm_subs_epu16(h, oe_ins);
+				f = _mm_subs_epu16(f, e_ins);
 				if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
 			}
 		}
@@ -326,30 +332,30 @@ end_loop8:
 	return r;
 }
 
-static void revseq(int l, uint8_t *s)
+static inline void revseq(int l, uint8_t *s)
 {
 	int i, t;
 	for (i = 0; i < l>>1; ++i)
 		t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
 }
 
-kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
+kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry)
 {
 	int size;
 	kswq_t *q;
 	kswr_t r, rr;
-	kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int);
+	kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int);
 
 	q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
 	if (qry && *qry == 0) *qry = q;
 	func = q->size == 2? ksw_i16 : ksw_u8;
 	size = q->size;
-	r = func(q, tlen, target, gapo, gape, xtra);
+	r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra);
 	if (qry == 0) free(q);
 	if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
 	revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
 	q = ksw_qinit(size, r.qe + 1, query, m, mat);
-	rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score);
+	rr = func(q, tlen, target, o_del, e_del, o_ins, e_ins, KSW_XSTOP | r.score);
 	revseq(r.qe + 1, query); revseq(r.te + 1, target);
 	free(q);
 	if (r.score == rr.score)
@@ -357,6 +363,11 @@ kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, con
 	return r;
 }
 
+kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
+{
+	return ksw_align2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, xtra, qry);
+}
+
 /********************
  *** SW extension ***
  ********************/
@@ -365,11 +376,11 @@ typedef struct {
 	int32_t h, e;
 } eh_t;
 
-int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
+int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
 {
 	eh_t *eh; // score array
 	int8_t *qp; // query profile
-	int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore, max_off;
+	int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
 	if (h0 < 0) h0 = 0;
 	// allocate memory
 	qp = malloc(qlen * m);
@@ -380,25 +391,28 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
 		for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
 	}
 	// fill the first row
-	eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0;
-	for (j = 2; j <= qlen && eh[j-1].h > gape; ++j)
-		eh[j].h = eh[j-1].h - gape;
+	eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0;
+	for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j)
+		eh[j].h = eh[j-1].h - e_ins;
 	// adjust $w if it is too large
 	k = m * m;
 	for (i = 0, max = 0; i < k; ++i) // get the max score
 		max = max > mat[i]? max : mat[i];
-	max_gap = (int)((double)(qlen * max + end_bonus - gapo) / gape + 1.);
-	max_gap = max_gap > 1? max_gap : 1;
-	w = w < max_gap? w : max_gap;
+	max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
+	max_ins = max_ins > 1? max_ins : 1;
+	w = w < max_ins? w : max_ins;
+	max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
+	max_del = max_del > 1? max_del : 1;
+	w = w < max_del? w : max_del; // TODO: is this necessary?
 	// DP loop
 	max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
 	max_off = 0;
 	beg = 0, end = qlen;
 	for (i = 0; LIKELY(i < tlen); ++i) {
-		int f = 0, h1, m = 0, mj = -1;
+		int t, f = 0, h1, m = 0, mj = -1;
 		int8_t *q = &qp[target[i] * qlen];
 		// compute the first column
-		h1 = h0 - (gapo + gape * (i + 1));
+		h1 = h0 - (o_del + e_del * (i + 1));
 		if (h1 < 0) h1 = 0;
 		// apply the band and the constraint (if provided)
 		if (beg < i - w) beg = i - w;
@@ -419,23 +433,31 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
 			h1 = h;             // save H(i,j) to h1 for the next column
 			mj = m > h? mj : j; // record the position where max score is achieved
 			m = m > h? m : h;   // m is stored at eh[mj+1]
-			h -= gapoe;
-			h = h > 0? h : 0;
-			e -= gape;
-			e = e > h? e : h;   // computed E(i+1,j)
+			t = h - oe_del;
+			t = t > 0? t : 0;
+			e -= e_del;
+			e = e > t? e : t;   // computed E(i+1,j)
 			p->e = e;           // save E(i+1,j) for the next row
-			f -= gape;
-			f = f > h? f : h;   // computed F(i,j+1)
+			t = h - oe_ins;
+			t = t > 0? t : 0;
+			f -= e_ins;
+			f = f > t? f : t;   // computed F(i,j+1)
 		}
 		eh[end].h = h1; eh[end].e = 0;
 		if (j == qlen) {
 			max_ie = gscore > h1? max_ie : i;
 			gscore = gscore > h1? gscore : h1;
 		}
-		if (m == 0 || (zdrop > 0 && max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop)) break; // drop to zero, or below Z-dropoff
+		if (m == 0) break;
 		if (m > max) {
 			max = m, max_i = i, max_j = mj;
 			max_off = max_off > abs(mj - i)? max_off : abs(mj - i);
+		} else if (zdrop > 0) {
+			if (i - max_i > mj - max_j) {
+				if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break;
+			} else {
+				if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break;
+			}
 		}
 		// update beg and end for the next round
 		for (j = mj; j >= beg && eh[j].h; --j);
@@ -453,6 +475,11 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
 	return max;
 }
 
+int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off)
+{
+	return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off);
+}
+
 /********************
  * Global alignment *
  ********************/
@@ -471,11 +498,11 @@ static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar,
 	return cigar;
 }
 
-int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
+int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_)
 {
 	eh_t *eh;
 	int8_t *qp; // query profile
-	int i, j, k, gapoe = gapo + gape, score, n_col;
+	int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, score, n_col;
 	uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
 	if (n_cigar_) *n_cigar_ = 0;
 	// allocate memory
@@ -491,16 +518,16 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
 	// fill the first row
 	eh[0].h = 0; eh[0].e = MINUS_INF;
 	for (j = 1; j <= qlen && j <= w; ++j)
-		eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF;
+		eh[j].h = -(o_ins + e_ins * j), eh[j].e = MINUS_INF;
 	for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
 	// DP loop
 	for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
-		int32_t f = MINUS_INF, h1, beg, end;
+		int32_t f = MINUS_INF, h1, beg, end, t;
 		int8_t *q = &qp[target[i] * qlen];
 		uint8_t *zi = &z[i * n_col];
 		beg = i > w? i - w : 0;
 		end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
-		h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF;
+		h1 = beg == 0? -(o_del + e_del * (i + 1)) : MINUS_INF;
 		for (j = beg; LIKELY(j < end); ++j) {
 			// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
 			// Cells are computed in the following order:
@@ -522,14 +549,15 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
 			d = h >= f? d : 2;
 			h = h >= f? h : f;
 			h1 = h;
-			m -= gapoe;
-			e -= gape;
-			d |= e > m? 1<<2 : 0;
-			e  = e > m? e    : m;
+			t = m - oe_del;
+			e -= e_del;
+			d |= e > t? 1<<2 : 0;
+			e  = e > t? e    : t;
 			p->e = e;
-			f -= gape;
-			d |= f > m? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
-			f  = f > m? f    : m;
+			t = m - oe_ins;
+			f -= e_ins;
+			d |= f > t? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
+			f  = f > t? f    : t;
 			zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
 		}
 		eh[end].h = h1; eh[end].e = MINUS_INF;
@@ -555,6 +583,11 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
 	return score;
 }
 
+int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
+{
+	return ksw_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_);
+}
+
 /*******************************************
  * Main function (not compiled by default) *
  *******************************************/
diff --git a/ksw.h b/ksw.h
index 97559fd..5d45a67 100644
--- a/ksw.h
+++ b/ksw.h
@@ -61,6 +61,7 @@ extern "C" {
 	 * query profile will be deallocated in ksw_align().
 	 */
 	kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
+	kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry);
 
 	/**
 	 * Banded global alignment
@@ -80,6 +81,7 @@ extern "C" {
 	 * @return        score of the alignment
 	 */
 	int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar);
+	int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar, uint32_t **cigar);
 
 	/**
 	 * Extend alignment
@@ -103,6 +105,7 @@ extern "C" {
 	 * @return        best semi-local alignment score
 	 */
 	int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
+	int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
 
 #ifdef __cplusplus
 }
diff --git a/main.c b/main.c
index a8df9c0..0ae7978 100644
--- a/main.c
+++ b/main.c
@@ -4,7 +4,7 @@
 #include "utils.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.7.7-r441"
+#define PACKAGE_VERSION "0.7.8-r455"
 #endif
 
 int bwa_fa2pac(int argc, char *argv[]);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bwa.git