[med-svn] [Git][med-team/minimap2][upstream] New upstream version 2.27+dfsg

Michael R. Crusoe (@crusoe) gitlab at salsa.debian.org
Mon Mar 18 16:39:41 GMT 2024



Michael R. Crusoe pushed to branch upstream at Debian Med / minimap2


Commits:
81bbb78f by Michael R. Crusoe at 2024-03-18T17:11:54+01:00
New upstream version 2.27+dfsg
- - - - -


14 changed files:

- NEWS.md
- README.md
- align.c
- cookbook.md
- format.c
- index.c
- main.c
- minimap.h
- minimap2.1
- misc/paftools.js
- options.c
- python/README.rst
- python/mappy.pyx
- setup.py


Changes:

=====================================
NEWS.md
=====================================
@@ -1,9 +1,47 @@
+Release 2.27-r1193 (12 March 2024)
+----------------------------------
+
+Notable changes to minimap2:
+
+ * New feature: added the `lr:hq` preset for accurate long reads at ~1% error
+   rate. This was suggested by Oxford Nanopore developers (#1127). It is not
+   clear if this preset also works well for PacBio HiFi reads.
+
+ * New feature: added the `map-iclr` preset for Illumina Complete Long Reads
+   (#1069), provided by Illumina developers.
+
+ * New feature: added option `-b` to specify mismatch penalty for base
+   transitions (i.e. A-to-G or C-to-T changes).
+
+ * New feature: added option `--ds` to generate a new `ds:Z` tag that
+   indicates uncertainty in INDEL positions. It is an extension to `cs`. The
+   `mgutils-es6.js` script in minigraph parses `ds`.
+
+ * Bugfix: avoided a NULL pointer dereference (#1154). This would not have an
+   effect on most systems but would still be good to fix.
+
+ * Bugfix: reverted the value of `ms:i` to pre-2.22 versions (#1146). This was
+   an oversight. See fcd4df2 for details.
+
+Notable changes to paftools.js and mappy:
+
+ * New feature: expose `bw_long` to mappy's Aligner class (#1124).
+
+ * Bugfix: fixed several compatibility issues with k8 v1.0 (#1161 and #1166).
+   Subcommands "call", "pbsim2fq" and "mason2fq" were not working with v1.0.
+
+Minimap2 should output identical alignments to v2.26, except the ms tag.
+
+(2.27: 12 March 2024, r1193)
+
+
+
 Release 2.26-r1175 (29 April 2023)
 ----------------------------------
 
 Fixed the broken Python package. This is the only change.
 
-(2.25: 25 April 2023, r1173)
+(2.26: 25 April 2023, r1173)
 
 
 


=====================================
README.md
=====================================
@@ -15,7 +15,7 @@ cd minimap2 && make
 ./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam       # PacBio CLR genomic reads
 ./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam         # Oxford Nanopore genomic reads
 ./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later)
-./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam    # PacBio HiFi/CCS genomic reads (v2.18 or earlier)
+./minimap2 -ax lr:hq ref.fa ont-Q20.fq.gz > aln.sam       # Nanopore Q20 genomic reads (v2.27 or later)
 ./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam      # short genomic paired-end reads
 ./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam       # spliced long reads (strand unknown)
 ./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam  # noisy Nanopore Direct RNA-seq
@@ -74,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the
 Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from
 the [release page][release] with:
 ```sh
-curl -L https://github.com/lh3/minimap2/releases/download/v2.26/minimap2-2.26_x64-linux.tar.bz2 | tar -jxvf -
-./minimap2-2.26_x64-linux/minimap2
+curl -L https://github.com/lh3/minimap2/releases/download/v2.27/minimap2-2.27_x64-linux.tar.bz2 | tar -jxvf -
+./minimap2-2.27_x64-linux/minimap2
 ```
 If you want to compile from the source, you need to have a C compiler, GNU make
 and zlib development files installed. Then type `make` in the source code
@@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`.
 ```sh
 minimap2 -ax map-pb  ref.fa pacbio-reads.fq > aln.sam   # for PacBio CLR reads
 minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam      # for Oxford Nanopore reads
+minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam    # for Illumina Complete Long Reads
 ```
 The difference between `map-pb` and `map-ont` is that `map-pb` uses
 homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
-minimizers as seeds. Emperical evaluation suggests HPC minimizers improve
+minimizers as seeds. Empirical evaluation suggests HPC minimizers improve
 performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
-Nanopore reads.
+Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that
+accounts for the low overall error rate in the reads, with transversion errors
+being less frequent than transitions.
 
 #### <a name="map-long-splice"></a>Map long mRNA/cDNA reads
 


=====================================
align.c
=====================================
@@ -21,6 +21,18 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc
 		mat[(m - 1) * m + j] = sc_ambi;
 }
 
+static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi)
+{
+	assert(m == 5);
+	ksw_gen_simple_mat(m, mat, a, b, sc_ambi);
+	if (transition == 0 || transition == b) return;
+	transition = transition > 0? -transition : transition;
+	mat[0 * m + 2] = transition;  // A->G
+	mat[1 * m + 3] = transition;  // C->T
+	mat[2 * m + 0] = transition;  // G->A
+	mat[3 * m + 1] = transition;  // T->C
+}
+
 static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
 {
 	uint32_t i;
@@ -283,7 +295,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts
 			toff += len;
 		}
 	}
-	p->dp_max = (int32_t)(max + .499);
+	p->dp_max = p->dp_max0 = (int32_t)(max + .499);
 	assert(qoff == r->qe - r->qs && toff == r->re - r->rs);
 	if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned
 }
@@ -323,6 +335,8 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
 		for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
 		fputc('\n', stderr);
 	}
+	if (opt->transition != 0 && opt->b != opt->transition)
+		flag |= KSW_EZ_GENERIC_SC;
 	if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
 		ksw_reset_extz(ez);
 		ez->zdropped = 1;
@@ -586,7 +600,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
 
 	r2->cnt = 0;
 	if (r->cnt == 0) return;
-	ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
+	ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
 	bw = (int)(opt->bw * 1.5 + 1.);
 	bw_long = (int)(opt->bw_long * 1.5 + 1.);
 	if (bw_long < bw) bw_long = bw;
@@ -844,7 +858,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
 	if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
 	if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
 
-	ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
+	ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
 	tseq = (uint8_t*)kmalloc(km, tl);
 	mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
 	qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];


=====================================
cookbook.md
=====================================
@@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools,
 please follow the command lines below:
 ```sh
 # install minimap2 executables
-curl -L https://github.com/lh3/minimap2/releases/download/v2.26/minimap2-2.26_x64-linux.tar.bz2 | tar jxf -
-cp minimap2-2.26_x64-linux/{minimap2,k8,paftools.js} .  # copy executables
+curl -L https://github.com/lh3/minimap2/releases/download/v2.27/minimap2-2.27_x64-linux.tar.bz2 | tar jxf -
+cp minimap2-2.27_x64-linux/{minimap2,k8,paftools.js} .  # copy executables
 export PATH="$PATH:"`pwd`                               # put the current directory on PATH
 # download example datasets
 curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -


=====================================
format.c
=====================================
@@ -139,10 +139,48 @@ int mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int a
 	return ret;
 }
 
-static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int write_tag)
+static void write_indel_ds(kstring_t *str, int64_t len, const uint8_t *seq, int64_t ll, int64_t lr) // write an indel to ds; adapted from minigraph
 {
-	int i, q_off, t_off;
-	if (write_tag) mm_sprintf_lite(s, "\tcs:Z:");
+	int64_t i;
+	if (ll + lr >= len) {
+		mm_sprintf_lite(str, "[");
+		for (i = 0; i < len; ++i)
+			mm_sprintf_lite(str, "%c", "acgtn"[seq[i]]);
+		mm_sprintf_lite(str, "]");
+	} else {
+		int64_t k = 0;
+		if (ll > 0) {
+			mm_sprintf_lite(str, "[");
+			for (i = 0; i < ll; ++i)
+				mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
+			mm_sprintf_lite(str, "]");
+			k += ll;
+		}
+		for (i = 0; i < len - lr - ll; ++i)
+			mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
+		k += len - lr - ll;
+		if (lr > 0) {
+			mm_sprintf_lite(str, "[");
+			for (i = 0; i < lr; ++i)
+				mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]);
+			mm_sprintf_lite(str, "]");
+		}
+	}
+}
+
+static void write_cs_ds_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int is_ds, int write_tag)
+{
+	int i, q_off, t_off, q_len = 0, t_len = 0;
+	if (write_tag) mm_sprintf_lite(s, "\t%cs:Z:", is_ds? 'd' : 'c');
+	for (i = 0; i < (int)r->p->n_cigar; ++i) {
+		int op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
+		if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH)
+			q_len += len, t_len += len;
+		else if (op == MM_CIGAR_INS)
+			q_len += len;
+		else if (op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP)
+			t_len += len;
+	}
 	for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) {
 		int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
 		assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH);
@@ -168,14 +206,42 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
 			}
 			q_off += len, t_off += len;
 		} else if (op == MM_CIGAR_INS) {
-			for (j = 0, tmp[len] = 0; j < len; ++j)
-				tmp[j] = "acgtn"[qseq[q_off + j]];
-			mm_sprintf_lite(s, "+%s", tmp);
+			if (is_ds) {
+				int z, ll, lr, y = q_off;
+				for (z = 1; z <= len; ++z)
+					if (y - z < 0 || qseq[y + len - z] != qseq[y - z])
+						break;
+				lr = z - 1;
+				for (z = 0; z < len; ++z)
+					if (y + len + z >= q_len || qseq[y + len + z] != qseq[y + z])
+						break;
+				ll = z;
+				mm_sprintf_lite(s, "+");
+				write_indel_ds(s, len, &qseq[y], ll, lr);
+			} else {
+				for (j = 0, tmp[len] = 0; j < len; ++j)
+					tmp[j] = "acgtn"[qseq[q_off + j]];
+				mm_sprintf_lite(s, "+%s", tmp);
+			}
 			q_off += len;
 		} else if (op == MM_CIGAR_DEL) {
-			for (j = 0, tmp[len] = 0; j < len; ++j)
-				tmp[j] = "acgtn"[tseq[t_off + j]];
-			mm_sprintf_lite(s, "-%s", tmp);
+			if (is_ds) {
+				int z, ll, lr, x = t_off;
+				for (z = 1; z <= len; ++z)
+					if (x - z < 0 || tseq[x + len - z] != tseq[x - z])
+						break;
+				lr = z - 1;
+				for (z = 0; z < len; ++z)
+					if (x + len + z >= t_len || tseq[x + z] != tseq[x + len + z])
+						break;
+				ll = z;
+				mm_sprintf_lite(s, "-");
+				write_indel_ds(s, len, &tseq[x], ll, lr);
+			} else {
+				for (j = 0, tmp[len] = 0; j < len; ++j)
+					tmp[j] = "acgtn"[tseq[t_off + j]];
+				mm_sprintf_lite(s, "-%s", tmp);
+			}
 			t_off += len;
 		} else { // intron
 			assert(len >= 2);
@@ -218,7 +284,7 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
 	assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
 }
 
-static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int write_tag, int is_qstrand)
+static void write_cs_ds_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int is_ds, int write_tag, int is_qstrand)
 {
 	extern unsigned char seq_nt4_table[256];
 	int i;
@@ -244,8 +310,8 @@ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_
 			}
 		}
 	}
-	if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
-	else write_cs_core(s, tseq, qseq, r, tmp, no_iden, write_tag);
+	if (is_MD == 1) write_MD_core(s, tseq, qseq, r, tmp, write_tag);
+	else write_cs_ds_core(s, tseq, qseq, r, tmp, no_iden, is_ds, write_tag);
 	kfree(km, qseq); kfree(km, tseq); kfree(km, tmp);
 }
 
@@ -256,7 +322,7 @@ int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, cons
 	str.s = *buf, str.l = 0, str.m = *max_len;
 	t.l_seq = strlen(seq);
 	t.seq = (char*)seq;
-	write_cs_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, is_qstrand);
+	write_cs_ds_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, 0, is_qstrand);
 	*max_len = str.m;
 	*buf = str.s;
 	return str.l;
@@ -278,7 +344,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
 	if (r->id == r->parent) type = r->inv? 'I' : 'P';
 	else type = r->inv? 'i' : 'S';
 	if (r->p) {
-		mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
+		mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max0, r->p->dp_score, r->p->n_ambi);
 		if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
 			mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
 	}
@@ -326,8 +392,8 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const
 		for (k = 0; k < r->p->n_cigar; ++k)
 			mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]);
 	}
-	if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
-		write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, !!(opt_flag&MM_F_QSTRAND));
+	if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
+		write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, !!(opt_flag&MM_F_QSTRAND));
 	if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
 		mm_sprintf_lite(s, "\t%s", t->comment);
 }
@@ -535,8 +601,8 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
 				}
 			}
 		}
-		if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
-			write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, 0);
+		if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD)))
+			write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, 0);
 		if (cigar_in_tag)
 			write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
 	}


=====================================
index.c
=====================================
@@ -192,6 +192,7 @@ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
 	if (f <= 0.) return INT32_MAX;
 	for (i = 0; i < 1<<mi->b; ++i)
 		if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
+	if (n == 0) return INT32_MAX;
 	a = (uint32_t*)malloc(n * 4);
 	for (i = n = 0; i < 1<<mi->b; ++i) {
 		idxhash_t *h = (idxhash_t*)mi->B[i].h;


=====================================
main.c
=====================================
@@ -77,6 +77,7 @@ static ko_longopt_t long_options[] = {
 	{ "print-chains",   ko_no_argument,       352 },
 	{ "no-hash-name",   ko_no_argument,       353 },
 	{ "secondary-seq",  ko_no_argument,       354 },
+	{ "ds",             ko_no_argument,       355 },
 	{ "help",           ko_no_argument,       'h' },
 	{ "max-intron-len", ko_required_argument, 'G' },
 	{ "version",        ko_no_argument,       'V' },
@@ -120,7 +121,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
 
 int main(int argc, char *argv[])
 {
-	const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
+	const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
 	ketopt_t o = KETOPT_INIT;
 	mm_mapopt_t opt;
 	mm_idxopt_t ipt;
@@ -178,6 +179,7 @@ int main(int argc, char *argv[])
 		else if (c == 'm') opt.min_chain_score = atoi(o.arg);
 		else if (c == 'A') opt.a = atoi(o.arg);
 		else if (c == 'B') opt.b = atoi(o.arg);
+		else if (c == 'b') opt.transition = atoi(o.arg);
 		else if (c == 's') opt.min_dp_max = atoi(o.arg);
 		else if (c == 'C') opt.noncan = atoi(o.arg);
 		else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
@@ -242,6 +244,7 @@ int main(int argc, char *argv[])
 		else if (c == 352) mm_dbg_flag |= MM_DBG_PRINT_CHAIN; // --print-chains
 		else if (c == 353) opt.flag |= MM_F_NO_HASH_NAME; // --no-hash-name
 		else if (c == 354) opt.flag |= MM_F_SECONDARY_SEQ; // --secondary-seq
+		else if (c == 355) opt.flag |= MM_F_OUT_DS; // --ds
 		else if (c == 330) {
 			fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
 		} else if (c == 314) { // --frag
@@ -358,6 +361,7 @@ int main(int argc, char *argv[])
 		fprintf(fp_help, "    -R STR       SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
 		fprintf(fp_help, "    -c           output CIGAR in PAF\n");
 		fprintf(fp_help, "    --cs[=STR]   output the cs tag; STR is 'short' (if absent) or 'long' [none]\n");
+		fprintf(fp_help, "    --ds         output the ds tag, which is an extension to cs\n");
 		fprintf(fp_help, "    --MD         output the MD tag\n");
 		fprintf(fp_help, "    --eqx        write =/X CIGAR operators\n");
 		fprintf(fp_help, "    -Y           use soft clipping for supplementary alignments\n");
@@ -367,12 +371,12 @@ int main(int argc, char *argv[])
 		fprintf(fp_help, "    --version    show version number\n");
 		fprintf(fp_help, "  Preset:\n");
 		fprintf(fp_help, "    -x STR       preset (always applied before other options; see minimap2.1 for details) []\n");
-		fprintf(fp_help, "                 - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n");
-		fprintf(fp_help, "                 - map-hifi - PacBio HiFi reads vs reference mapping\n");
-		fprintf(fp_help, "                 - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
+		fprintf(fp_help, "                 - lr:hq - accurate long reads (error rate <1%%) against a reference genome\n");
+		fprintf(fp_help, "                 - splice/splice:hq - spliced alignment for long reads/accurate long reads\n");
 		fprintf(fp_help, "                 - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
-		fprintf(fp_help, "                 - splice/splice:hq - long-read/Pacbio-CCS spliced alignment\n");
-		fprintf(fp_help, "                 - sr - genomic short-read mapping\n");
+		fprintf(fp_help, "                 - sr - short reads against a reference\n");
+		fprintf(fp_help, "                 - map-pb/map-hifi/map-ont/map-iclr - CLR/HiFi/Nanopore/ICLR vs reference mapping\n");
+		fprintf(fp_help, "                 - ava-pb/ava-ont - PacBio CLR/Nanopore read overlap\n");
 		fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of these and other advanced command-line options.\n");
 		return fp_help == stdout? 0 : 1;
 	}


=====================================
minimap.h
=====================================
@@ -5,7 +5,7 @@
 #include <stdio.h>
 #include <sys/types.h>
 
-#define MM_VERSION "2.26-r1175"
+#define MM_VERSION "2.27-r1193"
 
 #define MM_F_NO_DIAG       (0x001LL) // no exact diagonal hit
 #define MM_F_NO_DUAL       (0x002LL) // skip pairs where query name is lexicographically larger than target name
@@ -44,6 +44,7 @@
 #define MM_F_NO_HASH_NAME  (0x400000000LL)
 #define MM_F_SPLICE_OLD    (0x800000000LL)
 #define MM_F_SECONDARY_SEQ (0x1000000000LL)	//output SEQ field for seqondary alignments using hard clipping
+#define MM_F_OUT_DS        (0x2000000000LL)
 
 #define MM_I_HPC          0x1
 #define MM_I_NO_SEQ       0x2
@@ -97,6 +98,7 @@ typedef struct {
 typedef struct {
 	uint32_t capacity;                  // the capacity of cigar[]
 	int32_t dp_score, dp_max, dp_max2;  // DP score; score of the max-scoring segment; score of the best alternate mappings
+	int32_t dp_max0;                    // DP score before mm_update_dp_max() adjustment
 	uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
 	uint32_t n_cigar;                   // number of cigar operations in cigar[]
 	uint32_t cigar[];
@@ -153,6 +155,7 @@ typedef struct {
 	float alt_drop;
 
 	int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
+	int transition; // transition mismatch score (A:G, C:T)
 	int sc_ambi; // score when one or both bases are "N"
 	int noncan;      // cost of non-canonical splicing sites
 	int junc_bonus;


=====================================
minimap2.1
=====================================
@@ -1,4 +1,4 @@
-.TH minimap2 1 "29 April 2023" "minimap2-2.26 (r1175)" "Bioinformatics tools"
+.TH minimap2 1 "12 March 2024" "minimap2-2.27 (r1193)" "Bioinformatics tools"
 .SH NAME
 .PP
 minimap2 - mapping and alignment between collections of DNA sequences
@@ -343,6 +343,10 @@ Matching score [2]
 .BI -B \ INT
 Mismatching penalty [4]
 .TP
+.BI -b \ INT
+Mismatching penalty for transitions [same as
+.BR -B ].
+.TP
 .BI -O \ INT1[,INT2]
 Gap open penalty [4,24]. If
 .I INT2
@@ -356,10 +360,19 @@ costs
 .RI min{ O1 + k * E1 , O2 + k * E2 }.
 In the splice mode, the second gap penalties are not used.
 .TP
+.BI -J \ INT
+Splice model [1]. 0 for the original minimap2 splice model that always penalizes non-GT-AG splicing;
+1 for the miniprot model that considers non-GT-AG. Option
+.B -C
+has no effect with the default
+.BR -J1 .
+.BR -J0 .
+.TP
 .BI -C \ INT
 Cost for a non-canonical GT-AG splicing (effective with
-.BR --splice )
-[0]
+.B --splice
+.BR -J0 )
+[0].
 .TP
 .BI -z \ INT1[,INT2]
 Truncate an alignment if the running alignment score drops too quickly along
@@ -506,6 +519,9 @@ Output =/X CIGAR operators for sequence match/mismatch.
 .B -Y
 In SAM output, use soft clipping for supplementary alignments.
 .TP
+.B --secondary-seq
+In SAM output, show query sequences for secondary alignments.
+.TP
 .BI --seed \ INT
 Integer seed for randomizing equally best hits. Minimap2 hashes
 .I INT
@@ -566,15 +582,43 @@ are:
 Align noisy long reads of ~10% error rate to a reference genome. This is the
 default mode.
 .TP
+.B lr:hq
+Align accurate long reads (error rate <1%) to a reference genome
+.RB ( -k19
+.B -w19 -U50,500
+.BR -g10k ).
+This was recommended by ONT developers for recent Nanopore reads
+produced with chemistry v14 that can reach ~99% in accuracy.
+It was shown to work better for accurate Nanopore reads
+than
+.BR map-hifi .
+.TP
 .B map-hifi
 Align PacBio high-fidelity (HiFi) reads to a reference genome
-.RB ( -k19
-.B -w19 -U50,500 -g10k -A1 -B4 -O6,26 -E2,1
+.RB ( -xlr:hq
+.B -A1 -B4 -O6,26 -E2,1
 .BR -s200 ).
+It differs from
+.B lr:hq
+only in scoring. It has not been tested whether
+.B lr:hq
+would work better for PacBio HiFi reads.
 .TP
 .B map-pb
 Align older PacBio continuous long (CLR) reads to a reference genome
 .RB ( -Hk19 ).
+Note that this data type is effectively deprecated by HiFi.
+Unless you work on very old data, you probably want to use
+.B map-hifi
+or
+.BR lr:hq .
+.TP
+.B map-iclr
+Align Illumina Complete Long Reads (ICLR) to a reference genome
+.RB ( -k19
+.B -B6 -b4
+.BR -O10,50 ).
+This was recommended by Illumina developers.
 .TP
 .B asm5
 Long assembly to reference mapping
@@ -582,21 +626,21 @@ Long assembly to reference mapping
 .B -w19 -U50,500 --rmq -r1k,100k -g10k -A1 -B19 -O39,81 -E3,1 -s200 -z200
 .BR -N50 ).
 Typically, the alignment will not extend to regions with 5% or higher sequence
-divergence. Only use this preset if the average divergence is far below 5%.
+divergence. Use this preset if the average divergence is not much higher than 0.1%.
 .TP
 .B asm10
 Long assembly to reference mapping
 .RB ( -k19
 .B -w19 -U50,500 --rmq -r1k,100k -g10k -A1 -B9 -O16,41 -E2,1 -s200 -z200
 .BR -N50 ).
-Up to 10% sequence divergence.
+Use this if the average divergence is around 1%.
 .TP
 .B asm20
 Long assembly to reference mapping
 .RB ( -k19
 .B -w10 -U50,500 --rmq -r1k,100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200
 .BR -N50 ).
-Up to 20% sequence divergence.
+Use this if the average divergence is around several percent.
 .TP
 .B splice
 Long-read spliced alignment
@@ -612,13 +656,13 @@ costs are different during chaining; 4) the computation of the
 tag ignores introns to demote hits to pseudogenes.
 .TP
 .B splice:hq
-Long-read splice alignment for PacBio CCS reads
+Spliced alignment for accurate long RNA-seq reads such as PacBio iso-seq
 .RB ( -xsplice
 .B -C5 -O6,24
 .BR -B4 ).
 .TP
 .B sr
-Short single-end reads without splicing
+Short-read alignment without splicing
 .RB ( -k21
 .B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
 .B -s40 -g100 -2K50m --heap-sort=yes


=====================================
misc/paftools.js
=====================================
@@ -1,6 +1,6 @@
 #!/usr/bin/env k8
 
-var paftools_version = '2.26-r1175';
+var paftools_version = '2.27-r1193';
 
 /*****************************
  ***** Library functions *****
@@ -133,26 +133,50 @@ Interval.find_ovlp = function(a, st, en)
 
 function fasta_read(fn)
 {
-	var h = {}, gt = '>'.charCodeAt(0);
+	var h = {}, seqlen = [];
+	var buf = new Bytes();
 	var file = fn == '-'? new File() : new File(fn);
-	var buf = new Bytes(), seq = null, name = null, seqlen = [];
-	while (file.readline(buf) >= 0) {
-		if (buf[0] == gt) {
-			if (seq != null && name != null) {
-				seqlen.push([name, seq.length]);
-				h[name] = seq;
-				name = seq = null;
-			}
-			var m, line = buf.toString();
-			if ((m = /^>(\S+)/.exec(line)) != null) {
-				name = m[1];
-				seq = new Bytes();
-			}
-		} else seq.set(buf);
-	}
-	if (seq != null && name != null) {
-		seqlen.push([name, seq.length]);
-		h[name] = seq;
+	if (typeof k8_version == "undefined") { // for k8-0.x
+		var seq = null, name = null, gt = '>'.charCodeAt(0);
+		while (file.readline(buf) >= 0) {
+			if (buf[0] == gt) {
+				if (seq != null && name != null) {
+					seqlen.push([name, seq.length]);
+					h[name] = seq;
+					name = seq = null;
+				}
+				var m, line = buf.toString();
+				if ((m = /^>(\S+)/.exec(line)) != null) {
+					name = m[1];
+					seq = new Bytes();
+				}
+			} else seq.set(buf);
+		}
+		if (seq != null && name != null) {
+			seqlen.push([name, seq.length]);
+			h[name] = seq;
+		}
+	} else { // for k8-1.x
+		var seq = null, name = null;
+		while (file.readline(buf) >= 0) {
+			var line = buf.toString();
+			if (line[0] == ">") {
+				if (seq != null && name != null) {
+					seqlen.push([name, seq.length]);
+					h[name] = new Uint8Array(seq.buffer);
+					name = seq = null;
+				}
+				var m;
+				if ((m = /^>(\S+)/.exec(line)) != null) {
+					name = m[1];
+					seq = new Bytes();
+				}
+			} else seq.set(line);
+		}
+		if (seq != null && name != null) {
+			seqlen.push([name, seq.length]);
+			h[name] = new Uint8Array(seq.buffer);
+		}
 	}
 	buf.destroy();
 	file.close();
@@ -161,16 +185,27 @@ function fasta_read(fn)
 
 function fasta_free(fa)
 {
-	for (var name in fa)
-		fa[name].destroy();
+	if (typeof k8_version == "undefined")
+		for (var name in fa)
+			fa[name].destroy();
+	// FIXME: for k8-1.0, sequences are not freed. This is ok for now but not general.
 }
 
 Bytes.prototype.reverse = function()
 {
-	for (var i = 0; i < this.length>>1; ++i) {
-		var tmp = this[i];
-		this[i] = this[this.length - i - 1];
-		this[this.length - i - 1] = tmp;
+	if (typeof k8_version === "undefined") { // k8-0.x
+		for (var i = 0; i < this.length>>1; ++i) {
+			var tmp = this[i];
+			this[i] = this[this.length - i - 1];
+			this[this.length - i - 1] = tmp;
+		}
+	} else { // k8-1.x
+		var buf = new Uint8Array(this.buffer);
+		for (var i = 0; i < buf.length>>1; ++i) {
+			var tmp = buf[i];
+			buf[i] = buf[buf.length - i - 1];
+			buf[buf.length - i - 1] = tmp;
+		}
 	}
 }
 
@@ -185,13 +220,24 @@ Bytes.prototype.revcomp = function()
 		for (var i = 0; i < s1.length; ++i)
 			Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
 	}
-	for (var i = 0; i < this.length>>1; ++i) {
-		var tmp = this[this.length - i - 1];
-		this[this.length - i - 1] = Bytes.rctab[this[i]];
-		this[i] = Bytes.rctab[tmp];
+	if (typeof k8_version === "undefined") { // k8-0.x
+		for (var i = 0; i < this.length>>1; ++i) {
+			var tmp = this[this.length - i - 1];
+			this[this.length - i - 1] = Bytes.rctab[this[i]];
+			this[i] = Bytes.rctab[tmp];
+		}
+		if (this.length&1)
+			this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
+	} else { // k8-1.x
+		var buf = new Uint8Array(this.buffer);
+		for (var i = 0; i < buf.length>>1; ++i) {
+			var tmp = buf[buf.length - i - 1];
+			buf[buf.length - i - 1] = Bytes.rctab[buf[i]];
+			buf[i] = Bytes.rctab[tmp];
+		}
+		if (buf.length&1)
+			buf[buf.length>>1] = Bytes.rctab[buf[buf.length>>1]];
 	}
-	if (this.length&1)
-		this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
 }
 
 /********************
@@ -2051,7 +2097,7 @@ function paf_mapeval(args)
 		warn("Usage: paftools.js mapeval [options] <in.paf>|<in.sam>");
 		warn("Options:");
 		warn("  -r FLOAT   mapping correct if overlap_length/union_length>FLOAT [" + ovlp_ratio + "]");
-		warn("  -Q INT     print wrong mappings with mapQ>INT [don't print]");
+		warn("  -Q INT     print wrong mappings with mapQ>=INT [don't print]");
 		warn("  -m INT     0: eval the longest aln only; 1: first aln only; 2: all primary aln [0]");
 		exit(1);
 	}


=====================================
options.c
=====================================
@@ -45,6 +45,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
 	opt->alt_drop = 0.15f;
 
 	opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
+	opt->transition = 0;
 	opt->sc_ambi = 1;
 	opt->zdrop = 400, opt->zdrop_inv = 200;
 	opt->end_bonus = -1;
@@ -90,7 +91,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
 	if (preset == 0) {
 		mm_idxopt_init(io);
 		mm_mapopt_init(mo);
-	} else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default
+	} else if (strcmp(preset, "lr") == 0 || strcmp(preset, "map-ont") == 0) { // this is the same as the default
 	} else if (strcmp(preset, "ava-ont") == 0) {
 		io->flag = 0, io->k = 15, io->w = 5;
 		mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN;
@@ -105,13 +106,22 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
 		mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25;
 		mo->bw_long = mo->bw;
 		mo->occ_dist = 0;
-	} else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
+	} else if (strcmp(preset, "lr:hq") == 0 || strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
 		io->flag = 0, io->k = 19, io->w = 19;
 		mo->max_gap = 10000;
-		mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
-		mo->occ_dist = 500;
 		mo->min_mid_occ = 50, mo->max_mid_occ = 500;
-		mo->min_dp_max = 200;
+		if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) {
+			mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1;
+			mo->min_dp_max = 200;
+		}
+	} else if (strcmp(preset, "map-iclr-prerender") == 0) {
+		io->flag = 0, io->k = 15;
+		mo->b = 6, mo->transition = 1;
+		mo->q = 10, mo->q2 = 50;
+	} else if (strcmp(preset, "map-iclr") == 0) {
+		io->flag = 0, io->k = 19;
+		mo->b = 6, mo->transition = 4;
+		mo->q = 10, mo->q2 = 50;
 	} else if (strncmp(preset, "asm", 3) == 0) {
 		io->flag = 0, io->k = 19, io->w = 19;
 		mo->bw = 1000, mo->bw_long = 100000;
@@ -156,7 +166,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
 		mo->junc_bonus = 9;
 		mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved
 		if (strcmp(preset, "splice:hq") == 0)
-			mo->junc_bonus = 5, mo->b = 4, mo->q = 6, mo->q2 = 24;
+			mo->noncan = 5, mo->b = 4, mo->q = 6, mo->q2 = 24;
 	} else return -1;
 	return 0;
 }


=====================================
python/README.rst
=====================================
@@ -77,7 +77,9 @@ This constructor accepts the following arguments:
 
 * **min_chain_score**: minimum chaing score
 
-* **bw**: chaining and alignment band width
+* **bw**: chaining and alignment band width (initial chaining and extension)
+
+* **bw_long**: chaining and alignment band width (RMQ-based rechaining and closing gaps)
 
 * **best_n**: max number of alignments to return
 


=====================================
python/mappy.pyx
=====================================
@@ -3,7 +3,7 @@ from libc.stdlib cimport free
 cimport cmappy
 import sys
 
-__version__ = '2.26'
+__version__ = '2.27'
 
 cmappy.mm_reset_timer()
 
@@ -112,7 +112,7 @@ cdef class Aligner:
 	cdef cmappy.mm_idxopt_t idx_opt
 	cdef cmappy.mm_mapopt_t map_opt
 
-	def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
+	def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, bw_long=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None):
 		self._idx = NULL
 		cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options
 		if preset is not None:
@@ -125,6 +125,7 @@ cdef class Aligner:
 		if min_chain_score is not None: self.map_opt.min_chain_score = min_chain_score
 		if min_dp_score is not None: self.map_opt.min_dp_max = min_dp_score
 		if bw is not None: self.map_opt.bw = bw
+		if bw_long is not None: self.map_opt.bw_long = bw_long
 		if best_n is not None: self.map_opt.best_n = best_n
 		if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len
 		if extra_flags is not None: self.map_opt.flag |= extra_flags


=====================================
setup.py
=====================================
@@ -23,7 +23,7 @@ def readme():
 
 setup(
 	name = 'mappy',
-	version = '2.26',
+	version = '2.27',
 	url = 'https://github.com/lh3/minimap2',
 	description = 'Minimap2 python binding',
 	long_description = readme(),



View it on GitLab: https://salsa.debian.org/med-team/minimap2/-/commit/81bbb78fa551456823287d362c4fb29d29de1fc3

-- 
View it on GitLab: https://salsa.debian.org/med-team/minimap2/-/commit/81bbb78fa551456823287d362c4fb29d29de1fc3
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240318/bb43e6d4/attachment-0001.htm>


More information about the debian-med-commit mailing list