[med-svn] [Git][med-team/miniasm][master] 4 commits: New upstream version 0.3+dfsg

Andreas Tille gitlab at salsa.debian.org
Tue Jul 24 05:59:29 BST 2018


Andreas Tille pushed to branch master at Debian Med / miniasm


Commits:
89afb8f0 by Andreas Tille at 2018-07-24T06:51:39+02:00
New upstream version 0.3+dfsg
- - - - -
d3005085 by Andreas Tille at 2018-07-24T06:51:39+02:00
Update upstream source from tag 'upstream/0.3+dfsg'

Update to upstream version '0.3+dfsg'
with Debian dir ab22263aaae78edd3503ea4153ecddcad210d45e
- - - - -
70eba5a5 by Andreas Tille at 2018-07-24T06:55:51+02:00
New upstream version

- - - - -
d7b6c4f3 by Andreas Tille at 2018-07-24T06:57:48+02:00
Upload to unstable

- - - - -


19 changed files:

- Makefile
- PAF.md
- README.md
- asm.c
- common.c
- debian/changelog
- debian/patches/hardening
- debian/patches/spelling
- dotter.c
- hit.c
- main.c
- miniasm.1
- miniasm.h
- misc/da2paf.pl
- misc/mhap2paf.pl
- + misc/ov-sen.js
- + misc/paftop.js
- misc/sam2paf.js
- + misc/wt2paf.pl


Changes:

=====================================
Makefile
=====================================
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 CC=			gcc
-CFLAGS=		-g -Wall -O2 -Wc++-compat -Wno-unused-function
+CFLAGS=		-g -Wall -O2 -Wc++-compat
 CPPFLAGS=
 INCLUDES=	-I.
 OBJS=		sys.o sdict.o paf.o asg.o common.o hit.o asm.o


=====================================
PAF.md
=====================================
--- a/PAF.md
+++ b/PAF.md
@@ -22,7 +22,7 @@ following predefined fields:
 If PAF is generated from an alignment, column 10 equals the number of sequence
 matches, and column 11 equals the total number of sequence matches, mismatches,
 insertions and deletions in the alignment. If alignment is not available,
-column 10 and 11 are still required but can be approximate.
+column 10 and 11 are still required but may be highly inaccurate.
 
 A PAF file may optionally contain SAM-like typed key-value pairs at the end of
 each line.


=====================================
README.md
=====================================
--- a/README.md
+++ b/README.md
@@ -1,5 +1,3 @@
-*Warning: since r104, miniasm only works with minimap-r122 or later*
-
 ## Getting Started
 
 ```sh
@@ -7,10 +5,10 @@
 wget -O- http://www.cbcb.umd.edu/software/PBcR/data/selfSampleData.tar.gz | tar zxf -
 ln -s selfSampleData/pacbio_filtered.fastq reads.fq
 # Install minimap and miniasm (requiring gcc and zlib)
-git clone https://github.com/lh3/minimap && (cd minimap && make)
-git clone https://github.com/lh3/miniasm && (cd miniasm && make)
-# Overlap
-minimap/minimap -Sw5 -L100 -m0 -t8 reads.fq reads.fq | gzip -1 > reads.paf.gz
+git clone https://github.com/lh3/minimap2 && (cd minimap2 && make)
+git clone https://github.com/lh3/miniasm  && (cd miniasm  && make)
+# Overlap for PacBio reads (or use "-x map-ont" for nanopore read overlapping)
+minimap2/minimap2 -x map-pb -t8 pb-reads.fq pb-reads.fq | gzip -1 > reads.paf.gz
 # Layout
 miniasm/miniasm -f reads.fq reads.paf.gz > reads.gfa
 ```


=====================================
asm.c
=====================================
--- a/asm.c
+++ b/asm.c
@@ -45,10 +45,10 @@ void ma_sg_print(const asg_t *g, const sdict_t *d, const ma_sub_t *sub, FILE *fp
 		const asg_arc_t *p = &g->arc[i];
 		if (sub) {
 			const ma_sub_t *sq = &sub[p->ul>>33], *st = &sub[p->v>>1];
-			fprintf(fp, "L\t%s:%d-%d\t%c\t%s:%d-%d\t%c\t%dM\tSD:i:%d\n", d->seq[p->ul>>33].name, sq->s + 1, sq->e, "+-"[p->ul>>32&1],
+			fprintf(fp, "L\t%s:%d-%d\t%c\t%s:%d-%d\t%c\t%d:\tL1:i:%d\n", d->seq[p->ul>>33].name, sq->s + 1, sq->e, "+-"[p->ul>>32&1],
 					d->seq[p->v>>1].name, st->s + 1, st->e, "+-"[p->v&1], p->ol, (uint32_t)p->ul);
 		} else {
-			fprintf(fp, "L\t%s\t%c\t%s\t%c\t%dM\tSD:i:%d\n", d->seq[p->ul>>33].name, "+-"[p->ul>>32&1],
+			fprintf(fp, "L\t%s\t%c\t%s\t%c\t%d:\tL1:i:%d\n", d->seq[p->ul>>33].name, "+-"[p->ul>>32&1],
 					d->seq[p->v>>1].name, "+-"[p->v&1], p->ol, (uint32_t)p->ul);
 		}
 	}


=====================================
common.c
=====================================
--- a/common.c
+++ b/common.c
@@ -8,7 +8,6 @@ void ma_opt_init(ma_opt_t *opt)
 	opt->min_match = 100;
 	opt->min_dp = 3;
 	opt->min_iden = .05;
-	opt->cov_ratio = 0.;
 
 	opt->max_hang = 1000;
 	opt->min_ovlp = opt->min_span;


=====================================
debian/changelog
=====================================
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,9 @@
-miniasm (0.2+dfsg-4) UNRELEASED; urgency=medium
+miniasm (0.3+dfsg-1) unstable; urgency=medium
 
-  * Stumbled upon code copy of some git commit in third party software
-    (Unicycler) and asked for clarification about releases in
-      https://github.com/lh3/miniasm/issues/51
+  * Team upload
+  * New upstream version
 
- -- Andreas Tille <tille at debian.org>  Tue, 17 Jul 2018 08:42:58 +0200
+ -- Andreas Tille <tille at debian.org>  Tue, 24 Jul 2018 06:56:06 +0200
 
 miniasm (0.2+dfsg-3) unstable; urgency=medium
 


=====================================
debian/patches/hardening
=====================================
--- a/debian/patches/hardening
+++ b/debian/patches/hardening
@@ -5,13 +5,13 @@ Last-Update: 2016-01-14
 +++ b/Makefile
 @@ -1,6 +1,5 @@
  CC=			gcc
--CFLAGS=		-g -Wall -O2 -Wc++-compat -Wno-unused-function
+-CFLAGS=		-g -Wall -O2 -Wc++-compat
 -CPPFLAGS=
-+CFLAGS+=		-g -Wall -O2 -Wc++-compat -Wno-unused-function
++CFLAGS+=		-g -Wall -O2 -Wc++-compat
  INCLUDES=	-I.
  OBJS=		sys.o sdict.o paf.o asg.o common.o hit.o asm.o
  PROG=		miniasm minidot
-@@ -9,15 +8,15 @@
+@@ -9,15 +8,15 @@ LIBS=		-lm -lz -lpthread
  .SUFFIXES:.c .o
  
  .c.o:


=====================================
debian/patches/spelling
=====================================
--- a/debian/patches/spelling
+++ b/debian/patches/spelling
@@ -2,7 +2,7 @@ Description: spelling
 Author: Sascha Steinbiss <satta at debian.org>
 --- a/miniasm.1
 +++ b/miniasm.1
-@@ -138,7 +138,7 @@
+@@ -153,7 +153,7 @@ sequences.
  Max and min overlap drop ratio [0.7,0.5]. Let overlap(v->w) be the overlap
  length of edge v->w and maxovlp(v)=max_w{overlap(v->w)} be the length of
  largest overlap. Miniasm drops overlap v->w if overlap(v->w)/maxovlp(v) is below


=====================================
dotter.c
=====================================
--- a/dotter.c
+++ b/dotter.c
@@ -91,19 +91,23 @@ int main(int argc, char *argv[])
 	d[1] = sd_init();
 
 	f = paf_open(argv[optind]);
+	if (!f) {
+		fprintf(stderr, "[E::%s] could not open PAF file %s\n", __func__, argv[optind]);
+		return 1;
+	}
 	while (paf_read(f, &r) >= 0) {
 		dt_hit_t *s;
 		if (r.qe - r.qs < min_span || r.te - r.ts < min_span || r.ml < min_match) continue;
 		if (r.ml < r.bl * min_iden) continue;
 		kv_pushp(dt_hit_t, h, &s);
-		s->qn = sd_put(d[0], r.qn, r.ql), s->qs = r.qs, s->qe = r.qe;
-		s->tn = sd_put(d[1], r.tn, r.tl);
+		s->qn = sd_put(d[1], r.qn, r.ql), s->qs = r.qs, s->qe = r.qe;
+		s->tn = sd_put(d[0], r.tn, r.tl);
 		s->ts = r.rev? r.te : r.ts, s->te = r.rev? r.ts : r.te;
 		s->ml = r.ml;
 	}
 	paf_close(f);
 
-	for (i = 0; i < 2; ++i) {
+	for (i = 0; i < 2; ++i) { // 0 for target; 1 for query
 		uint32_t n = d[i]->n_seq;
 		uint64_t l = 0;
 		a[i] = (srtaux_t*)calloc(n + 1, sizeof(srtaux_t));
@@ -113,22 +117,17 @@ int main(int argc, char *argv[])
 			ks_introsort_dtx(n, a[i]);
 		} else {
 			srtaux_t *b = a[i];
-			uint32_t *inv;
-			inv = (uint32_t*)calloc(d[0]->n_seq, 4);
-			for (j = 0; j < d[0]->n_seq; ++j)
-				inv[a[0][j].i] = j;
 			for (j = 0; j < n; ++j)
 				b[j].name = d[i]->seq[j].name, b[j].tot = b[j].w = 0, b[j].i = j;
 			for (j = 0; j < h.n; ++j) {
 				uint64_t w, coor;
 				dt_hit_t *p = &h.a[j];
-				srtaux_t *q = &b[p->tn];
-				coor = acclen[0][inv[p->qn]] + (p->qs + p->qe) / 2;
+				srtaux_t *q = &b[p->qn];
+				coor = acclen[0][p->tn] + (p->ts + p->te) / 2;
 				w = (uint64_t)(.01 * p->ml * p->ml + .499);
 				q->tot += (double)coor * w;
 				q->w += w;
 			}
-			free(inv);
 			for (j = 0; j < n; ++j) b[j].tot /= b[j].w;
 			ks_introsort_dty(n, b);
 		}
@@ -175,11 +174,11 @@ int main(int argc, char *argv[])
 		for (i = 0; i < h.n; ++i) {
 			dt_hit_t *p = &h.a[i];
 			double x0, y0, x1, y1;
-			uint64_t xo = acclen[0][p->qn], yo = acclen[1][p->tn];
+			uint64_t xo = acclen[0][p->tn], yo = acclen[1][p->qn];
 			if (j == 0 && p->ts > p->te) continue;
 			if (j == 1 && p->ts < p->te) continue;
-			x0 = (p->qs + xo) * sx, y0 = (p->ts + yo) * sy;
-			x1 = (p->qe + xo) * sx, y1 = (p->te + yo) * sy;
+			x0 = (p->ts + xo) * sx, y0 = (p->qs + yo) * sy;
+			x1 = (p->te + xo) * sx, y1 = (p->qe + yo) * sy;
 			eps_line(stdout, x0, y0, x1, y1);
 		}
 		eps_stroke(stdout);


=====================================
hit.c
=====================================
--- a/hit.c
+++ b/hit.c
@@ -14,12 +14,14 @@ KRADIX_SORT_INIT(hit, ma_hit_t, ma_hit_key, 8)
 
 KSORT_INIT_GENERIC(uint32_t)
 
+typedef kvec_t(uint32_t) uint32_v;
+
 void ma_hit_sort(size_t n, ma_hit_t *a)
 {
 	radix_sort_hit(a, a + n);
 }
 
-void ma_hit_mark_unused(sdict_t *d, int n, const ma_hit_t *a)
+void ma_hit_mark_unused(sdict_t *d, size_t n, const ma_hit_t *a)
 {
 	size_t i;
 	for (i = 0; i < d->n_seq; ++i)
@@ -33,7 +35,39 @@ void ma_hit_mark_unused(sdict_t *d, int n, const ma_hit_t *a)
 	}
 }
 
-ma_hit_t *ma_hit_read(const char *fn, int min_span, int min_match, sdict_t *d, size_t *n, int bi_dir)
+sdict_t *ma_hit_no_cont(const char *fn, int min_span, int min_match, int max_hang, float int_frac)
+{
+	paf_file_t *fp;
+	paf_rec_t r;
+	sdict_t *d;
+
+	fp = paf_open(fn);
+	if (!fp) {
+		fprintf(stderr, "[E::%s] could not open PAF file %s\n", __func__, fn);
+		exit(1);
+	}
+	d = sd_init();
+	while (paf_read(fp, &r) >= 0) {
+		int l5, l3;
+		if (r.qe - r.qs < min_span || r.te - r.ts < min_span || r.ml < min_match) continue;
+		l5 = r.rev? r.tl - r.te : r.ts;
+		l3 = r.rev? r.ts : r.tl - r.te;
+		if (r.ql>>1 > r.tl) {
+			if (l5 > max_hang>>2 || l3 > max_hang>>2 || r.te - r.ts < r.tl * int_frac) continue; // internal match
+			if ((int)r.qs - l5 > max_hang<<1 && (int)(r.ql - r.qe) - l3 > max_hang<<1)
+				sd_put(d, r.tn, r.tl);
+		} else if (r.ql < r.tl>>1) {
+			if (r.qs > max_hang>>2 || r.ql - r.qe > max_hang>>2 || r.qe - r.qs < r.ql * int_frac) continue; // internal
+			if (l5 - (int)r.qs > max_hang<<1 && l3 - (int)(r.ql - r.qe) > max_hang<<1)
+				sd_put(d, r.qn, r.ql);
+		}
+	}
+	paf_close(fp);
+	if (ma_verbose >= 3) fprintf(stderr, "[M::%s::%s] dropped %d contained reads\n", __func__, sys_timestamp(), d->n_seq);
+	return d;
+}
+
+ma_hit_t *ma_hit_read(const char *fn, int min_span, int min_match, sdict_t *d, size_t *n, int bi_dir, const sdict_t *excl)
 {
 	paf_file_t *fp;
 	paf_rec_t r;
@@ -41,10 +75,15 @@ ma_hit_t *ma_hit_read(const char *fn, int min_span, int min_match, sdict_t *d, s
 	size_t i, tot = 0, tot_len = 0;
 
 	fp = paf_open(fn);
+	if (!fp) {
+		fprintf(stderr, "[E::%s] could not open PAF file %s\n", __func__, fn);
+		exit(1);
+	}
 	while (paf_read(fp, &r) >= 0) {
 		ma_hit_t *p;
 		++tot;
 		if (r.qe - r.qs < min_span || r.te - r.ts < min_span || r.ml < min_match) continue;
+		if (excl && (sd_get(excl, r.qn) >= 0 || sd_get(excl, r.tn) >= 0)) continue;
 		kv_pushp(ma_hit_t, h, &p);
 		p->qns = (uint64_t)sd_put(d, r.qn, r.ql)<<32 | r.qs;
 		p->qe = r.qe;


=====================================
main.c
=====================================
--- a/main.c
+++ b/main.c
@@ -8,7 +8,7 @@
 #include "sdict.h"
 #include "miniasm.h"
 
-#define MA_VERSION "0.2-r128"
+#define MA_VERSION "0.3-r179"
 
 static void print_subs(const sdict_t *d, const ma_sub_t *sub)
 {
@@ -32,16 +32,16 @@ static void print_hits(size_t n_hits, const ma_hit_t *hit, const sdict_t *d, con
 int main(int argc, char *argv[])
 {
 	ma_opt_t opt;
-	int i, c, stage = 100, no_first = 0, no_second = 0, bi_dir = 1, o_set = 0;
-	sdict_t *d;
+	int i, c, stage = 100, no_first = 0, no_second = 0, bi_dir = 1, o_set = 0, no_cont = 0;
+	sdict_t *d, *excl = 0;
 	ma_sub_t *sub = 0;
 	ma_hit_t *hit;
 	size_t n_hits;
-	float cov;
+	float cov = 40.0;
 	char *fn_reads = 0, *outfmt = "ug";
 
 	ma_opt_init(&opt);
-	while ((c = getopt(argc, argv, "n:m:s:c:C:S:i:d:g:o:h:I:r:f:e:p:12VBbF:")) >= 0) {
+	while ((c = getopt(argc, argv, "n:m:s:c:S:i:d:g:o:h:I:r:f:e:p:12VBRbF:")) >= 0) {
 		if (c == 'm') opt.min_match = atoi(optarg);
 		else if (c == 'i') opt.min_iden = atof(optarg);
 		else if (c == 's') opt.min_span = atoi(optarg);
@@ -58,9 +58,9 @@ int main(int argc, char *argv[])
 		else if (c == '1') no_first = 1;
 		else if (c == '2') no_second = 1;
 		else if (c == 'n') opt.n_rounds = atoi(optarg) - 1;
-		else if (c == 'C') opt.cov_ratio = atof(optarg);
 		else if (c == 'B') bi_dir = 1;
 		else if (c == 'b') bi_dir = 0;
+		else if (c == 'R') no_cont = 1;
 		else if (c == 'F') opt.final_ovlp_drop_ratio = atof(optarg);
 		else if (c == 'V') {
 			printf("%s\n", MA_VERSION);
@@ -76,6 +76,7 @@ int main(int argc, char *argv[])
 		fprintf(stderr, "Usage: miniasm [options] <in.paf>\n");
 		fprintf(stderr, "Options:\n");
 		fprintf(stderr, "  Pre-selection:\n");
+		fprintf(stderr, "    -R          prefilter clearly contained reads (2-pass required)\n");
 		fprintf(stderr, "    -m INT      min match length [%d]\n", opt.min_match);
 		fprintf(stderr, "    -i FLOAT    min identity [%.2g]\n", opt.min_iden);
 		fprintf(stderr, "    -s INT      min span [%d]\n", opt.min_span);
@@ -107,8 +108,13 @@ int main(int argc, char *argv[])
 	sys_init();
 	d = sd_init();
 
+	if (no_cont) {
+		fprintf(stderr, "[M::%s] ===> Step 0: removing contained reads <===\n", __func__);
+		excl = ma_hit_no_cont(argv[optind], opt.min_span, opt.min_match, opt.max_hang, opt.int_frac);
+	}
+
 	fprintf(stderr, "[M::%s] ===> Step 1: reading read mappings <===\n", __func__);
-	hit = ma_hit_read(argv[optind], opt.min_span, opt.min_match, d, &n_hits, bi_dir);
+	hit = ma_hit_read(argv[optind], opt.min_span, opt.min_match, d, &n_hits, bi_dir, excl);
 
 	if (!no_first) {
 		fprintf(stderr, "[M::%s] ===> Step 2: 1-pass (crude) read selection <===\n", __func__);
@@ -123,12 +129,14 @@ int main(int argc, char *argv[])
 		fprintf(stderr, "[M::%s] ===> Step 3: 2-pass (fine) read selection <===\n", __func__);
 		if (stage >= 4) {
 			ma_sub_t *sub2;
-			int min_dp = (int)(cov * opt.cov_ratio + .499) - 1;
-			min_dp = min_dp > opt.min_dp? min_dp : opt.min_dp;
-			sub2 = ma_hit_sub(min_dp, opt.min_iden, opt.min_span/2, n_hits, hit, d->n_seq);
+			sub2 = ma_hit_sub(opt.min_dp, opt.min_iden, opt.min_span/2, n_hits, hit, d->n_seq);
 			n_hits = ma_hit_cut(sub2, opt.min_span, n_hits, hit);
-			ma_sub_merge(d->n_seq, sub, sub2);
-			free(sub2);
+			if (!no_first) {
+				ma_sub_merge(d->n_seq, sub, sub2);
+				free(sub2);
+			} else {
+				sub = sub2;
+			}
 		}
 		if (stage >= 5) n_hits = ma_hit_contained(&opt, d, sub, n_hits, hit);
 	}
@@ -139,7 +147,7 @@ int main(int argc, char *argv[])
 		print_subs(d, sub);
 	} else if (strcmp(outfmt, "paf") == 0) {
 		print_hits(n_hits, hit, d, sub);
-	} if (strcmp(outfmt, "ug") == 0 || strcmp(outfmt, "sg") == 0) {
+	} else if (strcmp(outfmt, "ug") == 0 || strcmp(outfmt, "sg") == 0) {
 		asg_t *sg = 0;
 		ma_ug_t *ug = 0;
 
@@ -192,6 +200,7 @@ int main(int argc, char *argv[])
 
 	free(sub); free(hit);
 	sd_destroy(d);
+	if (excl) sd_destroy(excl);
 
 	fprintf(stderr, "[M::%s] Version: %s\n", __func__, MA_VERSION);
 	fprintf(stderr, "[M::%s] CMD:", __func__);


=====================================
miniasm.1
=====================================
--- a/miniasm.1
+++ b/miniasm.1
@@ -1,4 +1,4 @@
-.TH minimap 1 "06 December 2015" "miniasm-0.2" "Bioinformatics tools"
+.TH miniasm 1 "23 July 2018" "miniasm-0.3 (r179)" "Bioinformatics tools"
 
 .SH NAME
 .PP
@@ -7,7 +7,7 @@ miniasm - de novo assembler for long read sequences
 .SH SYNOPSIS
 .PP
 miniasm
-.RB [ -b12V ]
+.RB [ -b12VR ]
 .RB [ -m
 .IR minMatch ]
 .RB [ -i
@@ -56,6 +56,21 @@ to the raw input reads.
 .SS Preselection options
 
 .TP 10
+.BI -R
+Pre-filter clearly contained short reads. In this mode,
+.I mapping.paf
+is read twice. The first pass identifies contained reads without loading hits
+to RAM; the second pass skips contained reads and load the rest into RAM. Due
+to the 2-pass behavior, the peak RAM is greatly reduced, but
+.I mapping.paf
+has to be a normal file, not a stream. When this option is in use, it is
+recommended to reduce
+.B -c
+to 2, as there are fewer reads after pre-filtering. Applying
+.BI -Rc 2
+sometimes improves assembly.
+
+.TP
 .BI -m \ INT
 Drop mappings having less than
 .I INT
@@ -231,9 +246,20 @@ _
 H	Header	N/A
 S	Segment	segName segSeq
 L	Overlap	segName1 segOri1 segName2 segOri2 ovlpCIGAR
-a	Golden path	utgName utgStart readName:start-end readOri length
+a	Golden path	utgName utgStart readName:rStart-rEnd readOri incLen
 .TE
 
+.PP
+An `a' line indicates that the unitig subsequence in
+.RI [ utgStart , utgStart + incLen )
+is taken from read
+.I readName
+in region
+.RI [ rStart -1, rStart -1+ incLen ).
+It is not a
+standard GFA line. An `x' line gives a brief summary of each unitig, which can
+be inferred from `S' and `a' lines.
+
 .SH SEE ALSO
 .PP
 minimap(1)


=====================================
miniasm.h
=====================================
--- a/miniasm.h
+++ b/miniasm.h
@@ -14,7 +14,6 @@ typedef struct {
 	int min_match;
 	int min_dp;
 	float min_iden;
-	float cov_ratio;
 
 	int max_hang;
 	int min_ovlp;
@@ -60,7 +59,8 @@ extern "C" {
 #endif
 
 void ma_opt_init(ma_opt_t *opt);
-ma_hit_t *ma_hit_read(const char *fn, int min_span, int min_match, sdict_t *d, size_t *n, int bi_dir);
+sdict_t *ma_hit_no_cont(const char *fn, int min_span, int min_match, int max_hang, float int_frac);
+ma_hit_t *ma_hit_read(const char *fn, int min_span, int min_match, sdict_t *d, size_t *n, int bi_dir, const sdict_t *excl);
 ma_sub_t *ma_hit_sub(int min_dp, float min_iden, int end_clip, size_t n, const ma_hit_t *a, size_t n_sub);
 size_t ma_hit_cut(const ma_sub_t *reg, int min_span, size_t n, ma_hit_t *a);
 size_t ma_hit_flt(const ma_sub_t *sub, int max_hang, int min_ovlp, size_t n, ma_hit_t *a, float *cov);
@@ -87,7 +87,6 @@ static inline int ma_hit2arc(const ma_hit_t *h, int ql, int tl, int max_hang, fl
 {
 	int32_t tl5, tl3, ext5, ext3, qs = (int32_t)h->qns;
 	uint32_t u, v, l; // u: query end; v: target end; l: length from u to v
-	u = v = l = UINT32_MAX;
 	if (h->rev) tl5 = tl - h->te, tl3 = h->ts; // tl5: 5'-end overhang (on the query strand); tl3: similar
 	else tl5 = h->ts, tl3 = tl - h->te;
 	ext5 = qs < tl5? qs : tl5;


=====================================
misc/da2paf.pl
=====================================
--- a/misc/da2paf.pl
+++ b/misc/da2paf.pl
@@ -5,19 +5,23 @@ use warnings;
 use Getopt::Std;
 
 my %opts;
-getopts("2", \%opts);
-die("Usage: ls *.las | xargs -i LAdump -cd reads.db {} | da2paf.pl [-2] <(DBdump -rh reads.db)\n") if @ARGV < 1;
+getopts("2n", \%opts);
+die("Usage: ls *.las | xargs -i LAdump -cd reads.db {} | da2paf.pl [-2n] <(DBdump -rh reads.db)\n") if @ARGV < 1;
 my $is_dbl = defined($opts{2});
+my $with_name = defined($opts{n});
 
 warn("Reading sequence lengths...\n");
 my $fn = shift(@ARGV);
 open(FH, $fn) || die;
-my ($id, @len);
+my ($id, $pre, @len, @name);
 while (<FH>) {
 	if (/^R\s(\d+)/) {
 		$id = $1;
-	} elsif (/^L\s\S+\s(\d+)\s(\d+)/) {
-		$len[$id] = $2 - $1;
+	} elsif (/^H\s\S+\s(\S+)/) {
+		$pre = $1;
+	} elsif (/^L\s(\S+)\s(\d+)\s(\d+)/) {
+		$len[$id] = $3 - $2;
+		$name[$id] = "$pre/$1/$2_$3";
 	}
 }
 close(FH);
@@ -33,11 +37,12 @@ while (<>) {
 	} elsif (!$skip && /^D\s(\d+)/) {
 		my $bl = $ae - $ab > $be - $bb? $ae - $ab : $be - $bb;
 		my $ml = $bl - $1;
+		my ($n0, $n1) = $with_name? ($name[$id0], $name[$id1]) : ($id0, $id1);
 		if ($strand eq '+') {
-			print join("\t", $id0, $len[$id0], $ab, $ae, '+', $id1, $len[$id1], $bb, $be, $ml, $bl, 255), "\n";
+			print join("\t", $n0, $len[$id0], $ab, $ae, '+', $n1, $len[$id1], $bb, $be, $ml, $bl, 255), "\n";
 		} else {
 			my $l = $len[$id1];
-			print join("\t", $id0, $len[$id0], $ab, $ae, '-', $id1, $l, $l - $be, $l - $bb, $ml, $bl, 255), "\n";
+			print join("\t", $n0, $len[$id0], $ab, $ae, '-', $n1, $l, $l - $be, $l - $bb, $ml, $bl, 255), "\n";
 		}
 	}
 }


=====================================
misc/mhap2paf.pl
=====================================
--- a/misc/mhap2paf.pl
+++ b/misc/mhap2paf.pl
@@ -5,10 +5,21 @@ use warnings;
 use Getopt::Std;
 
 my %opts = ();
-getopts("2", \%opts);
-my $is_dbl = defined($opts{2});
+getopts("2f:l:", \%opts);
+die("Usage: mhap2paf.pl [-2] [-f name_list] [-l min_len] <in.mhap>\n") if (@ARGV == 0 && -t STDIN);
 
-die("Usage: mhap2paf.pl [-2] <in.mhap>\n") if (@ARGV == 0 && -t STDIN);
+my $is_dbl = defined($opts{2});
+my $min_blen = defined($opts{l})? $opts{l} : 0;
+my @a = ();
+if (defined $opts{f}) {
+	open(FH, $opts{f} =~ /\.gz$/? "gzip -dc $opts{f} |" : $opts{f}) || die;
+	while (<FH>) {
+		chomp;
+		my @t = split;
+		push(@a, $t[0]);
+	}
+	close(FH);
+}
 
 while (<>) {
 	chomp;
@@ -18,6 +29,11 @@ while (<>) {
 	my $ml = int($bl * ($r <= 1.? $r : .01 * $r) + .499);
 	my $cm = "cm:i:" . int($t[3] + .499);
 	my $rev = $t[4] == $t[8]? '+' : '-';
+	next if $bl < $min_blen;
+	if (@a) {
+		$t[0] = $a[$t[0]-1];
+		$t[1] = $a[$t[1]-1];
+	}
 	print(join("\t", @t[0,7,5,6], $rev, @t[1,11,9,10], $ml, $bl, 255, $cm), "\n");
 	print(join("\t", @t[1,11,9,10], $rev, @t[0,7,5,6], $ml, $bl, 255, $cm), "\n") if ($is_dbl);
 }


=====================================
misc/ov-sen.js
=====================================
--- /dev/null
+++ b/misc/ov-sen.js
@@ -0,0 +1,114 @@
+var getopt = function(args, ostr) {
+	var oli; // option letter list index
+	if (typeof(getopt.place) == 'undefined')
+		getopt.ind = 0, getopt.arg = null, getopt.place = -1;
+	if (getopt.place == -1) { // update scanning pointer
+		if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
+			getopt.place = -1;
+			return null;
+		}
+		if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
+			++getopt.ind;
+			getopt.place = -1;
+			return null;
+		}
+	}
+	var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
+	if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
+		if (optopt == '-') return null; //  if the user didn't specify '-' as an option, assume it means null.
+		if (getopt.place < 0) ++getopt.ind;
+		return '?';
+	}
+	if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
+		getopt.arg = null;
+		if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
+	} else { // need an argument
+		if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
+			getopt.arg = args[getopt.ind].substr(getopt.place);
+		else if (args.length <= ++getopt.ind) { // no arg
+			getopt.place = -1;
+			if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
+			return '?';
+		} else getopt.arg = args[getopt.ind]; // white space
+		getopt.place = -1;
+		++getopt.ind;
+	}
+	return optopt;
+}
+
+var c, min_len = 2000, min_mapq = 10;
+while ((c = getopt(arguments, "l:q:")) != null)
+	if (c == 'l') min_len = parseInt(getopt.arg);
+	else if (c == 'q') min_mapq = parseInt(getopt.arg);
+
+if (arguments.length - getopt.ind < 2) {
+	warn("Usage: k8 ov-sen.js [options] <in.ref-sorted.paf> <in.ovlp.paf>");
+	warn("Options:");
+	warn("  -l INT     min overlap length [2000]");
+	warn("  -q INT     min mapping quality [10]");
+	exit(1);
+}
+
+var file, buf = new Bytes();
+
+var h = {};
+
+file = new File(arguments[getopt.ind]);
+var a = [];
+while (file.readline(buf) >= 0) {
+	var t = buf.toString().split("\t");
+	if (parseInt(t[11]) < min_mapq) continue;
+	if (parseInt(t[10]) < min_len) continue;
+	var st = parseInt(t[7]), en = parseInt(t[8]);
+
+	var n_shift = 0;
+	if (a.length > 0) {
+		for (var i = 0; i < a.length; ++i) {
+			if (t[5] != a[i][1]) {
+				++n_shift;
+			} else {
+				var min_en = a[i][3] < en? a[i][3] : en;
+				if (min_en - st >= min_len) break;
+				++n_shift;
+			}
+		}
+	}
+	if (n_shift > 0) {
+		for (var i = 0; i < n_shift; ++i)
+			a.shift();
+	}
+	if (a.length > 0) {
+		for (var i = 0; i < a.length; ++i) {
+			if (t[5] != a[i][1]) continue;
+			var min_en = a[i][3] < en? a[i][3] : en;
+			if (min_en - st < min_len) continue;
+			h[a[i][0] + "\t" + t[0]] = 0;
+			//print(a[i][0], t[0], min_en - st);
+		}
+	}
+	a.push([t[0], t[5], st, en]);
+}
+file.close();
+
+file = new File(arguments[getopt.ind+1]);
+while (file.readline(buf) >= 0) {
+	var t = buf.toString().split("\t");
+	var key = t[0] + "\t" + t[5];
+	if (h[key] != null) ++h[key];
+	else {
+		key = t[5] + "\t" + t[0];
+		if (h[key] != null) ++h[key];
+	}
+}
+file.close();
+
+buf.destroy();
+
+var n_ovlp = 0, n_missed = 0;
+for (var key in h) {
+	++n_ovlp;
+	if (h[key] == 0) ++n_missed;
+}
+print(n_ovlp + " overlaps");
+print(n_missed + " missed");
+print((1 - n_missed/n_ovlp).toFixed(4) + " sensitivity");


=====================================
misc/paftop.js
=====================================
--- /dev/null
+++ b/misc/paftop.js
@@ -0,0 +1,142 @@
+var getopt = function(args, ostr) {
+	var oli; // option letter list index
+	if (typeof(getopt.place) == 'undefined')
+		getopt.ind = 0, getopt.arg = null, getopt.place = -1;
+	if (getopt.place == -1) { // update scanning pointer
+		if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
+			getopt.place = -1;
+			return null;
+		}
+		if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
+			++getopt.ind;
+			getopt.place = -1;
+			return null;
+		}
+	}
+	var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
+	if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
+		if (optopt == '-') return null; //  if the user didn't specify '-' as an option, assume it means null.
+		if (getopt.place < 0) ++getopt.ind;
+		return '?';
+	}
+	if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
+		getopt.arg = null;
+		if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
+	} else { // need an argument
+		if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
+			getopt.arg = args[getopt.ind].substr(getopt.place);
+		else if (args.length <= ++getopt.ind) { // no arg
+			getopt.place = -1;
+			if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
+			return '?';
+		} else getopt.arg = args[getopt.ind]; // white space
+		getopt.place = -1;
+		++getopt.ind;
+	}
+	return optopt;
+}
+
+function pafmask(a, mask_level)
+{
+	var k = 1;
+	for (var i = 1; i < a.length; ++i) {
+		var j, ai = a[i];
+		for (j = 0; j < k; ++j) {
+			var ol = 0, aj = a[j];
+			if (ai[2] < aj[2]) {
+				if (ai[3] > aj[2])
+					ol = ai[3] - aj[2];
+			} else {
+				if (aj[3] > ai[2])
+					ol = aj[3] - ai[2];
+			}
+			var min_l = ai[3] - ai[2] < aj[3] - aj[2]? ai[3] - ai[2] : aj[3] - aj[2];
+			if (ol > min_l * mask_level)
+				break;
+		}
+		if (j == k) a[k++] = ai;
+	}
+	a.length = k;
+}
+
+function pafmerge(a, max_gap)
+{
+	for (var i = 1; i < a.length; ++i) {
+		var ai = a[i];
+		for (var j = 0; j < i; ++j) {
+			var aj = a[j];
+			if (aj[4] != ai[4] || aj[5] != ai[5]) continue; // diff strand or chr
+			var ts = [ai[7], aj[7]], te = [ai[8], aj[8]];
+			var qs = [ai[2], aj[2]], qe = [ai[3], aj[3]];
+			if (qs[0] > qs[1]) {
+				qs = [aj[2], ai[2]], qe = [aj[3], ai[3]];
+				ts = [aj[7], ai[7]], te = [aj[8], ai[8]];
+				if (ai[4] == '-') {
+					ts = [aj[6] - aj[8], ai[6] - ai[8]];
+					te = [aj[6] - aj[7], ai[6] - ai[7]];
+				}
+			} else {
+				if (ai[4] == '-') {
+					ts = [ai[6] - ai[8], aj[6] - aj[8]];
+					te = [ai[6] - ai[7], aj[6] - aj[7]];
+				}
+			}
+			if (qe[0] > qe[1]) continue; // contained
+			if (ts[0] > ts[1]) continue;
+			var qg = qs[1] - qe[0], tg = ts[1] - te[0];
+			if ((qg < 0 && tg < 0) || Math.abs(tg - qg) < max_gap) {
+				//print("Merged: ["+ai[2]+","+ai[3]+") <=> ["+aj[2]+","+aj[3]+") "+ai[4]+" ["+ai[7]+","+ai[8]+") <=> ["+aj[7]+","+aj[8]+")");
+				aj[2] = qs[0], aj[3] = qe[1];
+				if (aj[4] == '+') {
+					aj[7] = ts[0], aj[8] = te[1];
+				} else {
+					aj[7] = aj[6] - te[1], aj[8] = aj[6] - ts[0];
+				}
+				aj[9] += ai[9], aj[10] += ai[10];
+				aj[11] = aj[11] > ai[11]? aj[11] : ai[11];
+				a[i] = [];
+				break;
+			}
+		}
+	}
+	var k = 0;
+	for (var i = 0; i < a.length; ++i)
+		if (a[i].length != 0) a[k++] = a[i];
+	a.length = k;
+}
+
+function paftop(a, mask_level, max_gap)
+{
+	for (var i = 0; i < a.length; ++i) {
+		for (var j = 1; j <= 3; ++j) a[i][j] = parseInt(a[i][j]);
+		for (var j = 6; j <= 11; ++j) a[i][j] = parseInt(a[i][j]);
+	}
+	a.sort(function(x,y){return y[9]-x[9];});
+	pafmask(a, mask_level);
+	pafmerge(a, max_gap);
+	pafmask(a, mask_level);
+	for (var i = 0; i < a.length; ++i)
+		if (a[i].length) print(a[i].join("\t"));
+}
+
+var c, mask_level = .5, max_gap = 1000;
+while ((c = getopt(arguments, 'm:g:')) != null)
+	if (c == 'm') mask_level = parseFloat(getopt.arg);
+	else if (c == 'g') max_gap = parseInt(getopt.arg);
+
+var file = arguments.length == getopt.ind? new File() : new File(arguments[getopt.ind]);
+var buf = new Bytes();
+
+var last = null, a = [];
+while (file.readline(buf) >= 0) {
+	var t = buf.toString().split("\t");
+	if (t[0] != last) {
+		if (a.length) paftop(a, mask_level, max_gap);
+		a = [], last = t[0];
+	}
+	a.push(t);
+}
+if (a.length) paftop(a, mask_level, max_gap);
+
+buf.destroy();
+file.close();


=====================================
misc/sam2paf.js
=====================================
--- a/misc/sam2paf.js
+++ b/misc/sam2paf.js
@@ -1,10 +1,52 @@
-var file = arguments.length? new File(arguments[0]) : new File();
+var getopt = function(args, ostr) {
+	var oli; // option letter list index
+	if (typeof(getopt.place) == 'undefined')
+		getopt.ind = 0, getopt.arg = null, getopt.place = -1;
+	if (getopt.place == -1) { // update scanning pointer
+		if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
+			getopt.place = -1;
+			return null;
+		}
+		if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
+			++getopt.ind;
+			getopt.place = -1;
+			return null;
+		}
+	}
+	var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
+	if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
+		if (optopt == '-') return null; //  if the user didn't specify '-' as an option, assume it means null.
+		if (getopt.place < 0) ++getopt.ind;
+		return '?';
+	}
+	if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
+		getopt.arg = null;
+		if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
+	} else { // need an argument
+		if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
+			getopt.arg = args[getopt.ind].substr(getopt.place);
+		else if (args.length <= ++getopt.ind) { // no arg
+			getopt.place = -1;
+			if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
+			return '?';
+		} else getopt.arg = args[getopt.ind]; // white space
+		getopt.place = -1;
+		++getopt.ind;
+	}
+	return optopt;
+}
+
+var c, pri_only = false;
+while ((c = getopt(arguments, "p")) != null)
+	if (c == 'p') pri_only = true;
+
+var file = arguments.length == getopt.ind? new File() : new File(arguments[getopt.ind]);
 var buf = new Bytes();
-var re = /(\d+)([MIDSHN])/g;
+var re = /(\d+)([MIDSHNX=])/g;
 
 var len = {}, lineno = 0;
 while (file.readline(buf) >= 0) {
-	var m, line = buf.toString();
+	var m, n_cigar = 0, line = buf.toString();
 	++lineno;
 	if (line.charAt(0) == '@') {
 		if (/^@SQ/.test(line)) {
@@ -16,25 +58,43 @@ while (file.readline(buf) >= 0) {
 	}
 	var t = line.split("\t");
 	var flag = parseInt(t[1]);
+	if (t[9] != '*' && t[10] != '*' && t[9].length != t[10].length) throw Error("ERROR at line " + lineno + ": inconsistent SEQ and QUAL lengths - " + t[9].length + " != " + t[10].length);
 	if (t[2] == '*' || (flag&4)) continue;
+	if (pri_only && (flag&0x100)) continue;
 	var tlen = len[t[2]];
 	if (tlen == null) throw Error("ERROR at line " + lineno + ": can't find the length of contig " + t[2]);
+	var nn = (m = /\tnn:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0;
 	var NM = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : null;
-	if (NM == null) throw Error("ERROR at line " + lineno + ": no NM tag");
-	var clip = [0, 0], I = [0, 0], D = [0, 0], M = 0, N = 0;
+	var have_NM = NM == null? false : true;
+	NM += nn;
+	var clip = [0, 0], I = [0, 0], D = [0, 0], M = 0, N = 0, ql = 0, tl = 0, mm = 0, ext_cigar = false;
 	while ((m = re.exec(t[5])) != null) {
 		var l = parseInt(m[1]);
-		if (m[2] == 'M') M += l;
-		else if (m[2] == 'I') ++I[0], I[1] += l;
-		else if (m[2] == 'D') ++D[0], D[1] += l;
-		else if (m[2] == 'N') N += l;
-		else if (m[2] == 'S' || m[2] == 'H')
-			clip[M == 0? 0 : 1] = l;
-	}
-	if (NM < I[1] + D[1]) {
-		warn("WARNING at line " + lineno + ": NM is less than the total number of gaps; skipped");
+		if (m[2] == 'M') M += l, ql += l, tl += l, ext_cigar = false;
+		else if (m[2] == 'I') ++I[0], I[1] += l, ql += l;
+		else if (m[2] == 'D') ++D[0], D[1] += l, tl += l;
+		else if (m[2] == 'N') N += l, tl += l;
+		else if (m[2] == 'S') clip[M == 0? 0 : 1] = l, ql += l;
+		else if (m[2] == 'H') clip[M == 0? 0 : 1] = l;
+		else if (m[2] == '=') M += l, ql += l, tl += l, ext_cigar = true;
+		else if (m[2] == 'X') M += l, ql += l, tl += l, mm += l, ext_cigar = true;
+		++n_cigar;
+	}
+	if (n_cigar > 65535)
+		warn("WARNING at line " + lineno + ": " + n_cigar + " CIGAR operations");
+	if (tl + parseInt(t[3]) - 1 > tlen) {
+		warn("WARNING at line " + lineno + ": alignment end position larger than ref length; skipped");
 		continue;
 	}
+	if (t[9] != '*' && t[9].length != ql) {
+		warn("WARNING at line " + lineno + ": SEQ length inconsistent with CIGAR (" + t[9].length + " != " + ql + "); skipped");
+		continue;
+	}
+	if (!have_NM || ext_cigar) NM = I[1] + D[1] + mm;
+	if (NM < I[1] + D[1] + mm) {
+		warn("WARNING at line " + lineno + ": NM is less than the total number of gaps (" + NM + " < " + (I[1]+D[1]+mm) + ")");
+		NM = I[1] + D[1] + mm;
+	}
 	var extra = ["mm:i:"+(NM-I[1]-D[1]), "io:i:"+I[0], "in:i:"+I[1], "do:i:"+D[0], "dn:i:"+D[1]];
 	var match = M - (NM - I[1] - D[1]);
 	var blen = M + I[1] + D[1];


=====================================
misc/wt2paf.pl
=====================================
--- /dev/null
+++ b/misc/wt2paf.pl
@@ -0,0 +1,17 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+while (<>) {
+	chomp;
+	my @t = split("\t");
+	if ($t[4] eq '-') {
+		@t[3,4] = ($t[2] - $t[4], $t[2] - $t[3]);
+	}
+	if ($t[6] eq '-') {
+		@t[8,9] = ($t[7] - $t[9], $t[7] - $t[8]);
+	}
+	my $bl = $t[12] + $t[13] + $t[14] + $t[15];
+	print join("\t", @t[0,2..4], $t[1] eq $t[6]? '+' : '-', @t[5,7..9,12], $bl, 255), "\n";
+}



View it on GitLab: https://salsa.debian.org/med-team/miniasm/compare/c23b2121ec3f8082bbb02499edea827181a76fda...d7b6c4f3557ddd7602088589e02ff4ef9e874444

-- 
View it on GitLab: https://salsa.debian.org/med-team/miniasm/compare/c23b2121ec3f8082bbb02499edea827181a76fda...d7b6c4f3557ddd7602088589e02ff4ef9e874444
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180724/999412b1/attachment-0001.html>


More information about the debian-med-commit mailing list