[med-svn] [Git][med-team/fermi-lite][upstream] New upstream version 0.1+git20190320.b499514

Fri Jan 8 10:14:53 GMT 2021


Andreas Tille pushed to branch upstream at Debian Med / fermi-lite


Commits:
64754d1a by Andreas Tille at 2021-01-08T10:42:56+01:00
New upstream version 0.1+git20190320.b499514
- - - - -


7 changed files:

- README.md
- bubble.c
- example.c
- fml.h
- mag.c
- mag.h
- misc.c


Changes:

=====================================
README.md
=====================================
@@ -18,6 +18,11 @@ is able to retain heterozygous events and thus can be used to assemble diploid
 regions for the purpose of variant calling. It is one of the limited choices
 for local re-assembly and arguably the easiest to interface.
 
+If you use fermi-lite in your work, please cite the FermiKit paper:
+
+> Li H (2015) FermiKit: assembly-based variant calling for Illumina
+> resequencing data, *Bioinformatics*, **31**:3694-6.
+
 ## Usage
 
 For now, see [example.c][example] for the basic use of the library. Here is a
@@ -42,8 +47,9 @@ int main(int argc, char *argv[])
 	return 0;
 }
 ```
-The direct assembly output is in fact a graph. You may have a look at the
-[header file][header] for details.
+The `fml_assemble()` output is in fact a graph. You may have a look at the
+`fml_utg_print_gfa()` function in [misc.c][misc] about how to derive a
+[GFA][gfa] representation from an array of `fml_utg_t` objects.
 
 ## Overview of the Assembly Algorithm
 
@@ -86,3 +92,5 @@ after these procedure is the final output. Sequences in this graph are unitigs.
 [fk]: http://github.com/lh3/fermikit
 [example]: https://github.com/lh3/fermi-lite/blob/master/example.c
 [header]: https://github.com/lh3/fermi-lite/blob/master/fml.h
+[misc]: https://github.com/lh3/fermi-lite/blob/master/misc.c
+[gfa]: https://github.com/pmelsted/GFA-spec


=====================================
bubble.c
=====================================
@@ -16,8 +16,6 @@ typedef khash_t(64) hash64_t;
 #define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0)
 #define edge_is_del(_x)   ((_x).x == (uint64_t)-2 || (_x).y == 0)
 
-static int fm_verbose = 1;
-
 /******************
  * Closed bubbles *
  ******************/
@@ -178,7 +176,7 @@ void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist)
 	mag_g_merge(g, 0, 0);
 }
 
-int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int aggressive)
+int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int max_bdiff, int aggressive)
 {
 	magv_t *p = &g->v.a[idd>>1], *q[2];
 	ku128_v *r;
@@ -195,9 +193,10 @@ int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int
 		dir[j] = x&1;
 		q[j] = &g->v.a[x>>1];
 		if (q[j]->nei[0].n != 1 || q[j]->nei[1].n != 1) return ret; // no bubble
-		l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y);
+		l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y); // bubble length, excluding overlaps
 	}
 	if (q[0]->nei[dir[0]^1].a->x != q[1]->nei[dir[1]^1].a->x) return ret; // no bubble
+	if (l[0] - l[1] > max_bdiff || l[1] - l[0] > max_bdiff) return 1; // huge bubble differences
 	for (j = 0; j < 2; ++j) { // set seq[] and cov[], and compute avg[]
 		if (l[j] > 0) {
 			seq[j] = malloc(l[j]<<1);
@@ -254,16 +253,16 @@ int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int
 	return ret;
 }
 
-void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int aggressive)
+void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive)
 {
 	int64_t i, n_examined = 0, n_popped = 0;
 	int ret;
 
 	for (i = 0; i < g->v.n; ++i) {
-		ret = mag_vh_pop_simple(g, i<<1|0, max_cov, max_frac, aggressive);
+		ret = mag_vh_pop_simple(g, i<<1|0, max_cov, max_frac, max_bdiff, aggressive);
 		if (ret >= 1) ++n_examined;
 		if (ret >= 2) ++n_popped;
-		ret = mag_vh_pop_simple(g, i<<1|1, max_cov, max_frac, aggressive);
+		ret = mag_vh_pop_simple(g, i<<1|1, max_cov, max_frac, max_bdiff, aggressive);
 		if (ret >= 1) ++n_examined;
 		if (ret >= 2) ++n_popped;
 	}


=====================================
example.c
=====================================
@@ -6,17 +6,21 @@
 int main(int argc, char *argv[])
 {
 	fml_opt_t opt;
-	int c, n_seqs, n_utg;
+	int c, n_seqs, n_utg, gfa_out = 0;
 	bseq1_t *seqs;
 	fml_utg_t *utg;
 
 	fml_opt_init(&opt);
-	while ((c = getopt(argc, argv, "Ae:l:r:t:c:")) >= 0) {
+	while ((c = getopt(argc, argv, "gOAe:l:r:t:c:d:v:")) >= 0) {
 		if (c == 'e') opt.ec_k = atoi(optarg);
 		else if (c == 'l') opt.min_asm_ovlp = atoi(optarg);
 		else if (c == 'r') opt.mag_opt.min_dratio1 = atof(optarg);
 		else if (c == 'A') opt.mag_opt.flag |= MAG_F_AGGRESSIVE;
+		else if (c == 'O') opt.mag_opt.flag &= ~MAG_F_POPOPEN;
+		else if (c == 'd') opt.mag_opt.max_bdiff = atoi(optarg);
 		else if (c == 't') opt.n_threads = atoi(optarg);
+		else if (c == 'g') gfa_out = 1;
+		else if (c == 'v') fm_verbose = atoi(optarg);
 		else if (c == 'c') {
 			char *p;
 			opt.min_cnt = strtol(optarg, &p, 10);
@@ -31,12 +35,16 @@ int main(int argc, char *argv[])
 		fprintf(stderr, "  -l INT          min overlap length during initial assembly [%d]\n", opt.min_asm_ovlp);
 		fprintf(stderr, "  -r FLOAT        drop an overlap if its length is below maxOvlpLen*FLOAT [%g]\n", opt.mag_opt.min_dratio1);
 		fprintf(stderr, "  -t INT          number of threads (don't use multi-threading for small data sets) [%d]\n", opt.n_threads);
-		fprintf(stderr, "  -A              discard heterozygotes (apply this to assemble bacterial genomes)\n");
+		fprintf(stderr, "  -d INT          retain a bubble if one side is longer than the other side by >INT-bp [%d]\n", opt.mag_opt.max_bdiff);
+		fprintf(stderr, "  -A              discard heterozygotes (apply this to assemble bacterial genomes; override -O)\n");
+		fprintf(stderr, "  -O              don't apply aggressive tip trimming\n");
+		fprintf(stderr, "  -g              output the assembly graph in the GFA format\n");
 		return 1;
 	}
 	seqs = bseq_read(argv[optind], &n_seqs);
 	utg = fml_assemble(&opt, n_seqs, seqs, &n_utg);
-	fml_utg_print(n_utg, utg);
+	if (!gfa_out) fml_utg_print(n_utg, utg);
+	else fml_utg_print_gfa(n_utg, utg);
 	fml_utg_destroy(n_utg, utg);
 	return 0;
 }


=====================================
fml.h
=====================================
@@ -1,20 +1,21 @@
 #ifndef FML_H
 #define FML_H
 
-#define FML_VERSION "r41"
+#define FML_VERSION "r55"
 
 #include <stdint.h>
 
 typedef struct {
 	int32_t l_seq;
-	char *seq, *qual;
+	char *seq, *qual; // NULL-terminated strings; length expected to match $l_seq
 } bseq1_t;
 
-#define MAG_F_AGGRESSIVE 0x20
-#define MAG_F_NO_SIMPL   0x80
+#define MAG_F_AGGRESSIVE 0x20 // pop variant bubbles (not default)
+#define MAG_F_POPOPEN    0x40 // aggressive tip trimming (default)
+#define MAG_F_NO_SIMPL   0x80 // skip bubble simplification (default)
 
 typedef struct {
-	int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bvtx, min_merge_len, trim_len, trim_depth;
+	int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth;
 	float min_dratio1, max_bcov, max_bfrac;
 } magopt_t;
 
@@ -31,8 +32,8 @@ struct rld_t;
 struct mag_t;
 
 typedef struct {
-	uint32_t tid;
-	uint32_t len:31, from:1;
+	uint32_t len:31, from:1; // $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end
+	uint32_t id:31, to:1;    // $id: unitig number
 } fml_ovlp_t;
 
 typedef struct {
@@ -44,6 +45,8 @@ typedef struct {
 	fml_ovlp_t *ovlp; // overlaps, of size n_ovlp[0]+n_ovlp[1]
 } fml_utg_t;
 
+extern int fm_verbose;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -121,7 +124,7 @@ float fml_fltuniq(const fml_opt_t *opt, int n, bseq1_t *seq);
  * @param n         number of sequences
  * @param seq       array of sequences; FREED on return
  *
- * @return FMD-index
+ * @return FMD-index on success; NULL if all input sequences are zero in length
  */
 struct rld_t *fml_seq2fmi(const fml_opt_t *opt, int n, bseq1_t *seq);
 
@@ -161,6 +164,14 @@ fml_utg_t *fml_mag2utg(struct mag_t *g, int *n_utg);
  */
 void fml_utg_print(int n_utgs, const fml_utg_t *utg);
 
+/**
+ * Output unitig graph in the GFA format
+ *
+ * @param n_utg     number of unitigs
+ * @param utg       array of unitigs
+ */
+void fml_utg_print_gfa(int n, const fml_utg_t *utg);
+
 /**
  * Deallocate an FM-index
  *


=====================================
mag.c
=====================================
@@ -28,7 +28,7 @@ KSORT_INIT_GENERIC(uint64_t)
 #define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0)
 #define edge_is_del(_x)   ((_x).x == (uint64_t)-2 || (_x).y == 0)
 
-static int fm_verbose = 1;
+int fm_verbose = 1;
 
 /*********************
  * Vector operations *
@@ -553,6 +553,7 @@ void mag_init_opt(magopt_t *o)
 	o->max_bfrac = 0.15;
 	o->max_bvtx = 64;
 	o->max_bdist = 512;
+	o->max_bdiff = 50;
 }
 
 void mag_g_clean(mag_t *g, const magopt_t *opt)
@@ -568,15 +569,15 @@ void mag_g_clean(mag_t *g, const magopt_t *opt)
 	for (j = 2; j <= opt->min_ensr; ++j)
 		mag_g_rm_vext(g, opt->min_elen, j);
 	mag_g_merge(g, 0, opt->min_merge_len);
-	if (opt->flag & MAG_F_AGGRESSIVE) mag_g_pop_open(g, opt->min_elen);
+	if ((opt->flag & MAG_F_AGGRESSIVE) || (opt->flag & MAG_F_POPOPEN)) mag_g_pop_open(g, opt->min_elen);
 	if (!(opt->flag & MAG_F_NO_SIMPL)) mag_g_simplify_bubble(g, opt->max_bvtx, opt->max_bdist);
-	mag_g_pop_simple(g, opt->max_bcov, opt->max_bfrac, opt->min_merge_len, opt->flag & MAG_F_AGGRESSIVE);
+	mag_g_pop_simple(g, opt->max_bcov, opt->max_bfrac, opt->min_merge_len, opt->max_bdiff, opt->flag & MAG_F_AGGRESSIVE);
 	mag_g_rm_vint(g, opt->min_elen, opt->min_insr, g->min_ovlp);
 	mag_g_rm_edge(g, g->min_ovlp, opt->min_dratio1, opt->min_elen, opt->min_ensr);
 	mag_g_merge(g, 1, opt->min_merge_len);
 	mag_g_rm_vext(g, opt->min_elen, opt->min_ensr);
 	mag_g_merge(g, 0, opt->min_merge_len);
-	if (opt->flag & MAG_F_AGGRESSIVE) mag_g_pop_open(g, opt->min_elen);
+	if ((opt->flag & MAG_F_AGGRESSIVE) || (opt->flag & MAG_F_POPOPEN)) mag_g_pop_open(g, opt->min_elen);
 	mag_g_rm_vext(g, opt->min_elen, opt->min_ensr);
 	mag_g_merge(g, 0, opt->min_merge_len);
 }


=====================================
mag.h
=====================================
@@ -49,7 +49,7 @@ extern "C" {
 	void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr);
 	void mag_g_merge(mag_t *g, int rmdup, int min_merge_len);
 	void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist);
-	void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int aggressive);
+	void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive);
 	void mag_g_pop_open(mag_t *g, int min_elen);
 	void mag_g_trim_open(mag_t *g, const magopt_t *opt);
 


=====================================
misc.c
=====================================
@@ -37,7 +37,7 @@ void fml_opt_init(fml_opt_t *opt)
 	opt->min_asm_ovlp = 33;
 	opt->min_merge_len = 0;
 	mag_init_opt(&opt->mag_opt);
-	opt->mag_opt.flag = MAG_F_NO_SIMPL;
+	opt->mag_opt.flag = MAG_F_NO_SIMPL | MAG_F_POPOPEN;
 }
 
 void fml_opt_adjust(fml_opt_t *opt, int n_seqs, const bseq1_t *seqs)
@@ -72,6 +72,11 @@ struct rld_t *fml_fmi_gen(int n, bseq1_t *seq, int is_mt)
 	rld_t *e = 0;
 	int k;
 
+	for (k = 0; k < n; ++k)
+		if (seq[k].l_seq > 0)
+			break;
+	if (k == n) return 0;
+
 	mr = mr_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN, MR_SO_RCLO);
 	for (k = 0; k < n; ++k) {
 		int i;
@@ -193,7 +198,8 @@ fml_utg_t *fml_mag2utg(struct mag_t *g, int *n)
 				t = &q->ovlp[a++];
 				k = kh_get(64, h, s->x);
 				assert(k != kh_end(h));
-				t->tid = kh_val(h, k);
+				t->id = kh_val(h, k) >> 1;
+				t->to = kh_val(h, k) & 1;
 				t->len = s->y;
 				t->from = from;
 			}
@@ -217,13 +223,13 @@ void fml_utg_print(int n, const fml_utg_t *utg)
 		kputc('\t', &out); kputw(u->nsr, &out);
 		kputc('\t', &out);
 		for (j = 0; j < u->n_ovlp[0]; ++j) {
-			kputw(u->ovlp[j].tid, &out); kputc(',', &out);
+			kputw(u->ovlp[j].id<<1|u->ovlp[j].to, &out); kputc(',', &out);
 			kputw(u->ovlp[j].len, &out); kputc(';', &out);
 		}
 		if (u->n_ovlp[0] == 0) kputc('.', &out);
 		kputc('\t', &out);
 		for (; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) {
-			kputw(u->ovlp[j].tid, &out); kputc(',', &out);
+			kputw(u->ovlp[j].id<<1|u->ovlp[j].to, &out); kputc(',', &out);
 			kputw(u->ovlp[j].len, &out); kputc(';', &out);
 		}
 		if (u->n_ovlp[1] == 0) kputc('.', &out);
@@ -238,6 +244,26 @@ void fml_utg_print(int n, const fml_utg_t *utg)
 	free(out.s);
 }
 
+void fml_utg_print_gfa(int n, const fml_utg_t *utg)
+{
+	int i, j;
+	FILE *fp = stdout;
+	fputs("H\tVN:Z:1.0\n", fp);
+	for (i = 0; i < n; ++i) {
+		const fml_utg_t *u = &utg[i];
+		fprintf(fp, "S\t%d\t", i);
+		fputs(u->seq, fp);
+		fprintf(fp, "\tLN:i:%d\tRC:i:%d\tPD:Z:", u->len, u->nsr);
+		fputs(u->cov, fp);
+		fputc('\n', fp);
+		for (j = 0; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) {
+			fml_ovlp_t *o = &u->ovlp[j];
+			if (i < o->id)
+				fprintf(fp, "L\t%d\t%c\t%d\t%c\t%dM\n", i, "+-"[!o->from], o->id, "+-"[o->to], o->len);
+		}
+	}
+}
+
 void fml_utg_destroy(int n, fml_utg_t *utg)
 {
 	int i;
@@ -259,10 +285,12 @@ fml_utg_t *fml_assemble(const fml_opt_t *opt0, int n_seqs, bseq1_t *seqs, int *n
 	fml_opt_t opt = *opt0;
 	float kcov;
 
+	*n_utg = 0;
 	fml_opt_adjust(&opt, n_seqs, seqs);
 	if (opt.ec_k >= 0) fml_correct(&opt, n_seqs, seqs);
 	kcov = fml_fltuniq(&opt, n_seqs, seqs);
 	e = fml_seq2fmi(&opt, n_seqs, seqs);
+	if (e == 0) return 0; // this may happen when all sequences are filtered out
 	g = fml_fmi2mag(&opt, e);
 	opt.mag_opt.min_ensr = opt.mag_opt.min_ensr > kcov * MAG_MIN_NSR_COEF? opt.mag_opt.min_ensr : (int)(kcov * MAG_MIN_NSR_COEF + .499);
 	opt.mag_opt.min_ensr = opt.mag_opt.min_ensr < opt0->max_cnt? opt.mag_opt.min_ensr : opt0->max_cnt;



View it on GitLab: https://salsa.debian.org/med-team/fermi-lite/-/commit/64754d1a69ca6caa081d83627a6988ecd7fb09d8

-- 
View it on GitLab: https://salsa.debian.org/med-team/fermi-lite/-/commit/64754d1a69ca6caa081d83627a6988ecd7fb09d8
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210108/508117e0/attachment-0001.html>