[med-svn] [Git][med-team/fermi-lite][upstream] New upstream version 0.1+git20190320.b499514
Andreas Tille
gitlab at salsa.debian.org
Fri Jan 8 10:14:53 GMT 2021
Andreas Tille pushed to branch upstream at Debian Med / fermi-lite
Commits:
64754d1a by Andreas Tille at 2021-01-08T10:42:56+01:00
New upstream version 0.1+git20190320.b499514
- - - - -
7 changed files:
- README.md
- bubble.c
- example.c
- fml.h
- mag.c
- mag.h
- misc.c
Changes:
=====================================
README.md
=====================================
@@ -18,6 +18,11 @@ is able to retain heterozygous events and thus can be used to assemble diploid
regions for the purpose of variant calling. It is one of the limited choices
for local re-assembly and arguably the easiest to interface.
+If you use fermi-lite in your work, please cite the FermiKit paper:
+
+> Li H (2015) FermiKit: assembly-based variant calling for Illumina
+> resequencing data, *Bioinformatics*, **31**:3694-6.
+
## Usage
For now, see [example.c][example] for the basic use of the library. Here is a
@@ -42,8 +47,9 @@ int main(int argc, char *argv[])
return 0;
}
```
-The direct assembly output is in fact a graph. You may have a look at the
-[header file][header] for details.
+The `fml_assemble()` output is in fact a graph. You may have a look at the
+`fml_utg_print_gfa()` function in [misc.c][misc] about how to derive a
+[GFA][gfa] representation from an array of `fml_utg_t` objects.
## Overview of the Assembly Algorithm
@@ -86,3 +92,5 @@ after these procedure is the final output. Sequences in this graph are unitigs.
[fk]: http://github.com/lh3/fermikit
[example]: https://github.com/lh3/fermi-lite/blob/master/example.c
[header]: https://github.com/lh3/fermi-lite/blob/master/fml.h
+[misc]: https://github.com/lh3/fermi-lite/blob/master/misc.c
+[gfa]: https://github.com/pmelsted/GFA-spec
=====================================
bubble.c
=====================================
@@ -16,8 +16,6 @@ typedef khash_t(64) hash64_t;
#define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0)
#define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0)
-static int fm_verbose = 1;
-
/******************
* Closed bubbles *
******************/
@@ -178,7 +176,7 @@ void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist)
mag_g_merge(g, 0, 0);
}
-int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int aggressive)
+int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int max_bdiff, int aggressive)
{
magv_t *p = &g->v.a[idd>>1], *q[2];
ku128_v *r;
@@ -195,9 +193,10 @@ int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int
dir[j] = x&1;
q[j] = &g->v.a[x>>1];
if (q[j]->nei[0].n != 1 || q[j]->nei[1].n != 1) return ret; // no bubble
- l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y);
+ l[j] = q[j]->len - (int)(q[j]->nei[0].a->y + q[j]->nei[1].a->y); // bubble length, excluding overlaps
}
if (q[0]->nei[dir[0]^1].a->x != q[1]->nei[dir[1]^1].a->x) return ret; // no bubble
+ if (l[0] - l[1] > max_bdiff || l[1] - l[0] > max_bdiff) return 1; // huge bubble differences
for (j = 0; j < 2; ++j) { // set seq[] and cov[], and compute avg[]
if (l[j] > 0) {
seq[j] = malloc(l[j]<<1);
@@ -254,16 +253,16 @@ int mag_vh_pop_simple(mag_t *g, uint64_t idd, float max_cov, float max_frac, int
return ret;
}
-void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int aggressive)
+void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive)
{
int64_t i, n_examined = 0, n_popped = 0;
int ret;
for (i = 0; i < g->v.n; ++i) {
- ret = mag_vh_pop_simple(g, i<<1|0, max_cov, max_frac, aggressive);
+ ret = mag_vh_pop_simple(g, i<<1|0, max_cov, max_frac, max_bdiff, aggressive);
if (ret >= 1) ++n_examined;
if (ret >= 2) ++n_popped;
- ret = mag_vh_pop_simple(g, i<<1|1, max_cov, max_frac, aggressive);
+ ret = mag_vh_pop_simple(g, i<<1|1, max_cov, max_frac, max_bdiff, aggressive);
if (ret >= 1) ++n_examined;
if (ret >= 2) ++n_popped;
}
=====================================
example.c
=====================================
@@ -6,17 +6,21 @@
int main(int argc, char *argv[])
{
fml_opt_t opt;
- int c, n_seqs, n_utg;
+ int c, n_seqs, n_utg, gfa_out = 0;
bseq1_t *seqs;
fml_utg_t *utg;
fml_opt_init(&opt);
- while ((c = getopt(argc, argv, "Ae:l:r:t:c:")) >= 0) {
+ while ((c = getopt(argc, argv, "gOAe:l:r:t:c:d:v:")) >= 0) {
if (c == 'e') opt.ec_k = atoi(optarg);
else if (c == 'l') opt.min_asm_ovlp = atoi(optarg);
else if (c == 'r') opt.mag_opt.min_dratio1 = atof(optarg);
else if (c == 'A') opt.mag_opt.flag |= MAG_F_AGGRESSIVE;
+ else if (c == 'O') opt.mag_opt.flag &= ~MAG_F_POPOPEN;
+ else if (c == 'd') opt.mag_opt.max_bdiff = atoi(optarg);
else if (c == 't') opt.n_threads = atoi(optarg);
+ else if (c == 'g') gfa_out = 1;
+ else if (c == 'v') fm_verbose = atoi(optarg);
else if (c == 'c') {
char *p;
opt.min_cnt = strtol(optarg, &p, 10);
@@ -31,12 +35,16 @@ int main(int argc, char *argv[])
fprintf(stderr, " -l INT min overlap length during initial assembly [%d]\n", opt.min_asm_ovlp);
fprintf(stderr, " -r FLOAT drop an overlap if its length is below maxOvlpLen*FLOAT [%g]\n", opt.mag_opt.min_dratio1);
fprintf(stderr, " -t INT number of threads (don't use multi-threading for small data sets) [%d]\n", opt.n_threads);
- fprintf(stderr, " -A discard heterozygotes (apply this to assemble bacterial genomes)\n");
+ fprintf(stderr, " -d INT retain a bubble if one side is longer than the other side by >INT-bp [%d]\n", opt.mag_opt.max_bdiff);
+ fprintf(stderr, " -A discard heterozygotes (apply this to assemble bacterial genomes; override -O)\n");
+ fprintf(stderr, " -O don't apply aggressive tip trimming\n");
+ fprintf(stderr, " -g output the assembly graph in the GFA format\n");
return 1;
}
seqs = bseq_read(argv[optind], &n_seqs);
utg = fml_assemble(&opt, n_seqs, seqs, &n_utg);
- fml_utg_print(n_utg, utg);
+ if (!gfa_out) fml_utg_print(n_utg, utg);
+ else fml_utg_print_gfa(n_utg, utg);
fml_utg_destroy(n_utg, utg);
return 0;
}
=====================================
fml.h
=====================================
@@ -1,20 +1,21 @@
#ifndef FML_H
#define FML_H
-#define FML_VERSION "r41"
+#define FML_VERSION "r55"
#include <stdint.h>
typedef struct {
int32_t l_seq;
- char *seq, *qual;
+ char *seq, *qual; // NULL-terminated strings; length expected to match $l_seq
} bseq1_t;
-#define MAG_F_AGGRESSIVE 0x20
-#define MAG_F_NO_SIMPL 0x80
+#define MAG_F_AGGRESSIVE 0x20 // pop variant bubbles (not default)
+#define MAG_F_POPOPEN 0x40 // aggressive tip trimming (default)
+#define MAG_F_NO_SIMPL 0x80 // skip bubble simplification (default)
typedef struct {
- int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bvtx, min_merge_len, trim_len, trim_depth;
+ int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth;
float min_dratio1, max_bcov, max_bfrac;
} magopt_t;
@@ -31,8 +32,8 @@ struct rld_t;
struct mag_t;
typedef struct {
- uint32_t tid;
- uint32_t len:31, from:1;
+ uint32_t len:31, from:1; // $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end
+ uint32_t id:31, to:1; // $id: unitig number
} fml_ovlp_t;
typedef struct {
@@ -44,6 +45,8 @@ typedef struct {
fml_ovlp_t *ovlp; // overlaps, of size n_ovlp[0]+n_ovlp[1]
} fml_utg_t;
+extern int fm_verbose;
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -121,7 +124,7 @@ float fml_fltuniq(const fml_opt_t *opt, int n, bseq1_t *seq);
* @param n number of sequences
* @param seq array of sequences; FREED on return
*
- * @return FMD-index
+ * @return FMD-index on success; NULL if all input sequences are zero in length
*/
struct rld_t *fml_seq2fmi(const fml_opt_t *opt, int n, bseq1_t *seq);
@@ -161,6 +164,14 @@ fml_utg_t *fml_mag2utg(struct mag_t *g, int *n_utg);
*/
void fml_utg_print(int n_utgs, const fml_utg_t *utg);
+/**
+ * Output unitig graph in the GFA format
+ *
+ * @param n_utg number of unitigs
+ * @param utg array of unitigs
+ */
+void fml_utg_print_gfa(int n, const fml_utg_t *utg);
+
/**
* Deallocate an FM-index
*
=====================================
mag.c
=====================================
@@ -28,7 +28,7 @@ KSORT_INIT_GENERIC(uint64_t)
#define edge_mark_del(_x) ((_x).x = (uint64_t)-2, (_x).y = 0)
#define edge_is_del(_x) ((_x).x == (uint64_t)-2 || (_x).y == 0)
-static int fm_verbose = 1;
+int fm_verbose = 1;
/*********************
* Vector operations *
@@ -553,6 +553,7 @@ void mag_init_opt(magopt_t *o)
o->max_bfrac = 0.15;
o->max_bvtx = 64;
o->max_bdist = 512;
+ o->max_bdiff = 50;
}
void mag_g_clean(mag_t *g, const magopt_t *opt)
@@ -568,15 +569,15 @@ void mag_g_clean(mag_t *g, const magopt_t *opt)
for (j = 2; j <= opt->min_ensr; ++j)
mag_g_rm_vext(g, opt->min_elen, j);
mag_g_merge(g, 0, opt->min_merge_len);
- if (opt->flag & MAG_F_AGGRESSIVE) mag_g_pop_open(g, opt->min_elen);
+ if ((opt->flag & MAG_F_AGGRESSIVE) || (opt->flag & MAG_F_POPOPEN)) mag_g_pop_open(g, opt->min_elen);
if (!(opt->flag & MAG_F_NO_SIMPL)) mag_g_simplify_bubble(g, opt->max_bvtx, opt->max_bdist);
- mag_g_pop_simple(g, opt->max_bcov, opt->max_bfrac, opt->min_merge_len, opt->flag & MAG_F_AGGRESSIVE);
+ mag_g_pop_simple(g, opt->max_bcov, opt->max_bfrac, opt->min_merge_len, opt->max_bdiff, opt->flag & MAG_F_AGGRESSIVE);
mag_g_rm_vint(g, opt->min_elen, opt->min_insr, g->min_ovlp);
mag_g_rm_edge(g, g->min_ovlp, opt->min_dratio1, opt->min_elen, opt->min_ensr);
mag_g_merge(g, 1, opt->min_merge_len);
mag_g_rm_vext(g, opt->min_elen, opt->min_ensr);
mag_g_merge(g, 0, opt->min_merge_len);
- if (opt->flag & MAG_F_AGGRESSIVE) mag_g_pop_open(g, opt->min_elen);
+ if ((opt->flag & MAG_F_AGGRESSIVE) || (opt->flag & MAG_F_POPOPEN)) mag_g_pop_open(g, opt->min_elen);
mag_g_rm_vext(g, opt->min_elen, opt->min_ensr);
mag_g_merge(g, 0, opt->min_merge_len);
}
=====================================
mag.h
=====================================
@@ -49,7 +49,7 @@ extern "C" {
void mag_g_rm_edge(mag_t *g, int min_ovlp, double min_ratio, int min_len, int min_nsr);
void mag_g_merge(mag_t *g, int rmdup, int min_merge_len);
void mag_g_simplify_bubble(mag_t *g, int max_vtx, int max_dist);
- void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int aggressive);
+ void mag_g_pop_simple(mag_t *g, float max_cov, float max_frac, int min_merge_len, int max_bdiff, int aggressive);
void mag_g_pop_open(mag_t *g, int min_elen);
void mag_g_trim_open(mag_t *g, const magopt_t *opt);
=====================================
misc.c
=====================================
@@ -37,7 +37,7 @@ void fml_opt_init(fml_opt_t *opt)
opt->min_asm_ovlp = 33;
opt->min_merge_len = 0;
mag_init_opt(&opt->mag_opt);
- opt->mag_opt.flag = MAG_F_NO_SIMPL;
+ opt->mag_opt.flag = MAG_F_NO_SIMPL | MAG_F_POPOPEN;
}
void fml_opt_adjust(fml_opt_t *opt, int n_seqs, const bseq1_t *seqs)
@@ -72,6 +72,11 @@ struct rld_t *fml_fmi_gen(int n, bseq1_t *seq, int is_mt)
rld_t *e = 0;
int k;
+ for (k = 0; k < n; ++k)
+ if (seq[k].l_seq > 0)
+ break;
+ if (k == n) return 0;
+
mr = mr_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN, MR_SO_RCLO);
for (k = 0; k < n; ++k) {
int i;
@@ -193,7 +198,8 @@ fml_utg_t *fml_mag2utg(struct mag_t *g, int *n)
t = &q->ovlp[a++];
k = kh_get(64, h, s->x);
assert(k != kh_end(h));
- t->tid = kh_val(h, k);
+ t->id = kh_val(h, k) >> 1;
+ t->to = kh_val(h, k) & 1;
t->len = s->y;
t->from = from;
}
@@ -217,13 +223,13 @@ void fml_utg_print(int n, const fml_utg_t *utg)
kputc('\t', &out); kputw(u->nsr, &out);
kputc('\t', &out);
for (j = 0; j < u->n_ovlp[0]; ++j) {
- kputw(u->ovlp[j].tid, &out); kputc(',', &out);
+ kputw(u->ovlp[j].id<<1|u->ovlp[j].to, &out); kputc(',', &out);
kputw(u->ovlp[j].len, &out); kputc(';', &out);
}
if (u->n_ovlp[0] == 0) kputc('.', &out);
kputc('\t', &out);
for (; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) {
- kputw(u->ovlp[j].tid, &out); kputc(',', &out);
+ kputw(u->ovlp[j].id<<1|u->ovlp[j].to, &out); kputc(',', &out);
kputw(u->ovlp[j].len, &out); kputc(';', &out);
}
if (u->n_ovlp[1] == 0) kputc('.', &out);
@@ -238,6 +244,26 @@ void fml_utg_print(int n, const fml_utg_t *utg)
free(out.s);
}
+void fml_utg_print_gfa(int n, const fml_utg_t *utg)
+{
+ int i, j;
+ FILE *fp = stdout;
+ fputs("H\tVN:Z:1.0\n", fp);
+ for (i = 0; i < n; ++i) {
+ const fml_utg_t *u = &utg[i];
+ fprintf(fp, "S\t%d\t", i);
+ fputs(u->seq, fp);
+ fprintf(fp, "\tLN:i:%d\tRC:i:%d\tPD:Z:", u->len, u->nsr);
+ fputs(u->cov, fp);
+ fputc('\n', fp);
+ for (j = 0; j < u->n_ovlp[0] + u->n_ovlp[1]; ++j) {
+ fml_ovlp_t *o = &u->ovlp[j];
+ if (i < o->id)
+ fprintf(fp, "L\t%d\t%c\t%d\t%c\t%dM\n", i, "+-"[!o->from], o->id, "+-"[o->to], o->len);
+ }
+ }
+}
+
void fml_utg_destroy(int n, fml_utg_t *utg)
{
int i;
@@ -259,10 +285,12 @@ fml_utg_t *fml_assemble(const fml_opt_t *opt0, int n_seqs, bseq1_t *seqs, int *n
fml_opt_t opt = *opt0;
float kcov;
+ *n_utg = 0;
fml_opt_adjust(&opt, n_seqs, seqs);
if (opt.ec_k >= 0) fml_correct(&opt, n_seqs, seqs);
kcov = fml_fltuniq(&opt, n_seqs, seqs);
e = fml_seq2fmi(&opt, n_seqs, seqs);
+ if (e == 0) return 0; // this may happen when all sequences are filtered out
g = fml_fmi2mag(&opt, e);
opt.mag_opt.min_ensr = opt.mag_opt.min_ensr > kcov * MAG_MIN_NSR_COEF? opt.mag_opt.min_ensr : (int)(kcov * MAG_MIN_NSR_COEF + .499);
opt.mag_opt.min_ensr = opt.mag_opt.min_ensr < opt0->max_cnt? opt.mag_opt.min_ensr : opt0->max_cnt;
View it on GitLab: https://salsa.debian.org/med-team/fermi-lite/-/commit/64754d1a69ca6caa081d83627a6988ecd7fb09d8
--
View it on GitLab: https://salsa.debian.org/med-team/fermi-lite/-/commit/64754d1a69ca6caa081d83627a6988ecd7fb09d8
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210108/508117e0/attachment-0001.html>
More information about the debian-med-commit
mailing list