[med-svn] [minimap] 01/02: Imported Upstream version 0.2
Sascha Steinbiss
sascha at steinbiss.name
Fri Jan 8 13:52:43 UTC 2016
This is an automated email from the git hooks/post-receive script.
sascha-guest pushed a commit to branch master
in repository minimap.
commit e97a5a4a3728ecf82c659bc110d8f090af431f84
Author: Sascha Steinbiss <sascha at steinbiss.name>
Date: Fri Jan 8 10:56:18 2016 +0000
Imported Upstream version 0.2
---
.gitignore | 3 +
LICENSE.txt | 23 +++
Makefile | 46 +++++
README.md | 98 ++++++++++
bseq.c | 65 +++++++
bseq.h | 19 ++
example.c | 52 +++++
index.c | 352 ++++++++++++++++++++++++++++++++++
kdq.h | 128 +++++++++++++
khash.h | 619 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
kseq.h | 248 ++++++++++++++++++++++++
ksort.h | 159 ++++++++++++++++
kthread.c | 146 ++++++++++++++
kvec.h | 110 +++++++++++
main.c | 145 ++++++++++++++
map.c | 374 ++++++++++++++++++++++++++++++++++++
minimap.1 | 222 ++++++++++++++++++++++
minimap.h | 104 ++++++++++
misc.c | 26 +++
sdust.c | 209 ++++++++++++++++++++
sdust.h | 23 +++
sketch.c | 102 ++++++++++
22 files changed, 3273 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f98c8b2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.*.swp
+*.o
+*.a
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..beb04ef
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,23 @@
+The MIT License
+
+Copyright (c) 2015 Broad Institute
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..00bdae6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,46 @@
+CC= gcc
+CFLAGS= -g -Wall -O2 -Wc++-compat -Wno-unused-function
+CPPFLAGS=
+INCLUDES= -I.
+OBJS= kthread.o misc.o bseq.o sketch.o sdust.o index.o map.o
+PROG= minimap
+PROG_EXTRA= sdust minimap-lite
+LIBS= -lm -lz -lpthread
+
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@
+
+all:$(PROG)
+
+extra:all $(PROG_EXTRA)
+
+minimap:main.o libminimap.a
+ $(CC) $(CFLAGS) $< -o $@ -L. -lminimap $(LIBS)
+
+minimap-lite:example.o libminimap.a
+ $(CC) $(CFLAGS) $< -o $@ -L. -lminimap $(LIBS)
+
+libminimap.a:$(OBJS)
+ $(AR) -csru $@ $(OBJS)
+
+sdust:sdust.c kdq.h kvec.h kseq.h sdust.h
+ $(CC) -D_SDUST_MAIN $(CFLAGS) $< -o $@ -lz
+
+clean:
+ rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM session*
+
+depend:
+ (LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c)
+
+# DO NOT DELETE
+
+bseq.o: bseq.h kseq.h
+example.o: minimap.h bseq.h kseq.h
+index.o: minimap.h bseq.h kvec.h khash.h
+main.o: minimap.h bseq.h
+map.o: bseq.h kvec.h minimap.h sdust.h ksort.h
+misc.o: minimap.h bseq.h ksort.h
+sdust.o: kdq.h kvec.h sdust.h
+sketch.o: kvec.h minimap.h bseq.h
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..a0f2ce7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,98 @@
+## Introduction
+
+Minimap is an *experimental* tool to efficiently find multiple approximate
+mapping positions between two sets of long sequences, such as between reads and
+reference genomes, between genomes and between long noisy reads. By default, it
+is tuned to have high sensitivity to 2kb matches around 20% divergence but with
+low specificity. Minimap does not generate alignments as of now and because of
+this, it is usually tens of times faster than mainstream *aligners*. With four
+CPU cores, minimap can map 1.6Gbp PacBio reads to human in 2.5 minutes, 1Gbp
+PacBio E. coli reads to pre-indexed 9.6Gbp bacterial genomes in 3 minutes, to
+pre-indexed >100Gbp nt database in ~1 hour (of which ~20 minutes are spent on
+loading index from the network filesystem; peak RAM: 10GB), map 2800 bacteria
+to themselves in 1 hour, and map 1Gbp E. coli reads against themselves in a
+couple of minutes.
+
+Minimap does not replace mainstream aligners, but it can be useful when you
+want to quickly identify long approximate matches at moderate divergence among
+a huge collection of sequences. For this task, it is much faster than most
+existing tools.
+
+## Usage
+
+* Map two sets of long sequences:
+ ```sh
+ minimap target.fa.gz query.fa.gz > out.mini
+ ```
+ The output is TAB-delimited with each line consisting of query name, length,
+ 0-based start, end, strand, target name, length, start, end, the number of
+ matching bases, the number of co-linear minimizers in the match and the
+ fraction of matching bases.
+
+* All-vs-all PacBio read self-mapping for [miniasm][miniasm]:
+ ```sh
+ minimap -Sw5 -L100 -m0 reads.fa reads.fa | gzip -1 > reads.paf.gz
+ ```
+
+* Prebuild index and then map:
+ ```sh
+ minimap -d target.mmi target.fa.gz
+ minimap -l target.mmi query.fa.gz > out.mini
+ ```
+ Minimap indexing is very fast (1 minute for human genome; 50 minutes for >100Gbp
+ nt database retrieved on 2015-09-30), but for huge
+ repeatedly used databases, prebuilding index is still preferred.
+
+* Map sequences against themselve without diagnal matches:
+ ```sh
+ minimap -S sequences.fa sequences.fa > self-match.mini
+ ```
+ The output may still contain overlapping matches in repetitive regions.
+
+## Algorithm Overview
+
+1. Indexing. Collect all [(*w*,*k*)-minimizers][mini] in a batch (**-I**=4
+ billion bp) of target sequences and store them in a hash table. Mark top
+ **-f**=0.1% of most frequent minimizers as repeats. Minimap
+ uses [invertible hash function][invhash] to avoid taking ploy-A as
+ minimizers.
+
+2. For each query, collect all (*w*,*k*)-minimizers and look up the hash table for
+ matches (*q<sub>i</sub>*,*t<sub>i</sub>*,*s<sub>i</sub>*), where
+ *q<sub>i</sub>* is the query position, *t<sub>i</sub>* the target position
+ and *s<sub>i</sub>* indicates whether the minimizer match is on the same
+ strand.
+
+3. For matches on the same strand, sort by {*q<sub>i</sub>*-*t<sub>i</sub>*}
+ and then cluster matches within a **-r**=500bp window. Minimap merges
+ two windows if **-m**=50% of minimizer matches overlap. For matches on different
+ strands, sort {*q<sub>i</sub>*+*t<sub>i</sub>*} and apply a similar
+ clustering procedure. This is inspired by the [Hough transformation][hough].
+
+4. For each cluster, sort (*q<sub>i</sub>*,*t<sub>i</sub>*) by *q<sub>i</sub>*
+ and solve a [longest increasing sequence problem][lis] for *t<sub>i</sub>*. This
+ finds the longest co-linear matching chain. Break the chain whenever there
+ is a gap longer than **-g**=10000.
+
+5. Output the start and end of the chain if it contains **-c**=4 or more
+ minimizer matches and the matching length is no less than **-L**=40.
+
+6. Go to 1 and rewind to the first record of query if there are more target
+ sequences; otherwise stop.
+
+To increase sensitivity, we may decrease **-w** to index more minimizers;
+we may also decrease **-k**, though this may greatly impact performance for
+mammalian genomes.
+
+Also note that by default, if the total length of target sequences is less than
+4Gbp (1G=1 billion; controlled by **-I**), minimap creates one index and stream
+all the query sequences in one go. The multiple hits of a query sequence is
+adjacent to each other in the output. If the total length is greater than
+4Gbp, minimap needs to read query sequences multiple times. The multiple hits
+of a query may not be adjacent.
+
+[mini]: http://bioinformatics.oxfordjournals.org/content/20/18/3363.abstract
+[lis]: https://en.wikipedia.org/wiki/Longest_increasing_subsequence
+[hough]: https://en.wikipedia.org/wiki/Hough_transform
+[invhash]: https://gist.github.com/lh3/974ced188be2f90422cc
+[miniasm]: https://github.com/lh3/miniasm
diff --git a/bseq.c b/bseq.c
new file mode 100644
index 0000000..3cde6c8
--- /dev/null
+++ b/bseq.c
@@ -0,0 +1,65 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "bseq.h"
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+extern unsigned char seq_nt4_table[256];
+
+struct bseq_file_s {
+ int is_eof;
+ gzFile fp;
+ kseq_t *ks;
+};
+
+bseq_file_t *bseq_open(const char *fn)
+{
+ bseq_file_t *fp;
+ gzFile f;
+ f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ if (f == 0) return 0;
+ fp = (bseq_file_t*)calloc(1, sizeof(bseq_file_t));
+ fp->fp = f;
+ fp->ks = kseq_init(fp->fp);
+ return fp;
+}
+
+void bseq_close(bseq_file_t *fp)
+{
+ kseq_destroy(fp->ks);
+ gzclose(fp->fp);
+ free(fp);
+}
+
+bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_)
+{
+ int size = 0, m, n;
+ bseq1_t *seqs;
+ kseq_t *ks = fp->ks;
+ m = n = 0; seqs = 0;
+ while (kseq_read(ks) >= 0) {
+ bseq1_t *s;
+ assert(ks->seq.l <= INT32_MAX);
+ if (n >= m) {
+ m = m? m<<1 : 256;
+ seqs = (bseq1_t*)realloc(seqs, m * sizeof(bseq1_t));
+ }
+ s = &seqs[n];
+ s->name = strdup(ks->name.s);
+ s->seq = strdup(ks->seq.s);
+ s->l_seq = ks->seq.l;
+ size += seqs[n++].l_seq;
+ if (size >= chunk_size) break;
+ }
+ if (n == 0) fp->is_eof = 1;
+ *n_ = n;
+ return seqs;
+}
+
+int bseq_eof(bseq_file_t *fp)
+{
+ return fp->is_eof;
+}
diff --git a/bseq.h b/bseq.h
new file mode 100644
index 0000000..daf1fc5
--- /dev/null
+++ b/bseq.h
@@ -0,0 +1,19 @@
+#ifndef MM_BSEQ_H
+#define MM_BSEQ_H
+
+#include <stdint.h>
+
+struct bseq_file_s;
+typedef struct bseq_file_s bseq_file_t;
+
+typedef struct {
+ int l_seq, rid;
+ char *name, *seq;
+} bseq1_t;
+
+bseq_file_t *bseq_open(const char *fn);
+void bseq_close(bseq_file_t *fp);
+bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_);
+int bseq_eof(bseq_file_t *fp);
+
+#endif
diff --git a/example.c b/example.c
new file mode 100644
index 0000000..ecc5463
--- /dev/null
+++ b/example.c
@@ -0,0 +1,52 @@
+// To compile:
+// gcc -g -O2 example.c libminimap.a -lz
+
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#include <zlib.h>
+#include "minimap.h"
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+int main(int argc, char *argv[])
+{
+ if (argc < 3) {
+ fprintf(stderr, "Usage: minimap-lite <target.fa> <query.fa>\n");
+ return 1;
+ }
+
+ // open query file for reading; you may use your favorite FASTA/Q parser
+ gzFile f = gzopen(argv[2], "r");
+ assert(f);
+ kseq_t *ks = kseq_init(f);
+
+ // create index for target; we are creating one index for all target sequence
+ int n_threads = 4, w = 10, k = 15;
+ mm_idx_t *mi = mm_idx_build(argv[1], w, k, n_threads);
+ assert(mi);
+
+ // mapping
+ mm_mapopt_t opt;
+ mm_mapopt_init(&opt); // initialize mapping parameters
+ mm_tbuf_t *tbuf = mm_tbuf_init(); // thread buffer; for multi-threading, allocate one tbuf for each thread
+ while (kseq_read(ks) >= 0) { // each kseq_read() call reads one query sequence
+ const mm_reg1_t *reg;
+ int j, n_reg;
+ // get all hits for the query
+ reg = mm_map(mi, ks->seq.l, ks->seq.s, &n_reg, tbuf, &opt, 0);
+ // traverse hits and print them out
+ for (j = 0; j < n_reg; ++j) {
+ const mm_reg1_t *r = ®[j];
+ printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]);
+ printf("%s\t%d\t%d\t%d\t%d\t%d\n", mi->name[r->rid], mi->len[r->rid], r->rs, r->re, r->len, r->cnt);
+ }
+ }
+ mm_tbuf_destroy(tbuf);
+
+ // deallocate index and close the query file
+ mm_idx_destroy(mi);
+ kseq_destroy(ks);
+ gzclose(f);
+ return 0;
+}
diff --git a/index.c b/index.c
new file mode 100644
index 0000000..4edc433
--- /dev/null
+++ b/index.c
@@ -0,0 +1,352 @@
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#include "minimap.h"
+#include "kvec.h"
+#include "khash.h"
+
+#define idx_hash(a) ((a)>>1)
+#define idx_eq(a, b) ((a)>>1 == (b)>>1)
+KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
+typedef khash_t(idx) idxhash_t;
+
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
+
+mm_idx_t *mm_idx_init(int w, int k, int b)
+{
+ mm_idx_t *mi;
+ if (k*2 < b) b = k * 2;
+ if (w < 1) w = 1;
+ mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
+ mi->w = w, mi->k = k, mi->b = b;
+ mi->max_occ = UINT32_MAX;
+ mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t));
+ return mi;
+}
+
+void mm_idx_destroy(mm_idx_t *mi)
+{
+ int i;
+ if (mi == 0) return;
+ for (i = 0; i < 1<<mi->b; ++i) {
+ free(mi->B[i].p);
+ free(mi->B[i].a.a);
+ kh_destroy(idx, (idxhash_t*)mi->B[i].h);
+ }
+ free(mi->B);
+ if (mi->name)
+ for (i = 0; i < mi->n; ++i) free(mi->name[i]);
+ free(mi->len); free(mi->name);
+ free(mi);
+}
+
+const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n)
+{
+ int mask = (1<<mi->b) - 1;
+ khint_t k;
+ mm_idx_bucket_t *b = &mi->B[minier&mask];
+ idxhash_t *h = (idxhash_t*)b->h;
+ *n = 0;
+ if (h == 0) return 0;
+ k = kh_get(idx, h, minier>>mi->b<<1);
+ if (k == kh_end(h)) return 0;
+ if (kh_key(h, k)&1) {
+ *n = 1;
+ return &kh_val(h, k);
+ } else {
+ *n = (uint32_t)kh_val(h, k);
+ return &b->p[kh_val(h, k)>>32];
+ }
+}
+
+uint32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
+{
+ int i;
+ size_t n = 0;
+ uint32_t thres;
+ khint_t *a, k;
+ if (f <= 0.) return UINT32_MAX;
+ for (i = 0; i < 1<<mi->b; ++i)
+ if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
+ a = (uint32_t*)malloc(n * 4);
+ for (i = n = 0; i < 1<<mi->b; ++i) {
+ idxhash_t *h = (idxhash_t*)mi->B[i].h;
+ if (h == 0) continue;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (!kh_exist(h, k)) continue;
+ a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
+ }
+ }
+ thres = ks_ksmall_uint32_t(n, a, (uint32_t)((1. - f) * n)) + 1;
+ free(a);
+ return thres;
+}
+
+void mm_idx_set_max_occ(mm_idx_t *mi, float f)
+{
+ mi->freq_thres = f;
+ mi->max_occ = mm_idx_cal_max_occ(mi, f);
+}
+
+/*********************************
+ * Sort and generate hash tables *
+ *********************************/
+
+static void worker_post(void *g, long i, int tid)
+{
+ int j, start_a, start_p, n, n_keys;
+ idxhash_t *h;
+ mm_idx_t *mi = (mm_idx_t*)g;
+ mm_idx_bucket_t *b = &mi->B[i];
+ if (b->a.n == 0) return;
+
+ // sort by minimizer
+ radix_sort_128x(b->a.a, b->a.a + b->a.n);
+
+ // count and preallocate
+ for (j = 1, n = 1, n_keys = 0, b->n = 0; j <= b->a.n; ++j) {
+ if (j == b->a.n || b->a.a[j].x != b->a.a[j-1].x) {
+ ++n_keys;
+ if (n > 1) b->n += n;
+ n = 1;
+ } else ++n;
+ }
+ h = kh_init(idx);
+ kh_resize(idx, h, n_keys);
+ b->p = (uint64_t*)calloc(b->n, 8);
+
+ // create the hash table
+ for (j = 1, n = 1, start_a = start_p = 0; j <= b->a.n; ++j) {
+ if (j == b->a.n || b->a.a[j].x != b->a.a[j-1].x) {
+ khint_t itr;
+ int absent;
+ mm128_t *p = &b->a.a[j-1];
+ itr = kh_put(idx, h, p->x>>mi->b<<1, &absent);
+ assert(absent && j - start_a == n);
+ if (n == 1) {
+ kh_key(h, itr) |= 1;
+ kh_val(h, itr) = p->y;
+ } else {
+ int k;
+ for (k = 0; k < n; ++k)
+ b->p[start_p + k] = b->a.a[start_a + k].y;
+ kh_val(h, itr) = (uint64_t)start_p<<32 | n;
+ start_p += n;
+ }
+ start_a = j, n = 1;
+ } else ++n;
+ }
+ b->h = h;
+ assert(b->n == start_p);
+
+ // deallocate and clear b->a
+ free(b->a.a);
+ b->a.n = b->a.m = 0, b->a.a = 0;
+}
+
+static void mm_idx_post(mm_idx_t *mi, int n_threads)
+{
+ kt_for(n_threads, worker_post, mi, 1<<mi->b);
+}
+
+/******************
+ * Generate index *
+ ******************/
+
+#include <string.h>
+#include <zlib.h>
+#include "bseq.h"
+
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
+
+typedef struct {
+ int tbatch_size, n_processed, keep_name;
+ bseq_file_t *fp;
+ uint64_t ibatch_size, n_read;
+ mm_idx_t *mi;
+} pipeline_t;
+
+typedef struct {
+ int n_seq;
+ bseq1_t *seq;
+ mm128_v a;
+} step_t;
+
+static void mm_idx_add(mm_idx_t *mi, int n, const mm128_t *a)
+{
+ int i, mask = (1<<mi->b) - 1;
+ for (i = 0; i < n; ++i) {
+ mm128_v *p = &mi->B[a[i].x&mask].a;
+ kv_push(mm128_t, *p, a[i]);
+ }
+}
+
+static void *worker_pipeline(void *shared, int step, void *in)
+{
+ int i;
+ pipeline_t *p = (pipeline_t*)shared;
+ if (step == 0) { // step 0: read sequences
+ step_t *s;
+ if (p->n_read > p->ibatch_size) return 0;
+ s = (step_t*)calloc(1, sizeof(step_t));
+ s->seq = bseq_read(p->fp, p->tbatch_size, &s->n_seq);
+ if (s->seq) {
+ uint32_t old_m = p->mi->n, m, n;
+ assert((uint64_t)p->n_processed + s->n_seq <= INT32_MAX);
+ m = n = p->mi->n + s->n_seq;
+ kroundup32(m); kroundup32(old_m);
+ if (old_m != m) {
+ if (p->keep_name)
+ p->mi->name = (char**)realloc(p->mi->name, m * sizeof(char*));
+ p->mi->len = (int*)realloc(p->mi->len, m * sizeof(int));
+ }
+ for (i = 0; i < s->n_seq; ++i) {
+ if (p->keep_name) {
+ assert(strlen(s->seq[i].name) <= 254);
+ p->mi->name[p->mi->n] = strdup(s->seq[i].name);
+ }
+ p->mi->len[p->mi->n++] = s->seq[i].l_seq;
+ s->seq[i].rid = p->n_processed++;
+ p->n_read += s->seq[i].l_seq;
+ }
+ return s;
+ } else free(s);
+ } else if (step == 1) { // step 1: compute sketch
+ step_t *s = (step_t*)in;
+ for (i = 0; i < s->n_seq; ++i) {
+ bseq1_t *t = &s->seq[i];
+ mm_sketch(t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, &s->a);
+ free(t->seq); free(t->name);
+ }
+ free(s->seq); s->seq = 0;
+ return s;
+ } else if (step == 2) { // dispatch sketch to buckets
+ step_t *s = (step_t*)in;
+ mm_idx_add(p->mi, s->a.n, s->a.a);
+ free(s->a.a); free(s);
+ }
+ return 0;
+}
+
+mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name)
+{
+ pipeline_t pl;
+ memset(&pl, 0, sizeof(pipeline_t));
+ pl.tbatch_size = tbatch_size;
+ pl.keep_name = keep_name;
+ pl.ibatch_size = ibatch_size;
+ pl.fp = fp;
+ if (pl.fp == 0) return 0;
+ pl.mi = mm_idx_init(w, k, b);
+
+ kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
+ if (mm_verbose >= 3)
+ fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
+
+ mm_idx_post(pl.mi, n_threads);
+ if (mm_verbose >= 3)
+ fprintf(stderr, "[M::%s::%.3f*%.2f] sorted minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
+
+ return pl.mi;
+}
+
+mm_idx_t *mm_idx_build(const char *fn, int w, int k, int n_threads) // a simpler interface
+{
+ bseq_file_t *fp;
+ mm_idx_t *mi;
+ fp = bseq_open(fn);
+ if (fp == 0) return 0;
+ mi = mm_idx_gen(fp, w, k, MM_IDX_DEF_B, 1<<18, n_threads, UINT64_MAX, 1);
+ mm_idx_set_max_occ(mi, 0.001);
+ bseq_close(fp);
+ return mi;
+}
+
+/*************
+ * index I/O *
+ *************/
+
+#define MM_IDX_MAGIC "MMI\1"
+
+void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
+{
+ uint32_t x[6];
+ int i;
+ x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n, x[4] = mi->name? 1 : 0, x[5] = mi->max_occ;
+ fwrite(MM_IDX_MAGIC, 1, 4, fp);
+ fwrite(x, 4, 6, fp);
+ fwrite(&mi->freq_thres, sizeof(float), 1, fp);
+ fwrite(mi->len, 4, mi->n, fp);
+ if (mi->name) {
+ for (i = 0; i < mi->n; ++i) {
+ uint8_t l;
+ l = strlen(mi->name[i]);
+ fwrite(&l, 1, 1, fp);
+ fwrite(mi->name[i], 1, l, fp);
+ }
+ }
+ for (i = 0; i < 1<<mi->b; ++i) {
+ mm_idx_bucket_t *b = &mi->B[i];
+ khint_t k;
+ idxhash_t *h = (idxhash_t*)b->h;
+ uint32_t size = h? h->size : 0;
+ fwrite(&b->n, 4, 1, fp);
+ fwrite(b->p, 8, b->n, fp);
+ fwrite(&size, 4, 1, fp);
+ if (size == 0) continue;
+ for (k = 0; k < kh_end(h); ++k) {
+ uint64_t x[2];
+ if (!kh_exist(h, k)) continue;
+ x[0] = kh_key(h, k), x[1] = kh_val(h, k);
+ fwrite(x, 8, 2, fp);
+ }
+ }
+}
+
+mm_idx_t *mm_idx_load(FILE *fp)
+{
+ int i;
+ char magic[4];
+ uint32_t x[6];
+ mm_idx_t *mi;
+ if (fread(magic, 1, 4, fp) != 4) return 0;
+ if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
+ if (fread(x, 4, 6, fp) != 6) return 0;
+ mi = mm_idx_init(x[0], x[1], x[2]);
+ mi->n = x[3], mi->max_occ = x[5];
+ fread(&mi->freq_thres, sizeof(float), 1, fp);
+ mi->len = (int32_t*)malloc(mi->n * 4);
+ fread(mi->len, 4, mi->n, fp);
+ if (x[4]) { // has names
+ mi->name = (char**)calloc(mi->n, sizeof(char*));
+ for (i = 0; i < mi->n; ++i) {
+ uint8_t l;
+ fread(&l, 1, 1, fp);
+ mi->name[i] = (char*)malloc(l + 1);
+ fread(mi->name[i], 1, l, fp);
+ mi->name[i][l] = 0;
+ }
+ }
+ for (i = 0; i < 1<<mi->b; ++i) {
+ mm_idx_bucket_t *b = &mi->B[i];
+ uint32_t j, size;
+ khint_t k;
+ idxhash_t *h;
+ fread(&b->n, 4, 1, fp);
+ b->p = (uint64_t*)malloc(b->n * 8);
+ fread(b->p, 8, b->n, fp);
+ fread(&size, 4, 1, fp);
+ if (size == 0) continue;
+ b->h = h = kh_init(idx);
+ kh_resize(idx, h, size);
+ for (j = 0; j < size; ++j) {
+ uint64_t x[2];
+ int absent;
+ fread(x, 8, 2, fp);
+ k = kh_put(idx, h, x[0], &absent);
+ assert(absent);
+ kh_val(h, k) = x[1];
+ }
+ }
+ return mi;
+}
diff --git a/kdq.h b/kdq.h
new file mode 100644
index 0000000..edd55b5
--- /dev/null
+++ b/kdq.h
@@ -0,0 +1,128 @@
+#ifndef __AC_KDQ_H
+#define __AC_KDQ_H
+
+#include <stdlib.h>
+#include <string.h>
+
+#define __KDQ_TYPE(type) \
+ typedef struct { \
+ size_t front:58, bits:6, count, mask; \
+ type *a; \
+ } kdq_##type##_t;
+
+#define kdq_t(type) kdq_##type##_t
+#define kdq_size(q) ((q)->count)
+#define kdq_first(q) ((q)->a[(q)->front])
+#define kdq_last(q) ((q)->a[((q)->front + (q)->count - 1) & (q)->mask])
+#define kdq_at(q, i) ((q)->a[((q)->front + (i)) & (q)->mask])
+
+#define __KDQ_IMPL(type, SCOPE) \
+ SCOPE kdq_##type##_t *kdq_init_##type() \
+ { \
+ kdq_##type##_t *q; \
+ q = (kdq_##type##_t*)calloc(1, sizeof(kdq_##type##_t)); \
+ q->bits = 2, q->mask = (1ULL<<q->bits) - 1; \
+ q->a = (type*)malloc((1<<q->bits) * sizeof(type)); \
+ return q; \
+ } \
+ SCOPE void kdq_destroy_##type(kdq_##type##_t *q) \
+ { \
+ if (q == 0) return; \
+ free(q->a); free(q); \
+ } \
+ SCOPE int kdq_resize_##type(kdq_##type##_t *q, int new_bits) \
+ { \
+ size_t new_size = 1ULL<<new_bits, old_size = 1ULL<<q->bits; \
+ if (new_size < q->count) { /* not big enough */ \
+ int i; \
+ for (i = 0; i < 64; ++i) \
+ if (1ULL<<i > q->count) break; \
+ new_bits = i, new_size = 1ULL<<new_bits; \
+ } \
+ if (new_bits == q->bits) return q->bits; /* unchanged */ \
+ if (new_bits > q->bits) q->a = (type*)realloc(q->a, (1ULL<<new_bits) * sizeof(type)); \
+ if (q->front + q->count <= old_size) { /* unwrapped */ \
+ if (q->front + q->count > new_size) /* only happens for shrinking */ \
+ memmove(q->a, q->a + new_size, (q->front + q->count - new_size) * sizeof(type)); \
+ } else { /* wrapped */ \
+ memmove(q->a + (new_size - (old_size - q->front)), q->a + q->front, (old_size - q->front) * sizeof(type)); \
+ q->front = new_size - (old_size - q->front); \
+ } \
+ q->bits = new_bits, q->mask = (1ULL<<q->bits) - 1; \
+ if (new_bits < q->bits) q->a = (type*)realloc(q->a, (1ULL<<new_bits) * sizeof(type)); \
+ return q->bits; \
+ } \
+ SCOPE type *kdq_pushp_##type(kdq_##type##_t *q) \
+ { \
+ if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \
+ return &q->a[((q->count++) + q->front) & (q)->mask]; \
+ } \
+ SCOPE void kdq_push_##type(kdq_##type##_t *q, type v) \
+ { \
+ if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \
+ q->a[((q->count++) + q->front) & (q)->mask] = v; \
+ } \
+ SCOPE type *kdq_unshiftp_##type(kdq_##type##_t *q) \
+ { \
+ if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \
+ ++q->count; \
+ q->front = q->front? q->front - 1 : (1ULL<<q->bits) - 1; \
+ return &q->a[q->front]; \
+ } \
+ SCOPE void kdq_unshift_##type(kdq_##type##_t *q, type v) \
+ { \
+ type *p; \
+ p = kdq_unshiftp_##type(q); \
+ *p = v; \
+ } \
+ SCOPE type *kdq_pop_##type(kdq_##type##_t *q) \
+ { \
+ return q->count? &q->a[((--q->count) + q->front) & q->mask] : 0; \
+ } \
+ SCOPE type *kdq_shift_##type(kdq_##type##_t *q) \
+ { \
+ type *d = 0; \
+ if (q->count == 0) return 0; \
+ d = &q->a[q->front++]; \
+ q->front &= q->mask; \
+ --q->count; \
+ return d; \
+ }
+
+#define KDQ_INIT2(type, SCOPE) \
+ __KDQ_TYPE(type) \
+ __KDQ_IMPL(type, SCOPE)
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+#define KDQ_INIT(type) KDQ_INIT2(type, static inline klib_unused)
+
+#define KDQ_DECLARE(type) \
+ __KDQ_TYPE(type) \
+ kdq_##type##_t *kdq_init_##type(); \
+ void kdq_destroy_##type(kdq_##type##_t *q); \
+ int kdq_resize_##type(kdq_##type##_t *q, int new_bits); \
+ type *kdq_pushp_##type(kdq_##type##_t *q); \
+ void kdq_push_##type(kdq_##type##_t *q, type v); \
+ type *kdq_unshiftp_##type(kdq_##type##_t *q); \
+ void kdq_unshift_##type(kdq_##type##_t *q, type v); \
+ type *kdq_pop_##type(kdq_##type##_t *q); \
+ type *kdq_shift_##type(kdq_##type##_t *q);
+
+#define kdq_init(type) kdq_init_##type()
+#define kdq_destroy(type, q) kdq_destroy_##type(q)
+#define kdq_resize(type, q, new_bits) kdq_resize_##type(q, new_bits)
+#define kdq_pushp(type, q) kdq_pushp_##type(q)
+#define kdq_push(type, q, v) kdq_push_##type(q, v)
+#define kdq_pop(type, q) kdq_pop_##type(q)
+#define kdq_unshiftp(type, q) kdq_unshiftp_##type(q)
+#define kdq_unshift(type, q, v) kdq_unshift_##type(q, v)
+#define kdq_shift(type, q) kdq_shift_##type(q)
+
+#endif
diff --git a/khash.h b/khash.h
new file mode 100644
index 0000000..5e55088
--- /dev/null
+++ b/khash.h
@@ -0,0 +1,619 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/*
+ An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+ int ret, is_missing;
+ khiter_t k;
+ khash_t(32) *h = kh_init(32);
+ k = kh_put(32, h, 5, &ret);
+ kh_value(h, k) = 10;
+ k = kh_get(32, h, 10);
+ is_missing = (k == kh_end(h));
+ k = kh_get(32, h, 5);
+ kh_del(32, h, k);
+ for (k = kh_begin(h); k != kh_end(h); ++k)
+ if (kh_exist(h, k)) kh_value(h, k) = 1;
+ kh_destroy(32, h);
+ return 0;
+}
+*/
+
+/*
+ 2013-05-02 (0.2.8):
+
+ * Use quadratic probing. When the capacity is power of 2, stepping function
+ i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+ hashing on cache performance and is more robust than linear probing.
+
+ In theory, double hashing should be more robust than quadratic probing.
+ However, my implementation is probably not for large hash tables, because
+ the second hash function is closely tied to the first hash function,
+ which reduce the effectiveness of double hashing.
+
+ Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+ 2011-12-29 (0.2.7):
+
+ * Minor code clean up; no actual effect.
+
+ 2011-09-16 (0.2.6):
+
+ * The capacity is a power of 2. This seems to dramatically improve the
+ speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+ - http://code.google.com/p/ulib/
+ - http://nothings.org/computer/judy/
+
+ * Allow to optionally use linear probing which usually has better
+ performance for random input. Double hashing is still the default as it
+ is more robust to certain non-random input.
+
+ * Added Wang's integer hash function (not used by default). This hash
+ function is more robust to certain non-random input.
+
+ 2011-02-14 (0.2.5):
+
+ * Allow to declare global functions.
+
+ 2009-09-26 (0.2.4):
+
+ * Improve portability
+
+ 2008-09-19 (0.2.3):
+
+ * Corrected the example
+ * Improved interfaces
+
+ 2008-09-11 (0.2.2):
+
+ * Improved speed a little in kh_put()
+
+ 2008-09-10 (0.2.1):
+
+ * Added kh_clear()
+ * Fixed a compiling error
+
+ 2008-09-02 (0.2.0):
+
+ * Changed to token concatenation which increases flexibility.
+
+ 2008-08-31 (0.1.2):
+
+ * Fixed a bug in kh_get(), which has not been tested previously.
+
+ 2008-08-31 (0.1.1):
+
+ * Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+ @header
+
+ Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+ typedef struct kh_##name##_s { \
+ khint_t n_buckets, size, n_occupied, upper_bound; \
+ khint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
+ extern kh_##name##_t *kh_init_##name(void); \
+ extern void kh_destroy_##name(kh_##name##_t *h); \
+ extern void kh_clear_##name(kh_##name##_t *h); \
+ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
+ extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+ extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ SCOPE kh_##name##_t *kh_init_##name(void) { \
+ return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
+ } \
+ SCOPE void kh_destroy_##name(kh_##name##_t *h) \
+ { \
+ if (h) { \
+ kfree((void *)h->keys); kfree(h->flags); \
+ kfree((void *)h->vals); \
+ kfree(h); \
+ } \
+ } \
+ SCOPE void kh_clear_##name(kh_##name##_t *h) \
+ { \
+ if (h && h->flags) { \
+ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+ h->size = h->n_occupied = 0; \
+ } \
+ } \
+ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+ { \
+ if (h->n_buckets) { \
+ khint_t k, i, last, mask, step = 0; \
+ mask = h->n_buckets - 1; \
+ k = __hash_func(key); i = k & mask; \
+ last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ i = (i + (++step)) & mask; \
+ if (i == last) return h->n_buckets; \
+ } \
+ return __ac_iseither(h->flags, i)? h->n_buckets : i; \
+ } else return 0; \
+ } \
+ SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+ { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+ khint32_t *new_flags = 0; \
+ khint_t j = 1; \
+ { \
+ kroundup32(new_n_buckets); \
+ if (new_n_buckets < 4) new_n_buckets = 4; \
+ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
+ else { /* hash table size to be changed (shrink or expand); rehash */ \
+ new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ if (!new_flags) return -1; \
+ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ if (h->n_buckets < new_n_buckets) { /* expand */ \
+ khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (!new_keys) { kfree(new_flags); return -1; } \
+ h->keys = new_keys; \
+ if (kh_is_map) { \
+ khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+ if (!new_vals) { kfree(new_flags); return -1; } \
+ h->vals = new_vals; \
+ } \
+ } /* otherwise shrink */ \
+ } \
+ } \
+ if (j) { /* rehashing is needed */ \
+ for (j = 0; j != h->n_buckets; ++j) { \
+ if (__ac_iseither(h->flags, j) == 0) { \
+ khkey_t key = h->keys[j]; \
+ khval_t val; \
+ khint_t new_mask; \
+ new_mask = new_n_buckets - 1; \
+ if (kh_is_map) val = h->vals[j]; \
+ __ac_set_isdel_true(h->flags, j); \
+ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+ khint_t k, i, step = 0; \
+ k = __hash_func(key); \
+ i = k & new_mask; \
+ while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+ __ac_set_isempty_false(new_flags, i); \
+ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+ __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+ } else { /* write the element and jump out of the loop */ \
+ h->keys[i] = key; \
+ if (kh_is_map) h->vals[i] = val; \
+ break; \
+ } \
+ } \
+ } \
+ } \
+ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+ h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+ } \
+ kfree(h->flags); /* free the working space */ \
+ h->flags = new_flags; \
+ h->n_buckets = new_n_buckets; \
+ h->n_occupied = h->size; \
+ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+ } \
+ return 0; \
+ } \
+ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+ { \
+ khint_t x; \
+ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+ if (h->n_buckets > (h->size<<1)) { \
+ if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+ *ret = -1; return h->n_buckets; \
+ } \
+ } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+ *ret = -1; return h->n_buckets; \
+ } \
+ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+ { \
+ khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
+ else { \
+ last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ if (__ac_isdel(h->flags, i)) site = i; \
+ i = (i + (++step)) & mask; \
+ if (i == last) { x = site; break; } \
+ } \
+ if (x == h->n_buckets) { \
+ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+ else x = i; \
+ } \
+ } \
+ } \
+ if (__ac_isempty(h->flags, x)) { /* not present at all */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; ++h->n_occupied; \
+ *ret = 1; \
+ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; \
+ *ret = 2; \
+ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+ return x; \
+ } \
+ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
+ { \
+ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
+ __ac_set_isdel_true(h->flags, x); \
+ --h->size; \
+ } \
+ }
+
+#define KHASH_DECLARE(name, khkey_t, khval_t) \
+ __KHASH_TYPE(name, khkey_t, khval_t) \
+ __KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ __KHASH_TYPE(name, khkey_t, khval_t) \
+ __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+ @abstract Integer hash function
+ @param key The integer [khint32_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+ @abstract Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract 64-bit integer hash function
+ @param key The integer [khint64_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+ @abstract 64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract const char* hash function
+ @param s Pointer to a null terminated string
+ @return The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+ khint_t h = (khint_t)*s;
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+ return h;
+}
+/*! @function
+ @abstract Another interface to const char* hash function
+ @param key Pointer to a null terminated string [const char*]
+ @return The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+ @abstract Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+ key += ~(key << 15);
+ key ^= (key >> 10);
+ key += (key << 3);
+ key ^= (key >> 6);
+ key += ~(key << 11);
+ key ^= (key >> 16);
+ return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+ @abstract Type of the hash table.
+ @param name Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+ @abstract Initiate a hash table.
+ @param name Name of the hash table [symbol]
+ @return Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+ @abstract Destroy a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+ @abstract Reset a hash table without deallocating memory.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+ @abstract Resize a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param s New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+ @abstract Insert a key to the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @param r Extra return code: -1 if the operation failed;
+ 0 if the key is present in the hash table;
+ 1 if the bucket is empty (never used); 2 if the element in
+ the bucket has been deleted [int*]
+ @return Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+ @abstract Retrieve a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+ @abstract Remove a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+ @abstract Test whether a bucket contains data.
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return 1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+ @abstract Get key given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+ @abstract Get value given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Value [type of values]
+ @discussion For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Get the start iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+ @abstract Get the end iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+ @abstract Get the number of elements in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+ @abstract Get the number of buckets in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+ @abstract Iterate over the entries in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param kvar Variable to which key will be assigned
+ @param vvar Variable to which value will be assigned
+ @param code Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
+ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
+ if (!kh_exist(h,__i)) continue; \
+ (kvar) = kh_key(h,__i); \
+ (vvar) = kh_val(h,__i); \
+ code; \
+ } }
+
+/*! @function
+ @abstract Iterate over the values in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param vvar Variable to which value will be assigned
+ @param code Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i; \
+ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
+ if (!kh_exist(h,__i)) continue; \
+ (vvar) = kh_val(h,__i); \
+ code; \
+ } }
+
+/* More conenient interfaces */
+
+/*! @function
+ @abstract Instantiate a hash set containing integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name) \
+ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t) \
+ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name) \
+ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t) \
+ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name) \
+ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t) \
+ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/kseq.h b/kseq.h
new file mode 100644
index 0000000..6b98e51
--- /dev/null
+++ b/kseq.h
@@ -0,0 +1,248 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB 1 // isspace() && !' '
+#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX 2
+
+#define __KS_TYPE(type_t) \
+ typedef struct __kstream_t { \
+ int begin, end; \
+ int is_eof:2, bufsize:30; \
+ type_t f; \
+ unsigned char *buf; \
+ } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(SCOPE, type_t, __bufsize) \
+ SCOPE kstream_t *ks_init(type_t f) \
+ { \
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+ ks->f = f; ks->bufsize = __bufsize; \
+ ks->buf = (unsigned char*)malloc(__bufsize); \
+ return ks; \
+ } \
+ SCOPE void ks_destroy(kstream_t *ks) \
+ { \
+ if (!ks) return; \
+ free(ks->buf); \
+ free(ks); \
+ }
+
+#define __KS_INLINED(__read) \
+ static inline int ks_getc(kstream_t *ks) \
+ { \
+ if (ks->is_eof && ks->begin >= ks->end) return -1; \
+ if (ks->begin >= ks->end) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, ks->bufsize); \
+ if (ks->end < ks->bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) return -1; \
+ } \
+ return (int)ks->buf[ks->begin++]; \
+ } \
+ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ { return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ unsigned l, m;
+ char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(SCOPE, __read) \
+ SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+ { \
+ if (dret) *dret = 0; \
+ str->l = append? str->l : 0; \
+ if (ks->begin >= ks->end && ks->is_eof) return -1; \
+ for (;;) { \
+ int i; \
+ if (ks->begin >= ks->end) { \
+ if (!ks->is_eof) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, ks->bufsize); \
+ if (ks->end < ks->bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) break; \
+ } else break; \
+ } \
+ if (delimiter == KS_SEP_LINE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == '\n') break; \
+ } else if (delimiter > KS_SEP_MAX) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == delimiter) break; \
+ } else if (delimiter == KS_SEP_SPACE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i])) break; \
+ } else if (delimiter == KS_SEP_TAB) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+ } else i = 0; /* never come to here! */ \
+ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
+ str->m = str->l + (i - ks->begin) + 1; \
+ kroundup32(str->m); \
+ str->s = (char*)realloc(str->s, str->m); \
+ } \
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+ str->l = str->l + (i - ks->begin); \
+ ks->begin = i + 1; \
+ if (i < ks->end) { \
+ if (dret) *dret = ks->buf[i]; \
+ break; \
+ } \
+ } \
+ if (str->s == 0) { \
+ str->m = 1; \
+ str->s = (char*)calloc(1, 1); \
+ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+ str->s[str->l] = '\0'; \
+ return str->l; \
+ }
+
+#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
+ __KS_TYPE(type_t) \
+ __KS_BASIC(SCOPE, type_t, __bufsize) \
+ __KS_GETUNTIL(SCOPE, __read) \
+ __KS_INLINED(__read)
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
+
+#define KSTREAM_DECLARE(type_t, __read) \
+ __KS_TYPE(type_t) \
+ extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
+ extern kstream_t *ks_init(type_t f); \
+ extern void ks_destroy(kstream_t *ks); \
+ __KS_INLINED(__read)
+
+/******************
+ * FASTA/Q parser *
+ ******************/
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t) \
+ SCOPE kseq_t *kseq_init(type_t fd) \
+ { \
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
+ s->f = ks_init(fd); \
+ return s; \
+ } \
+ SCOPE void kseq_destroy(kseq_t *ks) \
+ { \
+ if (!ks) return; \
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+ ks_destroy(ks->f); \
+ free(ks); \
+ }
+
+/* Return value:
+ >=0 length of the sequence (normal)
+ -1 end-of-file
+ -2 truncated quality string
+ */
+#define __KSEQ_READ(SCOPE) \
+ SCOPE int kseq_read(kseq_t *seq) \
+ { \
+ int c; \
+ kstream_t *ks = seq->f; \
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+ if (c == -1) return -1; /* end of file */ \
+ seq->last_char = c; \
+ } /* else: the first header char has been read in the previous call */ \
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+ seq->seq.m = 256; \
+ seq->seq.s = (char*)malloc(seq->seq.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+ if (c == '\n') continue; /* skip empty lines */ \
+ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+ } \
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+ seq->seq.m = seq->seq.l + 2; \
+ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+ } \
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
+ if (c != '+') return seq->seq.l; /* FASTA */ \
+ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
+ seq->qual.m = seq->seq.m; \
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+ if (c == -1) return -2; /* error: no quality string */ \
+ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+ seq->last_char = 0; /* we have not come to the next header line */ \
+ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+ return seq->seq.l; \
+ }
+
+#define __KSEQ_TYPE(type_t) \
+ typedef struct { \
+ kstring_t name, comment, seq, qual; \
+ int last_char; \
+ kstream_t *f; \
+ } kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read) \
+ KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \
+ __KSEQ_TYPE(type_t) \
+ __KSEQ_BASIC(SCOPE, type_t) \
+ __KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+ __KS_TYPE(type_t) \
+ __KSEQ_TYPE(type_t) \
+ extern kseq_t *kseq_init(type_t fd); \
+ void kseq_destroy(kseq_t *ks); \
+ int kseq_read(kseq_t *seq);
+
+#endif
diff --git a/ksort.h b/ksort.h
new file mode 100644
index 0000000..0190f50
--- /dev/null
+++ b/ksort.h
@@ -0,0 +1,159 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2011 Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+// This is a simplified version of ksort.h
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+ void *left, *right;
+ int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt) \
+ size_t ks_lis_##name(size_t n, const type_t *a, size_t *b, size_t *_p) \
+ { /* translated from: http://www.algorithmist.com/index.php/Longest_Increasing_Subsequence.cpp */ \
+ size_t i, u, v, *top = b, *p; \
+ if (n == 0) return 0; \
+ p = _p? _p : (size_t*)malloc(n * sizeof(size_t)); \
+ *top++ = 0; \
+ for (i = 1; i < n; i++) { \
+ if (__sort_lt(a[*(top-1)], a[i])) { \
+ p[i] = *(top-1); \
+ *top++ = i; \
+ continue; \
+ } \
+ for (u = 0, v = top - b - 1; u < v;) { \
+ size_t c = (u + v) >> 1; \
+ if (__sort_lt(a[b[c]], a[i])) u = c + 1; \
+ else v = c; \
+ } \
+ if (__sort_lt(a[i], a[b[u]])) { \
+ if (u > 0) p[i] = b[u-1]; \
+ b[u] = i; \
+ } \
+ } \
+ for (u = top - b, v = *(top-1); u--; v = p[v]) b[u] = v; \
+ if (!_p) free(p); \
+ return top - b; \
+ } \
+ type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
+ { \
+ type_t *low, *high, *k, *ll, *hh, *mid; \
+ low = arr; high = arr + n - 1; k = arr + kk; \
+ for (;;) { \
+ if (high <= low) return *k; \
+ if (high == low + 1) { \
+ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+ return *k; \
+ } \
+ mid = low + (high - low) / 2; \
+ if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+ if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \
+ KSORT_SWAP(type_t, *mid, *(low+1)); \
+ ll = low + 1; hh = high; \
+ for (;;) { \
+ do ++ll; while (__sort_lt(*ll, *low)); \
+ do --hh; while (__sort_lt(*low, *hh)); \
+ if (hh < ll) break; \
+ KSORT_SWAP(type_t, *ll, *hh); \
+ } \
+ KSORT_SWAP(type_t, *low, *hh); \
+ if (hh <= k) low = ll; \
+ if (hh >= k) high = hh - 1; \
+ } \
+ } \
+
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#define RS_MIN_SIZE 64
+
+#define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \
+ typedef struct { \
+ rstype_t *b, *e; \
+ } rsbucket_##name##_t; \
+ void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \
+ { \
+ rstype_t *i; \
+ for (i = beg + 1; i < end; ++i) \
+ if (rskey(*i) < rskey(*(i - 1))) { \
+ rstype_t *j, tmp = *i; \
+ for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \
+ *j = *(j - 1); \
+ *j = tmp; \
+ } \
+ } \
+ void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \
+ { \
+ rstype_t *i; \
+ int size = 1<<n_bits, m = size - 1; \
+ rsbucket_##name##_t *k, b[size], *be = b + size; \
+ for (k = b; k != be; ++k) k->b = k->e = beg; \
+ for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \
+ for (k = b + 1; k != be; ++k) \
+ k->e += (k-1)->e - beg, k->b = (k-1)->e; \
+ for (k = b; k != be;) { \
+ if (k->b != k->e) { \
+ rsbucket_##name##_t *l; \
+ if ((l = b + (rskey(*k->b)>>s&m)) != k) { \
+ rstype_t tmp = *k->b, swap; \
+ do { \
+ swap = tmp; tmp = *l->b; *l->b++ = swap; \
+ l = b + (rskey(tmp)>>s&m); \
+ } while (l != k); \
+ *k->b++ = tmp; \
+ } else ++k->b; \
+ } else ++k; \
+ } \
+ for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \
+ if (s) { \
+ s = s > n_bits? s - n_bits : 0; \
+ for (k = b; k != be; ++k) \
+ if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \
+ else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \
+ } \
+ } \
+ void radix_sort_##name(rstype_t *beg, rstype_t *end) \
+ { \
+ if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \
+ else rs_sort_##name(beg, end, 8, sizeof_key * 8 - 8); \
+ }
+
+#endif
diff --git a/kthread.c b/kthread.c
new file mode 100644
index 0000000..1b13494
--- /dev/null
+++ b/kthread.c
@@ -0,0 +1,146 @@
+#include <pthread.h>
+#include <stdlib.h>
+#include <limits.h>
+
+/************
+ * kt_for() *
+ ************/
+
+struct kt_for_t;
+
+typedef struct {
+ struct kt_for_t *t;
+ long i;
+} ktf_worker_t;
+
+typedef struct kt_for_t {
+ int n_threads;
+ long n;
+ ktf_worker_t *w;
+ void (*func)(void*,long,int);
+ void *data;
+} kt_for_t;
+
+static inline long steal_work(kt_for_t *t)
+{
+ int i, min_i = -1;
+ long k, min = LONG_MAX;
+ for (i = 0; i < t->n_threads; ++i)
+ if (min > t->w[i].i) min = t->w[i].i, min_i = i;
+ k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
+ return k >= t->n? -1 : k;
+}
+
+static void *ktf_worker(void *data)
+{
+ ktf_worker_t *w = (ktf_worker_t*)data;
+ long i;
+ for (;;) {
+ i = __sync_fetch_and_add(&w->i, w->t->n_threads);
+ if (i >= w->t->n) break;
+ w->t->func(w->t->data, i, w - w->t->w);
+ }
+ while ((i = steal_work(w->t)) >= 0)
+ w->t->func(w->t->data, i, w - w->t->w);
+ pthread_exit(0);
+}
+
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
+{
+ int i;
+ kt_for_t t;
+ pthread_t *tid;
+ t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
+ t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t));
+ tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
+ for (i = 0; i < n_threads; ++i)
+ t.w[i].t = &t, t.w[i].i = i;
+ for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
+ for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+}
+
+/*****************
+ * kt_pipeline() *
+ *****************/
+
+struct ktp_t;
+
+typedef struct {
+ struct ktp_t *pl;
+ int64_t index;
+ int step;
+ void *data;
+} ktp_worker_t;
+
+typedef struct ktp_t {
+ void *shared;
+ void *(*func)(void*, int, void*);
+ int64_t index;
+ int n_workers, n_steps;
+ ktp_worker_t *workers;
+ pthread_mutex_t mutex;
+ pthread_cond_t cv;
+} ktp_t;
+
+static void *ktp_worker(void *data)
+{
+ ktp_worker_t *w = (ktp_worker_t*)data;
+ ktp_t *p = w->pl;
+ while (w->step < p->n_steps) {
+ // test whether we can kick off the job with this worker
+ pthread_mutex_lock(&p->mutex);
+ for (;;) {
+ int i;
+ // test whether another worker is doing the same step
+ for (i = 0; i < p->n_workers; ++i) {
+ if (w == &p->workers[i]) continue; // ignore itself
+ if (p->workers[i].step <= w->step && p->workers[i].index < w->index)
+ break;
+ }
+ if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps
+ pthread_cond_wait(&p->cv, &p->mutex);
+ }
+ pthread_mutex_unlock(&p->mutex);
+
+ // working on w->step
+ w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
+
+ // update step and let other workers know
+ pthread_mutex_lock(&p->mutex);
+ w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
+ if (w->step == 0) w->index = p->index++;
+ pthread_cond_broadcast(&p->cv);
+ pthread_mutex_unlock(&p->mutex);
+ }
+ pthread_exit(0);
+}
+
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
+{
+ ktp_t aux;
+ pthread_t *tid;
+ int i;
+
+ if (n_threads < 1) n_threads = 1;
+ aux.n_workers = n_threads;
+ aux.n_steps = n_steps;
+ aux.func = func;
+ aux.shared = shared_data;
+ aux.index = 0;
+ pthread_mutex_init(&aux.mutex, 0);
+ pthread_cond_init(&aux.cv, 0);
+
+ aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t));
+ for (i = 0; i < n_threads; ++i) {
+ ktp_worker_t *w = &aux.workers[i];
+ w->step = 0; w->pl = &aux; w->data = 0;
+ w->index = aux.index++;
+ }
+
+ tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
+ for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
+ for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+
+ pthread_mutex_destroy(&aux.mutex);
+ pthread_cond_destroy(&aux.cv);
+}
diff --git a/kvec.h b/kvec.h
new file mode 100644
index 0000000..632fce4
--- /dev/null
+++ b/kvec.h
@@ -0,0 +1,110 @@
+/* The MIT License
+
+ Copyright (c) 2008, by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/*
+ An example:
+
+#include "kvec.h"
+int main() {
+ kvec_t(int) array;
+ kv_init(array);
+ kv_push(int, array, 10); // append
+ kv_a(int, array, 20) = 5; // dynamic
+ kv_A(array, 20) = 4; // static
+ kv_destroy(array);
+ return 0;
+}
+*/
+
+/*
+ 2008-09-22 (0.1.0):
+
+ * The initial version.
+
+*/
+
+#ifndef AC_KVEC_H
+#define AC_KVEC_H
+
+#include <stdlib.h>
+
+#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+
+#define kvec_t(type) struct { size_t n, m; type *a; }
+#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
+#define kv_destroy(v) free((v).a)
+#define kv_A(v, i) ((v).a[(i)])
+#define kv_pop(v) ((v).a[--(v).n])
+#define kv_size(v) ((v).n)
+#define kv_max(v) ((v).m)
+
+#define kv_resize(type, v, s) do { \
+ if ((v).m < (s)) { \
+ (v).m = (s); \
+ kv_roundup32((v).m); \
+ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
+ } \
+ } while (0)
+
+#define kv_copy(type, v1, v0) do { \
+ if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \
+ (v1).n = (v0).n; \
+ memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
+ } while (0) \
+
+#define kv_push(type, v, x) do { \
+ if ((v).n == (v).m) { \
+ (v).m = (v).m? (v).m<<1 : 2; \
+ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
+ } \
+ (v).a[(v).n++] = (x); \
+ } while (0)
+
+#define kv_pushp(type, v, p) do { \
+ if ((v).n == (v).m) { \
+ (v).m = (v).m? (v).m<<1 : 2; \
+ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
+ } \
+ *(p) = &(v).a[(v).n++]; \
+ } while (0)
+
+#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \
+ ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
+ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
+ : (v).n <= (size_t)(i)? (v).n = (i) \
+ : 0), (v).a[(i)]
+
+#define kv_reverse(type, v, start) do { \
+ if ((v).m > 0 && (v).n > (start)) { \
+ size_t __i, __end = (v).n - (start); \
+ type *__a = (v).a + (start); \
+ for (__i = 0; __i < __end>>1; ++__i) { \
+ type __t = __a[__end - 1 - __i]; \
+ __a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \
+ } \
+ } \
+ } while (0)
+
+#endif
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..d2060be
--- /dev/null
+++ b/main.c
@@ -0,0 +1,145 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include "minimap.h"
+
+#define MM_VERSION "0.2-r123"
+
+void liftrlimit()
+{
+#ifdef __linux__
+ struct rlimit r;
+ getrlimit(RLIMIT_AS, &r);
+ r.rlim_cur = r.rlim_max;
+ setrlimit(RLIMIT_AS, &r);
+#endif
+}
+
+int main(int argc, char *argv[])
+{
+ mm_mapopt_t opt;
+ int i, c, k = 15, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx = 0;
+ int tbatch_size = 100000000;
+ uint64_t ibatch_size = 4000000000ULL;
+ float f = 0.001;
+ bseq_file_t *fp = 0;
+ char *fnw = 0;
+ FILE *fpr = 0, *fpw = 0;
+
+ liftrlimit();
+ mm_realtime0 = realtime();
+ mm_mapopt_init(&opt);
+
+ while ((c = getopt(argc, argv, "w:k:B:b:t:r:c:f:Vv:NOg:I:d:lRPST:m:L:Dx:")) >= 0) {
+ if (c == 'w') w = atoi(optarg);
+ else if (c == 'k') k = atoi(optarg);
+ else if (c == 'b') b = atoi(optarg);
+ else if (c == 'r') opt.radius = atoi(optarg);
+ else if (c == 'c') opt.min_cnt = atoi(optarg);
+ else if (c == 'm') opt.merge_frac = atof(optarg);
+ else if (c == 'f') f = atof(optarg);
+ else if (c == 't') n_threads = atoi(optarg);
+ else if (c == 'v') mm_verbose = atoi(optarg);
+ else if (c == 'g') opt.max_gap = atoi(optarg);
+ else if (c == 'N') keep_name = 0;
+ else if (c == 'd') fnw = optarg;
+ else if (c == 'l') is_idx = 1;
+ else if (c == 'R') opt.flag |= MM_F_WITH_REP;
+ else if (c == 'P') opt.flag &= ~MM_F_WITH_REP;
+ else if (c == 'D') opt.flag |= MM_F_NO_SELF;
+ else if (c == 'O') opt.flag |= MM_F_NO_ISO;
+ else if (c == 'S') opt.flag |= MM_F_AVA | MM_F_NO_SELF;
+ else if (c == 'T') opt.sdust_thres = atoi(optarg);
+ else if (c == 'L') opt.min_match = atoi(optarg);
+ else if (c == 'V') {
+ puts(MM_VERSION);
+ return 0;
+ } else if (c == 'B' || c == 'I') {
+ double x;
+ char *p;
+ x = strtod(optarg, &p);
+ if (*p == 'G' || *p == 'g') x *= 1e9;
+ else if (*p == 'M' || *p == 'm') x *= 1e6;
+ else if (*p == 'K' || *p == 'k') x *= 1e3;
+ if (c == 'B') tbatch_size = (uint64_t)(x + .499);
+ else ibatch_size = (uint64_t)(x + .499);
+ } else if (c == 'x') {
+ if (strcmp(optarg, "ava10k") == 0) {
+ opt.flag |= MM_F_AVA | MM_F_NO_SELF;
+ opt.min_match = 100;
+ opt.merge_frac = 0.0;
+ w = 5;
+ }
+ }
+ }
+ if (w < 0) w = (int)(.6666667 * k + .499);
+
+ if (argc == optind) {
+ fprintf(stderr, "Usage: minimap [options] <target.fa> [query.fa] [...]\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " Indexing:\n");
+ fprintf(stderr, " -k INT k-mer size [%d]\n", k);
+ fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n");
+ fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n");
+ fprintf(stderr, " -d FILE dump index to FILE []\n");
+ fprintf(stderr, " -l the 1st argument is a index file (overriding -k, -w and -I)\n");
+// fprintf(stderr, " -b INT bucket bits [%d]\n", b); // most users would care about this
+ fprintf(stderr, " Mapping:\n");
+ fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%.3f]\n", f);
+ fprintf(stderr, " -r INT bandwidth [%d]\n", opt.radius);
+ fprintf(stderr, " -m FLOAT merge two chains if FLOAT fraction of minimizers are shared [%.2f]\n", opt.merge_frac);
+ fprintf(stderr, " -c INT retain a mapping if it consists of >=INT minimizers [%d]\n", opt.min_cnt);
+ fprintf(stderr, " -L INT min matching length [%d]\n", opt.min_match);
+ fprintf(stderr, " -g INT split a mapping if there is a gap longer than INT [%d]\n", opt.max_gap);
+ fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres);
+// fprintf(stderr, " -D skip self mappings but keep dual mappings\n"); // too confusing to expose to end users
+ fprintf(stderr, " -S skip self and dual mappings\n");
+ fprintf(stderr, " -O drop isolated hits before chaining (EXPERIMENTAL)\n");
+ fprintf(stderr, " -P filtering potential repeats after mapping (EXPERIMENTAL)\n");
+// fprintf(stderr, " -R skip post-mapping repeat filtering\n"); // deprecated option for backward compatibility
+ fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n");
+ fprintf(stderr, " ava10k: -Sw5 -L100 -m0 (PacBio/ONT all-vs-all read mapping)\n");
+ fprintf(stderr, " Input/Output:\n");
+ fprintf(stderr, " -t INT number of threads [%d]\n", n_threads);
+// fprintf(stderr, " -B NUM process ~NUM bp in each batch [100M]\n");
+// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose);
+// fprintf(stderr, " -N use integer as target names\n");
+ fprintf(stderr, " -V show version number\n");
+ fprintf(stderr, "\nSee minimap.1 for detailed description of the command-line options.\n");
+ return 1;
+ }
+
+ if (is_idx) fpr = fopen(argv[optind], "rb");
+ else fp = bseq_open(argv[optind]);
+ if (fnw) fpw = fopen(fnw, "wb");
+ for (;;) {
+ mm_idx_t *mi = 0;
+ if (fpr) mi = mm_idx_load(fpr);
+ else if (!bseq_eof(fp))
+ mi = mm_idx_gen(fp, w, k, b, tbatch_size, n_threads, ibatch_size, keep_name);
+ if (mi == 0) break;
+ if (mm_verbose >= 3)
+ fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
+ __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n);
+ mm_idx_set_max_occ(mi, f);
+ if (mm_verbose >= 3)
+ fprintf(stderr, "[M::%s] max occurrences of a minimizer to consider: %d\n", __func__, mi->max_occ);
+ if (fpw) mm_idx_dump(fpw, mi);
+ for (i = optind + 1; i < argc; ++i)
+ mm_map_file(mi, argv[i], &opt, n_threads, tbatch_size);
+ mm_idx_destroy(mi);
+ }
+ if (fpw) fclose(fpw);
+ if (fpr) fclose(fpr);
+ if (fp) bseq_close(fp);
+
+ fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION);
+ fprintf(stderr, "[M::%s] CMD:", __func__);
+ for (i = 0; i < argc; ++i)
+ fprintf(stderr, " %s", argv[i]);
+ fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - mm_realtime0, cputime());
+ return 0;
+}
diff --git a/map.c b/map.c
new file mode 100644
index 0000000..0c68595
--- /dev/null
+++ b/map.c
@@ -0,0 +1,374 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "bseq.h"
+#include "kvec.h"
+#include "minimap.h"
+#include "sdust.h"
+
+void mm_mapopt_init(mm_mapopt_t *opt)
+{
+ opt->radius = 500;
+ opt->max_gap = 10000;
+ opt->min_cnt = 4;
+ opt->min_match = 40;
+ opt->sdust_thres = 0;
+ opt->flag = MM_F_WITH_REP;
+ opt->merge_frac = .5;
+}
+
+/****************************
+ * Find approxiate mappings *
+ ****************************/
+
+struct mm_tbuf_s { // per-thread buffer
+ mm128_v mini; // query minimizers
+ mm128_v coef; // Hough transform coefficient
+ mm128_v intv; // intervals on sorted coef
+ uint32_v reg2mini;
+ uint32_v rep_aux;
+ sdust_buf_t *sdb;
+ // the following are for computing LIS
+ uint32_t n, m;
+ uint64_t *a;
+ size_t *b, *p;
+ // final output
+ kvec_t(mm_reg1_t) reg;
+};
+
+mm_tbuf_t *mm_tbuf_init()
+{
+ mm_tbuf_t *b;
+ b = (mm_tbuf_t*)calloc(1, sizeof(mm_tbuf_t));
+ b->sdb = sdust_buf_init();
+ return b;
+}
+
+void mm_tbuf_destroy(mm_tbuf_t *b)
+{
+ if (b == 0) return;
+ free(b->mini.a); free(b->coef.a); free(b->intv.a); free(b->reg.a); free(b->reg2mini.a); free(b->rep_aux.a);
+ free(b->a); free(b->b); free(b->p);
+ sdust_buf_destroy(b->sdb);
+ free(b);
+}
+
+#include "ksort.h"
+#define sort_key_64(a) (a)
+KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8)
+#define lt_low32(a, b) ((uint32_t)(a) < (uint32_t)(b))
+KSORT_INIT(low32lt, uint64_t, lt_low32)
+#define gt_low32(a, b) ((uint32_t)(a) > (uint32_t)(b))
+KSORT_INIT(low32gt, uint64_t, gt_low32)
+
+/* TODO: drop_rep() is not robust. For all-vs-all mapping but without the -S
+ * flag, all minimizers have at least one hit. The _thres_ computed below will
+ * be highly skewed. Some improvements need to be made. */
+
+static void drop_rep(mm_tbuf_t *b, int min_cnt)
+{
+ int i, j, n, m;
+ uint32_t thres;
+ b->rep_aux.n = 0;
+ for (i = 0; i < b->mini.n; ++i)
+ if (b->mini.a[i].y>>32)
+ kv_push(uint32_t, b->rep_aux, b->mini.a[i].y>>32);
+ if (b->rep_aux.n < 3) return;
+ thres = (uint32_t)(ks_ksmall_uint32_t(b->rep_aux.n, b->rep_aux.a, b->rep_aux.n>>1) * MM_DEREP_Q50 + .499);
+ for (i = n = m = 0; i < b->reg.n; ++i) {
+ int cnt = 0, all_cnt = b->reg.a[i].cnt;
+ for (j = 0; j < all_cnt; ++j)
+ if (b->mini.a[b->reg2mini.a[m + j]].y>>32 <= thres)
+ ++cnt;
+ if (cnt >= min_cnt)
+ b->reg.a[n++] = b->reg.a[i];
+ m += all_cnt;
+ }
+// printf("%ld=>%d\t%d\n", b->reg.n, n, thres);
+ b->reg.n = n;
+}
+
+static void proc_intv(mm_tbuf_t *b, int which, int k, int min_cnt, int max_gap)
+{
+ int i, j, l_lis, rid = -1, rev = 0, start = b->intv.a[which].y, end = start + b->intv.a[which].x;
+
+ // make room for arrays needed by LIS (longest increasing sequence)
+ if (end - start > b->m) {
+ b->m = end - start;
+ kv_roundup32(b->m);
+ b->a = (uint64_t*)realloc(b->a, b->m * 8);
+ b->b = (size_t*)realloc(b->b, b->m * sizeof(size_t));
+ b->p = (size_t*)realloc(b->p, b->m * sizeof(size_t));
+ }
+
+ // prepare the input array _a_ for LIS
+ b->n = 0;
+ for (i = start; i < end; ++i)
+ if (b->coef.a[i].x != UINT64_MAX)
+ b->a[b->n++] = b->coef.a[i].y, rid = b->coef.a[i].x << 1 >> 33, rev = b->coef.a[i].x >> 63;
+ if (b->n < min_cnt) return;
+ radix_sort_64(b->a, b->a + b->n);
+
+ // find the longest increasing sequence
+ l_lis = rev? ks_lis_low32gt(b->n, b->a, b->b, b->p) : ks_lis_low32lt(b->n, b->a, b->b, b->p); // LIS
+ if (l_lis < min_cnt) return;
+ for (i = 1, j = 1; i < l_lis; ++i) // squeeze out minimizaers reused in the LIS sequence
+ if (b->a[b->b[i]]>>32 != b->a[b->b[i-1]]>>32)
+ b->a[b->b[j++]] = b->a[b->b[i]];
+ l_lis = j;
+ if (l_lis < min_cnt) return;
+
+ // convert LISes to regions; possibly break an LIS at a long gaps
+ for (i = 1, start = 0; i <= l_lis; ++i) {
+ int32_t qgap = i == l_lis? 0 : ((uint32_t)b->mini.a[b->a[b->b[i]]>>32].y>>1) - ((uint32_t)b->mini.a[b->a[b->b[i-1]]>>32].y>>1);
+ if (i == l_lis || (qgap > max_gap && abs((int32_t)b->a[b->b[i]] - (int32_t)b->a[b->b[i-1]]) > max_gap)) {
+ if (i - start >= min_cnt) {
+ uint32_t lq = 0, lr = 0, eq = 0, er = 0, sq = 0, sr = 0;
+ mm_reg1_t *r;
+ kv_pushp(mm_reg1_t, b->reg, &r);
+ r->rid = rid, r->rev = rev, r->cnt = i - start, r->rep = 0;
+ r->qs = ((uint32_t)b->mini.a[b->a[b->b[start]]>>32].y>>1) - (k - 1);
+ r->qe = ((uint32_t)b->mini.a[b->a[b->b[i-1]]>>32].y>>1) + 1;
+ r->rs = rev? (uint32_t)b->a[b->b[i-1]] : (uint32_t)b->a[b->b[start]];
+ r->re = rev? (uint32_t)b->a[b->b[start]] : (uint32_t)b->a[b->b[i-1]];
+ r->rs -= k - 1;
+ r->re += 1;
+ for (j = start; j < i; ++j) { // count the number of times each minimizer is used
+ int jj = b->a[b->b[j]]>>32;
+ b->mini.a[jj].y += 1ULL<<32;
+ kv_push(uint32_t, b->reg2mini, jj); // keep minimizer<=>reg mapping for derep
+ }
+ for (j = start; j < i; ++j) { // compute ->len
+ uint32_t q = ((uint32_t)b->mini.a[b->a[b->b[j]]>>32].y>>1) - (k - 1);
+ uint32_t r = (uint32_t)b->a[b->b[j]];
+ r = !rev? r - (k - 1) : (0x80000000U - r);
+ if (r > er) lr += er - sr, sr = r, er = sr + k;
+ else er = r + k;
+ if (q > eq) lq += eq - sq, sq = q, eq = sq + k;
+ else eq = q + k;
+ }
+ lr += er - sr, lq += eq - sq;
+ r->len = lr < lq? lr : lq;
+ }
+ start = i;
+ }
+ }
+}
+
+// merge or add a Hough interval; only used by get_reg()
+static inline void push_intv(mm128_v *intv, int start, int end, float merge_frac)
+{
+ mm128_t *p;
+ if (intv->n > 0) { // test overlap
+ int last_start, last_end, min;
+ p = &intv->a[intv->n-1];
+ last_start = p->y, last_end = p->x + last_start;
+ min = end - start < last_end - last_start? end - start : last_end - last_start;
+ if (last_end > start && last_end - start > min * merge_frac) { // large overlap; then merge
+ p->x = end - last_start;
+ return;
+ }
+ }
+ kv_pushp(mm128_t, *intv, &p); // a new interval
+ p->x = end - start, p->y = start;
+}
+
+// find mapping regions from a list of minimizer hits
+static void get_reg(mm_tbuf_t *b, int radius, int k, int min_cnt, int max_gap, float merge_frac, int flag)
+{
+ const uint64_t v_kept = ~(1ULL<<31), v_dropped = 1ULL<<31;
+ mm128_v *c = &b->coef;
+ int i, j, start = 0, iso_dist = radius * 2;
+
+ if (c->n < min_cnt) return;
+
+ // drop isolated minimizer hits
+ if (flag&MM_F_NO_ISO) {
+ for (i = 0; i < c->n; ++i) c->a[i].y |= v_dropped;
+ for (i = 1; i < c->n; ++i) {
+ uint64_t x = c->a[i].x;
+ int32_t rpos = (uint32_t)c->a[i].y;
+ for (j = i - 1; j >= 0 && x - c->a[j].x < radius; --j) {
+ int32_t y = c->a[j].y;
+ if (abs(y - rpos) < iso_dist) {
+ c->a[i].y &= v_kept, c->a[j].y &= v_kept;
+ break;
+ }
+ }
+ }
+ for (i = j = 0; i < c->n; ++i) // squeeze out hits still marked as v_dropped
+ if ((c->a[i].y&v_dropped) == 0)
+ c->a[j++] = c->a[i];
+ c->n = j;
+ }
+
+ // identify (possibly overlapping) intervals within _radius_; an interval is a cluster of hits
+ b->intv.n = 0;
+ for (i = 1; i < c->n; ++i) {
+ if (c->a[i].x - c->a[start].x > radius) {
+ if (i - start >= min_cnt) push_intv(&b->intv, start, i, merge_frac);
+ for (++start; start < i && c->a[i].x - c->a[start].x > radius; ++start);
+ }
+ }
+ if (i - start >= min_cnt) push_intv(&b->intv, start, i, merge_frac);
+
+ // sort by the size of the interval
+ radix_sort_128x(b->intv.a, b->intv.a + b->intv.n);
+
+ // generate hits, starting from the largest interval
+ b->reg2mini.n = 0;
+ for (i = b->intv.n - 1; i >= 0; --i) proc_intv(b, i, k, min_cnt, max_gap);
+
+ // post repeat removal
+ if (!(flag&MM_F_WITH_REP)) drop_rep(b, min_cnt);
+}
+
+const mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name)
+{
+ int j, n_dreg = 0, u = 0;
+ const uint64_t *dreg = 0;
+
+ b->mini.n = b->coef.n = 0;
+ mm_sketch(seq, l_seq, mi->w, mi->k, 0, &b->mini);
+ if (opt->sdust_thres > 0)
+ dreg = sdust_core((const uint8_t*)seq, l_seq, opt->sdust_thres, 64, &n_dreg, b->sdb);
+ for (j = 0; j < b->mini.n; ++j) {
+ int k, n;
+ const uint64_t *r;
+ int32_t qpos = (uint32_t)b->mini.a[j].y>>1, strand = b->mini.a[j].y&1;
+ b->mini.a[j].y = b->mini.a[j].y<<32>>32; // clear the rid field
+ if (dreg && n_dreg) { // test complexity
+ int s = qpos - (mi->k - 1), e = s + mi->k;
+ while (u < n_dreg && (uint32_t)dreg[u] <= s) ++u;
+ if (u < n_dreg && dreg[u]>>32 < e) {
+ int v, l = 0;
+ for (v = u; v < n_dreg && dreg[v]>>32 < e; ++v) { // iterate over LCRs overlapping this minimizer
+ int ss = s > dreg[v]>>32? s : dreg[v]>>32;
+ int ee = e < (uint32_t)dreg[v]? e : (uint32_t)dreg[v];
+ l += ee - ss;
+ }
+ if (l > mi->k>>1) continue;
+ }
+ }
+ r = mm_idx_get(mi, b->mini.a[j].x, &n);
+ if (n > mi->max_occ) continue;
+ for (k = 0; k < n; ++k) {
+ int32_t rpos = (uint32_t)r[k] >> 1;
+ mm128_t *p;
+ if (name && (opt->flag&MM_F_NO_SELF) && mi->name && strcmp(name, mi->name[r[k]>>32]) == 0 && rpos == qpos)
+ continue;
+ if (name && (opt->flag&MM_F_AVA) && mi->name && strcmp(name, mi->name[r[k]>>32]) > 0)
+ continue;
+ kv_pushp(mm128_t, b->coef, &p);
+ if ((r[k]&1) == strand) { // forward strand
+ p->x = (uint64_t)r[k] >> 32 << 32 | (0x80000000U + rpos - qpos);
+ p->y = (uint64_t)j << 32 | rpos;
+ } else { // reverse strand
+ p->x = (uint64_t)r[k] >> 32 << 32 | (rpos + qpos) | 1ULL<<63;
+ p->y = (uint64_t)j << 32 | rpos;
+ }
+ }
+ }
+ radix_sort_128x(b->coef.a, b->coef.a + b->coef.n);
+ b->reg.n = 0;
+ get_reg(b, opt->radius, mi->k, opt->min_cnt, opt->max_gap, opt->merge_frac, opt->flag);
+ *n_regs = b->reg.n;
+ return b->reg.a;
+}
+
+/**************************
+ * Multi-threaded mapping *
+ **************************/
+
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
+
+typedef struct {
+ int batch_size, n_processed, n_threads;
+ const mm_mapopt_t *opt;
+ bseq_file_t *fp;
+ const mm_idx_t *mi;
+} pipeline_t;
+
+typedef struct {
+ const pipeline_t *p;
+ int n_seq;
+ bseq1_t *seq;
+ int *n_reg;
+ mm_reg1_t **reg;
+ mm_tbuf_t **buf;
+} step_t;
+
+static void worker_for(void *_data, long i, int tid) // kt_for() callback
+{
+ step_t *step = (step_t*)_data;
+ const mm_reg1_t *regs;
+ int n_regs;
+
+ regs = mm_map(step->p->mi, step->seq[i].l_seq, step->seq[i].seq, &n_regs, step->buf[tid], step->p->opt, step->seq[i].name);
+ step->n_reg[i] = n_regs;
+ if (n_regs > 0) {
+ step->reg[i] = (mm_reg1_t*)malloc(n_regs * sizeof(mm_reg1_t));
+ memcpy(step->reg[i], regs, n_regs * sizeof(mm_reg1_t));
+ }
+}
+
+static void *worker_pipeline(void *shared, int step, void *in)
+{
+ int i, j;
+ pipeline_t *p = (pipeline_t*)shared;
+ if (step == 0) { // step 0: read sequences
+ step_t *s;
+ s = (step_t*)calloc(1, sizeof(step_t));
+ s->seq = bseq_read(p->fp, p->batch_size, &s->n_seq);
+ if (s->seq) {
+ s->p = p;
+ for (i = 0; i < s->n_seq; ++i)
+ s->seq[i].rid = p->n_processed++;
+ s->buf = (mm_tbuf_t**)calloc(p->n_threads, sizeof(mm_tbuf_t*));
+ for (i = 0; i < p->n_threads; ++i)
+ s->buf[i] = mm_tbuf_init();
+ s->n_reg = (int*)calloc(s->n_seq, sizeof(int));
+ s->reg = (mm_reg1_t**)calloc(s->n_seq, sizeof(mm_reg1_t**));
+ return s;
+ } else free(s);
+ } else if (step == 1) { // step 1: map
+ kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_seq);
+ return in;
+ } else if (step == 2) { // step 2: output
+ step_t *s = (step_t*)in;
+ const mm_idx_t *mi = p->mi;
+ for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]);
+ free(s->buf);
+ for (i = 0; i < s->n_seq; ++i) {
+ bseq1_t *t = &s->seq[i];
+ for (j = 0; j < s->n_reg[i]; ++j) {
+ mm_reg1_t *r = &s->reg[i][j];
+ if (r->len < p->opt->min_match) continue;
+ printf("%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]);
+ if (mi->name) fputs(mi->name[r->rid], stdout);
+ else printf("%d", r->rid + 1);
+ printf("\t%d\t%d\t%d\t%d\t%d\t255\tcm:i:%d\n", mi->len[r->rid], r->rs, r->re, r->len,
+ r->re - r->rs > r->qe - r->qs? r->re - r->rs : r->qe - r->qs, r->cnt);
+ }
+ free(s->reg[i]);
+ free(s->seq[i].seq); free(s->seq[i].name);
+ }
+ free(s->reg); free(s->n_reg); free(s->seq);
+ free(s);
+ }
+ return 0;
+}
+
+int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size)
+{
+ pipeline_t pl;
+ memset(&pl, 0, sizeof(pipeline_t));
+ pl.fp = bseq_open(fn);
+ if (pl.fp == 0) return -1;
+ pl.opt = opt, pl.mi = idx;
+ pl.n_threads = n_threads, pl.batch_size = tbatch_size;
+ kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3);
+ bseq_close(pl.fp);
+ return 0;
+}
diff --git a/minimap.1 b/minimap.1
new file mode 100644
index 0000000..03886fb
--- /dev/null
+++ b/minimap.1
@@ -0,0 +1,222 @@
+.TH minimap 1 "06 December 2015" "minimap-0.2" "Bioinformatics tools"
+
+.SH NAME
+.PP
+minimap - fast mapping between long DNA sequences
+
+.SH SYNOPSIS
+.PP
+minimap
+.RB [ -lSOV ]
+.RB [ -k
+.IR kmer ]
+.RB [ -w
+.IR winSize ]
+.RB [ -I
+.IR batchSize ]
+.RB [ -d
+.IR dumpFile ]
+.RB [ -f
+.IR occThres ]
+.RB [ -r
+.IR bandWidth ]
+.RB [ -m
+.IR minShared ]
+.RB [ -c
+.IR minCount ]
+.RB [ -L
+.IR minMatch ]
+.RB [ -g
+.IR maxGap ]
+.RB [ -T
+.IR dustThres ]
+.RB [ -t
+.IR nThreads ]
+.RB [ -x
+.IR preset ]
+.I target.fa
+.I query.fa
+>
+.I output.paf
+
+.SH DESCRIPTION
+.PP
+Minimap is a tool to efficiently find multiple approximate mapping positions
+between two sets of long sequences, such as between reads and reference
+genomes, between genomes and between long noisy reads. Minimap has an indexing
+and a mapping phase. In the indexing phase, it collects all minimizers of a
+large batch of target sequences in a hash table; in the mapping phase, it
+identifies good clusters of colinear minimizer hits. Minimap does not generate
+detailed alignments between the target and the query sequences. It only outputs
+the approximate start and the end coordinates of these clusters.
+
+.SH OPTIONS
+
+.SS Indexing options
+
+.TP 10
+.BI -k \ INT
+Minimizer k-mer length [15]
+
+.TP
+.BI -w \ INT
+Minimizer window size [2/3 of k-mer length]. A minimizer is the smallest k-mer
+in a window of w consecutive k-mers.
+
+.TP
+.BI -I \ NUM
+Load at most
+.I NUM
+target bases into RAM for indexing [4G]. If there are more than
+.I NUM
+bases in
+.IR target.fa ,
+minimap needs to read
+.I query.fa
+multiple times to map it against each batch of target sequences.
+.I NUM
+may be ending with k/K/m/M/g/G.
+
+.TP
+.BI -d \ FILE
+Dump minimizer index to
+.I FILE
+[no dump]
+
+.TP
+.B -l
+Indicate that
+.I target.fa
+is in fact a minimizer index generated by option
+.BR -d ,
+not a FASTA or FASTQ file.
+
+.SS Mapping options
+
+.TP 10
+.BI -f \ FLOAT
+Ignore top
+.I FLOAT
+fraction of most occurring minimizers [0.001]
+
+.TP
+.BI -r \ INT
+Approximate bandwidth for initial minimizer hits clustering [500]. A
+.I minimizer hit
+is a minimizer present in both the target and query sequences. A
+.I minimizer hit cluster
+is a group of potentially colinear minimizer hits between a target and a query
+sequence.
+
+.TP
+.BI -m \ FLOAT
+Merge initial minimizer hit clusters if
+.I FLOAT
+or higher fraction of minimizers are shared between the clusters [0.5]
+
+.TP
+.BI -c \ INT
+Retain a minimizer hit cluster if it contains
+.I INT
+or more minimizer hits [4]
+
+.TP
+.BI -L \ INT
+Discard a minimizer hit cluster if after colinearization, the number of matching bases is below
+.I INT
+[40]. This option mainly reduces the size of output. It has little effect on
+the speed and peak memory.
+
+.TP
+.BI -g \ INT
+Split a minimizer hit cluster at a gap
+.IR INT -bp
+or longer that does not contain any minimizer hits [10000]
+
+.TP
+.BI -T \ INT
+Mask regions on query sequences with SDUST score threshold
+.IR INT ;
+0 to disable [0]. SDUST is an algorithm
+to identify low-complexity subsequences. It is not enabled by default. If SDUST
+is preferred, a value between 20 and 25 is recommended. A higher threshold masks
+less sequences.
+
+.TP
+.B -S
+Perform all-vs-all mapping. In this mode, if the query sequence name is
+lexicographically larger than the target sequence name, the hits between them
+will be suppressed; if the query sequence name is the same as the target name,
+diagonal minimizer hits will also be suppressed.
+
+.TP
+.B -O
+Drop a minimizer hit if it is far away from other hits (EXPERIMENTAL). This
+option is useful for mapping long chromosomes from two diverged species.
+
+.TP
+.BI -x \ STR
+Changing multiple settings based on
+.I STR
+[not set]. It is recommended to apply this option before other options, such
+that the following options may override the multiple settings modified by this
+option.
+
+.RS
+.TP 8
+.B ava10k
+for PacBio or Oxford Nanopore all-vs-all read mapping (-Sw5 -L100 -m0).
+.RE
+
+.SS Input/output options
+
+.TP 10
+.BI -t \ INT
+Number of threads [3]. Minimap uses at most three threads when collecting
+minimizers on target sequences, and uses up to
+.IR INT +1
+threads when mapping (the extra thread is for I/O, which is frequently idle and
+takes little CPU time).
+
+.TP
+.B -V
+Print version number to stdout
+
+.SH OUTPUT FORMAT
+
+.PP
+Minimap outputs mapping positions in the Pairwise mApping Format (PAF). PAF is
+a TAB-delimited text format with each line consisting of at least 12 fields as
+are described in the following table:
+
+.TS
+center box;
+cb | cb | cb
+r | c | l .
+Col Type Description
+_
+1 string Query sequence name
+2 int Query sequence length
+3 int Query start coordinate (0-based)
+4 int Query end coordinate (0-based)
+5 char `+' if query and target on the same strand; `-' if opposite
+6 string Target sequence name
+7 int Target sequence length
+8 int Target start coordinate on the original strand
+9 int Target end coordinate on the original strand
+10 int Number of matching bases in the mapping
+11 int Number bases, including gaps, in the mapping
+12 int Mapping quality (0-255 with 255 for missing)
+.TE
+
+.PP
+When the alignment is available, column 11 gives the total number of sequence
+matches, mismatches and gaps in the alignment; column 10 divided by column 11
+gives the alignment identity. As minimap does not generate detailed alignment,
+these two columns are approximate. PAF may optionally have additional fields in
+the SAM-like typed key-value format. Minimap writes the number of minimizer
+hits in a cluster to the cm tag.
+
+.SH SEE ALSO
+.PP
+miniasm(1)
diff --git a/minimap.h b/minimap.h
new file mode 100644
index 0000000..84cd159
--- /dev/null
+++ b/minimap.h
@@ -0,0 +1,104 @@
+#ifndef MINIMAP_H
+#define MINIMAP_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include "bseq.h"
+
+#define MM_IDX_DEF_B 14
+#define MM_DEREP_Q50 5.0
+
+#define MM_F_WITH_REP 0x1
+#define MM_F_NO_SELF 0x2
+#define MM_F_NO_ISO 0x4
+#define MM_F_AVA 0x8
+
+typedef struct {
+ uint64_t x, y;
+} mm128_t;
+
+typedef struct { size_t n, m; mm128_t *a; } mm128_v;
+typedef struct { size_t n, m; uint64_t *a; } uint64_v;
+typedef struct { size_t n, m; uint32_t *a; } uint32_v;
+
+typedef struct {
+ mm128_v a; // (minimizer, position) array
+ int32_t n; // size of the _p_ array
+ uint64_t *p; // position array for minimizers appearing >1 times
+ void *h; // hash table indexing _p_ and minimizers appearing once
+} mm_idx_bucket_t;
+
+typedef struct {
+ int b, w, k;
+ uint32_t n; // number of reference sequences
+ mm_idx_bucket_t *B;
+ uint32_t max_occ;
+ float freq_thres;
+ int32_t *len; // length of each reference sequence
+ char **name; // TODO: if this uses too much RAM, switch one concatenated string
+} mm_idx_t;
+
+typedef struct {
+ uint32_t cnt:31, rev:1;
+ uint32_t rid:31, rep:1;
+ uint32_t len;
+ int32_t qs, qe, rs, re;
+} mm_reg1_t;
+
+typedef struct {
+ int radius; // bandwidth to cluster hits
+ int max_gap; // break a chain if there are no minimizers in a max_gap window
+ int min_cnt; // minimum number of minimizers to start a chain
+ int min_match;
+ int sdust_thres; // score threshold for SDUST; 0 to disable
+ int flag; // see MM_F_* macros
+ float merge_frac; // merge two chains if merge_frac fraction of minimzers are shared between the chains
+} mm_mapopt_t;
+
+extern int mm_verbose;
+extern double mm_realtime0;
+
+struct mm_tbuf_s;
+typedef struct mm_tbuf_s mm_tbuf_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// compute minimizers
+void mm_sketch(const char *str, int len, int w, int k, uint32_t rid, mm128_v *p);
+
+// minimizer indexing
+mm_idx_t *mm_idx_init(int w, int k, int b);
+void mm_idx_destroy(mm_idx_t *mi);
+mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name);
+void mm_idx_set_max_occ(mm_idx_t *mi, float f);
+const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
+
+mm_idx_t *mm_idx_build(const char *fn, int w, int k, int n_threads);
+
+// minimizer index I/O
+void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
+mm_idx_t *mm_idx_load(FILE *fp);
+
+// mapping
+void mm_mapopt_init(mm_mapopt_t *opt);
+mm_tbuf_t *mm_tbuf_init(void);
+void mm_tbuf_destroy(mm_tbuf_t *b);
+const mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);
+
+int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size);
+
+// private functions (may be moved to a "mmpriv.h" in future)
+double cputime(void);
+double realtime(void);
+void radix_sort_128x(mm128_t *beg, mm128_t *end);
+void radix_sort_64(uint64_t *beg, uint64_t *end);
+uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/misc.c b/misc.c
new file mode 100644
index 0000000..3f8bf52
--- /dev/null
+++ b/misc.c
@@ -0,0 +1,26 @@
+#include <sys/resource.h>
+#include <sys/time.h>
+#include "minimap.h"
+
+int mm_verbose = 3;
+double mm_realtime0;
+
+double cputime()
+{
+ struct rusage r;
+ getrusage(RUSAGE_SELF, &r);
+ return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
+}
+
+double realtime()
+{
+ struct timeval tp;
+ struct timezone tzp;
+ gettimeofday(&tp, &tzp);
+ return tp.tv_sec + tp.tv_usec * 1e-6;
+}
+
+#include "ksort.h"
+#define sort_key_128x(a) ((a).x)
+KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8)
+KSORT_INIT_GENERIC(uint32_t)
diff --git a/sdust.c b/sdust.c
new file mode 100644
index 0000000..e6032c3
--- /dev/null
+++ b/sdust.c
@@ -0,0 +1,209 @@
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "kdq.h"
+#include "kvec.h"
+#include "sdust.h"
+
+#define SD_WLEN 3
+#define SD_WTOT (1<<(SD_WLEN<<1))
+#define SD_WMSK (SD_WTOT - 1)
+
+typedef struct {
+ int start, finish;
+ int r, l;
+} perf_intv_t;
+
+typedef kvec_t(perf_intv_t) perf_intv_v;
+typedef kvec_t(uint64_t) uint64_v;
+
+KDQ_INIT(int)
+
+#if defined(_NO_NT4_TBL) || defined(_SDUST_MAIN)
+unsigned char seq_nt4_table[256] = {
+ 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+#else
+extern unsigned char seq_nt4_table[256];
+#endif
+
+struct sdust_buf_s {
+ kdq_t(int) *w;
+ perf_intv_v P; // the list of perfect intervals for the current window, sorted by descending start and then by ascending finish
+ uint64_v res; // the result
+};
+
+sdust_buf_t *sdust_buf_init(void)
+{
+ sdust_buf_t *buf;
+ buf = (sdust_buf_t*)calloc(1, sizeof(sdust_buf_t));
+ buf->w = kdq_init(int);
+ return buf;
+}
+
+void sdust_buf_destroy(sdust_buf_t *buf)
+{
+ if (buf == 0) return;
+ kdq_destroy(int, buf->w);
+ free(buf->P.a); free(buf->res.a);
+ free(buf);
+}
+
+static inline void shift_window(int t, kdq_t(int) *w, int T, int W, int *L, int *rw, int *rv, int *cw, int *cv)
+{
+ int s;
+ if (kdq_size(w) >= W - SD_WLEN + 1) { // TODO: is this right for SD_WLEN!=3?
+ s = *kdq_shift(int, w);
+ *rw -= --cw[s];
+ if (*L > kdq_size(w))
+ --*L, *rv -= --cv[s];
+ }
+ kdq_push(int, w, t);
+ ++*L;
+ *rw += cw[t]++;
+ *rv += cv[t]++;
+ if (cv[t] * 10 > T<<1) {
+ do {
+ s = kdq_at(w, kdq_size(w) - *L);
+ *rv -= --cv[s];
+ --*L;
+ } while (s != t);
+ }
+}
+
+static inline void save_masked_regions(uint64_v *res, perf_intv_v *P, int start)
+{
+ int i, saved = 0;
+ perf_intv_t *p;
+ if (P->n == 0 || P->a[P->n - 1].start >= start) return;
+ p = &P->a[P->n - 1];
+ if (res->n) {
+ int s = res->a[res->n - 1]>>32, f = (uint32_t)res->a[res->n - 1];
+ if (p->start <= f) // if overlapping with or adjacent to the previous interval
+ saved = 1, res->a[res->n - 1] = (uint64_t)s<<32 | (f > p->finish? f : p->finish);
+ }
+ if (!saved) kv_push(uint64_t, *res, (uint64_t)p->start<<32|p->finish);
+ for (i = P->n - 1; i >= 0 && P->a[i].start < start; --i); // remove perfect intervals that have falled out of the window
+ P->n = i + 1;
+}
+
+static void find_perfect(perf_intv_v *P, const kdq_t(int) *w, int T, int start, int L, int rv, const int *cv)
+{
+ int c[SD_WTOT], r = rv, i, max_r = 0, max_l = 0;
+ memcpy(c, cv, SD_WTOT * sizeof(int));
+ for (i = (long)kdq_size(w) - L - 1; i >= 0; --i) {
+ int j, t = kdq_at(w, i), new_r, new_l;
+ r += c[t]++;
+ new_r = r, new_l = kdq_size(w) - i - 1;
+ if (new_r * 10 > T * new_l) {
+ for (j = 0; j < P->n && P->a[j].start >= i + start; ++j) { // find insertion position
+ perf_intv_t *p = &P->a[j];
+ if (max_r == 0 || p->r * max_l > max_r * p->l)
+ max_r = p->r, max_l = p->l;
+ }
+ if (max_r == 0 || new_r * max_l >= max_r * new_l) { // then insert
+ max_r = new_r, max_l = new_l;
+ if (P->n == P->m) kv_resize(perf_intv_t, *P, P->n + 1);
+ memmove(&P->a[j+1], &P->a[j], (P->n - j) * sizeof(perf_intv_t)); // make room
+ ++P->n;
+ P->a[j].start = i + start, P->a[j].finish = kdq_size(w) + (SD_WLEN - 1) + start;
+ P->a[j].r = new_r, P->a[j].l = new_l;
+ }
+ }
+ }
+}
+
+const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf)
+{
+ int rv = 0, rw = 0, L = 0, cv[SD_WTOT], cw[SD_WTOT];
+ int i, start, l; // _start_: start of the current window; _l_: length of a contiguous A/C/G/T (sub)sequence
+ unsigned t; // current word
+
+ buf->P.n = buf->res.n = 0;
+ buf->w->front = buf->w->count = 0;
+ memset(cv, 0, SD_WTOT * sizeof(int));
+ memset(cw, 0, SD_WTOT * sizeof(int));
+ if (l_seq < 0) l_seq = strlen((const char*)seq);
+ for (i = l = t = 0; i <= l_seq; ++i) {
+ int b = i < l_seq? seq_nt4_table[seq[i]] : 4;
+ if (b < 4) { // an A/C/G/T base
+ ++l, t = (t<<2 | b) & SD_WMSK;
+ if (l >= SD_WLEN) { // we have seen a word
+ start = (l - W > 0? l - W : 0) + (i + 1 - l); // set the start of the current window
+ save_masked_regions(&buf->res, &buf->P, start); // save intervals falling out of the current window?
+ shift_window(t, buf->w, T, W, &L, &rw, &rv, cw, cv);
+ if (rw * 10 > L * T)
+ find_perfect(&buf->P, buf->w, T, start, L, rv, cv);
+ }
+ } else { // N or the end of sequence; N effectively breaks input into pieces of independent sequences
+ start = (l - W + 1 > 0? l - W + 1 : 0) + (i + 1 - l);
+ while (buf->P.n) save_masked_regions(&buf->res, &buf->P, start++); // clear up unsaved perfect intervals
+ l = t = 0;
+ }
+ }
+ *n = buf->res.n;
+ return buf->res.a;
+}
+
+uint64_t *sdust(const uint8_t *seq, int l_seq, int T, int W, int *n)
+{
+ uint64_t *ret;
+ sdust_buf_t *buf;
+ buf = sdust_buf_init();
+ ret = (uint64_t*)sdust_core(seq, l_seq, T, W, n, buf);
+ buf->res.a = 0;
+ sdust_buf_destroy(buf);
+ return ret;
+}
+
+#ifdef _SDUST_MAIN
+#include <zlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+int main(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *ks;
+ int W = 64, T = 20, c;
+
+ while ((c = getopt(argc, argv, "w:t:")) >= 0) {
+ if (c == 'w') W = atoi(optarg);
+ else if (c == 't') T = atoi(optarg);
+ }
+ if (optind == argc) {
+ fprintf(stderr, "Usage: sdust [-w %d] [-t %d] <in.fa>\n", W, T);
+ return 1;
+ }
+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ ks = kseq_init(fp);
+ while (kseq_read(ks) >= 0) {
+ uint64_t *r;
+ int i, n;
+ r = sdust((uint8_t*)ks->seq.s, -1, T, W, &n);
+ for (i = 0; i < n; ++i)
+ printf("%s\t%d\t%d\n", ks->name.s, (int)(r[i]>>32), (int)r[i]);
+ free(r);
+ }
+ kseq_destroy(ks);
+ gzclose(fp);
+ return 0;
+}
+#endif
diff --git a/sdust.h b/sdust.h
new file mode 100644
index 0000000..52a13ff
--- /dev/null
+++ b/sdust.h
@@ -0,0 +1,23 @@
+#ifndef SDUST_H
+#define SDUST_H
+
+struct sdust_buf_s;
+typedef struct sdust_buf_s sdust_buf_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// the simple interface
+uint64_t *sdust(const uint8_t *seq, int l_seq, int T, int W, int *n);
+
+// the following interface dramatically reduce heap allocations when sdust is frequently called.
+sdust_buf_t *sdust_buf_init(void);
+void sdust_buf_destroy(sdust_buf_t *buf);
+const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sketch.c b/sketch.c
new file mode 100644
index 0000000..04fbb4c
--- /dev/null
+++ b/sketch.c
@@ -0,0 +1,102 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include "kvec.h"
+#include "minimap.h"
+
+unsigned char seq_nt4_table[256] = {
+ 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+static inline uint64_t hash64(uint64_t key, uint64_t mask)
+{
+ key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
+ key = key ^ key >> 24;
+ key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
+ key = key ^ key >> 14;
+ key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
+ key = key ^ key >> 28;
+ key = (key + (key << 31)) & mask;
+ return key;
+}
+
+/**
+ * Find symmetric (w,k)-minimizers on a DNA sequence
+ *
+ * @param str DNA sequence
+ * @param len length of $str
+ * @param w find a minimizer for every $w consecutive k-mers
+ * @param k k-mer size
+ * @param rid reference ID; will be copied to the output $p array
+ * @param p minimizers; p->a[i].x is the 2k-bit hash value;
+ * p->a[i].y = rid<<32 | lastPos<<1 | strand
+ * where lastPos is the position of the last base of the i-th minimizer,
+ * and strand indicates whether the minimizer comes from the top or the bottom strand.
+ * Callers may want to set "p->n = 0"; otherwise results are appended to p
+ */
+void mm_sketch(const char *str, int len, int w, int k, uint32_t rid, mm128_v *p)
+{
+ uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0};
+ int i, j, l, buf_pos, min_pos;
+ mm128_t *buf, min = { UINT64_MAX, UINT64_MAX };
+
+ assert(len > 0 && w > 0 && k > 0);
+ buf = (mm128_t*)alloca(w * 16);
+ memset(buf, 0xff, w * 16);
+
+ for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
+ int c = seq_nt4_table[(uint8_t)str[i]];
+ mm128_t info = { UINT64_MAX, UINT64_MAX };
+ if (c < 4) { // not an ambiguous base
+ int z;
+ kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
+ kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
+ if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
+ z = kmer[0] < kmer[1]? 0 : 1; // strand
+ if (++l >= k)
+ info.x = hash64(kmer[z], mask), info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
+ } else l = 0;
+ buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
+ if (l == w + k - 1) { // special case for the first window - because identical k-mers are not stored yet
+ for (j = buf_pos + 1; j < w; ++j)
+ if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]);
+ for (j = 0; j < buf_pos; ++j)
+ if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]);
+ }
+ if (info.x <= min.x) { // a new minimum; then write the old min
+ if (l >= w + k) kv_push(mm128_t, *p, min);
+ min = info, min_pos = buf_pos;
+ } else if (buf_pos == min_pos) { // old min has moved outside the window
+ if (l >= w + k - 1) kv_push(mm128_t, *p, min);
+ for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
+ if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
+ for (j = 0; j <= buf_pos; ++j)
+ if (min.x >= buf[j].x) min = buf[j], min_pos = j;
+ if (l >= w + k - 1) { // write identical k-mers
+ for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
+ if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]);
+ for (j = 0; j <= buf_pos; ++j)
+ if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]);
+ }
+ }
+ if (++buf_pos == w) buf_pos = 0;
+ }
+ if (min.x != UINT64_MAX)
+ kv_push(mm128_t, *p, min);
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/minimap.git
More information about the debian-med-commit
mailing list