[med-svn] [seqtk] 02/07: Imported Upstream version 1.2
Andreas Tille
tille at debian.org
Mon May 30 08:23:49 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository seqtk.
commit 52f429030f3714f8a6cc8d925aafb62fa4bafa8c
Author: Andreas Tille <tille at debian.org>
Date: Mon May 30 10:13:43 2016 +0200
Imported Upstream version 1.2
---
LICENSE | 19 ++
Makefile | 6 +-
README.md | 16 +-
kseq.h | 10 +-
ksort.h | 298 ++++++++++++++++++++++++++++
kstring.h | 169 ++++++++++++++++
kvec.h | 90 +++++++++
seqtk.c | 663 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
8 files changed, 1204 insertions(+), 67 deletions(-)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0380c7e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+The MIT License
+Copyright (c) 20082-2012 by Heng Li <lh3 at me.com>
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
index 287a36e..12bd575 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,10 @@
CC=gcc
-CFLAGS=-g -Wall -O2
+CFLAGS=-g -Wall -O2 -Wno-unused-function
+
+all:seqtk
seqtk:seqtk.c khash.h kseq.h
$(CC) $(CFLAGS) seqtk.c -o $@ -lz -lm
clean:
- rm -fr gmon.out *.o ext/*.o a.out seqtk *~ *.a *.dSYM session*
+ rm -fr gmon.out *.o ext/*.o a.out seqtk trimadap *~ *.a *.dSYM session*
diff --git a/README.md b/README.md
index 0de5349..6dec1cc 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,17 @@
Introduction
------------
-Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format. It seamlessly parses both FASTA and FASTQ files which can also be optionally compressed by gzip.
-
-Examples
---------
+Seqtk is a fast and lightweight tool for processing sequences in the FASTA or
+FASTQ format. It seamlessly parses both FASTA and FASTQ files which can also be
+optionally compressed by gzip. To install `seqtk`,
+```sh
+git clone https://github.com/lh3/seqtk.git;
+cd seqtk; make
+```
+The only library dependency is zlib.
+
+Seqtk Examples
+--------------
* Convert FASTQ to FASTA:
@@ -51,3 +58,4 @@ Examples
* Trim 5bp from the left end of each read and 10bp from the right end:
seqtk trimfq -b 5 -e 10 in.fa > out.fa
+
diff --git a/kseq.h b/kseq.h
index a5cec7c..b2238d1 100644
--- a/kseq.h
+++ b/kseq.h
@@ -70,8 +70,7 @@
if (ks->begin >= ks->end) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end < __bufsize) ks->is_eof = 1; \
- if (ks->end == 0) return -1; \
+ if (ks->end == 0) { ks->is_eof = 1; return -1;} \
} \
return (int)ks->buf[ks->begin++]; \
}
@@ -91,17 +90,16 @@ typedef struct __kstring_t {
#define __KS_GETUNTIL(__read, __bufsize) \
static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
{ \
+ int gotany = 0; \
if (dret) *dret = 0; \
str->l = append? str->l : 0; \
- if (ks->begin >= ks->end && ks->is_eof) return -1; \
for (;;) { \
int i; \
if (ks->begin >= ks->end) { \
if (!ks->is_eof) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, __bufsize); \
- if (ks->end < __bufsize) ks->is_eof = 1; \
- if (ks->end == 0) break; \
+ if (ks->end == 0) { ks->is_eof = 1; break; } \
} else break; \
} \
if (delimiter == KS_SEP_LINE) { \
@@ -122,6 +120,7 @@ typedef struct __kstring_t {
kroundup32(str->m); \
str->s = (char*)realloc(str->s, str->m); \
} \
+ gotany = 1; \
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
str->l = str->l + (i - ks->begin); \
ks->begin = i + 1; \
@@ -130,6 +129,7 @@ typedef struct __kstring_t {
break; \
} \
} \
+ if (!gotany && ks_eof(ks)) return -1; \
if (str->s == 0) { \
str->m = 1; \
str->s = (char*)calloc(1, 1); \
diff --git a/ksort.h b/ksort.h
new file mode 100644
index 0000000..4da7a13
--- /dev/null
+++ b/ksort.h
@@ -0,0 +1,298 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2011 Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/*
+ 2011-04-10 (0.1.6):
+
+ * Added sample
+
+ 2011-03 (0.1.5):
+
+ * Added shuffle/permutation
+
+ 2008-11-16 (0.1.4):
+
+ * Fixed a bug in introsort() that happens in rare cases.
+
+ 2008-11-05 (0.1.3):
+
+ * Fixed a bug in introsort() for complex comparisons.
+
+ * Fixed a bug in mergesort(). The previous version is not stable.
+
+ 2008-09-15 (0.1.2):
+
+ * Accelerated introsort. On my Mac (not on another Linux machine),
+ my implementation is as fast as std::sort on random input.
+
+ * Added combsort and in introsort, switch to combsort if the
+ recursion is too deep.
+
+ 2008-09-13 (0.1.1):
+
+ * Added k-small algorithm
+
+ 2008-09-05 (0.1.0):
+
+ * Initial version
+
+*/
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+ void *left, *right;
+ int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt) \
+ void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \
+ { \
+ type_t *a2[2], *a, *b; \
+ int curr, shift; \
+ \
+ a2[0] = array; \
+ a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \
+ for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \
+ a = a2[curr]; b = a2[1-curr]; \
+ if (shift == 0) { \
+ type_t *p = b, *i, *eb = a + n; \
+ for (i = a; i < eb; i += 2) { \
+ if (i == eb - 1) *p++ = *i; \
+ else { \
+ if (__sort_lt(*(i+1), *i)) { \
+ *p++ = *(i+1); *p++ = *i; \
+ } else { \
+ *p++ = *i; *p++ = *(i+1); \
+ } \
+ } \
+ } \
+ } else { \
+ size_t i, step = 1ul<<shift; \
+ for (i = 0; i < n; i += step<<1) { \
+ type_t *p, *j, *k, *ea, *eb; \
+ if (n < i + step) { \
+ ea = a + n; eb = a; \
+ } else { \
+ ea = a + i + step; \
+ eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
+ } \
+ j = a + i; k = a + i + step; p = b + i; \
+ while (j < ea && k < eb) { \
+ if (__sort_lt(*k, *j)) *p++ = *k++; \
+ else *p++ = *j++; \
+ } \
+ while (j < ea) *p++ = *j++; \
+ while (k < eb) *p++ = *k++; \
+ } \
+ } \
+ curr = 1 - curr; \
+ } \
+ if (curr == 1) { \
+ type_t *p = a2[0], *i = a2[1], *eb = array + n; \
+ for (; p < eb; ++i) *p++ = *i; \
+ } \
+ if (temp == 0) free(a2[1]); \
+ } \
+ void ks_heapadjust_##name(size_t i, size_t n, type_t l[]) \
+ { \
+ size_t k = i; \
+ type_t tmp = l[i]; \
+ while ((k = (k << 1) + 1) < n) { \
+ if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
+ if (__sort_lt(l[k], tmp)) break; \
+ l[i] = l[k]; i = k; \
+ } \
+ l[i] = tmp; \
+ } \
+ void ks_heapmake_##name(size_t lsize, type_t l[]) \
+ { \
+ size_t i; \
+ for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \
+ ks_heapadjust_##name(i, lsize, l); \
+ } \
+ void ks_heapsort_##name(size_t lsize, type_t l[]) \
+ { \
+ size_t i; \
+ for (i = lsize - 1; i > 0; --i) { \
+ type_t tmp; \
+ tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
+ } \
+ } \
+ static inline void __ks_insertsort_##name(type_t *s, type_t *t) \
+ { \
+ type_t *i, *j, swap_tmp; \
+ for (i = s + 1; i < t; ++i) \
+ for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \
+ swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \
+ } \
+ } \
+ void ks_combsort_##name(size_t n, type_t a[]) \
+ { \
+ const double shrink_factor = 1.2473309501039786540366528676643; \
+ int do_swap; \
+ size_t gap = n; \
+ type_t tmp, *i, *j; \
+ do { \
+ if (gap > 2) { \
+ gap = (size_t)(gap / shrink_factor); \
+ if (gap == 9 || gap == 10) gap = 11; \
+ } \
+ do_swap = 0; \
+ for (i = a; i < a + n - gap; ++i) { \
+ j = i + gap; \
+ if (__sort_lt(*j, *i)) { \
+ tmp = *i; *i = *j; *j = tmp; \
+ do_swap = 1; \
+ } \
+ } \
+ } while (do_swap || gap > 2); \
+ if (gap != 1) __ks_insertsort_##name(a, a + n); \
+ } \
+ void ks_introsort_##name(size_t n, type_t a[]) \
+ { \
+ int d; \
+ ks_isort_stack_t *top, *stack; \
+ type_t rp, swap_tmp; \
+ type_t *s, *t, *i, *j, *k; \
+ \
+ if (n < 1) return; \
+ else if (n == 2) { \
+ if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
+ return; \
+ } \
+ for (d = 2; 1ul<<d < n; ++d); \
+ stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
+ top = stack; s = a; t = a + (n-1); d <<= 1; \
+ while (1) { \
+ if (s < t) { \
+ if (--d == 0) { \
+ ks_combsort_##name(t - s + 1, s); \
+ t = s; \
+ continue; \
+ } \
+ i = s; j = t; k = i + ((j-i)>>1) + 1; \
+ if (__sort_lt(*k, *i)) { \
+ if (__sort_lt(*k, *j)) k = j; \
+ } else k = __sort_lt(*j, *i)? i : j; \
+ rp = *k; \
+ if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \
+ for (;;) { \
+ do ++i; while (__sort_lt(*i, rp)); \
+ do --j; while (i <= j && __sort_lt(rp, *j)); \
+ if (j <= i) break; \
+ swap_tmp = *i; *i = *j; *j = swap_tmp; \
+ } \
+ swap_tmp = *i; *i = *t; *t = swap_tmp; \
+ if (i-s > t-i) { \
+ if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
+ s = t-i > 16? i+1 : t; \
+ } else { \
+ if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
+ t = i-s > 16? i-1 : s; \
+ } \
+ } else { \
+ if (top == stack) { \
+ free(stack); \
+ __ks_insertsort_##name(a, a+n); \
+ return; \
+ } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
+ } \
+ } \
+ } \
+ /* This function is adapted from: http://ndevilla.free.fr/median/ */ \
+ /* 0 <= kk < n */ \
+ type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
+ { \
+ type_t *low, *high, *k, *ll, *hh, *mid; \
+ low = arr; high = arr + n - 1; k = arr + kk; \
+ for (;;) { \
+ if (high <= low) return *k; \
+ if (high == low + 1) { \
+ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+ return *k; \
+ } \
+ mid = low + (high - low) / 2; \
+ if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+ if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \
+ KSORT_SWAP(type_t, *mid, *(low+1)); \
+ ll = low + 1; hh = high; \
+ for (;;) { \
+ do ++ll; while (__sort_lt(*ll, *low)); \
+ do --hh; while (__sort_lt(*low, *hh)); \
+ if (hh < ll) break; \
+ KSORT_SWAP(type_t, *ll, *hh); \
+ } \
+ KSORT_SWAP(type_t, *low, *hh); \
+ if (hh <= k) low = ll; \
+ if (hh >= k) high = hh - 1; \
+ } \
+ } \
+ void ks_shuffle_##name(size_t n, type_t a[]) \
+ { \
+ int i, j; \
+ for (i = n; i > 1; --i) { \
+ type_t tmp; \
+ j = (int)(drand48() * i); \
+ tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \
+ } \
+ } \
+ void ks_sample_##name(size_t n, size_t r, type_t a[]) /* FIXME: NOT TESTED!!! */ \
+ { /* reference: http://code.activestate.com/recipes/272884/ */ \
+ int i, k, pop = n; \
+ for (i = (int)r, k = 0; i >= 0; --i) { \
+ double z = 1., x = drand48(); \
+ type_t tmp; \
+ while (x < z) z -= z * i / (pop--); \
+ if (k != n - pop - 1) tmp = a[k], a[k] = a[n-pop-1], a[n-pop-1] = tmp; \
+ ++k; \
+ } \
+ }
+
+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#endif
diff --git a/kstring.h b/kstring.h
new file mode 100644
index 0000000..6639bd8
--- /dev/null
+++ b/kstring.h
@@ -0,0 +1,169 @@
+/* The MIT License
+
+ Copyright (c) by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef KSTRING_H
+#define KSTRING_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ uint32_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+typedef struct {
+ uint64_t tab[4];
+ int sep, finished;
+ const char *p; // end of the current token
+} ks_tokaux_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ int ksprintf(kstring_t *s, const char *fmt, ...);
+ int ksprintf_fast(kstring_t *s, const char *fmt, ...);
+ int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+ char *kstrstr(const char *str, const char *pat, int **_prep);
+ char *kstrnstr(const char *str, const char *pat, int n, int **_prep);
+ void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep);
+
+ /* kstrtok() is similar to strtok_r() except that str is not
+ * modified and both str and sep can be NULL. For efficiency, it is
+ * actually recommended to set both to NULL in the subsequent calls
+ * if sep is not changed. */
+ char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline void ks_resize(kstring_t *s, size_t size)
+{
+ if (s->m < size) {
+ s->m = size;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+}
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ memcpy(s->s + s->l, p, l);
+ s->l += l;
+ s->s[s->l] = 0;
+ return l;
+}
+
+static inline int kputs(const char *p, kstring_t *s)
+{
+ return kputsn(p, strlen(p), s);
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+ if (s->l + 1 >= s->m) {
+ s->m = s->l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ s->s[s->l++] = c;
+ s->s[s->l] = 0;
+ return c;
+}
+
+static inline int kputw(int c, kstring_t *s)
+{
+ char buf[16];
+ int l, x;
+ if (c == 0) return kputc('0', s);
+ for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (c < 0) buf[l++] = '-';
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+static inline int kputuw(unsigned c, kstring_t *s)
+{
+ char buf[16];
+ int l, i;
+ unsigned x;
+ if (c == 0) return kputc('0', s);
+ for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+static inline int kputl(long c, kstring_t *s)
+{
+ char buf[32];
+ long l, x;
+ if (c == 0) return kputc('0', s);
+ for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (c < 0) buf[l++] = '-';
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+static inline int *ksplit(kstring_t *s, int delimiter, int *n)
+{
+ int max = 0, *offsets = 0;
+ *n = ksplit_core(s->s, delimiter, &max, &offsets);
+ return offsets;
+}
+
+#endif
diff --git a/kvec.h b/kvec.h
new file mode 100644
index 0000000..676be8b
--- /dev/null
+++ b/kvec.h
@@ -0,0 +1,90 @@
+/* The MIT License
+
+ Copyright (c) 2008, by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/*
+ An example:
+
+#include "kvec.h"
+int main() {
+ kvec_t(int) array;
+ kv_init(array);
+ kv_push(int, array, 10); // append
+ kv_a(int, array, 20) = 5; // dynamic
+ kv_A(array, 20) = 4; // static
+ kv_destroy(array);
+ return 0;
+}
+*/
+
+/*
+ 2008-09-22 (0.1.0):
+
+ * The initial version.
+
+*/
+
+#ifndef AC_KVEC_H
+#define AC_KVEC_H
+
+#include <stdlib.h>
+
+#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+
+#define kvec_t(type) struct { size_t n, m; type *a; }
+#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
+#define kv_destroy(v) free((v).a)
+#define kv_A(v, i) ((v).a[(i)])
+#define kv_pop(v) ((v).a[--(v).n])
+#define kv_size(v) ((v).n)
+#define kv_max(v) ((v).m)
+
+#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))
+
+#define kv_copy(type, v1, v0) do { \
+ if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \
+ (v1).n = (v0).n; \
+ memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
+ } while (0) \
+
+#define kv_push(type, v, x) do { \
+ if ((v).n == (v).m) { \
+ (v).m = (v).m? (v).m<<1 : 2; \
+ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
+ } \
+ (v).a[(v).n++] = (x); \
+ } while (0)
+
+#define kv_pushp(type, v) (((v).n == (v).m)? \
+ ((v).m = ((v).m? (v).m<<1 : 2), \
+ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
+ : 0), ((v).a + ((v).n++))
+
+#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
+ ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
+ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
+ : (v).n <= (size_t)(i)? (v).n = (i) + 1 \
+ : 0), (v).a[(i)])
+
+#endif
diff --git a/seqtk.c b/seqtk.c
index 9cbcfad..feaeda8 100644
--- a/seqtk.c
+++ b/seqtk.c
@@ -27,10 +27,12 @@
#include <ctype.h>
#include <stdlib.h>
#include <stdint.h>
+#include <inttypes.h>
#include <zlib.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>
+#include <assert.h>
#include <math.h>
#include "kseq.h"
@@ -43,6 +45,7 @@ typedef struct {
#include "khash.h"
KHASH_MAP_INIT_STR(reg, reglist_t)
+KHASH_SET_INIT_INT64(64)
typedef kh_reg_t reghash_t;
@@ -129,6 +132,26 @@ unsigned char seq_nt16_table[256] = {
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
};
+unsigned char seq_nt6_table[256] = {
+ 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
+};
+
char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN";
unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
unsigned char seq_nt16comp_table[] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 };
@@ -146,7 +169,7 @@ char comp_tab[] = {
static void stk_printstr(const kstring_t *s, unsigned line_len)
{
- if (line_len != UINT_MAX) {
+ if (line_len != UINT_MAX && line_len != 0) {
int i, rest = s->l;
for (i = 0; i < s->l; i += line_len, rest -= line_len) {
putchar('\n');
@@ -160,10 +183,13 @@ static void stk_printstr(const kstring_t *s, unsigned line_len)
}
}
-void stk_printseq(const kseq_t *s, int line_len)
+static inline void stk_printseq_renamed(const kseq_t *s, int line_len, const char *prefix, int64_t n)
{
putchar(s->qual.l? '@' : '>');
- fputs(s->name.s, stdout);
+ if (n >= 0) {
+ if (prefix) fputs(prefix, stdout);
+ printf("%lld", (long long)n);
+ } else fputs(s->name.s, stdout);
if (s->comment.l) {
putchar(' '); fputs(s->comment.s, stdout);
}
@@ -174,19 +200,94 @@ void stk_printseq(const kseq_t *s, int line_len)
}
}
+static inline void stk_printseq(const kseq_t *s, int line_len)
+{
+ stk_printseq_renamed(s, line_len, 0, -1);
+}
+
+/*
+ 64-bit Mersenne Twister pseudorandom number generator. Adapted from:
+
+ http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/VERSIONS/C-LANG/mt19937-64.c
+
+ which was written by Takuji Nishimura and Makoto Matsumoto and released
+ under the 3-clause BSD license.
+*/
+
+typedef uint64_t krint64_t;
+
+struct _krand_t;
+typedef struct _krand_t krand_t;
+
+#define KR_NN 312
+#define KR_MM 156
+#define KR_UM 0xFFFFFFFF80000000ULL /* Most significant 33 bits */
+#define KR_LM 0x7FFFFFFFULL /* Least significant 31 bits */
+
+struct _krand_t {
+ int mti;
+ krint64_t mt[KR_NN];
+};
+
+static void kr_srand0(krint64_t seed, krand_t *kr)
+{
+ kr->mt[0] = seed;
+ for (kr->mti = 1; kr->mti < KR_NN; ++kr->mti)
+ kr->mt[kr->mti] = 6364136223846793005ULL * (kr->mt[kr->mti - 1] ^ (kr->mt[kr->mti - 1] >> 62)) + kr->mti;
+}
+
+krand_t *kr_srand(krint64_t seed)
+{
+ krand_t *kr;
+ kr = malloc(sizeof(krand_t));
+ kr_srand0(seed, kr);
+ return kr;
+}
+
+krint64_t kr_rand(krand_t *kr)
+{
+ krint64_t x;
+ static const krint64_t mag01[2] = { 0, 0xB5026F5AA96619E9ULL };
+ if (kr->mti >= KR_NN) {
+ int i;
+ if (kr->mti == KR_NN + 1) kr_srand0(5489ULL, kr);
+ for (i = 0; i < KR_NN - KR_MM; ++i) {
+ x = (kr->mt[i] & KR_UM) | (kr->mt[i+1] & KR_LM);
+ kr->mt[i] = kr->mt[i + KR_MM] ^ (x>>1) ^ mag01[(int)(x&1)];
+ }
+ for (; i < KR_NN - 1; ++i) {
+ x = (kr->mt[i] & KR_UM) | (kr->mt[i+1] & KR_LM);
+ kr->mt[i] = kr->mt[i + (KR_MM - KR_NN)] ^ (x>>1) ^ mag01[(int)(x&1)];
+ }
+ x = (kr->mt[KR_NN - 1] & KR_UM) | (kr->mt[0] & KR_LM);
+ kr->mt[KR_NN - 1] = kr->mt[KR_MM - 1] ^ (x>>1) ^ mag01[(int)(x&1)];
+ kr->mti = 0;
+ }
+ x = kr->mt[kr->mti++];
+ x ^= (x >> 29) & 0x5555555555555555ULL;
+ x ^= (x << 17) & 0x71D67FFFEDA60000ULL;
+ x ^= (x << 37) & 0xFFF7EEE000000000ULL;
+ x ^= (x >> 43);
+ return x;
+}
+
+#define kr_drand(_kr) ((kr_rand(_kr) >> 11) * (1.0/9007199254740992.0))
+
+
/* quality based trimming with Mott's algorithm */
int stk_trimfq(int argc, char *argv[])
{ // FIXME: when a record with zero length will always be treated as a fasta record
gzFile fp;
kseq_t *seq;
double param = 0.05, q_int2real[128];
- int i, c, min_len = 30, left = 0, right = 0;
- while ((c = getopt(argc, argv, "l:q:b:e:")) >= 0) {
+ int i, c, min_len = 30, left = 0, right = 0, fixed_len = -1;
+ while ((c = getopt(argc, argv, "l:q:b:e:L:")) >= 0) {
switch (c) {
case 'q': param = atof(optarg); break;
case 'l': min_len = atoi(optarg); break;
case 'b': left = atoi(optarg); break;
case 'e': right = atoi(optarg); break;
+ case 'L': fixed_len = atoi(optarg); break;
}
}
if (optind == argc) {
@@ -196,6 +297,7 @@ int stk_trimfq(int argc, char *argv[])
fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e) [%d]\n", min_len);
fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n");
fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n");
+ fprintf(stderr, " -L INT retain at most INT bp from the 5'-end (non-zero to disable -q/-l) [0]\n");
fprintf(stderr, "\n");
return 1;
}
@@ -206,9 +308,10 @@ int stk_trimfq(int argc, char *argv[])
while (kseq_read(seq) >= 0) {
int beg, tmp, end;
double s, max;
- if (left || right) {
+ if (left || right || fixed_len > 0) {
beg = left; end = seq->seq.l - right;
if (beg >= end) beg = end = 0;
+ if (fixed_len > 0 && end - beg > fixed_len) end = beg + fixed_len;
} else if (seq->qual.l > min_len) {
for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) {
int q = seq->qual.s[i];
@@ -218,6 +321,10 @@ int stk_trimfq(int argc, char *argv[])
if (s > max) max = s, beg = tmp, end = i + 1;
if (s < 0) s = 0, tmp = i + 1;
}
+
+ /* max never set; all low qual, just give first min_len bp */
+ if (max == 0.) beg = 0, end = min_len;
+
if (end - beg < min_len) { // window-based
int is, imax;
for (i = 0, is = 0; i < min_len; ++i)
@@ -252,18 +359,19 @@ int stk_comp(int argc, char *argv[])
int l, c, upper_only = 0;
reghash_t *h = 0;
reglist_t dummy;
+
while ((c = getopt(argc, argv, "ur:")) >= 0) {
switch (c) {
case 'u': upper_only = 1; break;
case 'r': h = stk_reg_read(optarg); break;
}
}
- if (argc == optind) {
+ if (argc == optind && isatty(fileno(stdin))) {
fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] <in.fa>\n\n");
fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n");
return 1;
}
- fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+ fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
seq = kseq_init(fp);
dummy.n= dummy.m = 1; dummy.a = calloc(1, 8);
while ((l = kseq_read(seq)) >= 0) {
@@ -422,7 +530,7 @@ int stk_subseq(int argc, char *argv[])
khash_t(reg) *h = kh_init(reg);
gzFile fp;
kseq_t *seq;
- int l, i, j, c, is_tab = 0, line = 1024;
+ int l, i, j, c, is_tab = 0, line = 0;
khint_t k;
while ((c = getopt(argc, argv, "tl:")) >= 0) {
switch (c) {
@@ -461,17 +569,18 @@ int stk_subseq(int argc, char *argv[])
if (beg) printf(":%d", beg+1);
} else printf(":%d-%d", beg+1, end);
}
+ if (seq->comment.l) printf(" %s", seq->comment.s);
} else printf("%s\t%d\t", seq->name.s, beg + 1);
if (end > seq->seq.l) end = seq->seq.l;
for (j = 0; j < end - beg; ++j) {
- if (is_tab == 0 && j % line == 0) putchar('\n');
+ if (is_tab == 0 && (j == 0 || (line > 0 && j % line == 0))) putchar('\n');
putchar(seq->seq.s[j + beg]);
}
putchar('\n');
if (seq->qual.l != seq->seq.l || is_tab) continue;
printf("+");
for (j = 0; j < end - beg; ++j) {
- if (j % line == 0) putchar('\n');
+ if (j == 0 || (line > 0 && j % line == 0)) putchar('\n');
putchar(seq->qual.s[j + beg]);
}
putchar('\n');
@@ -588,8 +697,9 @@ int stk_famask(int argc, char *argv[])
{
gzFile fp[2];
kseq_t *seq[2];
- int i, l;
- if (argc < 3) {
+ int i, l, c;
+ while ((c = getopt(argc, argv, "")) >= 0);
+ if (argc - optind < 2) {
fprintf(stderr, "Usage: seqtk famask <src.fa> <mask.fa>\n");
return 1;
}
@@ -756,12 +866,19 @@ static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end)
{
int i;
if (begin >= end) return; // FIXME: why may this happen? Understand it!
- fprintf(fpout, ">%s:%d-%d", ks->name.s, begin+1, end);
+ fprintf(fpout, "%c%s:%d-%d", ks->qual.l? '@' : '>', ks->name.s, begin+1, end);
for (i = begin; i < end && i < ks->seq.l; ++i) {
if ((i - begin)%60 == 0) fputc('\n', fpout);
fputc(ks->seq.s[i], fpout);
}
fputc('\n', fpout);
+ if (ks->qual.l == 0) return;
+ fputs("+\n", fpout);
+ for (i = begin; i < end && i < ks->qual.l; ++i) {
+ if ((i - begin)%60 == 0) fputc('\n', fpout);
+ fputc(ks->qual.s[i], fpout);
+ }
+ fputc('\n', fpout);
}
int stk_cutN(int argc, char *argv[])
{
@@ -848,48 +965,105 @@ static void cpy_kseq(kseq_t *dst, const kseq_t *src)
cpy_kstr(&dst->name, &src->name);
cpy_kstr(&dst->seq, &src->seq);
cpy_kstr(&dst->qual, &src->qual);
+ cpy_kstr(&dst->comment, &src->comment);
}
int stk_sample(int argc, char *argv[])
{
- int c;
+ int c, twopass = 0;
uint64_t i, num = 0, n_seqs = 0;
double frac = 0.;
gzFile fp;
- kseq_t *seq, *buf = 0;
+ kseq_t *seq;
+ krand_t *kr = 0;
+
+ while ((c = getopt(argc, argv, "2s:")) >= 0)
+ if (c == 's') kr = kr_srand(atol(optarg));
+ else if (c == '2') twopass = 1;
- srand48(11);
- while ((c = getopt(argc, argv, "s:")) >= 0)
- switch (c) {
- case 's': srand48(atoi(optarg)); break;
- }
if (optind + 2 > argc) {
- fprintf(stderr, "Usage: seqtk sample [-s seed=11] <in.fa> <frac>|<number>\n\n");
- fprintf(stderr, "Warning: Large memory consumption for large <number>.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: seqtk sample [-2] [-s seed=11] <in.fa> <frac>|<number>\n\n");
+ fprintf(stderr, "Options: -s INT RNG seed [11]\n");
+ fprintf(stderr, " -2 2-pass mode: twice as slow but with much reduced memory\n\n");
return 1;
}
frac = atof(argv[optind+1]);
if (frac > 1.) num = (uint64_t)(frac + .499), frac = 0.;
- if (num > 0) buf = calloc(num, sizeof(kseq_t));
+ else if (twopass) {
+ fprintf(stderr, "[W::%s] when sampling a fraction, option -2 is ignored.", __func__);
+ twopass = 0;
+ }
+ if (kr == 0) kr = kr_srand(11);
- fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
- seq = kseq_init(fp);
- while (kseq_read(seq) >= 0) {
- double r = drand48();
- ++n_seqs;
- if (num) {
- uint64_t y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs);
- if (y < num) cpy_kseq(&buf[y], seq);
- } else if (r < frac) stk_printseq(seq, UINT_MAX);
+ if (!twopass) { // the streaming version
+ kseq_t *buf = 0;
+ if (num > 0) buf = calloc(num, sizeof(kseq_t));
+ if (num > 0 && buf == NULL) {
+ fprintf(stderr, "[E::%s] Could not allocate enough memory for %" PRIu64 " sequences. Exiting...\n", __func__, num);
+ free(kr);
+ return 1;
+ }
+
+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+ n_seqs = 0;
+ while (kseq_read(seq) >= 0) {
+ double r = kr_drand(kr);
+ ++n_seqs;
+ if (num) {
+ uint64_t y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs);
+ if (y < num) cpy_kseq(&buf[y], seq);
+ } else if (r < frac) stk_printseq(seq, UINT_MAX);
+ }
+ for (i = 0; i < num; ++i) {
+ kseq_t *p = &buf[i];
+ if (p->seq.l) stk_printseq(p, UINT_MAX);
+ free(p->seq.s); free(p->qual.s); free(p->name.s);
+ }
+ if (buf != NULL) free(buf);
+ } else {
+ uint64_t *buf;
+ khash_t(64) *hash;
+ int absent;
+
+ if (strcmp(argv[optind], "-") == 0) {
+ fprintf(stderr, "[E::%s] in the 2-pass mode, the input cannot be STDIN.\n", __func__);
+ free(kr);
+ return 1;
+ }
+
+ // 1st pass
+ buf = malloc(num * 8);
+ for (i = 0; i < num; ++i) buf[i] = UINT64_MAX;
+ fp = gzopen(argv[optind], "r");
+ seq = kseq_init(fp);
+ n_seqs = 0;
+ while (kseq_read(seq) >= 0) {
+ double r = kr_drand(kr);
+ uint64_t y;
+ ++n_seqs;
+ y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs);
+ if (y < num) buf[y] = n_seqs;
+ }
+ kseq_destroy(seq);
+ gzclose(fp);
+ hash = kh_init(64);
+ for (i = 0; i < num; ++i) kh_put(64, hash, buf[i], &absent);
+ free(buf);
+ // 2nd pass
+ fp = gzopen(argv[optind], "r");
+ seq = kseq_init(fp);
+ n_seqs = 0;
+ while (kseq_read(seq) >= 0)
+ if (kh_get(64, hash, ++n_seqs) != kh_end(hash))
+ stk_printseq(seq, UINT_MAX);
+ kh_destroy(64, hash);
}
+
kseq_destroy(seq);
gzclose(fp);
- for (i = 0; i < num; ++i) {
- kseq_t *p = &buf[i];
- if (p->seq.l) stk_printseq(p, UINT_MAX);
- free(p->seq.s); free(p->qual.s); free(p->name.s);
- }
- free(buf);
+ free(kr);
return 0;
}
@@ -943,33 +1117,43 @@ int stk_seq(int argc, char *argv[])
{
gzFile fp;
kseq_t *seq;
- int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0;
- unsigned line_len = 0;
+ int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0, max_q = 255;
+ unsigned i, line_len = 0;
+ int64_t n_seqs = 0;
double frac = 1.;
khash_t(reg) *h = 0;
+ krand_t *kr = 0;
- srand48(11);
- while ((c = getopt(argc, argv, "q:l:Q:aACrn:s:f:M:L:c")) >= 0) {
+ while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVUX:S")) >= 0) {
switch (c) {
case 'a':
case 'A': flag |= 1; break;
case 'C': flag |= 2; break;
case 'r': flag |= 4; break;
case 'c': flag |= 8; break;
+ case '1': flag |= 16; break;
+ case '2': flag |= 32; break;
+ case 'V': flag |= 64; break;
+ case 'N': flag |= 128; break;
+ case 'U': flag |= 256; break;
+ case 'S': flag |= 512; break;
case 'M': h = stk_reg_read(optarg); break;
case 'n': mask_chr = *optarg; break;
case 'Q': qual_shift = atoi(optarg); break;
case 'q': qual_thres = atoi(optarg); break;
+ case 'X': max_q = atoi(optarg); break;
case 'l': line_len = atoi(optarg); break;
case 'L': min_len = atoi(optarg); break;
- case 's': srand48(atoi(optarg)); break;
+ case 's': kr = kr_srand(atol(optarg)); break;
case 'f': frac = atof(optarg); break;
}
}
- if (argc == optind) {
+ if (kr == 0) kr = kr_srand(11);
+ if (argc == optind && isatty(fileno(stdin))) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: seqtk seq [options] <in.fq>|<in.fa>\n\n");
fprintf(stderr, "Options: -q INT mask bases with quality lower than INT [0]\n");
+ fprintf(stderr, " -X INT mask bases with quality higher than INT [255]\n");
fprintf(stderr, " -n CHAR masked bases converted to CHAR; 0 for lowercase [0]\n");
fprintf(stderr, " -l INT number of residues per line; 0 for 2^32-1 [%d]\n", line_len);
fprintf(stderr, " -Q INT quality shift: ASCII-INT gives base quality [%d]\n", qual_shift);
@@ -981,34 +1165,60 @@ int stk_seq(int argc, char *argv[])
fprintf(stderr, " -r reverse complement\n");
fprintf(stderr, " -A force FASTA output (discard quality)\n");
fprintf(stderr, " -C drop comments at the header lines\n");
+ fprintf(stderr, " -N drop sequences containing ambiguous bases\n");
+ fprintf(stderr, " -1 output the 2n-1 reads only\n");
+ fprintf(stderr, " -2 output the 2n reads only\n");
+ fprintf(stderr, " -V shift quality by '(-Q) - 33'\n");
+ fprintf(stderr, " -U convert all bases to uppercases\n");
+ fprintf(stderr, " -S strip of white spaces in sequences\n");
fprintf(stderr, "\n");
+ free(kr);
return 1;
}
if (line_len == 0) line_len = UINT_MAX;
- fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ fp = optind < argc && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
seq = kseq_init(fp);
qual_thres += qual_shift;
while (kseq_read(seq) >= 0) {
+ ++n_seqs;
if (seq->seq.l < min_len) continue; // NB: length filter before taking random
- if (frac < 1. && drand48() >= frac) continue;
+ if (frac < 1. && kr_drand(kr) >= frac) continue;
+ if (flag & 48) { // then choose odd/even reads only
+ if ((flag&16) && (n_seqs&1) == 0) continue;
+ if ((flag&32) && (n_seqs&1) == 1) continue;
+ }
+ if (flag & 512) { // option -S: squeeze out white spaces
+ int k;
+ if (seq->qual.l) {
+ for (i = k = 0; i < seq->seq.l; ++i)
+ if (!isspace(seq->seq.s[i]))
+ seq->qual.s[k++] = seq->qual.s[i];
+ seq->qual.l = k;
+ }
+ for (i = k = 0; i < seq->seq.l; ++i)
+ if (!isspace(seq->seq.s[i]))
+ seq->seq.s[k++] = seq->seq.s[i];
+ seq->seq.l = k;
+ }
if (seq->qual.l && qual_thres > qual_shift) {
- unsigned i;
if (mask_chr) {
for (i = 0; i < seq->seq.l; ++i)
- if (seq->qual.s[i] < qual_thres)
+ if (seq->qual.s[i] < qual_thres || seq->qual.s[i] > max_q)
seq->seq.s[i] = mask_chr;
} else {
for (i = 0; i < seq->seq.l; ++i)
- if (seq->qual.s[i] < qual_thres)
+ if (seq->qual.s[i] < qual_thres || seq->qual.s[i] > max_q)
seq->seq.s[i] = tolower(seq->seq.s[i]);
}
}
- if (flag & 1) seq->qual.l = 0;
- if (flag & 2) seq->comment.l = 0;
+ if (flag & 256) // option -U: convert to uppercases
+ for (i = 0; i < seq->seq.l; ++i)
+ seq->seq.s[i] = toupper(seq->seq.s[i]);
+ if (flag & 1) seq->qual.l = 0; // option -a: fastq -> fasta
+ if (flag & 2) seq->comment.l = 0; // option -C: drop fasta/q comments
if (h) stk_mask(seq, h, flag&8, mask_chr); // masking
- if (flag & 4) { // reverse complement
+ if (flag & 4) { // option -r: reverse complement
int c0, c1;
- unsigned i;
for (i = 0; i < seq->seq.l>>1; ++i) { // reverse complement sequence
c0 = comp_tab[(int)seq->seq.s[i]];
c1 = comp_tab[(int)seq->seq.s[seq->seq.l - 1 - i]];
@@ -1022,11 +1232,340 @@ int stk_seq(int argc, char *argv[])
c0 = seq->qual.s[i], seq->qual.s[i] = seq->qual.s[seq->qual.l - 1 - i], seq->qual.s[seq->qual.l - 1 - i] = c0;
}
}
+ if ((flag & 64) && seq->qual.l && qual_shift != 33)
+ for (i = 0; i < seq->qual.l; ++i)
+ seq->qual.s[i] -= qual_shift - 33;
+ if (flag & 128) { // option -N: drop sequences containing ambiguous bases - Note: this is the last step!
+ for (i = 0; i < seq->seq.l; ++i)
+ if (seq_nt16to4_table[seq_nt16_table[(int)seq->seq.s[i]]] > 3) break;
+ if (i < seq->seq.l) continue;
+ }
stk_printseq(seq, line_len);
}
kseq_destroy(seq);
gzclose(fp);
stk_reg_destroy(h);
+ free(kr);
+ return 0;
+}
+
+int stk_gc(int argc, char *argv[])
+{
+ int c, is_at = 0, min_l = 20;
+ double frac = 0.6f, xdropoff = 10.0f, q;
+ gzFile fp;
+ kseq_t *seq;
+
+ while ((c = getopt(argc, argv, "wx:f:l:")) >= 0) {
+ if (c == 'x') xdropoff = atof(optarg);
+ else if (c == 'w') is_at = 1;
+ else if (c == 'f') frac = atof(optarg);
+ else if (c == 'l') min_l = atoi(optarg);
+ }
+ if (optind + 1 > argc) {
+ fprintf(stderr, "Usage: seqtk gc [options] <in.fa>\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -w identify high-AT regions\n");
+ fprintf(stderr, " -f FLOAT min GC fraction (or AT fraction for -w) [%.2f]\n", frac);
+ fprintf(stderr, " -l INT min region length to output [%d]\n", min_l);
+ fprintf(stderr, " -x FLOAT X-dropoff [%.1f]\n", xdropoff);
+ return 1;
+ }
+ q = (1.0f - frac) / frac;
+
+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+ while (kseq_read(seq) >= 0) {
+ int i, start = 0, max_i = 0, n_hits = 0, start_hits = 0, max_hits = 0;
+ double sc = 0., max = 0.;
+ for (i = 0; i < seq->seq.l; ++i) {
+ int hit;
+ c = seq_nt16_table[(int)seq->seq.s[i]];
+ if (is_at) hit = (c == 1 || c == 8 || c == 9);
+ else hit = (c == 2 || c == 4 || c == 6);
+ n_hits += hit;
+ if (hit) {
+ if (sc == 0) start = i, start_hits = n_hits;
+ sc += q;
+ if (sc > max) max = sc, max_i = i, max_hits = n_hits;
+ } else if (sc > 0) {
+ sc += -1.0f;
+ if (sc < 0 || max - sc > xdropoff) {
+ if (max_i + 1 - start >= min_l)
+ printf("%s\t%d\t%d\t%d\n", seq->name.s, start, max_i + 1, max_hits - start_hits + 1);
+ sc = max = 0;
+ i = max_i;
+ }
+ }
+ }
+ if (max > 0. && max_i + 1 - start >= min_l)
+ printf("%s\t%d\t%d\t%d\n", seq->name.s, start, max_i + 1, max_hits - start_hits + 1);
+ }
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
+int stk_mergepe(int argc, char *argv[])
+{
+ gzFile fp1, fp2;
+ kseq_t *seq[2];
+
+ if (argc < 3) {
+ fprintf(stderr, "Usage: seqtk mergepe <in1.fq> <in2.fq>\n");
+ return 1;
+ }
+ fp1 = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+ fp2 = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
+ seq[0] = kseq_init(fp1);
+ seq[1] = kseq_init(fp2);
+ while (kseq_read(seq[0]) >= 0) {
+ if (kseq_read(seq[1]) < 0) {
+ fprintf(stderr, "[W::%s] the 2nd file has fewer records.\n", __func__);
+ break;
+ }
+ stk_printseq(seq[0], 0);
+ stk_printseq(seq[1], 0);
+ }
+ if (kseq_read(seq[1]) >= 0)
+ fprintf(stderr, "[W::%s] the 1st file has fewer records.\n", __func__);
+ kseq_destroy(seq[0]); gzclose(fp1);
+ kseq_destroy(seq[1]); gzclose(fp2);
+ return 0;
+}
+
+int stk_dropse(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq, last;
+
+ if (argc == 1 && isatty(fileno(stdin))) {
+ fprintf(stderr, "Usage: seqtk dropse <in.fq>\n");
+ return 1;
+ }
+ fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+
+ memset(&last, 0, sizeof(kseq_t));
+ while (kseq_read(seq) >= 0) {
+ if (last.name.l) {
+ kstring_t *p = &last.name, *q = &seq->name;
+ int is_diff;
+ if (p->l == q->l) {
+ int l = (p->l > 2 && p->s[p->l-2] == '/' && q->s[q->l-2] == '/' && isdigit(p->s[p->l-1]) && isdigit(q->s[q->l-1]))? p->l - 2 : p->l;
+ is_diff = strncmp(p->s, q->s, l);
+ } else is_diff = 1;
+ if (!is_diff) {
+ stk_printseq(&last, 0);
+ stk_printseq(seq, 0);
+ last.name.l = 0;
+ } else cpy_kseq(&last, seq);
+ } else cpy_kseq(&last, seq);
+ }
+
+ kseq_destroy(seq);
+ gzclose(fp);
+ // free last!
+ return 0;
+}
+
+int stk_rename(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq, last;
+ char *prefix = 0;
+ uint64_t n = 1;
+
+ if (argc == 1 && isatty(fileno(stdin))) {
+ fprintf(stderr, "Usage: seqtk rename <in.fq> [prefix]\n");
+ return 1;
+ }
+ fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+ if (argc > 2) prefix = argv[2];
+
+ memset(&last, 0, sizeof(kseq_t));
+ while (kseq_read(seq) >= 0) {
+ if (last.name.l) {
+ kstring_t *p = &last.name, *q = &seq->name;
+ int is_diff;
+ if (p->l == q->l) {
+ int l = (p->l > 2 && p->s[p->l-2] == '/' && q->s[q->l-2] == '/' && isdigit(p->s[p->l-1]) && isdigit(q->s[q->l-1]))? p->l - 2 : p->l;
+ is_diff = strncmp(p->s, q->s, l);
+ } else is_diff = 1;
+ if (!is_diff) {
+ stk_printseq_renamed(&last, 0, prefix, n);
+ stk_printseq_renamed(seq, 0, prefix, n);
+ last.name.l = 0;
+ ++n;
+ } else {
+ stk_printseq_renamed(&last, 0, prefix, n);
+ ++n;
+ cpy_kseq(&last, seq);
+ }
+ } else cpy_kseq(&last, seq);
+ }
+ if (last.name.l) stk_printseq_renamed(&last, 0, prefix, n);
+
+ kseq_destroy(seq);
+ gzclose(fp);
+ // free last!
+ return 0;
+}
+
+int stk_kfreq(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *ks;
+ int kmer, i, l, mask;
+ char *nei;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage: seqtk kfreq <kmer> <in.fa>\n");
+ return 1;
+ }
+
+ // get the k-mer
+ l = strlen(argv[1]);
+ for (i = kmer = 0; i < l; ++i) {
+ int c = seq_nt6_table[(int)argv[1][i]];
+ assert(c >= 1 && c <= 4);
+ kmer = kmer << 2 | (c - 1);
+ }
+ mask = (1<<2*l) - 1;
+
+ // get the neighbors
+ nei = calloc(1, 1<<2*l);
+ for (i = 0; i < l; ++i) {
+ int j, x;
+ x = kmer & ~(3 << 2*i);
+ for (j = 0; j < 4; ++j)
+ nei[x|j<<2*i] = 1;
+ }
+
+ fp = argc == 2 || strcmp(argv[2], "-") == 0? gzdopen(fileno(stdin), "r") : gzopen(argv[2], "r");
+ ks = kseq_init(fp);
+ while (kseq_read(ks) >= 0) {
+ int k, x[2], cnt[2], cnt_nei[2], which;
+ x[0] = x[1] = k = cnt[0] = cnt[1] = cnt_nei[0] = cnt_nei[1] = 0;
+ for (i = 0; i < ks->seq.l; ++i) {
+ int c = seq_nt6_table[(int)ks->seq.s[i]];
+ if (c >= 1 && c <= 4) {
+ x[0] = (x[0] << 2 | (c - 1)) & mask;
+ x[1] = (x[1] >> 2 | (4 - c) << 2*(l-1));
+ if (k < l) ++k;
+ if (k == l) {
+ if (x[0] == kmer) ++cnt[0];
+ else if (x[1] == kmer) ++cnt[1];
+ if (nei[x[0]]) ++cnt_nei[0];
+ else if (nei[x[1]]) ++cnt_nei[1];
+ }
+ } else k = 0;
+ }
+ which = cnt_nei[0] > cnt_nei[1]? 0 : 1;
+ printf("%s\t%ld\t%c\t%d\t%d\n", ks->name.s, ks->seq.l, "+-"[which], cnt_nei[which], cnt[which]);
+ }
+ kseq_destroy(ks);
+ gzclose(fp);
+ return 0;
+}
+
+/* fqchk */
+
+typedef struct {
+ int64_t q[94], b[5];
+} posstat_t;
+
+static void fqc_aux(posstat_t *p, int pos, int64_t allq[94], double perr[94], int qthres)
+{
+ int k;
+ int64_t sum = 0, qsum = 0, sum_low = 0;
+ double psum = 0;
+ if (pos <= 0) printf("ALL");
+ else printf("%d", pos);
+ for (k = 0; k <= 4; ++k) sum += p->b[k];
+ printf("\t%lld", (long long)sum);
+ for (k = 0; k <= 4; ++k)
+ printf("\t%.1f", 100. * p->b[k] / sum);
+ for (k = 0; k <= 93; ++k) {
+ qsum += p->q[k] * k, psum += p->q[k] * perr[k];
+ if (k < qthres) sum_low += p->q[k];
+ }
+ printf("\t%.1f\t%.1f", (double)qsum/sum, -4.343*log((psum+1e-6)/(sum+1e-6)));
+ if (qthres <= 0) {
+ for (k = 0; k <= 93; ++k)
+ if (allq[k] > 0) printf("\t%.2f", 100. * p->q[k] / sum);
+ } else printf("\t%.1f\t%.1f", 100. * sum_low / sum, 100. * (sum - sum_low) / sum);
+ putchar('\n');
+}
+
+int stk_fqchk(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ int i, c, k, max_len = 0, min_len = 0x7fffffff, max_alloc = 0, offset = 33, n_diffQ = 0, qthres = 20;
+ int64_t tot_len = 0, n = 0;
+ double perr[94];
+ posstat_t all, *pos = 0;
+
+ while ((c = getopt(argc, argv, "q:")) >= 0)
+ if (c == 'q') qthres = atoi(optarg);
+
+ if (optind == argc) {
+ fprintf(stderr, "Usage: seqtk fqchk [-q %d] <in.fq>\n", qthres);
+ fprintf(stderr, "Note: use -q0 to get the distribution of all quality values\n");
+ return 1;
+ }
+ fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+ seq = kseq_init(fp);
+ for (k = 0; k <= 93; ++k)
+ perr[k] = pow(10., -.1 * k);
+ perr[0] = perr[1] = perr[2] = perr[3] = .5;
+ while (kseq_read(seq) >= 0) {
+ if (seq->qual.l == 0) continue;
+ ++n;
+ tot_len += seq->seq.l;
+ min_len = min_len < seq->seq.l? min_len : seq->seq.l;
+ max_len = max_len > seq->seq.l? max_len : seq->seq.l;
+ if (max_len > max_alloc) {
+ int old_max = max_alloc;
+ max_alloc = max_len;
+ kroundup32(max_alloc);
+ pos = realloc(pos, max_alloc * sizeof(posstat_t));
+ memset(&pos[old_max], 0, (max_alloc - old_max) * sizeof(posstat_t));
+ }
+ for (i = 0; i < seq->qual.l; ++i) {
+ int q = seq->qual.s[i] - offset;
+ int b = seq_nt6_table[(int)seq->seq.s[i]];
+ b = b? b - 1 : 4;
+ q = q < 93? q : 93;
+ ++pos[i].q[q];
+ ++pos[i].b[b];
+ }
+ }
+ kseq_destroy(seq);
+ gzclose(fp);
+
+ memset(&all, 0, sizeof(posstat_t));
+ for (i = 0; i < max_len; ++i) {
+ for (k = 0; k <= 93; ++k)
+ all.q[k] += pos[i].q[k];
+ for (k = 0; k <= 4; ++k)
+ all.b[k] += pos[i].b[k];
+ }
+ for (k = n_diffQ = 0; k <= 93; ++k)
+ if (all.q[k]) ++n_diffQ;
+ printf("min_len: %d; max_len: %d; avg_len: %.2f; %d distinct quality values\n", min_len, max_len, (double)tot_len/n, n_diffQ);
+ printf("POS\t#bases\t%%A\t%%C\t%%G\t%%T\t%%N\tavgQ\terrQ");
+ if (qthres <= 0) {
+ for (k = 0; k <= 93; ++k)
+ if (all.q[k] > 0) printf("\t%%Q%d", k);
+ } else printf("\t%%low\t%%high");
+ putchar('\n');
+ fqc_aux(&all, 0, all.q, perr, qthres);
+ for (i = 0; i < max_len; ++i)
+ fqc_aux(&pos[i], i + 1, all.q, perr, qthres);
+ free(pos);
return 0;
}
@@ -1035,15 +1574,21 @@ static int usage()
{
fprintf(stderr, "\n");
fprintf(stderr, "Usage: seqtk <command> <arguments>\n");
- fprintf(stderr, "Version: 1.0-r31\n\n");
+ fprintf(stderr, "Version: 1.2-r94\n\n");
fprintf(stderr, "Command: seq common transformation of FASTA/Q\n");
fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n");
fprintf(stderr, " sample subsample sequences\n");
fprintf(stderr, " subseq extract subsequences from FASTA/Q\n");
+ fprintf(stderr, " fqchk fastq QC (base/quality summary)\n");
+ fprintf(stderr, " mergepe interleave two PE FASTA/Q files\n");
fprintf(stderr, " trimfq trim FASTQ using the Phred algorithm\n\n");
fprintf(stderr, " hety regional heterozygosity\n");
+ fprintf(stderr, " gc identify high- or low-GC regions\n");
fprintf(stderr, " mutfa point mutate FASTA at specified positions\n");
fprintf(stderr, " mergefa merge two FASTA/Q files\n");
+ fprintf(stderr, " famask apply a X-coded FASTA to a source FASTA\n");
+ fprintf(stderr, " dropse drop unpaired from interleaved PE FASTA/Q\n");
+ fprintf(stderr, " rename rename sequence names\n");
fprintf(stderr, " randbase choose a random base from hets\n");
fprintf(stderr, " cutN cut sequence at long N\n");
fprintf(stderr, " listhet extract the position of each het\n");
@@ -1055,10 +1600,14 @@ int main(int argc, char *argv[])
{
if (argc == 1) return usage();
if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1);
+ else if (strcmp(argv[1], "fqchk") == 0) stk_fqchk(argc-1, argv+1);
else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1);
+ else if (strcmp(argv[1], "gc") == 0) stk_gc(argc-1, argv+1);
else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1);
else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1);
else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1);
+ else if (strcmp(argv[1], "mergepe") == 0) stk_mergepe(argc-1, argv+1);
+ else if (strcmp(argv[1], "dropse") == 0) stk_dropse(argc-1, argv+1);
else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1);
else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1);
else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1);
@@ -1067,8 +1616,10 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "hrun") == 0) stk_hrun(argc-1, argv+1);
else if (strcmp(argv[1], "sample") == 0) stk_sample(argc-1, argv+1);
else if (strcmp(argv[1], "seq") == 0) stk_seq(argc-1, argv+1);
+ else if (strcmp(argv[1], "kfreq") == 0) stk_kfreq(argc-1, argv+1);
+ else if (strcmp(argv[1], "rename") == 0) stk_rename(argc-1, argv+1);
else {
- fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]);
+ fprintf(stderr, "[main] unrecognized command '%s'. Abort!\n", argv[1]);
return 1;
}
return 0;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/seqtk.git
More information about the debian-med-commit
mailing list