[med-svn] [seqtk] 01/01: Imported Upstream version 1.0
Andreas Tille
tille at debian.org
Sat Feb 1 12:43:07 UTC 2014
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch upstream
in repository seqtk.
commit bf014dd5b20c97213dc7ac794daf032ac5d3833a
Author: Andreas Tille <tille at debian.org>
Date: Sat Feb 1 13:41:49 2014 +0100
Imported Upstream version 1.0
---
.gitignore | 4 +
Makefile | 8 +
README.md | 53 +++
debian/changelog | 5 -
debian/compat | 1 -
debian/control | 24 --
debian/copyright | 32 --
debian/get-orig-source | 3 -
debian/rules | 9 -
debian/source/format | 1 -
debian/watch | 3 -
khash.h | 548 ++++++++++++++++++++++++
kseq.h | 235 +++++++++++
seqtk.c | 1075 ++++++++++++++++++++++++++++++++++++++++++++++++
14 files changed, 1923 insertions(+), 78 deletions(-)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..86f9203
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+*.o
+*.a
+*.dSYM
+.*.swp
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..287a36e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,8 @@
+CC=gcc
+CFLAGS=-g -Wall -O2
+
+seqtk:seqtk.c khash.h kseq.h
+ $(CC) $(CFLAGS) seqtk.c -o $@ -lz -lm
+
+clean:
+ rm -fr gmon.out *.o ext/*.o a.out seqtk *~ *.a *.dSYM session*
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..0de5349
--- /dev/null
+++ b/README.md
@@ -0,0 +1,53 @@
+Introduction
+------------
+
+Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format. It seamlessly parses both FASTA and FASTQ files which can also be optionally compressed by gzip.
+
+Examples
+--------
+
+* Convert FASTQ to FASTA:
+
+ seqtk seq -a in.fq.gz > out.fa
+
+* Convert ILLUMINA 1.3+ FASTQ to FASTA and mask bases with quality lower than 20 to lowercases (the 1st command line) or to `N` (the 2nd):
+
+ seqtk seq -aQ64 -q20 in.fq > out.fa
+ seqtk seq -aQ64 -q20 -n N in.fq > out.fa
+
+* Fold long FASTA/Q lines and remove FASTA/Q comments:
+
+ seqtk seq -Cl60 in.fa > out.fa
+
+* Convert multi-line FASTQ to 4-line FASTQ:
+
+ seqtk seq -l0 in.fq > out.fq
+
+* Reverse complement FASTA/Q:
+
+ seqtk seq -r in.fq > out.fq
+
+* Extract sequences with names in file `name.lst`, one sequence name per line:
+
+ seqtk subseq in.fq name.lst > out.fq
+
+* Extract sequences in regions contained in file `reg.bed`:
+
+ seqtk subseq in.fa reg.bed > out.fa
+
+* Mask regions in `reg.bed` to lowercases:
+
+ seqtk seq -M reg.bed in.fa > out.fa
+
+* Subsample 10000 read pairs from two large paired FASTQ files (remember to use the same random seed to keep pairing):
+
+ seqtk sample -s100 read1.fq 10000 > sub1.fq
+ seqtk sample -s100 read2.fq 10000 > sub2.fq
+
+* Trim low-quality bases from both ends using the Phred algorithm:
+
+ seqtk trimfq in.fq > out.fq
+
+* Trim 5bp from the left end of each read and 10bp from the right end:
+
+ seqtk trimfq -b 5 -e 10 in.fa > out.fa
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 9478225..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-seqtk (0.0.20121016-1) UNRELEASED; urgency=low
-
- * Initial release (Closes: #<bug>)
-
- -- Andreas Tille <tille at debian.org> Wed, 07 Nov 2012 16:07:35 +0100
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 8c3c586..0000000
--- a/debian/control
+++ /dev/null
@@ -1,24 +0,0 @@
-Source: seqtk
-Section: science
-Priority: optional
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Build-Depends: debhelper (>= 9)
-Standards-Version: 3.9.4
-Homepage: http://kevin-gattaca.blogspot.de/2012/05/program-seqtk-for-sampling-trimming.html
-Vcs-Browser: http://svn.debian.org/wsvn/debian-med/trunk/packages/seqtk/trunk/
-Vcs-Svn: svn://svn.debian.org/debian-med/trunk/packages/seqtk/trunk/
-
-Package: seqtk
-Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: sampling, trimming, fastq2fasta, subsequence, reverse complement
- Currently, seqtk supports quality based trimming with the phred
- algorithm, converting fastq to fasta, reverse complementing sequences,
- extracting or masking subsequences in regions given in a BED/name list
- file, and more. It contains a subsampling module to sample exactly n
- sequences or a fraction of sequences.
- .
- Seqtk supports both fasta and fastq input files, which can be
- optionally gzip compressed.
-
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 1d59bcf..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,32 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: seqtk
-Source: https://github.com/lh3/seqtk
-
-Files: *
-Copyright: © 2008-2012 Heng Li <lh3 at me.com>
-License: MIT
-
-Files: debian/*
-Copyright: © 2012 Andreas Tille <tille at debian.org>
-License: MIT
-
-License: MIT
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- .
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
- .
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
diff --git a/debian/get-orig-source b/debian/get-orig-source
deleted file mode 100755
index 46e5f86..0000000
--- a/debian/get-orig-source
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-echo "Need to ask upstream for real release. Meanwhile you can do:"
-echo " git clone https://github.com/lh3/seqtk.git"
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index e8f077e..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/make -f
-
-# DH_VERBOSE := 1
-
-%:
- dh $@
-
-#get-orig-source:
-# . debian/get-orig-source
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 4f7ce78..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,3 +0,0 @@
-# version=3
-# Only at
-# https://github.com/lh3/seqtk.git
diff --git a/khash.h b/khash.h
new file mode 100644
index 0000000..1a28e11
--- /dev/null
+++ b/khash.h
@@ -0,0 +1,548 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/*
+ An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+ int ret, is_missing;
+ khiter_t k;
+ khash_t(32) *h = kh_init(32);
+ k = kh_put(32, h, 5, &ret);
+ kh_value(h, k) = 10;
+ k = kh_get(32, h, 10);
+ is_missing = (k == kh_end(h));
+ k = kh_get(32, h, 5);
+ kh_del(32, h, k);
+ for (k = kh_begin(h); k != kh_end(h); ++k)
+ if (kh_exist(h, k)) kh_value(h, k) = 1;
+ kh_destroy(32, h);
+ return 0;
+}
+*/
+
+/*
+ 2011-12-29 (0.2.7):
+
+ * Minor code clean up; no actual effect.
+
+ 2011-09-16 (0.2.6):
+
+ * The capacity is a power of 2. This seems to dramatically improve the
+ speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+ - http://code.google.com/p/ulib/
+ - http://nothings.org/computer/judy/
+
+ * Allow to optionally use linear probing which usually has better
+ performance for random input. Double hashing is still the default as it
+ is more robust to certain non-random input.
+
+ * Added Wang's integer hash function (not used by default). This hash
+ function is more robust to certain non-random input.
+
+ 2011-02-14 (0.2.5):
+
+ * Allow to declare global functions.
+
+ 2009-09-26 (0.2.4):
+
+ * Improve portability
+
+ 2008-09-19 (0.2.3):
+
+ * Corrected the example
+ * Improved interfaces
+
+ 2008-09-11 (0.2.2):
+
+ * Improved speed a little in kh_put()
+
+ 2008-09-10 (0.2.1):
+
+ * Added kh_clear()
+ * Fixed a compiling error
+
+ 2008-09-02 (0.2.0):
+
+ * Changed to token concatenation which increases flexibility.
+
+ 2008-08-31 (0.1.2):
+
+ * Fixed a bug in kh_get(), which has not been tested previously.
+
+ 2008-08-31 (0.1.1):
+
+ * Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+ @header
+
+ Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.6"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compipler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#ifdef KHASH_LINEAR
+#define __ac_inc(k, m) 1
+#else
+#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
+#endif
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+ typedef struct { \
+ khint_t n_buckets, size, n_occupied, upper_bound; \
+ khint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t;
+
+#define KHASH_DECLARE(name, khkey_t, khval_t) \
+ __KHASH_TYPE(name, khkey_t, khval_t) \
+ extern kh_##name##_t *kh_init_##name(); \
+ extern void kh_destroy_##name(kh_##name##_t *h); \
+ extern void kh_clear_##name(kh_##name##_t *h); \
+ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
+ extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+ extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ __KHASH_TYPE(name, khkey_t, khval_t) \
+ SCOPE kh_##name##_t *kh_init_##name() { \
+ return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
+ } \
+ SCOPE void kh_destroy_##name(kh_##name##_t *h) \
+ { \
+ if (h) { \
+ free(h->keys); free(h->flags); \
+ free(h->vals); \
+ free(h); \
+ } \
+ } \
+ SCOPE void kh_clear_##name(kh_##name##_t *h) \
+ { \
+ if (h && h->flags) { \
+ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+ h->size = h->n_occupied = 0; \
+ } \
+ } \
+ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+ { \
+ if (h->n_buckets) { \
+ khint_t inc, k, i, last, mask; \
+ mask = h->n_buckets - 1; \
+ k = __hash_func(key); i = k & mask; \
+ inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ i = (i + inc) & mask; \
+ if (i == last) return h->n_buckets; \
+ } \
+ return __ac_iseither(h->flags, i)? h->n_buckets : i; \
+ } else return 0; \
+ } \
+ SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+ { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+ khint32_t *new_flags = 0; \
+ khint_t j = 1; \
+ { \
+ kroundup32(new_n_buckets); \
+ if (new_n_buckets < 4) new_n_buckets = 4; \
+ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
+ else { /* hash table size to be changed (shrink or expand); rehash */ \
+ new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ if (h->n_buckets < new_n_buckets) { /* expand */ \
+ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+ } /* otherwise shrink */ \
+ } \
+ } \
+ if (j) { /* rehashing is needed */ \
+ for (j = 0; j != h->n_buckets; ++j) { \
+ if (__ac_iseither(h->flags, j) == 0) { \
+ khkey_t key = h->keys[j]; \
+ khval_t val; \
+ khint_t new_mask; \
+ new_mask = new_n_buckets - 1; \
+ if (kh_is_map) val = h->vals[j]; \
+ __ac_set_isdel_true(h->flags, j); \
+ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+ khint_t inc, k, i; \
+ k = __hash_func(key); \
+ i = k & new_mask; \
+ inc = __ac_inc(k, new_mask); \
+ while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
+ __ac_set_isempty_false(new_flags, i); \
+ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+ __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+ } else { /* write the element and jump out of the loop */ \
+ h->keys[i] = key; \
+ if (kh_is_map) h->vals[i] = val; \
+ break; \
+ } \
+ } \
+ } \
+ } \
+ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+ } \
+ free(h->flags); /* free the working space */ \
+ h->flags = new_flags; \
+ h->n_buckets = new_n_buckets; \
+ h->n_occupied = h->size; \
+ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+ } \
+ } \
+ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+ { \
+ khint_t x; \
+ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+ if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \
+ else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \
+ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+ { \
+ khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
+ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
+ else { \
+ inc = __ac_inc(k, mask); last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ if (__ac_isdel(h->flags, i)) site = i; \
+ i = (i + inc) & mask; \
+ if (i == last) { x = site; break; } \
+ } \
+ if (x == h->n_buckets) { \
+ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+ else x = i; \
+ } \
+ } \
+ } \
+ if (__ac_isempty(h->flags, x)) { /* not present at all */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; ++h->n_occupied; \
+ *ret = 1; \
+ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; \
+ *ret = 2; \
+ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+ return x; \
+ } \
+ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
+ { \
+ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
+ __ac_set_isdel_true(h->flags, x); \
+ --h->size; \
+ } \
+ }
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+ @abstract Integer hash function
+ @param key The integer [khint32_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+ @abstract Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract 64-bit integer hash function
+ @param key The integer [khint64_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+ @abstract 64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract const char* hash function
+ @param s Pointer to a null terminated string
+ @return The hash value
+ */
+static inline khint_t __ac_X31_hash_string(const char *s)
+{
+ khint_t h = *s;
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+ return h;
+}
+/*! @function
+ @abstract Another interface to const char* hash function
+ @param key Pointer to a null terminated string [const char*]
+ @return The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+ @abstract Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static inline khint_t __ac_Wang_hash(khint_t key)
+{
+ key += ~(key << 15);
+ key ^= (key >> 10);
+ key += (key << 3);
+ key ^= (key >> 6);
+ key += ~(key << 11);
+ key ^= (key >> 16);
+ return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+ @abstract Type of the hash table.
+ @param name Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+ @abstract Initiate a hash table.
+ @param name Name of the hash table [symbol]
+ @return Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+ @abstract Destroy a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+ @abstract Reset a hash table without deallocating memory.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+ @abstract Resize a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param s New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+ @abstract Insert a key to the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @param r Extra return code: 0 if the key is present in the hash table;
+ 1 if the bucket is empty (never used); 2 if the element in
+ the bucket has been deleted [int*]
+ @return Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+ @abstract Retrieve a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+ @abstract Remove a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+ @abstract Test whether a bucket contains data.
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return 1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+ @abstract Get key given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+ @abstract Get value given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Value [type of values]
+ @discussion For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Get the start iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+ @abstract Get the end iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+ @abstract Get the number of elements in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+ @abstract Get the number of buckets in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/* More conenient interfaces */
+
+/*! @function
+ @abstract Instantiate a hash set containing integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name) \
+ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t) \
+ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name) \
+ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t) \
+ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name) \
+ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t) \
+ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/kseq.h b/kseq.h
new file mode 100644
index 0000000..a5cec7c
--- /dev/null
+++ b/kseq.h
@@ -0,0 +1,235 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB 1 // isspace() && !' '
+#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX 2
+
+#define __KS_TYPE(type_t) \
+ typedef struct __kstream_t { \
+ unsigned char *buf; \
+ int begin, end, is_eof; \
+ type_t f; \
+ } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize) \
+ static inline kstream_t *ks_init(type_t f) \
+ { \
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+ ks->f = f; \
+ ks->buf = (unsigned char*)malloc(__bufsize); \
+ return ks; \
+ } \
+ static inline void ks_destroy(kstream_t *ks) \
+ { \
+ if (ks) { \
+ free(ks->buf); \
+ free(ks); \
+ } \
+ }
+
+#define __KS_GETC(__read, __bufsize) \
+ static inline int ks_getc(kstream_t *ks) \
+ { \
+ if (ks->is_eof && ks->begin >= ks->end) return -1; \
+ if (ks->begin >= ks->end) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end < __bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) return -1; \
+ } \
+ return (int)ks->buf[ks->begin++]; \
+ }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize) \
+ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+ { \
+ if (dret) *dret = 0; \
+ str->l = append? str->l : 0; \
+ if (ks->begin >= ks->end && ks->is_eof) return -1; \
+ for (;;) { \
+ int i; \
+ if (ks->begin >= ks->end) { \
+ if (!ks->is_eof) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end < __bufsize) ks->is_eof = 1; \
+ if (ks->end == 0) break; \
+ } else break; \
+ } \
+ if (delimiter == KS_SEP_LINE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == '\n') break; \
+ } else if (delimiter > KS_SEP_MAX) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == delimiter) break; \
+ } else if (delimiter == KS_SEP_SPACE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i])) break; \
+ } else if (delimiter == KS_SEP_TAB) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+ } else i = 0; /* never come to here! */ \
+ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
+ str->m = str->l + (i - ks->begin) + 1; \
+ kroundup32(str->m); \
+ str->s = (char*)realloc(str->s, str->m); \
+ } \
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+ str->l = str->l + (i - ks->begin); \
+ ks->begin = i + 1; \
+ if (i < ks->end) { \
+ if (dret) *dret = ks->buf[i]; \
+ break; \
+ } \
+ } \
+ if (str->s == 0) { \
+ str->m = 1; \
+ str->s = (char*)calloc(1, 1); \
+ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+ str->s[str->l] = '\0'; \
+ return str->l; \
+ } \
+ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ { return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+ __KS_TYPE(type_t) \
+ __KS_BASIC(type_t, __bufsize) \
+ __KS_GETC(__read, __bufsize) \
+ __KS_GETUNTIL(__read, __bufsize)
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t) \
+ SCOPE kseq_t *kseq_init(type_t fd) \
+ { \
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
+ s->f = ks_init(fd); \
+ return s; \
+ } \
+ SCOPE void kseq_destroy(kseq_t *ks) \
+ { \
+ if (!ks) return; \
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+ ks_destroy(ks->f); \
+ free(ks); \
+ }
+
+/* Return value:
+ >=0 length of the sequence (normal)
+ -1 end-of-file
+ -2 truncated quality string
+ */
+#define __KSEQ_READ(SCOPE) \
+ SCOPE int kseq_read(kseq_t *seq) \
+ { \
+ int c; \
+ kstream_t *ks = seq->f; \
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+ if (c == -1) return -1; /* end of file */ \
+ seq->last_char = c; \
+ } /* else: the first header char has been read in the previous call */ \
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+ seq->seq.m = 256; \
+ seq->seq.s = (char*)malloc(seq->seq.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+ if (c == '\n') continue; /* skip empty lines */ \
+ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+ } \
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+ seq->seq.m = seq->seq.l + 2; \
+ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+ } \
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
+ if (c != '+') return seq->seq.l; /* FASTA */ \
+ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
+ seq->qual.m = seq->seq.m; \
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+ if (c == -1) return -2; /* error: no quality string */ \
+ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+ seq->last_char = 0; /* we have not come to the next header line */ \
+ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+ return seq->seq.l; \
+ }
+
+#define __KSEQ_TYPE(type_t) \
+ typedef struct { \
+ kstring_t name, comment, seq, qual; \
+ int last_char; \
+ kstream_t *f; \
+ } kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read) \
+ KSTREAM_INIT(type_t, __read, 16384) \
+ __KSEQ_TYPE(type_t) \
+ __KSEQ_BASIC(SCOPE, type_t) \
+ __KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+ __KS_TYPE(type_t) \
+ __KSEQ_TYPE(type_t) \
+ extern kseq_t *kseq_init(type_t fd); \
+ void kseq_destroy(kseq_t *ks); \
+ int kseq_read(kseq_t *seq);
+
+#endif
diff --git a/seqtk.c b/seqtk.c
new file mode 100644
index 0000000..9cbcfad
--- /dev/null
+++ b/seqtk.c
@@ -0,0 +1,1075 @@
+/* The MIT License
+
+ Copyright (c) 20082-2012 by Heng Li <lh3 at me.com>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <zlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <limits.h>
+#include <math.h>
+
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+typedef struct {
+ int n, m;
+ uint64_t *a;
+} reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, reglist_t)
+
+typedef kh_reg_t reghash_t;
+
+reghash_t *stk_reg_read(const char *fn)
+{
+ reghash_t *h = kh_init(reg);
+ gzFile fp;
+ kstream_t *ks;
+ int dret;
+ kstring_t *str;
+ // read the list
+ str = calloc(1, sizeof(kstring_t));
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ int beg = -1, end = -1;
+ reglist_t *p;
+ khint_t k = kh_get(reg, h, str->s);
+ if (k == kh_end(h)) {
+ int ret;
+ char *s = strdup(str->s);
+ k = kh_put(reg, h, s, &ret);
+ memset(&kh_val(h, k), 0, sizeof(reglist_t));
+ }
+ p = &kh_val(h, k);
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+ beg = atoi(str->s);
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+ end = atoi(str->s);
+ if (end < 0) end = -1;
+ }
+ }
+ }
+ }
+ // skip the rest of the line
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+ if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+ if (beg < 0) beg = 0, end = INT_MAX;
+ if (p->n == p->m) {
+ p->m = p->m? p->m<<1 : 4;
+ p->a = realloc(p->a, p->m * 8);
+ }
+ p->a[p->n++] = (uint64_t)beg<<32 | end;
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ return h;
+}
+
+void stk_reg_destroy(reghash_t *h)
+{
+ khint_t k;
+ if (h == 0) return;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ free(kh_val(h, k).a);
+ free((char*)kh_key(h, k));
+ }
+ }
+ kh_destroy(reg, h);
+}
+
+/* constant table */
+
+unsigned char seq_nt16_table[256] = {
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15,
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+ 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN";
+unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+unsigned char seq_nt16comp_table[] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 };
+int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+char comp_tab[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O',
+ 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95,
+ 64, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o',
+ 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
+};
+
+static void stk_printstr(const kstring_t *s, unsigned line_len)
+{
+ if (line_len != UINT_MAX) {
+ int i, rest = s->l;
+ for (i = 0; i < s->l; i += line_len, rest -= line_len) {
+ putchar('\n');
+ if (rest > line_len) fwrite(s->s + i, 1, line_len, stdout);
+ else fwrite(s->s + i, 1, rest, stdout);
+ }
+ putchar('\n');
+ } else {
+ putchar('\n');
+ puts(s->s);
+ }
+}
+
+void stk_printseq(const kseq_t *s, int line_len)
+{
+ putchar(s->qual.l? '@' : '>');
+ fputs(s->name.s, stdout);
+ if (s->comment.l) {
+ putchar(' '); fputs(s->comment.s, stdout);
+ }
+ stk_printstr(&s->seq, line_len);
+ if (s->qual.l) {
+ putchar('+');
+ stk_printstr(&s->qual, line_len);
+ }
+}
+
+/* quality based trimming with Mott's algorithm */
+int stk_trimfq(int argc, char *argv[])
+{ // FIXME: when a record with zero length will always be treated as a fasta record
+ gzFile fp;
+ kseq_t *seq;
+ double param = 0.05, q_int2real[128];
+ int i, c, min_len = 30, left = 0, right = 0;
+ while ((c = getopt(argc, argv, "l:q:b:e:")) >= 0) {
+ switch (c) {
+ case 'q': param = atof(optarg); break;
+ case 'l': min_len = atoi(optarg); break;
+ case 'b': left = atoi(optarg); break;
+ case 'e': right = atoi(optarg); break;
+ }
+ }
+ if (optind == argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: seqtk trimfq [options] <in.fq>\n\n");
+ fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param);
+ fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e) [%d]\n", min_len);
+ fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n");
+ fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+ for (i = 0; i < 128; ++i)
+ q_int2real[i] = pow(10., -(i - 33) / 10.);
+ while (kseq_read(seq) >= 0) {
+ int beg, tmp, end;
+ double s, max;
+ if (left || right) {
+ beg = left; end = seq->seq.l - right;
+ if (beg >= end) beg = end = 0;
+ } else if (seq->qual.l > min_len) {
+ for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) {
+ int q = seq->qual.s[i];
+ if (q < 36) q = 36;
+ if (q > 127) q = 127;
+ s += param - q_int2real[q];
+ if (s > max) max = s, beg = tmp, end = i + 1;
+ if (s < 0) s = 0, tmp = i + 1;
+ }
+ if (end - beg < min_len) { // window-based
+ int is, imax;
+ for (i = 0, is = 0; i < min_len; ++i)
+ is += seq->qual.s[i] - 33;
+ for (imax = is, beg = 0; i < seq->qual.l; ++i) {
+ is += (int)seq->qual.s[i] - seq->qual.s[i - min_len];
+ if (imax < is) imax = is, beg = i - min_len + 1;
+ }
+ end = beg + min_len;
+ }
+ } else beg = 0, end = seq->seq.l;
+ putchar(seq->qual.l? '@' : '>'); fputs(seq->name.s, stdout);
+ if (seq->comment.l) {
+ putchar(' '); puts(seq->comment.s);
+ } else putchar('\n');
+ fwrite(seq->seq.s + beg, 1, end - beg, stdout); putchar('\n');
+ if (seq->qual.l) {
+ puts("+");
+ fwrite(seq->qual.s + beg, 1, end - beg, stdout); putchar('\n');
+ }
+ }
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
+/* composition */
+int stk_comp(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ int l, c, upper_only = 0;
+ reghash_t *h = 0;
+ reglist_t dummy;
+ while ((c = getopt(argc, argv, "ur:")) >= 0) {
+ switch (c) {
+ case 'u': upper_only = 1; break;
+ case 'r': h = stk_reg_read(optarg); break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] <in.fa>\n\n");
+ fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n");
+ return 1;
+ }
+ fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+ seq = kseq_init(fp);
+ dummy.n= dummy.m = 1; dummy.a = calloc(1, 8);
+ while ((l = kseq_read(seq)) >= 0) {
+ int i, k;
+ reglist_t *p = 0;
+ if (h) {
+ khint_t k = kh_get(reg, h, seq->name.s);
+ if (k != kh_end(h)) p = &kh_val(h, k);
+ } else {
+ p = &dummy;
+ dummy.a[0] = l;
+ }
+ for (k = 0; p && k < p->n; ++k) {
+ int beg = p->a[k]>>32, end = p->a[k]&0xffffffff;
+ int la, lb, lc, na, nb, nc, cnt[11];
+ if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb];
+ else la = 'a', lb = -1, lc = 0;
+ na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
+ memset(cnt, 0, 11 * sizeof(int));
+ for (i = beg; i < end; ++i) {
+ int is_CpG = 0, a, b, c;
+ a = na; b = nb; c = nc;
+ na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
+ if (b == 2 || b == 10) { // C or Y
+ if (nb == 4 || nb == 5) is_CpG = 1;
+ } else if (b == 4 || b == 5) { // G or R
+ if (lb == 2 || lb == 10) is_CpG = 1;
+ }
+ if (upper_only == 0 || isupper(a)) {
+ if (c > 1) ++cnt[c+2];
+ if (c == 1) ++cnt[seq_nt16to4_table[b]];
+ if (b == 10 || b == 5) ++cnt[9];
+ else if (c == 2) {
+ ++cnt[8];
+ }
+ if (is_CpG) {
+ ++cnt[7];
+ if (b == 10 || b == 5) ++cnt[10];
+ }
+ }
+ la = a; lb = b; lc = c;
+ }
+ if (h) printf("%s\t%d\t%d", seq->name.s, beg, end);
+ else printf("%s\t%d", seq->name.s, l);
+ for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]);
+ putchar('\n');
+ }
+ fflush(stdout);
+ }
+ free(dummy.a);
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
+int stk_randbase(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ int l;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: seqtk randbase <in.fa>\n");
+ return 1;
+ }
+ fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r");
+ seq = kseq_init(fp);
+ while ((l = kseq_read(seq)) >= 0) {
+ int i;
+ printf(">%s", seq->name.s);
+ for (i = 0; i < l; ++i) {
+ int c, b, a, j, k, m;
+ b = seq->seq.s[i];
+ c = seq_nt16_table[b];
+ a = bitcnt_table[c];
+ if (a == 2) {
+ m = (drand48() < 0.5);
+ for (j = k = 0; j < 4; ++j) {
+ if ((1<<j & c) == 0) continue;
+ if (k == m) break;
+ ++k;
+ }
+ seq->seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j];
+ }
+ if (i%60 == 0) putchar('\n');
+ putchar(seq->seq.s[i]);
+ }
+ putchar('\n');
+ }
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
+int stk_hety(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0;
+ char *buf;
+ uint32_t cnt[3];
+ if (argc == 1) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: seqtk hety [options] <in.fa>\n\n");
+ fprintf(stderr, "Options: -w INT window size [%d]\n", win_size);
+ fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start);
+ fprintf(stderr, " -m treat lowercases as masked\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ while ((c = getopt(argc, argv, "w:t:m")) >= 0) {
+ switch (c) {
+ case 'w': win_size = atoi(optarg); break;
+ case 't': n_start = atoi(optarg); break;
+ case 'm': is_lower_mask = 1; break;
+ }
+ }
+ fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+ seq = kseq_init(fp);
+ win_step = win_size / n_start;
+ buf = calloc(win_size, 1);
+ while ((l = kseq_read(seq)) >= 0) {
+ int x, i, y, z, next = 0;
+ cnt[0] = cnt[1] = cnt[2] = 0;
+ for (i = 0; i <= l; ++i) {
+ if ((i >= win_size && i % win_step == 0) || i == l) {
+ if (i == l && l >= win_size) {
+ for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]];
+ }
+ if (cnt[1] + cnt[2] > 0)
+ printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i,
+ (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]);
+ next = i;
+ }
+ if (i < l) {
+ y = i % win_size;
+ c = seq->seq.s[i];
+ if (is_lower_mask && islower(c)) c = 'N';
+ c = seq_nt16_table[c];
+ x = bitcnt_table[c];
+ if (i >= win_size) --cnt[(int)buf[y]];
+ buf[y] = z = x > 2? 0 : x == 2? 2 : 1;
+ ++cnt[z];
+ }
+ }
+ }
+ free(buf);
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
+/* subseq */
+
+int stk_subseq(int argc, char *argv[])
+{
+ khash_t(reg) *h = kh_init(reg);
+ gzFile fp;
+ kseq_t *seq;
+ int l, i, j, c, is_tab = 0, line = 1024;
+ khint_t k;
+ while ((c = getopt(argc, argv, "tl:")) >= 0) {
+ switch (c) {
+ case 't': is_tab = 1; break;
+ case 'l': line = atoi(optarg); break;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: seqtk subseq [options] <in.fa> <in.bed>|<name.list>\n\n");
+ fprintf(stderr, "Options: -t TAB delimited output\n");
+ fprintf(stderr, " -l INT sequence line length [%d]\n\n", line);
+ fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n");
+ return 1;
+ }
+ h = stk_reg_read(argv[optind+1]);
+ // subseq
+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+ while ((l = kseq_read(seq)) >= 0) {
+ reglist_t *p;
+ k = kh_get(reg, h, seq->name.s);
+ if (k == kh_end(h)) continue;
+ p = &kh_val(h, k);
+ for (i = 0; i < p->n; ++i) {
+ int beg = p->a[i]>>32, end = p->a[i];
+ if (beg >= seq->seq.l) {
+ fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l);
+ continue;
+ }
+ if (end > seq->seq.l) end = seq->seq.l;
+ if (is_tab == 0) {
+ printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s);
+ if (beg > 0 || (int)p->a[i] != INT_MAX) {
+ if (end == INT_MAX) {
+ if (beg) printf(":%d", beg+1);
+ } else printf(":%d-%d", beg+1, end);
+ }
+ } else printf("%s\t%d\t", seq->name.s, beg + 1);
+ if (end > seq->seq.l) end = seq->seq.l;
+ for (j = 0; j < end - beg; ++j) {
+ if (is_tab == 0 && j % line == 0) putchar('\n');
+ putchar(seq->seq.s[j + beg]);
+ }
+ putchar('\n');
+ if (seq->qual.l != seq->seq.l || is_tab) continue;
+ printf("+");
+ for (j = 0; j < end - beg; ++j) {
+ if (j % line == 0) putchar('\n');
+ putchar(seq->qual.s[j + beg]);
+ }
+ putchar('\n');
+ }
+ }
+ // free
+ kseq_destroy(seq);
+ gzclose(fp);
+ stk_reg_destroy(h);
+ return 0;
+}
+
+/* mergefa */
+int stk_mergefa(int argc, char *argv[])
+{
+ gzFile fp[2];
+ kseq_t *seq[2];
+ int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0, is_randhet = 0;
+ uint64_t cnt[5];
+ while ((c = getopt(argc, argv, "himrq:")) >= 0) {
+ switch (c) {
+ case 'i': is_intersect = 1; break;
+ case 'h': is_haploid = 1; break;
+ case 'm': is_mask = 1; break;
+ case 'r': is_randhet = 1; break;
+ case 'q': qual = atoi(optarg); break;
+ }
+ }
+ if (is_mask && is_intersect) {
+ fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__);
+ return 1;
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "\nUsage: seqtk mergefa [options] <in1.fa> <in2.fa>\n\n");
+ fprintf(stderr, "Options: -q INT quality threshold [0]\n");
+ fprintf(stderr, " -i take intersection\n");
+ fprintf(stderr, " -m convert to lowercase when one of the input base is N\n");
+ fprintf(stderr, " -r pick a random allele from het\n");
+ fprintf(stderr, " -h suppress hets in the input\n\n");
+ return 1;
+ }
+ for (i = 0; i < 2; ++i) {
+ fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r");
+ seq[i] = kseq_init(fp[i]);
+ }
+ cnt[0] = cnt[1] = cnt[2] = cnt[3] = cnt[4] = 0;
+ srand48(11);
+ while (kseq_read(seq[0]) >= 0) {
+ int min_l, c[2], b[2], is_upper;
+ kseq_read(seq[1]);
+ if (strcmp(seq[0]->name.s, seq[1]->name.s))
+ fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s);
+ if (seq[0]->seq.l != seq[1]->seq.l)
+ fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l);
+ min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l;
+ printf(">%s", seq[0]->name.s);
+ for (l = 0; l < min_l; ++l) {
+ c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l];
+ if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]);
+ if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]);
+ if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0;
+ else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0;
+ else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0;
+ c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]];
+ if (c[0] == 0) c[0] = 15;
+ if (c[1] == 0) c[1] = 15;
+ b[0] = bitcnt_table[c[0]];
+ b[1] = bitcnt_table[c[1]];
+ if (is_upper) {
+ if (b[0] == 1 && b[1] == 1) {
+ if (c[0] == c[1]) ++cnt[0];
+ else ++cnt[1];
+ } else if (b[0] == 1 && b[1] == 2) ++cnt[2];
+ else if (b[0] == 2 && b[1] == 1) ++cnt[3];
+ else if (b[0] == 2 && b[1] == 2) ++cnt[4];
+ }
+ if (is_haploid && (b[0] > 1 || b[1] > 1)) is_upper = 0;
+ if (is_intersect) {
+ c[0] = c[0] & c[1];
+ if (c[0] == 0) is_upper = 0; // FIXME: is this a bug - c[0] cannot be 0!
+ } else if (is_mask) {
+ if (c[0] == 15 || c[1] == 15) is_upper = 0;
+ c[0] &= c[1];
+ if (c[0] == 0) is_upper = 0;
+ } else if (is_randhet) {
+ if (b[0] == 1 && b[1] == 1) { // two homs
+ c[0] |= c[1];
+ } else if (((b[0] == 1 && b[1] == 2) || (b[0] == 2 && b[1] == 1)) && (c[0]&c[1])) { // one hom, one het
+ c[0] = (lrand48()&1)? (c[0] & c[1]) : (c[0] | c[1]);
+ } else if (b[0] == 2 && b[1] == 2 && c[0] == c[1]) { // double hets
+ if (lrand48()&1) {
+ if (lrand48()&1) {
+ for (i = 8; i >= 1; i >>= 1) // pick the "larger" allele
+ if (c[0]&i) c[0] &= i;
+ } else {
+ for (i = 1; i <= 8; i <<= 1) // pick the "smaller" allele
+ if (c[0]&i) c[0] &= i;
+ }
+ } // else set as het
+ } else is_upper = 0;
+ } else c[0] |= c[1];
+ c[0] = seq_nt16_rev_table[c[0]];
+ if (!is_upper) c[0] = tolower(c[0]);
+ if (l%60 == 0) putchar('\n');
+ putchar(c[0]);
+ }
+ putchar('\n');
+ }
+ fprintf(stderr, "[%s] (same,diff,hom-het,het-hom,het-het)=(%ld,%ld,%ld,%ld,%ld)\n", __func__, (long)cnt[0], (long)cnt[1], (long)cnt[2], (long)cnt[3], (long)cnt[4]);
+ return 0;
+}
+
+int stk_famask(int argc, char *argv[])
+{
+ gzFile fp[2];
+ kseq_t *seq[2];
+ int i, l;
+ if (argc < 3) {
+ fprintf(stderr, "Usage: seqtk famask <src.fa> <mask.fa>\n");
+ return 1;
+ }
+ for (i = 0; i < 2; ++i) {
+ fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r");
+ seq[i] = kseq_init(fp[i]);
+ }
+ while (kseq_read(seq[0]) >= 0) {
+ int min_l, c[2];
+ kseq_read(seq[1]);
+ if (strcmp(seq[0]->name.s, seq[1]->name.s))
+ fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s);
+ if (seq[0]->seq.l != seq[1]->seq.l)
+ fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l);
+ min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l;
+ printf(">%s", seq[0]->name.s);
+ for (l = 0; l < min_l; ++l) {
+ c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l];
+ if (c[1] == 'x') c[0] = tolower(c[0]);
+ else if (c[1] != 'X') c[0] = c[1];
+ if (l%60 == 0) putchar('\n');
+ putchar(c[0]);
+ }
+ putchar('\n');
+ }
+ return 0;
+}
+
+int stk_mutfa(int argc, char *argv[])
+{
+ khash_t(reg) *h = kh_init(reg);
+ gzFile fp;
+ kseq_t *seq;
+ kstream_t *ks;
+ int l, i, dret;
+ kstring_t *str;
+ khint_t k;
+ if (argc < 3) {
+ fprintf(stderr, "Usage: seqtk mutfa <in.fa> <in.snp>\n\n");
+ fprintf(stderr, "Note: <in.snp> contains at least four columns per line which are:\n");
+ fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n");
+ return 1;
+ }
+ // read the list
+ str = calloc(1, sizeof(kstring_t));
+ fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ char *s = strdup(str->s);
+ int beg = 0, ret;
+ reglist_t *p;
+ k = kh_get(reg, h, s);
+ if (k == kh_end(h)) {
+ k = kh_put(reg, h, s, &ret);
+ memset(&kh_val(h, k), 0, sizeof(reglist_t));
+ }
+ p = &kh_val(h, k);
+ if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col
+ ks_getuntil(ks, 0, str, &dret); // 3rd col
+ ks_getuntil(ks, 0, str, &dret); // 4th col
+ // skip the rest of the line
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+ if (isalpha(str->s[0]) && str->l == 1) {
+ if (p->n == p->m) {
+ p->m = p->m? p->m<<1 : 4;
+ p->a = realloc(p->a, p->m * 8);
+ }
+ p->a[p->n++] = (uint64_t)beg<<32 | str->s[0];
+ }
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ // mutfa
+ fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+ while ((l = kseq_read(seq)) >= 0) {
+ reglist_t *p;
+ k = kh_get(reg, h, seq->name.s);
+ if (k != kh_end(h)) {
+ p = &kh_val(h, k);
+ for (i = 0; i < p->n; ++i) {
+ int beg = p->a[i]>>32;
+ if (beg < seq->seq.l)
+ seq->seq.s[beg] = (int)p->a[i];
+ }
+ }
+ printf(">%s", seq->name.s);
+ for (i = 0; i < l; ++i) {
+ if (i%60 == 0) putchar('\n');
+ putchar(seq->seq.s[i]);
+ }
+ putchar('\n');
+ }
+ // free
+ kseq_destroy(seq);
+ gzclose(fp);
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ free(kh_val(h, k).a);
+ free((char*)kh_key(h, k));
+ }
+ }
+ kh_destroy(reg, h);
+ return 0;
+}
+
+int stk_listhet(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ int i, l;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: seqtk listhet <in.fa>\n");
+ return 1;
+ }
+ fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r");
+ seq = kseq_init(fp);
+ while ((l = kseq_read(seq)) >= 0) {
+ for (i = 0; i < l; ++i) {
+ int b = seq->seq.s[i];
+ if (bitcnt_table[seq_nt16_table[b]] == 2)
+ printf("%s\t%d\t%c\n", seq->name.s, i+1, b);
+ }
+ }
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
+/* cutN */
+static int cutN_min_N_tract = 1000;
+static int cutN_nonN_penalty = 10;
+
+static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end)
+{
+ int i, b, e;
+ while (k < ks->seq.l) {
+ if (seq_nt16_table[(int)ks->seq.s[k]] == 15) {
+ int score, max;
+ score = 0; e = max = -1;
+ for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */
+ if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score;
+ else score -= cutN_nonN_penalty;
+ if (score > max) max = score, e = i;
+ }
+ score = 0; b = max = -1;
+ for (i = e; i >= 0 && score >= 0; --i) { /* backward */
+ if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score;
+ else score -= cutN_nonN_penalty;
+ if (score > max) max = score, b = i;
+ }
+ if (e + 1 - b >= cutN_min_N_tract) {
+ *begin = b;
+ *end = e + 1;
+ return *end;
+ }
+ k = e + 1;
+ } else ++k;
+ }
+ return -1;
+}
+static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end)
+{
+ int i;
+ if (begin >= end) return; // FIXME: why may this happen? Understand it!
+ fprintf(fpout, ">%s:%d-%d", ks->name.s, begin+1, end);
+ for (i = begin; i < end && i < ks->seq.l; ++i) {
+ if ((i - begin)%60 == 0) fputc('\n', fpout);
+ fputc(ks->seq.s[i], fpout);
+ }
+ fputc('\n', fpout);
+}
+int stk_cutN(int argc, char *argv[])
+{
+ int c, l, gap_only = 0;
+ gzFile fp;
+ kseq_t *ks;
+ while ((c = getopt(argc, argv, "n:p:g")) >= 0) {
+ switch (c) {
+ case 'n': cutN_min_N_tract = atoi(optarg); break;
+ case 'p': cutN_nonN_penalty = atoi(optarg); break;
+ case 'g': gap_only = 1; break;
+ default: return 1;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: seqtk cutN [options] <in.fa>\n\n");
+ fprintf(stderr, "Options: -n INT min size of N tract [%d]\n", cutN_min_N_tract);
+ fprintf(stderr, " -p INT penalty for a non-N [%d]\n", cutN_nonN_penalty);
+ fprintf(stderr, " -g print gaps only, no sequence\n\n");
+ return 1;
+ }
+ fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+ ks = kseq_init(fp);
+ while ((l = kseq_read(ks)) >= 0) {
+ int k = 0, begin = 0, end = 0;
+ while (find_next_cut(ks, k, &begin, &end) >= 0) {
+ if (begin != 0) {
+ if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end);
+ else print_seq(stdout, ks, k, begin);
+ }
+ k = end;
+ }
+ if (!gap_only) print_seq(stdout, ks, k, l);
+ }
+ kseq_destroy(ks);
+ gzclose(fp);
+ return 0;
+}
+
+int stk_hrun(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *ks;
+ int min_len = 7, l = 0, c = 0, beg = 0, i;
+ if (argc == optind) {
+ fprintf(stderr, "Usage: seqtk hrun <in.fa> [minLen=%d]\n", min_len);
+ return 1;
+ }
+ if (argc == optind + 2) min_len = atoi(argv[optind+1]);
+ fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+ ks = kseq_init(fp);
+ while (kseq_read(ks) >= 0) {
+ c = ks->seq.s[0]; l = 1; beg = 0;
+ for (i = 1; i < ks->seq.l; ++i) {
+ if (ks->seq.s[i] != c) {
+ if (l >= min_len) printf("%s\t%d\t%d\t%c\n", ks->name.s, beg, beg + l, c);
+ c = ks->seq.s[i]; l = 1; beg = i;
+ } else ++l;
+ }
+ }
+ if (l >= min_len) printf("%s\t%d\t%d\t%c\n", ks->name.s, beg, beg + l, c);
+ kseq_destroy(ks);
+ gzclose(fp);
+ return 0;
+}
+
+/* sample */
+
+static void cpy_kstr(kstring_t *dst, const kstring_t *src)
+{
+ if (src->l == 0) return;
+ if (src->l + 1 > dst->m) {
+ dst->m = src->l + 1;
+ kroundup32(dst->m);
+ dst->s = realloc(dst->s, dst->m);
+ }
+ dst->l = src->l;
+ memcpy(dst->s, src->s, src->l + 1);
+}
+
+static void cpy_kseq(kseq_t *dst, const kseq_t *src)
+{
+ cpy_kstr(&dst->name, &src->name);
+ cpy_kstr(&dst->seq, &src->seq);
+ cpy_kstr(&dst->qual, &src->qual);
+}
+
+int stk_sample(int argc, char *argv[])
+{
+ int c;
+ uint64_t i, num = 0, n_seqs = 0;
+ double frac = 0.;
+ gzFile fp;
+ kseq_t *seq, *buf = 0;
+
+ srand48(11);
+ while ((c = getopt(argc, argv, "s:")) >= 0)
+ switch (c) {
+ case 's': srand48(atoi(optarg)); break;
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: seqtk sample [-s seed=11] <in.fa> <frac>|<number>\n\n");
+ fprintf(stderr, "Warning: Large memory consumption for large <number>.\n");
+ return 1;
+ }
+ frac = atof(argv[optind+1]);
+ if (frac > 1.) num = (uint64_t)(frac + .499), frac = 0.;
+ if (num > 0) buf = calloc(num, sizeof(kseq_t));
+
+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+ while (kseq_read(seq) >= 0) {
+ double r = drand48();
+ ++n_seqs;
+ if (num) {
+ uint64_t y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs);
+ if (y < num) cpy_kseq(&buf[y], seq);
+ } else if (r < frac) stk_printseq(seq, UINT_MAX);
+ }
+ kseq_destroy(seq);
+ gzclose(fp);
+ for (i = 0; i < num; ++i) {
+ kseq_t *p = &buf[i];
+ if (p->seq.l) stk_printseq(p, UINT_MAX);
+ free(p->seq.s); free(p->qual.s); free(p->name.s);
+ }
+ free(buf);
+ return 0;
+}
+
+/* seq */
+
+void stk_mask(kseq_t *seq, const khash_t(reg) *h, int is_complement, int mask_chr)
+{
+ unsigned i, j;
+ khiter_t k;
+ k = kh_get(reg, h, seq->name.s);
+ if (k == kh_end(h)) { // not found in the hash table
+ if (is_complement) {
+ if (mask_chr) {
+ for (j = 0; j < seq->seq.l; ++j)
+ seq->seq.s[j] = mask_chr;
+ } else {
+ for (j = 0; j < seq->seq.l; ++j)
+ seq->seq.s[j] = tolower(seq->seq.s[j]);
+ }
+ }
+ } else {
+ reglist_t *p = &kh_val(h, k);
+ if (!is_complement) {
+ for (i = 0; i < p->n; ++i) {
+ unsigned beg = p->a[i]>>32, end = p->a[i];
+ if (beg >= seq->seq.l) continue;
+ if (end > seq->seq.l) end = seq->seq.l;
+ if (!mask_chr) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]);
+ else for (j = beg; j < end; ++j) seq->seq.s[j] = mask_chr;
+ }
+ } else {
+ int8_t *mask = calloc(seq->seq.l, 1);
+ for (i = 0; i < p->n; ++i) {
+ unsigned beg = p->a[i]>>32, end = p->a[i];
+ if (end >= seq->seq.l) end = seq->seq.l;
+ for (j = beg; j < end; ++j) mask[j] = 1;
+ }
+ if (mask_chr) {
+ for (j = 0; j < seq->seq.l; ++j)
+ if (mask[j] == 0) seq->seq.s[j] = mask_chr;
+ } else {
+ for (j = 0; j < seq->seq.l; ++j)
+ if (mask[j] == 0) seq->seq.s[j] = tolower(seq->seq.s[j]);
+ }
+ free(mask);
+ }
+ }
+}
+
+int stk_seq(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0;
+ unsigned line_len = 0;
+ double frac = 1.;
+ khash_t(reg) *h = 0;
+
+ srand48(11);
+ while ((c = getopt(argc, argv, "q:l:Q:aACrn:s:f:M:L:c")) >= 0) {
+ switch (c) {
+ case 'a':
+ case 'A': flag |= 1; break;
+ case 'C': flag |= 2; break;
+ case 'r': flag |= 4; break;
+ case 'c': flag |= 8; break;
+ case 'M': h = stk_reg_read(optarg); break;
+ case 'n': mask_chr = *optarg; break;
+ case 'Q': qual_shift = atoi(optarg); break;
+ case 'q': qual_thres = atoi(optarg); break;
+ case 'l': line_len = atoi(optarg); break;
+ case 'L': min_len = atoi(optarg); break;
+ case 's': srand48(atoi(optarg)); break;
+ case 'f': frac = atof(optarg); break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: seqtk seq [options] <in.fq>|<in.fa>\n\n");
+ fprintf(stderr, "Options: -q INT mask bases with quality lower than INT [0]\n");
+ fprintf(stderr, " -n CHAR masked bases converted to CHAR; 0 for lowercase [0]\n");
+ fprintf(stderr, " -l INT number of residues per line; 0 for 2^32-1 [%d]\n", line_len);
+ fprintf(stderr, " -Q INT quality shift: ASCII-INT gives base quality [%d]\n", qual_shift);
+ fprintf(stderr, " -s INT random seed (effective with -f) [11]\n");
+ fprintf(stderr, " -f FLOAT sample FLOAT fraction of sequences [1]\n");
+ fprintf(stderr, " -M FILE mask regions in BED or name list FILE [null]\n");
+ fprintf(stderr, " -L INT drop sequences with length shorter than INT [0]\n");
+ fprintf(stderr, " -c mask complement region (effective with -M)\n");
+ fprintf(stderr, " -r reverse complement\n");
+ fprintf(stderr, " -A force FASTA output (discard quality)\n");
+ fprintf(stderr, " -C drop comments at the header lines\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ if (line_len == 0) line_len = UINT_MAX;
+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ seq = kseq_init(fp);
+ qual_thres += qual_shift;
+ while (kseq_read(seq) >= 0) {
+ if (seq->seq.l < min_len) continue; // NB: length filter before taking random
+ if (frac < 1. && drand48() >= frac) continue;
+ if (seq->qual.l && qual_thres > qual_shift) {
+ unsigned i;
+ if (mask_chr) {
+ for (i = 0; i < seq->seq.l; ++i)
+ if (seq->qual.s[i] < qual_thres)
+ seq->seq.s[i] = mask_chr;
+ } else {
+ for (i = 0; i < seq->seq.l; ++i)
+ if (seq->qual.s[i] < qual_thres)
+ seq->seq.s[i] = tolower(seq->seq.s[i]);
+ }
+ }
+ if (flag & 1) seq->qual.l = 0;
+ if (flag & 2) seq->comment.l = 0;
+ if (h) stk_mask(seq, h, flag&8, mask_chr); // masking
+ if (flag & 4) { // reverse complement
+ int c0, c1;
+ unsigned i;
+ for (i = 0; i < seq->seq.l>>1; ++i) { // reverse complement sequence
+ c0 = comp_tab[(int)seq->seq.s[i]];
+ c1 = comp_tab[(int)seq->seq.s[seq->seq.l - 1 - i]];
+ seq->seq.s[i] = c1;
+ seq->seq.s[seq->seq.l - 1 - i] = c0;
+ }
+ if (seq->seq.l & 1) // complement the remaining base
+ seq->seq.s[seq->seq.l>>1] = comp_tab[(int)seq->seq.s[seq->seq.l>>1]];
+ if (seq->qual.l) {
+ for (i = 0; i < seq->seq.l>>1; ++i) // reverse quality
+ c0 = seq->qual.s[i], seq->qual.s[i] = seq->qual.s[seq->qual.l - 1 - i], seq->qual.s[seq->qual.l - 1 - i] = c0;
+ }
+ }
+ stk_printseq(seq, line_len);
+ }
+ kseq_destroy(seq);
+ gzclose(fp);
+ stk_reg_destroy(h);
+ return 0;
+}
+
+/* main function */
+static int usage()
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: seqtk <command> <arguments>\n");
+ fprintf(stderr, "Version: 1.0-r31\n\n");
+ fprintf(stderr, "Command: seq common transformation of FASTA/Q\n");
+ fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n");
+ fprintf(stderr, " sample subsample sequences\n");
+ fprintf(stderr, " subseq extract subsequences from FASTA/Q\n");
+ fprintf(stderr, " trimfq trim FASTQ using the Phred algorithm\n\n");
+ fprintf(stderr, " hety regional heterozygosity\n");
+ fprintf(stderr, " mutfa point mutate FASTA at specified positions\n");
+ fprintf(stderr, " mergefa merge two FASTA/Q files\n");
+ fprintf(stderr, " randbase choose a random base from hets\n");
+ fprintf(stderr, " cutN cut sequence at long N\n");
+ fprintf(stderr, " listhet extract the position of each het\n");
+ fprintf(stderr, "\n");
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ if (argc == 1) return usage();
+ if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1);
+ else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1);
+ else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1);
+ else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1);
+ else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1);
+ else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1);
+ else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1);
+ else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1);
+ else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1);
+ else if (strcmp(argv[1], "trimfq") == 0) stk_trimfq(argc-1, argv+1);
+ else if (strcmp(argv[1], "hrun") == 0) stk_hrun(argc-1, argv+1);
+ else if (strcmp(argv[1], "sample") == 0) stk_sample(argc-1, argv+1);
+ else if (strcmp(argv[1], "seq") == 0) stk_seq(argc-1, argv+1);
+ else {
+ fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]);
+ return 1;
+ }
+ return 0;
+}
--
Alioth's /git/debian-med/git-commit-notice on /srv/git.debian.org/git/debian-med/seqtk.git
More information about the debian-med-commit
mailing list