[med-svn] [seqtk] 01/01: Imported Upstream version 1.0

Andreas Tille tille at debian.org
Sat Feb 1 12:43:07 UTC 2014


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch upstream
in repository seqtk.

commit bf014dd5b20c97213dc7ac794daf032ac5d3833a
Author: Andreas Tille <tille at debian.org>
Date:   Sat Feb 1 13:41:49 2014 +0100

    Imported Upstream version 1.0
---
 .gitignore             |    4 +
 Makefile               |    8 +
 README.md              |   53 +++
 debian/changelog       |    5 -
 debian/compat          |    1 -
 debian/control         |   24 --
 debian/copyright       |   32 --
 debian/get-orig-source |    3 -
 debian/rules           |    9 -
 debian/source/format   |    1 -
 debian/watch           |    3 -
 khash.h                |  548 ++++++++++++++++++++++++
 kseq.h                 |  235 +++++++++++
 seqtk.c                | 1075 ++++++++++++++++++++++++++++++++++++++++++++++++
 14 files changed, 1923 insertions(+), 78 deletions(-)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..86f9203
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+*.o
+*.a
+*.dSYM
+.*.swp
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..287a36e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,8 @@
+CC=gcc
+CFLAGS=-g -Wall -O2
+
+seqtk:seqtk.c khash.h kseq.h
+		$(CC) $(CFLAGS) seqtk.c -o $@ -lz -lm
+
+clean:
+		rm -fr gmon.out *.o ext/*.o a.out seqtk *~ *.a *.dSYM session*
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..0de5349
--- /dev/null
+++ b/README.md
@@ -0,0 +1,53 @@
+Introduction
+------------
+
+Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format. It seamlessly parses both FASTA and FASTQ files which can also be optionally compressed by gzip.
+
+Examples
+--------
+
+* Convert FASTQ to FASTA:
+
+        seqtk seq -a in.fq.gz > out.fa
+
+* Convert ILLUMINA 1.3+ FASTQ to FASTA and mask bases with quality lower than 20 to lowercases (the 1st command line) or to `N` (the 2nd):
+
+        seqtk seq -aQ64 -q20 in.fq > out.fa
+        seqtk seq -aQ64 -q20 -n N in.fq > out.fa
+
+* Fold long FASTA/Q lines and remove FASTA/Q comments:
+
+        seqtk seq -Cl60 in.fa > out.fa
+
+* Convert multi-line FASTQ to 4-line FASTQ:
+
+        seqtk seq -l0 in.fq > out.fq
+
+* Reverse complement FASTA/Q:
+
+        seqtk seq -r in.fq > out.fq
+
+* Extract sequences with names in file `name.lst`, one sequence name per line:
+
+        seqtk subseq in.fq name.lst > out.fq
+
+* Extract sequences in regions contained in file `reg.bed`:
+
+        seqtk subseq in.fa reg.bed > out.fa
+
+* Mask regions in `reg.bed` to lowercases:
+
+        seqtk seq -M reg.bed in.fa > out.fa
+
+* Subsample 10000 read pairs from two large paired FASTQ files (remember to use the same random seed to keep pairing):
+
+        seqtk sample -s100 read1.fq 10000 > sub1.fq
+        seqtk sample -s100 read2.fq 10000 > sub2.fq
+
+* Trim low-quality bases from both ends using the Phred algorithm:
+
+        seqtk trimfq in.fq > out.fq
+
+* Trim 5bp from the left end of each read and 10bp from the right end:
+
+        seqtk trimfq -b 5 -e 10 in.fa > out.fa
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 9478225..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-seqtk (0.0.20121016-1) UNRELEASED; urgency=low
-
-  * Initial release (Closes: #<bug>)
-
- -- Andreas Tille <tille at debian.org>  Wed, 07 Nov 2012 16:07:35 +0100
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 8c3c586..0000000
--- a/debian/control
+++ /dev/null
@@ -1,24 +0,0 @@
-Source: seqtk
-Section: science
-Priority: optional
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Build-Depends: debhelper (>= 9)
-Standards-Version: 3.9.4
-Homepage: http://kevin-gattaca.blogspot.de/2012/05/program-seqtk-for-sampling-trimming.html
-Vcs-Browser: http://svn.debian.org/wsvn/debian-med/trunk/packages/seqtk/trunk/
-Vcs-Svn: svn://svn.debian.org/debian-med/trunk/packages/seqtk/trunk/
-
-Package: seqtk
-Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: sampling, trimming, fastq2fasta, subsequence, reverse complement
- Currently, seqtk supports quality based trimming with the phred
- algorithm, converting fastq to fasta, reverse complementing sequences,
- extracting or masking subsequences in regions given in a BED/name list
- file, and more.  It contains a subsampling module to sample exactly n
- sequences or a fraction of sequences.
- .
- Seqtk supports both fasta and fastq input files, which can be
- optionally gzip compressed.
-
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 1d59bcf..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,32 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: seqtk
-Source: https://github.com/lh3/seqtk
-
-Files: *
-Copyright: © 2008-2012 Heng Li <lh3 at me.com>
-License: MIT
-
-Files: debian/*
-Copyright: © 2012 Andreas Tille <tille at debian.org>
-License: MIT
-
-License: MIT
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- .
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
- .
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
diff --git a/debian/get-orig-source b/debian/get-orig-source
deleted file mode 100755
index 46e5f86..0000000
--- a/debian/get-orig-source
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-echo "Need to ask upstream for real release.  Meanwhile you can do:"
-echo "  git clone https://github.com/lh3/seqtk.git"
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index e8f077e..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/make -f
-
-# DH_VERBOSE := 1
-
-%:
-	dh $@
-
-#get-orig-source:
-#	. debian/get-orig-source
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 4f7ce78..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,3 +0,0 @@
-# version=3
-# Only at
-#   https://github.com/lh3/seqtk.git
diff --git a/khash.h b/khash.h
new file mode 100644
index 0000000..1a28e11
--- /dev/null
+++ b/khash.h
@@ -0,0 +1,548 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor at live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+	int ret, is_missing;
+	khiter_t k;
+	khash_t(32) *h = kh_init(32);
+	k = kh_put(32, h, 5, &ret);
+	kh_value(h, k) = 10;
+	k = kh_get(32, h, 10);
+	is_missing = (k == kh_end(h));
+	k = kh_get(32, h, 5);
+	kh_del(32, h, k);
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) kh_value(h, k) = 1;
+	kh_destroy(32, h);
+	return 0;
+}
+*/
+
+/*
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+	* Corrected the example
+	* Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+	* Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+	* Added kh_clear()
+	* Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+	* Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+	* Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+	* Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.6"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compipler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#ifdef KHASH_LINEAR
+#define __ac_inc(k, m) 1
+#else
+#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
+#endif
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct { \
+		khint_t n_buckets, size, n_occupied, upper_bound; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+	} kh_##name##_t;
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	extern kh_##name##_t *kh_init_##name();								\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	extern void kh_clear_##name(kh_##name##_t *h);						\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	SCOPE kh_##name##_t *kh_init_##name() {								\
+		return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t));		\
+	}																	\
+	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h) {														\
+			free(h->keys); free(h->flags);								\
+			free(h->vals);												\
+			free(h);													\
+		}																\
+	}																	\
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h && h->flags) {											\
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+			h->size = h->n_occupied = 0;								\
+		}																\
+	}																	\
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
+	{																	\
+		if (h->n_buckets) {												\
+			khint_t inc, k, i, last, mask;								\
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
+			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+				i = (i + inc) & mask; 									\
+				if (i == last) return h->n_buckets;						\
+			}															\
+			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
+		} else return 0;												\
+	}																	\
+	SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+		khint32_t *new_flags = 0;										\
+		khint_t j = 1;													\
+		{																\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+				} /* otherwise shrink */								\
+			}															\
+		}																\
+		if (j) { /* rehashing is needed */								\
+			for (j = 0; j != h->n_buckets; ++j) {						\
+				if (__ac_iseither(h->flags, j) == 0) {					\
+					khkey_t key = h->keys[j];							\
+					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
+					if (kh_is_map) val = h->vals[j];					\
+					__ac_set_isdel_true(h->flags, j);					\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+						khint_t inc, k, i;								\
+						k = __hash_func(key);							\
+						i = k & new_mask;								\
+						inc = __ac_inc(k, new_mask);					\
+						while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
+						__ac_set_isempty_false(new_flags, i);			\
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
+							h->keys[i] = key;							\
+							if (kh_is_map) h->vals[i] = val;			\
+							break;										\
+						}												\
+					}													\
+				}														\
+			}															\
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+			}															\
+			free(h->flags); /* free the working space */				\
+			h->flags = new_flags;										\
+			h->n_buckets = new_n_buckets;								\
+			h->n_occupied = h->size;									\
+			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+		}																\
+	}																	\
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	{																	\
+		khint_t x;														\
+		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \
+			else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		{																\
+			khint_t inc, k, i, site, last, mask = h->n_buckets - 1;		\
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
+			else {														\
+				inc = __ac_inc(k, mask); last = i;						\
+				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+					if (__ac_isdel(h->flags, i)) site = i;				\
+					i = (i + inc) & mask; 								\
+					if (i == last) { x = site; break; }					\
+				}														\
+				if (x == h->n_buckets) {								\
+					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+					else x = i;											\
+				}														\
+			}															\
+		}																\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size; ++h->n_occupied;									\
+			*ret = 1;													\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size;													\
+			*ret = 2;													\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+		return x;														\
+	}																	\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
+	{																	\
+		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
+			__ac_set_isdel_true(h->flags, x);							\
+			--h->size;													\
+		}																\
+	}
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static inline khint_t __ac_X31_hash_string(const char *s)
+{
+	khint_t h = *s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+	return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: 0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+				the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)										\
+	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)								\
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)										\
+	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)								\
+	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)										\
+	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)								\
+	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/kseq.h b/kseq.h
new file mode 100644
index 0000000..a5cec7c
--- /dev/null
+++ b/kseq.h
@@ -0,0 +1,235 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor at live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB   1 // isspace() && !' '
+#define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX   2
+
+#define __KS_TYPE(type_t)						\
+	typedef struct __kstream_t {				\
+		unsigned char *buf;						\
+		int begin, end, is_eof;					\
+		type_t f;								\
+	} kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize)								\
+	static inline kstream_t *ks_init(type_t f)						\
+	{																\
+		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
+		ks->f = f;													\
+		ks->buf = (unsigned char*)malloc(__bufsize);				\
+		return ks;													\
+	}																\
+	static inline void ks_destroy(kstream_t *ks)					\
+	{																\
+		if (ks) {													\
+			free(ks->buf);											\
+			free(ks);												\
+		}															\
+	}
+
+#define __KS_GETC(__read, __bufsize)						\
+	static inline int ks_getc(kstream_t *ks)				\
+	{														\
+		if (ks->is_eof && ks->begin >= ks->end) return -1;	\
+		if (ks->begin >= ks->end) {							\
+			ks->begin = 0;									\
+			ks->end = __read(ks->f, ks->buf, __bufsize);	\
+			if (ks->end < __bufsize) ks->is_eof = 1;		\
+			if (ks->end == 0) return -1;					\
+		}													\
+		return (int)ks->buf[ks->begin++];					\
+	}
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize)								\
+	static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+	{																	\
+		if (dret) *dret = 0;											\
+		str->l = append? str->l : 0;									\
+		if (ks->begin >= ks->end && ks->is_eof) return -1;				\
+		for (;;) {														\
+			int i;														\
+			if (ks->begin >= ks->end) {									\
+				if (!ks->is_eof) {										\
+					ks->begin = 0;										\
+					ks->end = __read(ks->f, ks->buf, __bufsize);		\
+					if (ks->end < __bufsize) ks->is_eof = 1;			\
+					if (ks->end == 0) break;							\
+				} else break;											\
+			}															\
+			if (delimiter == KS_SEP_LINE) { \
+				for (i = ks->begin; i < ks->end; ++i) \
+					if (ks->buf[i] == '\n') break; \
+			} else if (delimiter > KS_SEP_MAX) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (ks->buf[i] == delimiter) break;					\
+			} else if (delimiter == KS_SEP_SPACE) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i])) break;						\
+			} else if (delimiter == KS_SEP_TAB) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+			} else i = 0; /* never come to here! */						\
+			if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
+				str->m = str->l + (i - ks->begin) + 1;					\
+				kroundup32(str->m);										\
+				str->s = (char*)realloc(str->s, str->m);				\
+			}															\
+			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+			str->l = str->l + (i - ks->begin);							\
+			ks->begin = i + 1;											\
+			if (i < ks->end) {											\
+				if (dret) *dret = ks->buf[i];							\
+				break;													\
+			}															\
+		}																\
+		if (str->s == 0) {												\
+			str->m = 1;													\
+			str->s = (char*)calloc(1, 1);								\
+		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+		str->s[str->l] = '\0';											\
+		return str->l;													\
+	} \
+	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+	__KS_TYPE(type_t)							\
+	__KS_BASIC(type_t, __bufsize)				\
+	__KS_GETC(__read, __bufsize)				\
+	__KS_GETUNTIL(__read, __bufsize)
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t)										\
+	SCOPE kseq_t *kseq_init(type_t fd)									\
+	{																	\
+		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
+		s->f = ks_init(fd);												\
+		return s;														\
+	}																	\
+	SCOPE void kseq_destroy(kseq_t *ks)									\
+	{																	\
+		if (!ks) return;												\
+		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
+		ks_destroy(ks->f);												\
+		free(ks);														\
+	}
+
+/* Return value:
+   >=0  length of the sequence (normal)
+   -1   end-of-file
+   -2   truncated quality string
+ */
+#define __KSEQ_READ(SCOPE) \
+	SCOPE int kseq_read(kseq_t *seq) \
+	{ \
+		int c; \
+		kstream_t *ks = seq->f; \
+		if (seq->last_char == 0) { /* then jump to the next header line */ \
+			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+			if (c == -1) return -1; /* end of file */ \
+			seq->last_char = c; \
+		} /* else: the first header char has been read in the previous call */ \
+		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+			seq->seq.m = 256; \
+			seq->seq.s = (char*)malloc(seq->seq.m); \
+		} \
+		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+			if (c == '\n') continue; /* skip empty lines */ \
+			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+		} \
+		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
+		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+			seq->seq.m = seq->seq.l + 2; \
+			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+		} \
+		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
+		if (c != '+') return seq->seq.l; /* FASTA */ \
+		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
+			seq->qual.m = seq->seq.m; \
+			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+		} \
+		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+		if (c == -1) return -2; /* error: no quality string */ \
+		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+		seq->last_char = 0;	/* we have not come to the next header line */ \
+		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+		return seq->seq.l; \
+	}
+
+#define __KSEQ_TYPE(type_t)						\
+	typedef struct {							\
+		kstring_t name, comment, seq, qual;		\
+		int last_char;							\
+		kstream_t *f;							\
+	} kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read)		\
+	KSTREAM_INIT(type_t, __read, 16384)			\
+	__KSEQ_TYPE(type_t)							\
+	__KSEQ_BASIC(SCOPE, type_t)					\
+	__KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+	__KS_TYPE(type_t) \
+	__KSEQ_TYPE(type_t) \
+	extern kseq_t *kseq_init(type_t fd); \
+	void kseq_destroy(kseq_t *ks); \
+	int kseq_read(kseq_t *seq);
+
+#endif
diff --git a/seqtk.c b/seqtk.c
new file mode 100644
index 0000000..9cbcfad
--- /dev/null
+++ b/seqtk.c
@@ -0,0 +1,1075 @@
+/* The MIT License
+
+   Copyright (c) 20082-2012 by Heng Li <lh3 at me.com>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <zlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <limits.h>
+#include <math.h>
+
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+typedef struct {
+	int n, m;
+	uint64_t *a;
+} reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, reglist_t)
+
+typedef kh_reg_t reghash_t;
+
+reghash_t *stk_reg_read(const char *fn)
+{
+	reghash_t *h = kh_init(reg);
+	gzFile fp;
+	kstream_t *ks;
+	int dret;
+	kstring_t *str;
+	// read the list
+	str = calloc(1, sizeof(kstring_t));
+	fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+	ks = ks_init(fp);
+	while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+		int beg = -1, end = -1;
+		reglist_t *p;
+		khint_t k = kh_get(reg, h, str->s);
+		if (k == kh_end(h)) {
+			int ret;
+			char *s = strdup(str->s);
+			k = kh_put(reg, h, s, &ret);
+			memset(&kh_val(h, k), 0, sizeof(reglist_t));
+		}
+		p = &kh_val(h, k);
+		if (dret != '\n') {
+			if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+				beg = atoi(str->s);
+				if (dret != '\n') {
+					if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+						end = atoi(str->s);
+						if (end < 0) end = -1;
+					}
+				}
+			}
+		}
+		// skip the rest of the line
+		if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+		if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+		if (beg < 0) beg = 0, end = INT_MAX;
+		if (p->n == p->m) {
+			p->m = p->m? p->m<<1 : 4;
+			p->a = realloc(p->a, p->m * 8);
+		}
+		p->a[p->n++] = (uint64_t)beg<<32 | end;
+	}
+	ks_destroy(ks);
+	gzclose(fp);
+	free(str->s); free(str);
+	return h;
+}
+
+void stk_reg_destroy(reghash_t *h)
+{
+	khint_t k;
+	if (h == 0) return;
+	for (k = 0; k < kh_end(h); ++k) {
+		if (kh_exist(h, k)) {
+			free(kh_val(h, k).a);
+			free((char*)kh_key(h, k));
+		}
+	}
+	kh_destroy(reg, h);
+}
+
+/* constant table */
+
+unsigned char seq_nt16_table[256] = {
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+	15,15, 5, 6,  8,15, 7, 9,  0,10,15,15, 15,15,15,15,
+	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+	15,15, 5, 6,  8,15, 7, 9,  0,10,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN";
+unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+unsigned char seq_nt16comp_table[] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 };
+int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+char comp_tab[] = {
+	  0,   1,	2,	 3,	  4,   5,	6,	 7,	  8,   9,  10,	11,	 12,  13,  14,	15,
+	 16,  17,  18,	19,	 20,  21,  22,	23,	 24,  25,  26,	27,	 28,  29,  30,	31,
+	 32,  33,  34,	35,	 36,  37,  38,	39,	 40,  41,  42,	43,	 44,  45,  46,	47,
+	 48,  49,  50,	51,	 52,  53,  54,	55,	 56,  57,  58,	59,	 60,  61,  62,	63,
+	 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O',
+	'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z',	91,	 92,  93,  94,	95,
+	 64, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o',
+	'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
+};
+
+static void stk_printstr(const kstring_t *s, unsigned line_len)
+{
+	if (line_len != UINT_MAX) {
+		int i, rest = s->l;
+		for (i = 0; i < s->l; i += line_len, rest -= line_len) {
+			putchar('\n');
+			if (rest > line_len) fwrite(s->s + i, 1, line_len, stdout);
+			else fwrite(s->s + i, 1, rest, stdout);
+		}
+		putchar('\n');
+	} else {
+		putchar('\n');
+		puts(s->s);
+	}
+}
+
+void stk_printseq(const kseq_t *s, int line_len)
+{
+	putchar(s->qual.l? '@' : '>');
+	fputs(s->name.s, stdout);
+	if (s->comment.l) {
+		putchar(' '); fputs(s->comment.s, stdout);
+	}
+	stk_printstr(&s->seq, line_len);
+	if (s->qual.l) {
+		putchar('+');
+		stk_printstr(&s->qual, line_len);
+	}
+}
+
+/* quality based trimming with Mott's algorithm */
+int stk_trimfq(int argc, char *argv[])
+{ // FIXME: when a record with zero length will always be treated as a fasta record
+	gzFile fp;
+	kseq_t *seq;
+	double param = 0.05, q_int2real[128];
+	int i, c, min_len = 30, left = 0, right = 0;
+	while ((c = getopt(argc, argv, "l:q:b:e:")) >= 0) {
+		switch (c) {
+			case 'q': param = atof(optarg); break;
+			case 'l': min_len = atoi(optarg); break;
+			case 'b': left = atoi(optarg); break;
+			case 'e': right = atoi(optarg); break;
+		}
+	}
+	if (optind == argc) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   seqtk trimfq [options] <in.fq>\n\n");
+		fprintf(stderr, "Options: -q FLOAT    error rate threshold (disabled by -b/-e) [%.2f]\n", param);
+		fprintf(stderr, "         -l INT      maximally trim down to INT bp (disabled by -b/-e) [%d]\n", min_len);
+		fprintf(stderr, "         -b INT      trim INT bp from left (non-zero to disable -q/-l) [0]\n");
+		fprintf(stderr, "         -e INT      trim INT bp from right (non-zero to disable -q/-l) [0]\n");
+		fprintf(stderr, "\n");
+		return 1;
+	}
+	fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+	seq = kseq_init(fp);
+	for (i = 0; i < 128; ++i)
+		q_int2real[i] = pow(10., -(i - 33) / 10.);
+	while (kseq_read(seq) >= 0) {
+		int beg, tmp, end;
+		double s, max;
+		if (left || right) {
+			beg = left; end = seq->seq.l - right;
+			if (beg >= end) beg = end = 0;
+		} else if (seq->qual.l > min_len) {
+			for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) {
+				int q = seq->qual.s[i];
+				if (q < 36) q = 36;
+				if (q > 127) q = 127;
+				s += param - q_int2real[q];
+				if (s > max) max = s, beg = tmp, end = i + 1;
+				if (s < 0) s = 0, tmp = i + 1;
+			}
+			if (end - beg < min_len) { // window-based 
+				int is, imax;
+				for (i = 0, is = 0; i < min_len; ++i)
+					is += seq->qual.s[i] - 33;
+				for (imax = is, beg = 0; i < seq->qual.l; ++i) {
+					is += (int)seq->qual.s[i] - seq->qual.s[i - min_len];
+					if (imax < is) imax = is, beg = i - min_len + 1;
+				}
+				end = beg + min_len;
+			}
+		} else beg = 0, end = seq->seq.l;
+		putchar(seq->qual.l? '@' : '>'); fputs(seq->name.s, stdout); 
+		if (seq->comment.l) {
+			putchar(' '); puts(seq->comment.s);
+		} else putchar('\n');
+		fwrite(seq->seq.s + beg, 1, end - beg, stdout); putchar('\n');
+		if (seq->qual.l) {
+			puts("+");
+			fwrite(seq->qual.s + beg, 1, end - beg, stdout); putchar('\n');
+		}
+	}
+	kseq_destroy(seq);
+	gzclose(fp);
+	return 0;
+}
+
+/* composition */
+int stk_comp(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *seq;
+	int l, c, upper_only = 0;
+	reghash_t *h = 0;
+	reglist_t dummy;
+	while ((c = getopt(argc, argv, "ur:")) >= 0) {
+		switch (c) {
+			case 'u': upper_only = 1; break;
+			case 'r': h = stk_reg_read(optarg); break;
+		}
+	}
+	if (argc == optind) {
+		fprintf(stderr, "Usage:  seqtk comp [-u] [-r in.bed] <in.fa>\n\n");
+		fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n");
+		return 1;
+	}
+	fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+	seq = kseq_init(fp);
+	dummy.n= dummy.m = 1; dummy.a = calloc(1, 8);
+	while ((l = kseq_read(seq)) >= 0) {
+		int i, k;
+		reglist_t *p = 0;
+		if (h) {
+			khint_t k = kh_get(reg, h, seq->name.s);
+			if (k != kh_end(h)) p = &kh_val(h, k);
+		} else {
+			p = &dummy;
+			dummy.a[0] = l;
+		}
+		for (k = 0; p && k < p->n; ++k) {
+			int beg = p->a[k]>>32, end = p->a[k]&0xffffffff;
+			int la, lb, lc, na, nb, nc, cnt[11];
+			if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb];
+			else la = 'a', lb = -1, lc = 0;
+			na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
+			memset(cnt, 0, 11 * sizeof(int));
+			for (i = beg; i < end; ++i) {
+				int is_CpG = 0, a, b, c;
+				a = na; b = nb; c = nc;
+				na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
+				if (b == 2 || b == 10) { // C or Y
+					if (nb == 4 || nb == 5) is_CpG = 1;
+				} else if (b == 4 || b == 5) { // G or R
+					if (lb == 2 || lb == 10) is_CpG = 1;
+				}
+				if (upper_only == 0 || isupper(a)) {
+					if (c > 1) ++cnt[c+2];
+					if (c == 1) ++cnt[seq_nt16to4_table[b]];
+					if (b == 10 || b == 5) ++cnt[9];
+					else if (c == 2) {
+						++cnt[8];
+					}
+					if (is_CpG) {
+						++cnt[7];
+						if (b == 10 || b == 5) ++cnt[10];
+					}
+				}
+				la = a; lb = b; lc = c;
+			}
+			if (h) printf("%s\t%d\t%d", seq->name.s, beg, end);
+			else printf("%s\t%d", seq->name.s, l);
+			for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]);
+			putchar('\n');
+		}
+		fflush(stdout);
+	}
+	free(dummy.a);
+	kseq_destroy(seq);
+	gzclose(fp);
+	return 0;
+}
+
+int stk_randbase(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *seq;
+	int l;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: seqtk randbase <in.fa>\n");
+		return 1;
+	}
+	fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r");
+	seq = kseq_init(fp);
+	while ((l = kseq_read(seq)) >= 0) {
+		int i;
+		printf(">%s", seq->name.s);
+		for (i = 0; i < l; ++i) {
+			int c, b, a, j, k, m;
+			b = seq->seq.s[i];
+			c = seq_nt16_table[b];
+			a = bitcnt_table[c];
+			if (a == 2) {
+				m = (drand48() < 0.5);
+				for (j = k = 0; j < 4; ++j) {
+					if ((1<<j & c) == 0) continue;
+					if (k == m) break;
+					++k;
+				}
+				seq->seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j];
+			}
+			if (i%60 == 0) putchar('\n');
+			putchar(seq->seq.s[i]);
+		}
+		putchar('\n');
+	}
+	kseq_destroy(seq);
+	gzclose(fp);
+	return 0;
+}
+
+int stk_hety(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *seq;
+	int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0;
+	char *buf;
+	uint32_t cnt[3];
+	if (argc == 1) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   seqtk hety [options] <in.fa>\n\n");
+		fprintf(stderr, "Options: -w INT   window size [%d]\n", win_size);
+		fprintf(stderr, "         -t INT   # start positions in a window [%d]\n", n_start);
+		fprintf(stderr, "         -m       treat lowercases as masked\n");
+		fprintf(stderr, "\n");
+		return 1;
+	}
+	while ((c = getopt(argc, argv, "w:t:m")) >= 0) {
+		switch (c) {
+		case 'w': win_size = atoi(optarg); break;
+		case 't': n_start = atoi(optarg); break;
+		case 'm': is_lower_mask = 1; break;
+		}
+	}
+	fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+	seq = kseq_init(fp);
+	win_step = win_size / n_start;
+	buf = calloc(win_size, 1);
+	while ((l = kseq_read(seq)) >= 0) {
+		int x, i, y, z, next = 0;
+		cnt[0] = cnt[1] = cnt[2] = 0;
+		for (i = 0; i <= l; ++i) {
+			if ((i >= win_size && i % win_step == 0) || i == l) {
+				if (i == l && l >= win_size) {
+					for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]];
+				}
+				if (cnt[1] + cnt[2] > 0)
+					printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i,
+						   (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]);
+				next = i;
+			}
+			if (i < l) {
+				y = i % win_size;
+				c = seq->seq.s[i];
+				if (is_lower_mask && islower(c)) c = 'N';
+				c = seq_nt16_table[c];
+				x = bitcnt_table[c];
+				if (i >= win_size) --cnt[(int)buf[y]];
+				buf[y] = z = x > 2? 0 : x == 2? 2 : 1;
+				++cnt[z];
+			}
+		}
+	}
+	free(buf);
+	kseq_destroy(seq);
+	gzclose(fp);
+	return 0;
+}
+
+/* subseq */
+
+int stk_subseq(int argc, char *argv[])
+{
+	khash_t(reg) *h = kh_init(reg);
+	gzFile fp;
+	kseq_t *seq;
+	int l, i, j, c, is_tab = 0, line = 1024;
+	khint_t k;
+	while ((c = getopt(argc, argv, "tl:")) >= 0) {
+		switch (c) {
+		case 't': is_tab = 1; break;
+		case 'l': line = atoi(optarg); break;
+		}
+	}
+	if (optind + 2 > argc) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   seqtk subseq [options] <in.fa> <in.bed>|<name.list>\n\n");
+		fprintf(stderr, "Options: -t       TAB delimited output\n");
+		fprintf(stderr, "         -l INT   sequence line length [%d]\n\n", line);
+		fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n");
+		return 1;
+	}
+	h = stk_reg_read(argv[optind+1]);
+	// subseq
+	fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+	seq = kseq_init(fp);
+	while ((l = kseq_read(seq)) >= 0) {
+		reglist_t *p;
+		k = kh_get(reg, h, seq->name.s);
+		if (k == kh_end(h)) continue;
+		p = &kh_val(h, k);
+		for (i = 0; i < p->n; ++i) {
+			int beg = p->a[i]>>32, end = p->a[i];
+			if (beg >= seq->seq.l) {
+				fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l);
+				continue;
+			}
+			if (end > seq->seq.l) end = seq->seq.l;
+			if (is_tab == 0) {
+				printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s);
+				if (beg > 0 || (int)p->a[i] != INT_MAX) {
+					if (end == INT_MAX) {
+						if (beg) printf(":%d", beg+1);
+					} else printf(":%d-%d", beg+1, end);
+				}
+			} else printf("%s\t%d\t", seq->name.s, beg + 1);
+			if (end > seq->seq.l) end = seq->seq.l;
+			for (j = 0; j < end - beg; ++j) {
+				if (is_tab == 0 && j % line == 0) putchar('\n');
+				putchar(seq->seq.s[j + beg]);
+			}
+			putchar('\n');
+			if (seq->qual.l != seq->seq.l || is_tab) continue;
+			printf("+");
+			for (j = 0; j < end - beg; ++j) {
+				if (j % line == 0) putchar('\n');
+				putchar(seq->qual.s[j + beg]);
+			}
+			putchar('\n');
+		}
+	}
+	// free
+	kseq_destroy(seq);
+	gzclose(fp);
+	stk_reg_destroy(h);
+	return 0;
+}
+
+/* mergefa */
+int stk_mergefa(int argc, char *argv[])
+{
+	gzFile fp[2];
+	kseq_t *seq[2];
+	int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0, is_randhet = 0;
+	uint64_t cnt[5];
+	while ((c = getopt(argc, argv, "himrq:")) >= 0) {
+		switch (c) {
+			case 'i': is_intersect = 1; break;
+			case 'h': is_haploid = 1; break;
+			case 'm': is_mask = 1; break;
+			case 'r': is_randhet = 1; break;
+			case 'q': qual = atoi(optarg); break;
+		}
+	}
+	if (is_mask && is_intersect) {
+		fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__);
+		return 1;
+	}
+	if (optind + 2 > argc) {
+		fprintf(stderr, "\nUsage: seqtk mergefa [options] <in1.fa> <in2.fa>\n\n");
+		fprintf(stderr, "Options: -q INT   quality threshold [0]\n");
+		fprintf(stderr, "         -i       take intersection\n");
+		fprintf(stderr, "         -m       convert to lowercase when one of the input base is N\n");
+		fprintf(stderr, "         -r       pick a random allele from het\n");
+		fprintf(stderr, "         -h       suppress hets in the input\n\n");
+		return 1;
+	}
+	for (i = 0; i < 2; ++i) {
+		fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r");
+		seq[i] = kseq_init(fp[i]);
+	}
+	cnt[0] = cnt[1] = cnt[2] = cnt[3] = cnt[4] = 0;
+	srand48(11);
+	while (kseq_read(seq[0]) >= 0) {
+		int min_l, c[2], b[2], is_upper;
+		kseq_read(seq[1]);
+		if (strcmp(seq[0]->name.s, seq[1]->name.s))
+			fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s);
+		if (seq[0]->seq.l != seq[1]->seq.l)
+			fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l);
+		min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l;
+		printf(">%s", seq[0]->name.s);
+		for (l = 0; l < min_l; ++l) {
+			c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l];
+			if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]);
+			if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]);
+			if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0;
+			else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0;
+			else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0;
+			c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]];
+			if (c[0] == 0) c[0] = 15;
+			if (c[1] == 0) c[1] = 15;
+			b[0] = bitcnt_table[c[0]];
+			b[1] = bitcnt_table[c[1]];
+			if (is_upper) {
+				if (b[0] == 1 && b[1] == 1) {
+					if (c[0] == c[1]) ++cnt[0];
+					else ++cnt[1];
+				} else if (b[0] == 1 && b[1] == 2) ++cnt[2];
+				else if (b[0] == 2 && b[1] == 1) ++cnt[3];
+				else if (b[0] == 2 && b[1] == 2) ++cnt[4];
+			}
+			if (is_haploid && (b[0] > 1 || b[1] > 1)) is_upper = 0;
+			if (is_intersect) {
+				c[0] = c[0] & c[1];
+				if (c[0] == 0) is_upper = 0; // FIXME: is this a bug - c[0] cannot be 0!
+			} else if (is_mask) {
+				if (c[0] == 15 || c[1] == 15) is_upper = 0;
+				c[0] &= c[1];
+				if (c[0] == 0) is_upper = 0;
+			} else if (is_randhet) {
+				if (b[0] == 1 && b[1] == 1) { // two homs
+					c[0] |= c[1];
+				} else if (((b[0] == 1 && b[1] == 2) || (b[0] == 2 && b[1] == 1)) && (c[0]&c[1])) { // one hom, one het
+					c[0] = (lrand48()&1)? (c[0] & c[1]) : (c[0] | c[1]);
+				} else if (b[0] == 2 && b[1] == 2 && c[0] == c[1]) { // double hets
+					if (lrand48()&1) {
+						if (lrand48()&1) {
+							for (i = 8; i >= 1; i >>= 1) // pick the "larger" allele
+								if (c[0]&i) c[0] &= i;
+						} else {
+							for (i = 1; i <= 8; i <<= 1) // pick the "smaller" allele
+								if (c[0]&i) c[0] &= i;
+						}
+					} // else set as het
+				} else is_upper = 0;
+			} else c[0] |= c[1];
+			c[0] = seq_nt16_rev_table[c[0]];
+			if (!is_upper) c[0] = tolower(c[0]);
+			if (l%60 == 0) putchar('\n');
+			putchar(c[0]);
+		}
+		putchar('\n');
+	}
+	fprintf(stderr, "[%s] (same,diff,hom-het,het-hom,het-het)=(%ld,%ld,%ld,%ld,%ld)\n", __func__, (long)cnt[0], (long)cnt[1], (long)cnt[2], (long)cnt[3], (long)cnt[4]);
+	return 0;
+}
+
+int stk_famask(int argc, char *argv[])
+{
+	gzFile fp[2];
+	kseq_t *seq[2];
+	int i, l;
+	if (argc < 3) {
+		fprintf(stderr, "Usage: seqtk famask <src.fa> <mask.fa>\n");
+		return 1;
+	}
+	for (i = 0; i < 2; ++i) {
+		fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r");
+		seq[i] = kseq_init(fp[i]);
+	}
+	while (kseq_read(seq[0]) >= 0) {
+		int min_l, c[2];
+		kseq_read(seq[1]);
+		if (strcmp(seq[0]->name.s, seq[1]->name.s))
+			fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s);
+		if (seq[0]->seq.l != seq[1]->seq.l)
+			fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l);
+		min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l;
+		printf(">%s", seq[0]->name.s);
+		for (l = 0; l < min_l; ++l) {
+			c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l];
+			if (c[1] == 'x') c[0] = tolower(c[0]);
+			else if (c[1] != 'X') c[0] = c[1];
+			if (l%60 == 0) putchar('\n');
+			putchar(c[0]);
+		}
+		putchar('\n');
+	}
+	return 0;
+}
+
+int stk_mutfa(int argc, char *argv[])
+{
+	khash_t(reg) *h = kh_init(reg);
+	gzFile fp;
+	kseq_t *seq;
+	kstream_t *ks;
+	int l, i, dret;
+	kstring_t *str;
+	khint_t k;
+	if (argc < 3) {
+		fprintf(stderr, "Usage: seqtk mutfa <in.fa> <in.snp>\n\n");
+		fprintf(stderr, "Note: <in.snp> contains at least four columns per line which are:\n");
+		fprintf(stderr, "      'chr  1-based-pos  any  base-changed-to'.\n");
+		return 1;
+	}
+	// read the list
+	str = calloc(1, sizeof(kstring_t));
+	fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
+	ks = ks_init(fp);
+	while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+		char *s = strdup(str->s);
+		int beg = 0, ret;
+		reglist_t *p;
+		k = kh_get(reg, h, s);
+		if (k == kh_end(h)) {
+			k = kh_put(reg, h, s, &ret);
+			memset(&kh_val(h, k), 0, sizeof(reglist_t));
+		}
+		p = &kh_val(h, k);
+		if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col
+		ks_getuntil(ks, 0, str, &dret); // 3rd col
+		ks_getuntil(ks, 0, str, &dret); // 4th col
+		// skip the rest of the line
+		if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+		if (isalpha(str->s[0]) && str->l == 1) {
+			if (p->n == p->m) {
+				p->m = p->m? p->m<<1 : 4;
+				p->a = realloc(p->a, p->m * 8);
+			}
+			p->a[p->n++] = (uint64_t)beg<<32 | str->s[0];
+		}
+	}
+	ks_destroy(ks);
+	gzclose(fp);
+	free(str->s); free(str);
+	// mutfa
+	fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+	seq = kseq_init(fp);
+	while ((l = kseq_read(seq)) >= 0) {
+		reglist_t *p;
+		k = kh_get(reg, h, seq->name.s);
+		if (k != kh_end(h)) {
+			p = &kh_val(h, k);
+			for (i = 0; i < p->n; ++i) {
+				int beg = p->a[i]>>32;
+				if (beg < seq->seq.l)
+					seq->seq.s[beg] = (int)p->a[i];
+			}
+		}
+		printf(">%s", seq->name.s);
+		for (i = 0; i < l; ++i) {
+			if (i%60 == 0) putchar('\n');
+			putchar(seq->seq.s[i]);
+		}
+		putchar('\n');
+	}
+	// free
+	kseq_destroy(seq);
+	gzclose(fp);
+	for (k = 0; k < kh_end(h); ++k) {
+		if (kh_exist(h, k)) {
+			free(kh_val(h, k).a);
+			free((char*)kh_key(h, k));
+		}
+	}
+	kh_destroy(reg, h);
+	return 0;
+}
+
+int stk_listhet(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *seq;
+	int i, l;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: seqtk listhet <in.fa>\n");
+		return 1;
+	}
+	fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r");
+	seq = kseq_init(fp);
+	while ((l = kseq_read(seq)) >= 0) {
+		for (i = 0; i < l; ++i) {
+			int b = seq->seq.s[i];
+			if (bitcnt_table[seq_nt16_table[b]] == 2)
+				printf("%s\t%d\t%c\n", seq->name.s, i+1, b);
+		}
+	}
+	kseq_destroy(seq);
+	gzclose(fp);
+	return 0;
+}
+
+/* cutN */
+static int cutN_min_N_tract = 1000;
+static int cutN_nonN_penalty = 10;
+
+static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end)
+{
+	int i, b, e;
+	while (k < ks->seq.l) {
+		if (seq_nt16_table[(int)ks->seq.s[k]] == 15) {
+			int score, max;
+			score = 0; e = max = -1;
+			for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */
+				if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score;
+				else score -= cutN_nonN_penalty;
+				if (score > max) max = score, e = i;
+			}
+			score = 0; b = max = -1;
+			for (i = e; i >= 0 && score >= 0; --i) { /* backward */
+				if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score;
+				else score -= cutN_nonN_penalty;
+				if (score > max) max = score, b = i;
+			}
+			if (e + 1 - b >= cutN_min_N_tract) {
+				*begin = b;
+				*end = e + 1;
+				return *end;
+			}
+			k = e + 1;
+		} else ++k;
+	}
+	return -1;
+}
+static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end)
+{
+	int i;
+	if (begin >= end) return; // FIXME: why may this happen? Understand it!
+	fprintf(fpout, ">%s:%d-%d", ks->name.s, begin+1, end);
+	for (i = begin; i < end && i < ks->seq.l; ++i) {
+		if ((i - begin)%60 == 0) fputc('\n', fpout);
+		fputc(ks->seq.s[i], fpout);
+	}
+	fputc('\n', fpout);
+}
+int stk_cutN(int argc, char *argv[])
+{
+	int c, l, gap_only = 0;
+	gzFile fp;
+	kseq_t *ks;
+	while ((c = getopt(argc, argv, "n:p:g")) >= 0) {
+		switch (c) {
+		case 'n': cutN_min_N_tract = atoi(optarg); break;
+		case 'p': cutN_nonN_penalty = atoi(optarg); break;
+		case 'g': gap_only = 1; break;
+		default: return 1;
+		}
+	}
+	if (argc == optind) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   seqtk cutN [options] <in.fa>\n\n");
+		fprintf(stderr, "Options: -n INT    min size of N tract [%d]\n", cutN_min_N_tract);
+		fprintf(stderr, "         -p INT    penalty for a non-N [%d]\n", cutN_nonN_penalty);
+		fprintf(stderr, "         -g        print gaps only, no sequence\n\n");
+		return 1;
+	}
+	fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+	ks = kseq_init(fp);
+	while ((l = kseq_read(ks)) >= 0) {
+		int k = 0, begin = 0, end = 0;
+		while (find_next_cut(ks, k, &begin, &end) >= 0) {
+			if (begin != 0) {
+				if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end);
+				else print_seq(stdout, ks, k, begin);
+			}
+			k = end;
+		}
+		if (!gap_only) print_seq(stdout, ks, k, l);
+	}
+	kseq_destroy(ks);
+	gzclose(fp);
+	return 0;
+}
+
+int stk_hrun(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *ks;
+	int min_len = 7, l = 0, c = 0, beg = 0, i;
+	if (argc == optind) {
+		fprintf(stderr, "Usage: seqtk hrun <in.fa> [minLen=%d]\n", min_len);
+		return 1;
+	}
+	if (argc == optind + 2) min_len = atoi(argv[optind+1]);
+	fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+	ks = kseq_init(fp);
+	while (kseq_read(ks) >= 0) {
+		c = ks->seq.s[0]; l = 1; beg = 0;
+		for (i = 1; i < ks->seq.l; ++i) {
+			if (ks->seq.s[i] != c) {
+				if (l >= min_len) printf("%s\t%d\t%d\t%c\n", ks->name.s, beg, beg + l, c);
+				c = ks->seq.s[i]; l = 1; beg = i;
+			} else ++l;
+		}
+	}
+	if (l >= min_len) printf("%s\t%d\t%d\t%c\n", ks->name.s, beg, beg + l, c);
+	kseq_destroy(ks);
+	gzclose(fp);
+	return 0;
+}
+
+/* sample */
+
+static void cpy_kstr(kstring_t *dst, const kstring_t *src)
+{
+	if (src->l == 0) return;
+	if (src->l + 1 > dst->m) {
+		dst->m = src->l + 1;
+		kroundup32(dst->m);
+		dst->s = realloc(dst->s, dst->m);
+	}
+	dst->l = src->l;
+	memcpy(dst->s, src->s, src->l + 1);
+}
+
+static void cpy_kseq(kseq_t *dst, const kseq_t *src)
+{
+	cpy_kstr(&dst->name, &src->name);
+	cpy_kstr(&dst->seq,  &src->seq);
+	cpy_kstr(&dst->qual, &src->qual);
+}
+
+int stk_sample(int argc, char *argv[])
+{
+	int c;
+	uint64_t i, num = 0, n_seqs = 0;
+	double frac = 0.;
+	gzFile fp;
+	kseq_t *seq, *buf = 0;
+
+	srand48(11);
+	while ((c = getopt(argc, argv, "s:")) >= 0)
+		switch (c) {
+		case 's': srand48(atoi(optarg)); break;
+		}
+	if (optind + 2 > argc) {
+		fprintf(stderr, "Usage: seqtk sample [-s seed=11] <in.fa> <frac>|<number>\n\n");
+		fprintf(stderr, "Warning: Large memory consumption for large <number>.\n");
+		return 1;
+	}
+	frac = atof(argv[optind+1]);
+	if (frac > 1.) num = (uint64_t)(frac + .499), frac = 0.;
+	if (num > 0) buf = calloc(num, sizeof(kseq_t));
+
+	fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+	seq = kseq_init(fp);
+	while (kseq_read(seq) >= 0) {
+		double r = drand48();
+		++n_seqs;
+		if (num) {
+			uint64_t y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs);
+			if (y < num) cpy_kseq(&buf[y], seq);
+		} else if (r < frac) stk_printseq(seq, UINT_MAX);
+	}
+	kseq_destroy(seq);
+	gzclose(fp);
+	for (i = 0; i < num; ++i) {
+		kseq_t *p = &buf[i];
+		if (p->seq.l) stk_printseq(p, UINT_MAX);
+		free(p->seq.s); free(p->qual.s); free(p->name.s);
+	}
+	free(buf);
+	return 0;
+}
+
+/* seq */
+
+void stk_mask(kseq_t *seq, const khash_t(reg) *h, int is_complement, int mask_chr)
+{
+	unsigned i, j;
+	khiter_t k;
+	k = kh_get(reg, h, seq->name.s);
+	if (k == kh_end(h)) { // not found in the hash table
+		if (is_complement) {
+			if (mask_chr) {
+				for (j = 0; j < seq->seq.l; ++j)
+					seq->seq.s[j] = mask_chr;
+			} else {
+				for (j = 0; j < seq->seq.l; ++j)
+					seq->seq.s[j] = tolower(seq->seq.s[j]);
+			}
+		}
+	} else {
+		reglist_t *p = &kh_val(h, k);
+		if (!is_complement) {
+			for (i = 0; i < p->n; ++i) {
+				unsigned beg = p->a[i]>>32, end = p->a[i];
+				if (beg >= seq->seq.l) continue;
+				if (end > seq->seq.l) end = seq->seq.l;
+				if (!mask_chr) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]);
+				else for (j = beg; j < end; ++j) seq->seq.s[j] = mask_chr;
+			}
+		} else {
+			int8_t *mask = calloc(seq->seq.l, 1);
+			for (i = 0; i < p->n; ++i) {
+				unsigned beg = p->a[i]>>32, end = p->a[i];
+				if (end >= seq->seq.l) end = seq->seq.l;
+				for (j = beg; j < end; ++j) mask[j] = 1;
+			}
+			if (mask_chr) {
+				for (j = 0; j < seq->seq.l; ++j)
+					if (mask[j] == 0) seq->seq.s[j] = mask_chr;
+			} else {
+				for (j = 0; j < seq->seq.l; ++j)
+					if (mask[j] == 0) seq->seq.s[j] = tolower(seq->seq.s[j]);
+			}
+			free(mask);
+		}
+	}
+}
+
+int stk_seq(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *seq;
+	int c, qual_thres = 0, flag = 0, qual_shift = 33, mask_chr = 0, min_len = 0;
+	unsigned line_len = 0;
+	double frac = 1.;
+	khash_t(reg) *h = 0;
+
+	srand48(11);
+	while ((c = getopt(argc, argv, "q:l:Q:aACrn:s:f:M:L:c")) >= 0) {
+		switch (c) {
+			case 'a':
+			case 'A': flag |= 1; break;
+			case 'C': flag |= 2; break;
+			case 'r': flag |= 4; break;
+			case 'c': flag |= 8; break;
+			case 'M': h = stk_reg_read(optarg); break;
+			case 'n': mask_chr = *optarg; break;
+			case 'Q': qual_shift = atoi(optarg); break;
+			case 'q': qual_thres = atoi(optarg); break;
+			case 'l': line_len = atoi(optarg); break;
+			case 'L': min_len = atoi(optarg); break;
+			case 's': srand48(atoi(optarg)); break;
+			case 'f': frac = atof(optarg); break;
+		}
+	}
+	if (argc == optind) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   seqtk seq [options] <in.fq>|<in.fa>\n\n");
+		fprintf(stderr, "Options: -q INT    mask bases with quality lower than INT [0]\n");
+		fprintf(stderr, "         -n CHAR   masked bases converted to CHAR; 0 for lowercase [0]\n");
+		fprintf(stderr, "         -l INT    number of residues per line; 0 for 2^32-1 [%d]\n", line_len);
+		fprintf(stderr, "         -Q INT    quality shift: ASCII-INT gives base quality [%d]\n", qual_shift);
+		fprintf(stderr, "         -s INT    random seed (effective with -f) [11]\n");
+		fprintf(stderr, "         -f FLOAT  sample FLOAT fraction of sequences [1]\n");
+		fprintf(stderr, "         -M FILE   mask regions in BED or name list FILE [null]\n");
+		fprintf(stderr, "         -L INT    drop sequences with length shorter than INT [0]\n");
+		fprintf(stderr, "         -c        mask complement region (effective with -M)\n");
+		fprintf(stderr, "         -r        reverse complement\n");
+		fprintf(stderr, "         -A        force FASTA output (discard quality)\n");
+		fprintf(stderr, "         -C        drop comments at the header lines\n");
+		fprintf(stderr, "\n");
+		return 1;
+	}
+	if (line_len == 0) line_len = UINT_MAX;
+	fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+	seq = kseq_init(fp);
+	qual_thres += qual_shift;
+	while (kseq_read(seq) >= 0) {
+		if (seq->seq.l < min_len) continue; // NB: length filter before taking random
+		if (frac < 1. && drand48() >= frac) continue;
+		if (seq->qual.l && qual_thres > qual_shift) {
+			unsigned i;
+			if (mask_chr) {
+				for (i = 0; i < seq->seq.l; ++i)
+					if (seq->qual.s[i] < qual_thres)
+						seq->seq.s[i] = mask_chr;
+			} else {
+				for (i = 0; i < seq->seq.l; ++i)
+					if (seq->qual.s[i] < qual_thres)
+						seq->seq.s[i] = tolower(seq->seq.s[i]);
+			}
+		}
+		if (flag & 1) seq->qual.l = 0;
+		if (flag & 2) seq->comment.l = 0;
+		if (h) stk_mask(seq, h, flag&8, mask_chr); // masking
+		if (flag & 4) { // reverse complement
+			int c0, c1;
+			unsigned i;
+			for (i = 0; i < seq->seq.l>>1; ++i) { // reverse complement sequence
+				c0 = comp_tab[(int)seq->seq.s[i]];
+				c1 = comp_tab[(int)seq->seq.s[seq->seq.l - 1 - i]];
+				seq->seq.s[i] = c1;
+				seq->seq.s[seq->seq.l - 1 - i] = c0;
+			}
+			if (seq->seq.l & 1) // complement the remaining base
+				seq->seq.s[seq->seq.l>>1] = comp_tab[(int)seq->seq.s[seq->seq.l>>1]];
+			if (seq->qual.l) {
+				for (i = 0; i < seq->seq.l>>1; ++i) // reverse quality
+					c0 = seq->qual.s[i], seq->qual.s[i] = seq->qual.s[seq->qual.l - 1 - i], seq->qual.s[seq->qual.l - 1 - i] = c0;
+			}
+		}
+		stk_printseq(seq, line_len);
+	}
+	kseq_destroy(seq);
+	gzclose(fp);
+	stk_reg_destroy(h);
+	return 0;
+}
+
+/* main function */
+static int usage()
+{
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Usage:   seqtk <command> <arguments>\n");
+	fprintf(stderr, "Version: 1.0-r31\n\n");
+	fprintf(stderr, "Command: seq       common transformation of FASTA/Q\n");
+	fprintf(stderr, "         comp      get the nucleotide composition of FASTA/Q\n");
+	fprintf(stderr, "         sample    subsample sequences\n");
+	fprintf(stderr, "         subseq    extract subsequences from FASTA/Q\n");
+	fprintf(stderr, "         trimfq    trim FASTQ using the Phred algorithm\n\n");
+	fprintf(stderr, "         hety      regional heterozygosity\n");
+	fprintf(stderr, "         mutfa     point mutate FASTA at specified positions\n");
+	fprintf(stderr, "         mergefa   merge two FASTA/Q files\n");
+	fprintf(stderr, "         randbase  choose a random base from hets\n");
+	fprintf(stderr, "         cutN      cut sequence at long N\n");
+	fprintf(stderr, "         listhet   extract the position of each het\n");
+	fprintf(stderr, "\n");
+	return 1;
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc == 1) return usage();
+	if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1);
+	else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1);
+	else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1);
+	else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1);
+	else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1);
+	else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1);
+	else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1);
+	else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1);
+	else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1);
+	else if (strcmp(argv[1], "trimfq") == 0) stk_trimfq(argc-1, argv+1);
+	else if (strcmp(argv[1], "hrun") == 0) stk_hrun(argc-1, argv+1);
+	else if (strcmp(argv[1], "sample") == 0) stk_sample(argc-1, argv+1);
+	else if (strcmp(argv[1], "seq") == 0) stk_seq(argc-1, argv+1);
+	else {
+		fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]);
+		return 1;
+	}
+	return 0;
+}

-- 
Alioth's /git/debian-med/git-commit-notice on /srv/git.debian.org/git/debian-med/seqtk.git



More information about the debian-med-commit mailing list