[med-svn] [ea-utils] 01/03: Imported Upstream version 1.1.2+dfsg

Sat Jul 25 06:14:41 UTC 2015

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository ea-utils.

commit 8ad9ba2d886b19085ec277b1120739ca2a7ad8ce
Author: Andreas Tille <tille at debian.org>
Date:   Sat Jul 25 08:11:59 2015 +0200

    Imported Upstream version 1.1.2+dfsg
---
 CHANGES            |   14 +
 Makefile           |  105 ++++
 README             |   63 ++
 alc                |  178 ++++++
 bam.c              |  474 ++++++++++++++
 bam_aux.c          |  213 +++++++
 bam_cat.c          |  185 ++++++
 bam_import.c       |  489 +++++++++++++++
 bam_index.c        |  724 ++++++++++++++++++++++
 bam_lpileup.c      |  198 ++++++
 bam_md.c           |  389 ++++++++++++
 bam_pileup.c       |  437 +++++++++++++
 bam_reheader.c     |   62 ++
 bam_sort.c         |  566 +++++++++++++++++
 bedidx.c           |  162 +++++
 bgzf.c             |  694 +++++++++++++++++++++
 determine-phred    |   86 +++
 ea-utils.spec      |   37 ++
 faidx.c            |  437 +++++++++++++
 fastq-clipper.c    |  279 +++++++++
 fastq-join.c       |  424 +++++++++++++
 fastq-lib.cpp      |  375 +++++++++++
 fastq-lib.h        |  113 ++++
 fastq-mcf.c        | 1697 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fastq-multx.c      | 1087 ++++++++++++++++++++++++++++++++
 fastq-stats.cpp    |  672 ++++++++++++++++++++
 fastx-graph        |  149 +++++
 gcModel.c          |  207 +++++++
 gcModel.h          |    7 +
 gtf2bed            |  116 ++++
 kaln.c             |  486 +++++++++++++++
 knetfile.c         |  632 +++++++++++++++++++
 kprobaln.c         |  280 +++++++++
 kstring.c          |  212 +++++++
 padding.c          |  479 +++++++++++++++
 phase.c            |  687 +++++++++++++++++++++
 randomFQ           |  245 ++++++++
 razf.c             |  853 +++++++++++++++++++++++++
 razip.c            |  141 +++++
 sam-stats.cpp      | 1121 +++++++++++++++++++++++++++++++++
 sam.c              |  186 ++++++
 sam_header.c       |  772 +++++++++++++++++++++++
 tidx/fastq-lib.cpp |    1 +
 tidx/fastq-lib.h   |    1 +
 tidx/tidx-lib.cpp  |  436 +++++++++++++
 tidx/tidx.cpp      |  220 +++++++
 tidx/tidx.h        |   43 ++
 tidx/utils.cpp     |    1 +
 tidx/utils.h       |    1 +
 utils.h            |    5 +
 varcall.cpp        | 1744 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 51 files changed, 19185 insertions(+)

diff --git a/CHANGES b/CHANGES
new file mode 100644
index 0000000..f313903
--- /dev/null
+++ b/CHANGES
@@ -0,0 +1,14 @@
+r181	- CASAVA purity filtering
+r154	- fixed major bug in RMN's that would invalidate reads
+r152	- allowed short adapters to work at the 'begin' of reads
+r171	- paired-ends get trimmed like anything else....other behavior was too conservative
+r258    - gzip support on input and output, append barcode to unmatched id
+r353    - support for dual-indexed nextera reads in fastq-multx, new defaults based on ROC curve analysis
+r401    - added -L, included google dir, included build for sam-stats
+r408    - updated fast-join docs, changed default mismatch to 8%
+r425    - fastq-mcf filtering options, multx verif char fixed
+r474    - RNAmode & coverage stats output
+r475    - fix paired-end forward/reverse counts
+r534    - -S can be before -R, also ver num increment
+r551    - eventer -l 0 bug fix, debug output improvement
+r558    - sam-stats snp rate change, fastq-lib poorqual N's issue fixed, buffering added (todo: add to lib)
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..36b0b5b
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,105 @@
+#
+# $Id: Makefile 670 2013-12-13 16:47:07Z earonesty $
+
+CC=g++
+PREFIX?=/usr
+BINDIR?=$(PREFIX)/bin
+CFLAGS?=-O3 -I. 
+CPPFLAGS?=-O3 -I.
+# for debugging:
+# CFLAGS?=-g -I. 
+# CPPFLAGS?=-g -I.
+
+PKG=ea-utils
+REL := $(shell svnversion 2>/dev/null | perl -ne 'print $$1 if /:(\d+)/' )
+VER := $(shell grep '%define ver' ${PKG}.spec | perl -ne 'print $$1 if / (\S+) *$$/')
+
+SRC=fastq-clipper.c fastq-mcf.c fastq-multx.c fastq-join.c fastq-stats.cpp gcModel.c
+BIN=fastq-mcf fastq-multx fastq-join fastq-stats fastq-clipper sam-stats varcall
+TOOLS=fastx-graph gtf2bed determine-phred randomFQ alc
+
+all: $(BIN)
+
+debug: 
+	CPPFLAGS=-g ${MAKE} $(MFLAGS) varcall
+
+install: $(BIN) $(BINDIR)/fastq-clipper $(BINDIR)/fastq-mcf $(BINDIR)/fastq-multx $(BINDIR)/fastq-join $(BINDIR)/fastq-stats $(BINDIR)/sam-stats $(BINDIR)/varcall $(BINDIR)/fastx-graph $(BINDIR)/determine-phred $(BINDIR)/randomFQ $(BINDIR)/alc
+
+$(BINDIR):
+	mkdir -p $(BINDIR)
+
+$(BINDIR)/%: % $(BINDIR)
+	cp $< $@
+
+dist: getrel $(PKG).${VER}-${REL}.tar.gz
+
+# these shenanigans are done to ensure than the release in the spec file is the same as the subversion release
+# a less verbose way should be possible
+
+getrel:
+	grep "${REL}" $(PKG).spec || touch $(PKG).spex
+
+.PHONY: getrel debug
+
+$(PKG).spec: $(PKG).spex
+	perl -pe 's/%RELEASE%/${REL}/' $(PKG).spex > $(PKG).spec
+
+$(PKG).tar.gz: Makefile $(TOOLS) $(SRC) $(PKG).spec fastq-lib.cpp fastq-lib.h sam-stats.cpp fastq-stats.cpp gcModel.c gcModel.h varcall.cpp utils.h README CHANGES google sparsehash samtools/*.c 
+	rm -rf $(PKG).${VER}-${REL}
+	mkdir $(PKG).${VER}-${REL}
+	mkdir $(PKG).${VER}-${REL}/tidx
+	mkdir $(PKG).${VER}-${REL}/samtools
+	cp -nr $^ $(PKG).${VER}-${REL}
+	cp -nr tidx/*.cpp tidx/*.h $(PKG).${VER}-${REL}/tidx
+	cp -nr samtools/*.c samtools/*.h samtools/Makefile $(PKG).${VER}-${REL}/samtools
+	tar --exclude=".svn" -cvzf $(PKG).tar.gz $(PKG).${VER}-${REL}
+	rm -rf $(PKG).${VER}-${REL}
+
+disttest: $(PKG).tar.gz
+	tar -xzvf $(PKG).tar.gz
+	cd $(PKG).${VER}-${REL} && make
+	rm -rf $(PKG).${VER}-${REL}
+
+$(PKG).${VER}-${REL}.tar.gz: $(PKG).tar.gz
+	cp $< $@
+
+%: %.c fastq-lib.cpp fastq-lib.h 
+	$(CC) $(CFLAGS) fastq-lib.cpp -o $@ $<
+
+%: %.cpp fastq-lib.cpp fastq-lib.h
+	$(CC) $(CFLAGS) fastq-lib.cpp -o $@ $<
+
+
+%: %.c gcModel.c gcModel.h
+	$(CC) $(CFLAGS) gcModel.c -o $@ $<
+
+%: %.cpp gcModel.c gcModel.h
+	$(CC) $(CFLAGS) gcModel.c -o $@ $<
+
+# why the libbam.a doesn't work?  not sure... *.o works
+sam-stats: sam-stats.cpp samtools/libbam.a samtools/bam.h fastq-lib.h
+ifeq ($(OS),Windows_NT)
+	$(CC) $(CFLAGS) samtools/*.o -lz -lpthread -lws2_32 fastq-lib.cpp $< -o $@
+else
+	$(CC) $(CFLAGS) samtools/*.o -lz -lpthread fastq-lib.cpp $< -o $@
+endif
+
+samtools/libbam.a: samtools/*.c samtools/*.h
+	cd samtools && make libbam.a
+
+varcall: varcall.cpp fastq-lib.cpp tidx/tidx-lib.cpp
+ifeq ($(OS),Windows_NT)
+	echo varcall: not supported yet
+else
+	$(CC) $(CFLAGS) fastq-lib.cpp tidx/tidx-lib.cpp -o $@ $< -lgsl -lgslcblas
+endif
+
+fastq-stats: fastq-stats.cpp fastq-lib.cpp gcModel.c
+	$(CC) $(CFLAGS) fastq-lib.cpp gcModel.c -o $@ $<
+
+bam-filter:  bam-filter.cpp 
+	$(CC) $(CFLAGS) fastq-lib.cpp -o $@  $< -lbamtools 
+
+clean:
+	rm -f *.o $(BIN)
+	cd samtools && make clean
diff --git a/README b/README
new file mode 100644
index 0000000..8700228
--- /dev/null
+++ b/README
@@ -0,0 +1,63 @@
+OVERVIEW:
+
+fastq-mcf
+
+Scans a sequence file for adapters, and, based on a log-scaled threshold, determines a set of clipping parameters and performs clipping. Also does skewing detection and quality filtering.
+
+fastq-multx
+
+Demultiplexes a fastq. Capable of auto-determining barcode id's based on a master set fields. Keeps multiple reads in-sync during demultiplexing. Can verify that the reads are in-sync as well, and fail if they're not.
+
+fastq-join
+Similar to audy's stitch program, but in C, more efficient and supports some automatic benchmarking and tuning. It uses the same "squared distance for anchored alignment" as other tools.
+
+fastq-stats
+Outputs stats for fastqs
+
+sam-stats
+Output stats for sam/bam files
+
+varcall
+Variant caller, takes bam or pileup output and does variant calling with advanced features like PCR duplicate filtering, homopolymer repeat filtering, calculation of error rate and dectectibility (minimum percentage) thresholds.
+
+REQUIRES:
+
+For building sam-stats, please install this first!
+
+https://github.com/pezmaster31/bamtools/wiki/Building-and-installing
+
+QUICK FAQ:
+
+This is based on feedback/emails, etc.
+
+fastq-mcf does a 300k sub-sampling to determine what to do.   There are lots of paramters to play with, but the "automatic" mode should do the right thing most of the time.  If it doesn't, I really would like to hear why/what it did.  The point in this tool is that the basic quality and adapter filtering should be something that's done automagically 90% of the time - not by manually picking paramters for each run.   The fact that it's making decisions "for the user" means it will probabl [...]
+
+If you want fastq-mcf to be similar to other tools, you need to pass -m XX, and -s 100, so it's a fixed-length.  If you try running with unrealistic, or "test" data, the heuristic won't work.  Instead, try with a subsample of 50000 or so "real" reads.
+
+fastq-mcf doubles as a read-filtering program, it supports a broad range of filtering arguments.
+
+fastq-join produces a "report".  This is just a list of lengths of joined reads.   Also it chooses the "better quality base" when overlapping.  Very stable code at this point.
+
+fastq-multx is intended to keep mates in sync, so you can demultiplex in one-pass.  For single-reads, it's not better than other tools out there, except that you don't need to predefine your sets... which can help logistics in high-volume situations.  Also, notice the output file's "%-sign" substitution... this is instead of lots of prefix and suffix arguments.  Mismatch algorithm is "maximal unique"... ie... if it's possible that 2 barcodes can match, it won't use *either*.   Qualities  [...]
+
+Dual-indexed codes are listed as SEQUENCE-SEQUENCE in the barcode file.  I haven't tried mixing them with others on the autodetect code, I can't imaginge there's a reason to do that.
+
+The latest version can ignores bases that have extremely low qualities (<5), and refuses to match a barcode that isn't a minimum distance from another best match.   It's a lot safer, but for some poor-quality runs these features will need to be disabled.
+
+sam-stats take a lot of options for a variety of reports.   The most important ones to note are -D, which builds a huge hash of probe ids, and -R which produces a coverage matrix.   It could autodetect if reads are sorted by probe ID and save RAM.   It could also reduce RAM by removing common prefixes from the hash after some X reads.   It doesn't do those things now.
+
+INSTALL:
+
+Should be able to run "make install" on most machines that have g++ installed.  On windows, install a copy of the MinGW environment.   You'll need zlib installed for some tools.   fastq-mcf, fastq-stats, etc.  are pretty basic, and work without any external libs.
+
+Example:
+
+PREFIX=/usr make install
+
+OR to a subdir:
+
+BINDIR=/usr/bin/ea-utils make install
+
+Or with other options:
+
+CC=g++ PREFIX=/usr/local make install
diff --git a/alc b/alc
new file mode 100755
index 0000000..54615f2
--- /dev/null
+++ b/alc
@@ -0,0 +1,178 @@
+#!/usr/bin/perl
+
+# Copyright (c) 2011 Erik Aronesty (erik at q32.com)
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+# 
+# NOTE: Please let me know if you use it or like it, AKA: "Thank You Mask Man"
+
+use strict;
+use Getopt::Long;
+our $VERSION = 1.2;
+
+my %nl;
+my $w=200000;		# window size
+my $x=10;		# segment count
+my $only;
+GetOptions("only"=>\$only, "window=i"=>\$w, "segs=i"=>\$x);
+
+die usage() unless @ARGV;
+die usage() if $only && @ARGV > 1;
+$|=1;
+#$x-=1;
+my $tl=0;
+for my $f (@ARGV) {
+	my $s = -s $f;
+	my $gz = $f =~ /\.gz$/;
+	open (IN, ($gz ? "gunzip -c '$f'|" : $f)) || (warn("$f: $!\n"), next);
+	my $nl = 0;
+	my $gzratio = 1;
+	if ($gz) {
+		if ( $s > $w ) {
+            # todo, this can be done all-in-one stream, by buffering the read and gzipping in a loop
+            # it will be much faster!
+            my (@bz, @lz);
+            $x = 5;                     # 5 points
+            my $ss = $w/$x;
+            for (my $i=0; $i<$x; ++$i) {
+                my $o = int($ss * ($i+1));
+                $bz[$i]=`gunzip -c '$f' | head -$o | gzip -c | wc -c`;
+                if (abs($s-$bz[$i]) < ($ss+length($f))) {
+                    $nl=`gunzip -c '$f' | wc -l`;
+                    goto MAXXED;
+                } else {
+                    $lz[$i]=$s*$o/$bz[$i];
+                }
+#                warn("o: ", $o, ", lzi:", $lz[$i], ", bzi:", $bz[$i], "\n");
+            }
+
+            # simple linear model log(bytes)~lines
+            my ($xxs, $xys, $xs, $ys);
+            for (my $i=0; $i<$x; ++$i) {
+                $xs+=log($bz[$i]);
+                $ys+=$lz[$i];
+                $xxs+=log($bz[$i])*log($bz[$i]); 
+                $xys+=$lz[$i]*log($bz[$i]); 
+            }
+            my $slope = ($xys - $xs*$ys/$x) / ($xxs - $xs*$xs/$x);
+            my $intercept = ($ys - ($slope*$xs))/$x;
+#            warn("intercept: $intercept, slope: $slope, s: $s\n");
+# log size predictive of lines.....
+            $nl = $intercept + $slope * log($s);
+            MAXXED:
+		} else {
+            close(IN);
+            $nl = `gunzip -c '$f' | wc -l` + 0;
+        }
+	} else { 
+        my $rc = 0;
+        if ($s > ($w*2)) {
+            my $ss = $x == 1 ? $w : (($s-($w/$x))/($x-1));
+            my $d;
+#            warn("segments: $x, window: $w, ss:$ss\n");
+            my $tweight;
+            for (my $i=0;$i<$x;++$i) {
+                seek(IN, $i*$ss, 0) if $i > 0;
+                read(IN, $d, $w/$x);
+                #whole lines only, prevents overcounting
+                if ($i > 0) {
+                    # seek before current block, getting more accurate "long-first-lines"
+                    my $z;
+                    seek(IN, $i*$ss-1000, 0);
+                    read(IN, $z, 1000);
+                    $z=substr($z, rindex($z, "\n")+1);
+                    $d=$z.$d;
+                } else {
+                    #$d=substr($d, index($d, "\n")+1);               # remove first line
+                }
+                $d=substr($d, 0, rindex($d, "\n")+1);           # remove last line
+                $rc += length($d);
+                $nl += xlc($d);
+    #            printf STDERR "seek: %d, len: %d, xlc: %d, rc: %d, nl: %d\n", $i*$ss, length($d), xlc($d), $rc, $nl;
+            }
+        } else {
+            my $d;
+            read(IN, $d, $w);
+            $d=substr($d, 0, rindex($d, "\n")+1);               # remove last line
+            $rc += length($d); 
+            $nl += xlc($d);
+        }
+        $nl = int($rc > 0 ? $nl * $s / $rc : 0);
+    }
+	$tl += $nl;
+	print "$nl" if $only;
+	print "\n" if $only && -t STDOUT;
+	$nl{$f} = $nl if !$only;
+}
+
+if (!$only) {
+	my $m=length($tl)+1;
+	for my $f (@ARGV) {
+		printf "%*d %s\n", $m, $nl{$f}, $f;
+	}
+	printf "%*d %s\n", $m, $tl, "total" if @ARGV > 1;
+}
+
+sub xlc {
+	my $d = $_[0];
+	my $p=0;
+	my $c=0;
+	my $i=0;
+	while (($i=index($d, "\n", $p))>=0) {
+		++$c;
+		$p=$i+1;
+	}
+	return $p<length($d) ? $c+1 : $c;		# correct count
+}
+
+sub usage {<<EOF
+Usage: alc [-o] <file1> [<file2>] ...
+
+Approximate line counts for each file.  Attempts to be 
+somewhat compatible with "wc -l" by default.
+
+-o|--only            Output line count only for a single file.
+-w|--window <int>    Read <int> bytes from head, mid, and tail.
+-s|--segs <int>      Divide file & window into <int> segments.
+EOF
+};
+
+__END__
+
+=head1 NAME
+
+alc - Approximate line count
+
+=head1 DESCRIPTION
+
+Approximate line counts for each file.  Attempts to be
+somewhat compatible with "wc -l" by default.
+
+=head1 AUTHOR
+
+Erik Aronesty C<earonesty at cpan.org>
+
+=head1 LICENSE
+
+This program is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+See L<http://www.perl.com/perl/misc/Artistic.html>.
+
+=cut
diff --git a/bam.c b/bam.c
new file mode 100644
index 0000000..b00d6a6
--- /dev/null
+++ b/bam.c
@@ -0,0 +1,474 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <errno.h>
+#include <assert.h>
+#include "bam.h"
+#include "bam_endian.h"
+#include "kstring.h"
+#include "sam_header.h"
+
+int bam_is_be = 0, bam_verbose = 2, bam_no_B = 0;
+char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
+
+/**************************
+ * CIGAR related routines *
+ **************************/
+
+uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar)
+{
+	int k, end = c->pos;
+	for (k = 0; k < c->n_cigar; ++k) {
+		int op  = bam_cigar_op(cigar[k]);
+		int len = bam_cigar_oplen(cigar[k]);
+		if (op == BAM_CBACK) { // move backward
+			int l, u, v;
+			if (k == c->n_cigar - 1) break; // skip trailing 'B'
+			for (l = k - 1, u = v = 0; l >= 0; --l) {
+				int op1  = bam_cigar_op(cigar[l]);
+				int len1 = bam_cigar_oplen(cigar[l]);
+				if (bam_cigar_type(op1)&1) { // consume query
+					if (u + len1 >= len) { // stop
+						if (bam_cigar_type(op1)&2) v += len - u;
+						break;
+					} else u += len1;
+				}
+				if (bam_cigar_type(op1)&2) v += len1;
+			}
+			end = l < 0? c->pos : end - v;
+		} else if (bam_cigar_type(op)&2) end += bam_cigar_oplen(cigar[k]);
+	}
+	return end;
+}
+
+int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar)
+{
+	uint32_t k;
+	int32_t l = 0;
+	for (k = 0; k < c->n_cigar; ++k)
+		if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
+			l += bam_cigar_oplen(cigar[k]);
+	return l;
+}
+
+/********************
+ * BAM I/O routines *
+ ********************/
+
+bam_header_t *bam_header_init()
+{
+	bam_is_be = bam_is_big_endian();
+	return (bam_header_t*)calloc(1, sizeof(bam_header_t));
+}
+
+void bam_header_destroy(bam_header_t *header)
+{
+	int32_t i;
+	extern void bam_destroy_header_hash(bam_header_t *header);
+	if (header == 0) return;
+	if (header->target_name) {
+		for (i = 0; i < header->n_targets; ++i)
+			free(header->target_name[i]);
+		free(header->target_name);
+		free(header->target_len);
+	}
+	free(header->text);
+	if (header->dict) sam_header_free(header->dict);
+	if (header->rg2lib) sam_tbl_destroy(header->rg2lib);
+	bam_destroy_header_hash(header);
+	free(header);
+}
+
+bam_header_t *bam_header_read(bamFile fp)
+{
+	bam_header_t *header;
+	char buf[4];
+	int magic_len;
+	int32_t i = 1, name_len;
+	// check EOF
+	i = bgzf_check_EOF(fp);
+	if (i < 0) {
+		// If the file is a pipe, checking the EOF marker will *always* fail
+		// with ESPIPE.  Suppress the error message in this case.
+		if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF");
+	}
+	else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n");
+	// read "BAM1"
+	magic_len = bam_read(fp, buf, 4);
+	if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
+		fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
+		return 0;
+	}
+	header = bam_header_init();
+	// read plain text and the number of reference sequences
+	bam_read(fp, &header->l_text, 4);
+	if (bam_is_be) bam_swap_endian_4p(&header->l_text);
+	header->text = (char*)calloc(header->l_text + 1, 1);
+	bam_read(fp, header->text, header->l_text);
+	bam_read(fp, &header->n_targets, 4);
+	if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
+	// read reference sequence names and lengths
+	header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
+	header->target_len = (uint32_t*)calloc(header->n_targets, 4);
+	for (i = 0; i != header->n_targets; ++i) {
+		bam_read(fp, &name_len, 4);
+		if (bam_is_be) bam_swap_endian_4p(&name_len);
+		header->target_name[i] = (char*)calloc(name_len, 1);
+		bam_read(fp, header->target_name[i], name_len);
+		bam_read(fp, &header->target_len[i], 4);
+		if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
+	}
+	return header;
+}
+
+int bam_header_write(bamFile fp, const bam_header_t *header)
+{
+	char buf[4];
+	int32_t i, name_len, x;
+	// write "BAM1"
+	strncpy(buf, "BAM\001", 4);
+	bam_write(fp, buf, 4);
+	// write plain text and the number of reference sequences
+	if (bam_is_be) {
+		x = bam_swap_endian_4(header->l_text);
+		bam_write(fp, &x, 4);
+		if (header->l_text) bam_write(fp, header->text, header->l_text);
+		x = bam_swap_endian_4(header->n_targets);
+		bam_write(fp, &x, 4);
+	} else {
+		bam_write(fp, &header->l_text, 4);
+		if (header->l_text) bam_write(fp, header->text, header->l_text);
+		bam_write(fp, &header->n_targets, 4);
+	}
+	// write sequence names and lengths
+	for (i = 0; i != header->n_targets; ++i) {
+		char *p = header->target_name[i];
+		name_len = strlen(p) + 1;
+		if (bam_is_be) {
+			x = bam_swap_endian_4(name_len);
+			bam_write(fp, &x, 4);
+		} else bam_write(fp, &name_len, 4);
+		bam_write(fp, p, name_len);
+		if (bam_is_be) {
+			x = bam_swap_endian_4(header->target_len[i]);
+			bam_write(fp, &x, 4);
+		} else bam_write(fp, &header->target_len[i], 4);
+	}
+	bgzf_flush(fp);
+	return 0;
+}
+
+static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
+{
+	uint8_t *s;
+	uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
+	s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
+	for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
+	while (s < data + data_len) {
+		uint8_t type;
+		s += 2; // skip key
+		type = toupper(*s); ++s; // skip type
+		if (type == 'C' || type == 'A') ++s;
+		else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
+		else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
+		else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
+		else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
+		else if (type == 'B') {
+			int32_t n, Bsize = bam_aux_type2size(*s);
+			memcpy(&n, s + 1, 4);
+			if (1 == Bsize) {
+			} else if (2 == Bsize) {
+				for (i = 0; i < n; i += 2)
+					bam_swap_endian_2p(s + 5 + i);
+			} else if (4 == Bsize) {
+				for (i = 0; i < n; i += 4)
+					bam_swap_endian_4p(s + 5 + i);
+			}
+			bam_swap_endian_4p(s+1); 
+		}
+	}
+}
+
+int bam_read1(bamFile fp, bam1_t *b)
+{
+	bam1_core_t *c = &b->core;
+	int32_t block_len, ret, i;
+	uint32_t x[8];
+
+	assert(BAM_CORE_SIZE == 32);
+	if ((ret = bam_read(fp, &block_len, 4)) != 4) {
+		if (ret == 0) return -1; // normal end-of-file
+		else return -2; // truncated
+	}
+	if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3;
+	if (bam_is_be) {
+		bam_swap_endian_4p(&block_len);
+		for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+	}
+	c->tid = x[0]; c->pos = x[1];
+	c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
+	c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
+	c->l_qseq = x[4];
+	c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
+	b->data_len = block_len - BAM_CORE_SIZE;
+	if (b->m_data < b->data_len) {
+		b->m_data = b->data_len;
+		kroundup32(b->m_data);
+		b->data = (uint8_t*)realloc(b->data, b->m_data);
+	}
+	if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
+	b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
+	if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
+	if (bam_no_B) bam_remove_B(b);
+	return 4 + block_len;
+}
+
+inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data)
+{
+	uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y;
+	int i;
+	assert(BAM_CORE_SIZE == 32);
+	x[0] = c->tid;
+	x[1] = c->pos;
+	x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
+	x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
+	x[4] = c->l_qseq;
+	x[5] = c->mtid;
+	x[6] = c->mpos;
+	x[7] = c->isize;
+	bgzf_flush_try(fp, 4 + block_len);
+	if (bam_is_be) {
+		for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+		y = block_len;
+		bam_write(fp, bam_swap_endian_4p(&y), 4);
+		swap_endian_data(c, data_len, data);
+	} else bam_write(fp, &block_len, 4);
+	bam_write(fp, x, BAM_CORE_SIZE);
+	bam_write(fp, data, data_len);
+	if (bam_is_be) swap_endian_data(c, data_len, data);
+	return 4 + block_len;
+}
+
+int bam_write1(bamFile fp, const bam1_t *b)
+{
+	return bam_write1_core(fp, &b->core, b->data_len, b->data);
+}
+
+char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)
+{
+	uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
+	int i;
+	const bam1_core_t *c = &b->core;
+	kstring_t str;
+	str.l = str.m = 0; str.s = 0;
+
+	kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str);
+	if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); }
+	else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag);
+	else { // BAM_OFSTR
+		for (i = 0; i < 16; ++i)
+			if ((c->flag & 1<<i) && bam_flag2char_table[i])
+				kputc(bam_flag2char_table[i], &str);
+		kputc('\t', &str);
+	}
+	if (c->tid < 0) kputsn("*\t", 2, &str);
+	else {
+		if (header) kputs(header->target_name[c->tid] , &str);
+		else kputw(c->tid, &str);
+		kputc('\t', &str);
+	}
+	kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str);
+	if (c->n_cigar == 0) kputc('*', &str);
+	else {
+		uint32_t *cigar = bam1_cigar(b);
+		for (i = 0; i < c->n_cigar; ++i) {
+			kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
+			kputc(bam_cigar_opchr(cigar[i]), &str);
+		}
+	}
+	kputc('\t', &str);
+	if (c->mtid < 0) kputsn("*\t", 2, &str);
+	else if (c->mtid == c->tid) kputsn("=\t", 2, &str);
+	else {
+		if (header) kputs(header->target_name[c->mtid], &str);
+		else kputw(c->mtid, &str);
+		kputc('\t', &str);
+	}
+	kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str);
+	if (c->l_qseq) {
+		for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
+		kputc('\t', &str);
+		if (t[0] == 0xff) kputc('*', &str);
+		else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
+	} else kputsn("*\t*", 3, &str);
+	s = bam1_aux(b);
+	while (s < b->data + b->data_len) {
+		uint8_t type, key[2];
+		key[0] = s[0]; key[1] = s[1];
+		s += 2; type = *s; ++s;
+		kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str);
+		if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; }
+		else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; }
+		else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; }
+		else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; }
+		else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; }
+		else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; }
+		else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; }
+		else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
+		else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; }
+		else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; }
+		else if (type == 'B') {
+			uint8_t sub_type = *(s++);
+			int32_t n;
+			memcpy(&n, s, 4);
+			s += 4; // no point to the start of the array
+			kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing
+			for (i = 0; i < n; ++i) {
+				kputc(',', &str);
+				if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; }
+				else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; }
+				else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; }
+				else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; }
+				else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; }
+				else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; }
+				else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; }
+			}
+		}
+	}
+	return str.s;
+}
+
+char *bam_format1(const bam_header_t *header, const bam1_t *b)
+{
+	return bam_format1_core(header, b, BAM_OFDEC);
+}
+
+void bam_view1(const bam_header_t *header, const bam1_t *b)
+{
+	char *s = bam_format1(header, b);
+	puts(s);
+	free(s);
+}
+
+int bam_validate1(const bam_header_t *header, const bam1_t *b)
+{
+	char *s;
+
+	if (b->core.tid < -1 || b->core.mtid < -1) return 0;
+	if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0;
+
+	if (b->data_len < b->core.l_qname) return 0;
+	s = memchr(bam1_qname(b), '\0', b->core.l_qname);
+	if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0;
+
+	// FIXME: Other fields could also be checked, especially the auxiliary data
+
+	return 1;
+}
+
+// FIXME: we should also check the LB tag associated with each alignment
+const char *bam_get_library(bam_header_t *h, const bam1_t *b)
+{
+	const uint8_t *rg;
+	if (h->dict == 0) h->dict = sam_header_parse2(h->text);
+	if (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB");
+	rg = bam_aux_get(b, "RG");
+	return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1));
+}
+
+/************
+ * Remove B *
+ ************/
+
+int bam_remove_B(bam1_t *b)
+{
+	int i, j, end_j, k, l, no_qual;
+	uint32_t *cigar, *new_cigar;
+	uint8_t *seq, *qual, *p;
+	// test if removal is necessary
+	if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing
+	cigar = bam1_cigar(b);
+	for (k = 0; k < b->core.n_cigar; ++k)
+		if (bam_cigar_op(cigar[k]) == BAM_CBACK) break;
+	if (k == b->core.n_cigar) return 0; // no 'B'
+	if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed
+	// allocate memory for the new CIGAR
+	if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
+		b->m_data = b->data_len + b->core.n_cigar * 4;
+		kroundup32(b->m_data);
+		b->data = (uint8_t*)realloc(b->data, b->m_data);
+		cigar = bam1_cigar(b); // after realloc, cigar may be changed
+	}
+	new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
+	// the core loop
+	seq = bam1_seq(b); qual = bam1_qual(b);
+	no_qual = (qual[0] == 0xff); // test whether base quality is available
+	i = j = 0; end_j = -1;
+	for (k = l = 0; k < b->core.n_cigar; ++k) {
+		int op  = bam_cigar_op(cigar[k]);
+		int len = bam_cigar_oplen(cigar[k]);
+		if (op == BAM_CBACK) { // the backward operation
+			int t, u;
+			if (k == b->core.n_cigar - 1) break; // ignore 'B' at the end of CIGAR
+			if (len > j) goto rmB_err; // an excessively long backward
+			for (t = l - 1, u = 0; t >= 0; --t) { // look back
+				int op1  = bam_cigar_op(new_cigar[t]);
+				int len1 = bam_cigar_oplen(new_cigar[t]);
+				if (bam_cigar_type(op1)&1) { // consume the query
+					if (u + len1 >= len) { // stop
+						new_cigar[t] -= (len - u) << BAM_CIGAR_SHIFT;
+						break;
+					} else u += len1;
+				}
+			}
+			if (bam_cigar_oplen(new_cigar[t]) == 0) --t; // squeeze out the zero-length operation
+			l = t + 1;
+			end_j = j; j -= len;
+		} else { // other CIGAR operations
+			new_cigar[l++] = cigar[k];
+			if (bam_cigar_type(op)&1) { // consume the query
+				if (i != j) { // no need to copy if i == j
+					int u, c, c0;
+					for (u = 0; u < len; ++u) { // construct the consensus
+						c = bam1_seqi(seq, i+u);
+						if (j + u < end_j) { // in an overlap
+							c0 = bam1_seqi(seq, j+u);
+							if (c != c0) { // a mismatch; choose the better base
+								if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
+									bam1_seq_seti(seq, j+u, c);
+									qual[j+u] = qual[i+u] - qual[j+u];
+								} else qual[j+u] -= qual[i+u]; // the 1st is better; reduce base quality
+							} else qual[j+u] = qual[j+u] > qual[i+u]? qual[j+u] : qual[i+u];
+						} else { // not in an overlap; copy over
+							bam1_seq_seti(seq, j+u, c);
+							qual[j+u] = qual[i+u];
+						}
+					}
+				}
+				i += len, j += len;
+			}
+		}
+	}
+	if (no_qual) qual[0] = 0xff; // in very rare cases, this may be modified
+	// merge adjacent operations if possible
+	for (k = 1; k < l; ++k)
+		if (bam_cigar_op(new_cigar[k]) == bam_cigar_op(new_cigar[k-1]))
+			new_cigar[k] += new_cigar[k-1] >> BAM_CIGAR_SHIFT << BAM_CIGAR_SHIFT, new_cigar[k-1] &= 0xf;
+	// kill zero length operations
+	for (k = i = 0; k < l; ++k)
+		if (new_cigar[k] >> BAM_CIGAR_SHIFT)
+			new_cigar[i++] = new_cigar[k];
+	l = i;
+	// update b
+	memcpy(cigar, new_cigar, l * 4); // set CIGAR
+	p = b->data + b->core.l_qname + l * 4;
+	memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
+	memmove(p, qual, j); p += j; // set QUAL
+	memmove(p, bam1_aux(b), b->l_aux); p += b->l_aux; // set optional fields
+	b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
+	b->data_len = p - b->data; // update record length
+	return 0;
+
+rmB_err:
+	b->core.flag |= BAM_FUNMAP;
+	return -1;
+}
diff --git a/bam_aux.c b/bam_aux.c
new file mode 100644
index 0000000..28b22e3
--- /dev/null
+++ b/bam_aux.c
@@ -0,0 +1,213 @@
+#include <ctype.h>
+#include "bam.h"
+#include "khash.h"
+typedef char *str_p;
+KHASH_MAP_INIT_STR(s, int)
+KHASH_MAP_INIT_STR(r2l, str_p)
+
+void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
+{
+	int ori_len = b->data_len;
+	b->data_len += 3 + len;
+	b->l_aux += 3 + len;
+	if (b->m_data < b->data_len) {
+		b->m_data = b->data_len;
+		kroundup32(b->m_data);
+		b->data = (uint8_t*)realloc(b->data, b->m_data);
+	}
+	b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
+	b->data[ori_len + 2] = type;
+	memcpy(b->data + ori_len + 3, data, len);
+}
+
+uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2])
+{
+	return bam_aux_get(b, tag);
+}
+
+#define __skip_tag(s) do { \
+		int type = toupper(*(s)); \
+		++(s); \
+		if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \
+		else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \
+		else (s) += bam_aux_type2size(type); \
+	} while(0)
+
+uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
+{
+	uint8_t *s;
+	int y = tag[0]<<8 | tag[1];
+	s = bam1_aux(b);
+	while (s < b->data + b->data_len) {
+		int x = (int)s[0]<<8 | s[1];
+		s += 2;
+		if (x == y) return s;
+		__skip_tag(s);
+	}
+	return 0;
+}
+// s MUST BE returned by bam_aux_get()
+int bam_aux_del(bam1_t *b, uint8_t *s)
+{
+	uint8_t *p, *aux;
+	aux = bam1_aux(b);
+	p = s - 2;
+	__skip_tag(s);
+	memmove(p, s, b->l_aux - (s - aux));
+	b->data_len -= s - p;
+	b->l_aux -= s - p;
+	return 0;
+}
+
+int bam_aux_drop_other(bam1_t *b, uint8_t *s)
+{
+	if (s) {
+		uint8_t *p, *aux;
+		aux = bam1_aux(b);
+		p = s - 2;
+		__skip_tag(s);
+		memmove(aux, p, s - p);
+		b->data_len -= b->l_aux - (s - p);
+		b->l_aux = s - p;
+	} else {
+		b->data_len -= b->l_aux;
+		b->l_aux = 0;
+	}
+	return 0;
+}
+
+void bam_init_header_hash(bam_header_t *header)
+{
+	if (header->hash == 0) {
+		int ret, i;
+		khiter_t iter;
+		khash_t(s) *h;
+		header->hash = h = kh_init(s);
+		for (i = 0; i < header->n_targets; ++i) {
+			iter = kh_put(s, h, header->target_name[i], &ret);
+			kh_value(h, iter) = i;
+		}
+	}
+}
+
+void bam_destroy_header_hash(bam_header_t *header)
+{
+	if (header->hash)
+		kh_destroy(s, (khash_t(s)*)header->hash);
+}
+
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name)
+{
+	khint_t k;
+	khash_t(s) *h = (khash_t(s)*)header->hash;
+	k = kh_get(s, h, seq_name);
+	return k == kh_end(h)? -1 : kh_value(h, k);
+}
+
+int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end)
+{
+	char *s;
+	int i, l, k, name_end;
+	khiter_t iter;
+	khash_t(s) *h;
+
+	bam_init_header_hash(header);
+	h = (khash_t(s)*)header->hash;
+
+	*ref_id = *beg = *end = -1;
+	name_end = l = strlen(str);
+	s = (char*)malloc(l+1);
+	// remove space
+	for (i = k = 0; i < l; ++i)
+		if (!isspace(str[i])) s[k++] = str[i];
+	s[k] = 0; l = k;
+	// determine the sequence name
+	for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
+	if (i >= 0) name_end = i;
+	if (name_end < l) { // check if this is really the end
+		int n_hyphen = 0;
+		for (i = name_end + 1; i < l; ++i) {
+			if (s[i] == '-') ++n_hyphen;
+			else if (!isdigit(s[i]) && s[i] != ',') break;
+		}
+		if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
+		s[name_end] = 0;
+		iter = kh_get(s, h, s);
+		if (iter == kh_end(h)) { // cannot find the sequence name
+			iter = kh_get(s, h, str); // try str as the name
+			if (iter == kh_end(h)) {
+				if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__);
+				free(s); return -1;
+			} else s[name_end] = ':', name_end = l;
+		}
+	} else iter = kh_get(s, h, str);
+	*ref_id = kh_val(h, iter);
+	// parse the interval
+	if (name_end < l) {
+		for (i = k = name_end + 1; i < l; ++i)
+			if (s[i] != ',') s[k++] = s[i];
+		s[k] = 0;
+		*beg = atoi(s + name_end + 1);
+		for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
+		*end = i < k? atoi(s + i + 1) : 1<<29;
+		if (*beg > 0) --*beg;
+	} else *beg = 0, *end = 1<<29;
+	free(s);
+	return *beg <= *end? 0 : -1;
+}
+
+int32_t bam_aux2i(const uint8_t *s)
+{
+	int type;
+	if (s == 0) return 0;
+	type = *s++;
+	if (type == 'c') return (int32_t)*(int8_t*)s;
+	else if (type == 'C') return (int32_t)*(uint8_t*)s;
+	else if (type == 's') return (int32_t)*(int16_t*)s;
+	else if (type == 'S') return (int32_t)*(uint16_t*)s;
+	else if (type == 'i' || type == 'I') return *(int32_t*)s;
+	else return 0;
+}
+
+float bam_aux2f(const uint8_t *s)
+{
+	int type;
+	type = *s++;
+	if (s == 0) return 0.0;
+	if (type == 'f') return *(float*)s;
+	else return 0.0;
+}
+
+double bam_aux2d(const uint8_t *s)
+{
+	int type;
+	type = *s++;
+	if (s == 0) return 0.0;
+	if (type == 'd') return *(double*)s;
+	else return 0.0;
+}
+
+char bam_aux2A(const uint8_t *s)
+{
+	int type;
+	type = *s++;
+	if (s == 0) return 0;
+	if (type == 'A') return *(char*)s;
+	else return 0;
+}
+
+char *bam_aux2Z(const uint8_t *s)
+{
+	int type;
+	type = *s++;
+	if (s == 0) return 0;
+	if (type == 'Z' || type == 'H') return (char*)s;
+	else return 0;
+}
+
+#ifdef _WIN32
+double drand48()
+{
+	return (double)rand() / RAND_MAX;
+}
+#endif
diff --git a/bam_cat.c b/bam_cat.c
new file mode 100644
index 0000000..a7502b9
--- /dev/null
+++ b/bam_cat.c
@@ -0,0 +1,185 @@
+/*
+
+bam_cat -- efficiently concatenates bam files
+
+bam_cat can be used to concatenate BAM files. Under special
+circumstances, it can be used as an alternative to 'samtools merge' to
+concatenate multiple sorted files into a single sorted file. For this
+to work each file must be sorted, and the sorted files must be given
+as command line arguments in order such that the final read in file i
+is less than or equal to the first read in file i+1.
+
+This code is derived from the bam_reheader function in samtools 0.1.8
+and modified to perform concatenation by Chris Saunders on behalf of
+Illumina.
+
+
+########## License:
+
+The MIT License
+
+Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.
+Modified SAMtools work copyright (c) 2010 Illumina, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+
+/*
+makefile:
+"""
+CC=gcc
+CFLAGS+=-g -Wall -O2 -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -I$(SAMTOOLS_DIR)
+LDFLAGS+=-L$(SAMTOOLS_DIR)
+LDLIBS+=-lbam -lz
+
+all:bam_cat
+"""
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "knetfile.h"
+#include "bgzf.h"
+#include "bam.h"
+
+#define BUF_SIZE 0x10000
+
+#define GZIPID1 31
+#define GZIPID2 139
+
+#define BGZF_EMPTY_BLOCK_SIZE 28
+
+
+int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam)
+{
+    BGZF *fp;
+    FILE* fp_file;
+    uint8_t *buf;
+    uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
+    const int es=BGZF_EMPTY_BLOCK_SIZE;
+    int i;
+    
+    fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
+    if (fp == 0) {
+        fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);
+        return 1;
+    }
+    if (h) bam_header_write(fp, h);
+    
+    buf = (uint8_t*) malloc(BUF_SIZE);
+    for(i = 0; i < nfn; ++i){
+        BGZF *in;
+        bam_header_t *old;
+        int len,j;
+        
+        in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(fileno(stdin), "r");
+        if (in == 0) {
+            fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
+            return -1;
+        }
+        if (in->is_write) return -1;
+        
+        old = bam_header_read(in);
+		if (h == 0 && i == 0) bam_header_write(fp, old);
+        
+        if (in->block_offset < in->block_length) {
+            bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
+            bgzf_flush(fp);
+        }
+        
+        j=0;
+#ifdef _USE_KNETFILE
+        fp_file = fp->fp;
+        while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) {
+#else  
+        fp_file = fp->fp;
+        while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) {
+#endif
+            if(len<es){
+                int diff=es-len;
+                if(j==0) {
+                    fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);
+                    return -1;
+                }
+                fwrite(ebuf, 1, len, fp_file);
+                memcpy(ebuf,ebuf+len,diff);
+                memcpy(ebuf+diff,buf,len);
+            } else {
+                if(j!=0) fwrite(ebuf, 1, es, fp_file);
+                len-= es;
+                memcpy(ebuf,buf+len,es);
+                fwrite(buf, 1, len, fp_file);
+            }
+            j=1;
+        }
+
+        /* check final gzip block */
+        {
+            const uint8_t gzip1=ebuf[0];
+            const uint8_t gzip2=ebuf[1];
+            const uint32_t isize=*((uint32_t*)(ebuf+es-4));
+            if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
+                fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);
+                fprintf(stderr, " Possible output corruption.\n");
+                fwrite(ebuf, 1, es, fp_file);
+            }
+        }
+        bam_header_destroy(old);
+        bgzf_close(in);
+    }
+    free(buf);
+    bgzf_close(fp);
+    return 0;
+}
+
+
+
+int main_cat(int argc, char *argv[])
+{
+    bam_header_t *h = 0;
+	char *outfn = 0;
+	int c, ret;
+	while ((c = getopt(argc, argv, "h:o:")) >= 0) {
+		switch (c) {
+			case 'h': {
+        		tamFile fph = sam_open(optarg);
+		        if (fph == 0) {
+    		        fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]);
+        		    return 1;
+	        	}
+	    	    h = sam_header_read(fph);
+    	    	sam_close(fph);
+				break;
+			}
+			case 'o': outfn = strdup(optarg); break;
+		}
+	}
+	if (argc - optind < 2) {
+        fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n");
+        return 1;
+    }
+    ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
+	free(outfn);
+	return ret;
+}
diff --git a/bam_import.c b/bam_import.c
new file mode 100644
index 0000000..da2bf94
--- /dev/null
+++ b/bam_import.c
@@ -0,0 +1,489 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+#ifdef _WIN32
+#include <fcntl.h>
+#endif
+#include "kstring.h"
+#include "bam.h"
+#include "sam_header.h"
+#include "kseq.h"
+#include "khash.h"
+
+KSTREAM_INIT(gzFile, gzread, 16384)
+KHASH_MAP_INIT_STR(ref, uint64_t)
+
+void bam_init_header_hash(bam_header_t *header);
+void bam_destroy_header_hash(bam_header_t *header);
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+unsigned char bam_nt16_table[256] = {
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
+	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+	15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+	15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+unsigned short bam_char2flag_table[256] = {
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+	0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+	BAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0,
+	0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0,
+	BAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0,
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
+};
+
+char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
+
+struct __tamFile_t {
+	gzFile fp;
+	kstream_t *ks;
+	kstring_t *str;
+	uint64_t n_lines;
+	int is_first;
+};
+
+char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
+{
+	char **list = 0, *s;
+	int n = 0, dret, m = 0;
+	gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+	kstream_t *ks;
+	kstring_t *str;
+	str = (kstring_t*)calloc(1, sizeof(kstring_t));
+	ks = ks_init(fp);
+	while (ks_getuntil(ks, '\n', str, &dret) > 0) {
+		if (n == m) {
+			m = m? m << 1 : 16;
+			list = (char**)realloc(list, m * sizeof(char*));
+		}
+		if (str->s[str->l-1] == '\r')
+			str->s[--str->l] = '\0';
+		s = list[n++] = (char*)calloc(str->l + 1, 1);
+		strcpy(s, str->s);
+	}
+	ks_destroy(ks);
+	gzclose(fp);
+	free(str->s); free(str);
+	*_n = n;
+	return list;
+}
+
+static bam_header_t *hash2header(const kh_ref_t *hash)
+{
+	bam_header_t *header;
+	khiter_t k;
+	header = bam_header_init();
+	header->n_targets = kh_size(hash);
+	header->target_name = (char**)calloc(kh_size(hash), sizeof(char*));
+	header->target_len = (uint32_t*)calloc(kh_size(hash), 4);
+	for (k = kh_begin(hash); k != kh_end(hash); ++k) {
+		if (kh_exist(hash, k)) {
+			int i = (int)kh_value(hash, k);
+			header->target_name[i] = (char*)kh_key(hash, k);
+			header->target_len[i] = kh_value(hash, k)>>32;
+		}
+	}
+	bam_init_header_hash(header);
+	return header;
+}
+bam_header_t *sam_header_read2(const char *fn)
+{
+	bam_header_t *header;
+	int c, dret, ret, error = 0;
+	gzFile fp;
+	kstream_t *ks;
+	kstring_t *str;
+	kh_ref_t *hash;
+	khiter_t k;
+	if (fn == 0) return 0;
+	fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+	if (fp == 0) return 0;
+	hash = kh_init(ref);
+	ks = ks_init(fp);
+	str = (kstring_t*)calloc(1, sizeof(kstring_t));
+	while (ks_getuntil(ks, 0, str, &dret) > 0) {
+		char *s = strdup(str->s);
+		int len, i;
+		i = kh_size(hash);
+		ks_getuntil(ks, 0, str, &dret);
+		len = atoi(str->s);
+		k = kh_put(ref, hash, s, &ret);
+		if (ret == 0) {
+			fprintf(stderr, "[sam_header_read2] duplicated sequence name: %s\n", s);
+			error = 1;
+		}
+		kh_value(hash, k) = (uint64_t)len<<32 | i;
+		if (dret != '\n')
+			while ((c = ks_getc(ks)) != '\n' && c != -1);
+	}
+	ks_destroy(ks);
+	gzclose(fp);
+	free(str->s); free(str);
+	fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
+	if (error) return 0;
+	header = hash2header(hash);
+	kh_destroy(ref, hash);
+	return header;
+}
+static inline uint8_t *alloc_data(bam1_t *b, int size)
+{
+	if (b->m_data < size) {
+		b->m_data = size;
+		kroundup32(b->m_data);
+		b->data = (uint8_t*)realloc(b->data, b->m_data);
+	}
+	return b->data;
+}
+static inline void parse_error(int64_t n_lines, const char * __restrict msg)
+{
+	fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg);
+	abort();
+}
+static inline void append_text(bam_header_t *header, kstring_t *str)
+{
+	size_t x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null
+	kroundup32(x); kroundup32(y);
+	if (x < y) 
+    {
+        header->n_text = y;
+        header->text = (char*)realloc(header->text, y);
+        if ( !header->text ) 
+        {
+            fprintf(stderr,"realloc failed to alloc %ld bytes\n", y);
+            abort();
+        }
+    }
+    // Sanity check
+    if ( header->l_text+str->l+1 >= header->n_text )
+    {
+        fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n",  header->l_text+str->l+1,(long)header->n_text,x,y);
+        abort();
+    }
+	strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here.
+	header->l_text += str->l + 1;
+	header->text[header->l_text] = 0;
+}
+
+int sam_header_parse(bam_header_t *h)
+{
+	char **tmp;
+	int i;
+	free(h->target_len); free(h->target_name);
+	h->n_targets = 0; h->target_len = 0; h->target_name = 0;
+	if (h->l_text < 3) return 0;
+	if (h->dict == 0) h->dict = sam_header_parse2(h->text);
+	tmp = sam_header2list(h->dict, "SQ", "SN", &h->n_targets);
+	if (h->n_targets == 0) return 0;
+	h->target_name = calloc(h->n_targets, sizeof(void*));
+	for (i = 0; i < h->n_targets; ++i)
+		h->target_name[i] = strdup(tmp[i]);
+	free(tmp);
+	tmp = sam_header2list(h->dict, "SQ", "LN", &h->n_targets);
+	h->target_len = calloc(h->n_targets, 4);
+	for (i = 0; i < h->n_targets; ++i)
+		h->target_len[i] = atoi(tmp[i]);
+	free(tmp);
+	return h->n_targets;
+}
+
+bam_header_t *sam_header_read(tamFile fp)
+{
+	int ret, dret;
+	bam_header_t *header = bam_header_init();
+	kstring_t *str = fp->str;
+	while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
+		str->s[str->l] = dret; // note that str->s is NOT null terminated!!
+		append_text(header, str);
+		if (dret != '\n') {
+			ret = ks_getuntil(fp->ks, '\n', str, &dret);
+			str->s[str->l] = '\n'; // NOT null terminated!!
+			append_text(header, str);
+		}
+		++fp->n_lines;
+	}
+	sam_header_parse(header);
+	bam_init_header_hash(header);
+	fp->is_first = 1;
+	return header;
+}
+
+int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+{
+	int ret, doff, doff0, dret, z = 0;
+	bam1_core_t *c = &b->core;
+	kstring_t *str = fp->str;
+	kstream_t *ks = fp->ks;
+
+	if (fp->is_first) {
+		fp->is_first = 0;
+		ret = str->l;
+	} else {
+		do { // special consideration for empty lines
+			ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
+			if (ret >= 0) z += str->l + 1;
+		} while (ret == 0);
+	}
+	if (ret < 0) return -1;
+	++fp->n_lines;
+	doff = 0;
+
+	{ // name
+		c->l_qname = strlen(str->s) + 1;
+		memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname);
+		doff += c->l_qname;
+	}
+	{ // flag
+		long flag;
+		char *s;
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+		flag = strtol((char*)str->s, &s, 0);
+		if (*s) { // not the end of the string
+			flag = 0;
+			for (s = str->s; *s; ++s)
+				flag |= bam_char2flag_table[(int)*s];
+		}
+		c->flag = flag;
+	}
+	{ // tid, pos, qual
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
+		if (c->tid < 0 && strcmp(str->s, "*")) {
+			if (header->n_targets == 0) {
+				fprintf(stderr, "[sam_read1] missing header? Abort!\n");
+				exit(1);
+			} else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s);
+		}
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
+		if (ret < 0) return -2;
+	}
+	{ // cigar
+		char *s, *t;
+		int i, op;
+		long x;
+		c->n_cigar = 0;
+		if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3;
+		z += str->l + 1;
+		if (str->s[0] != '*') {
+			uint32_t *cigar;
+			for (s = str->s; *s; ++s) {
+				if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar;
+				else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character");
+			}
+			b->data = alloc_data(b, doff + c->n_cigar * 4);
+			cigar = bam1_cigar(b);
+			for (i = 0, s = str->s; i != c->n_cigar; ++i) {
+				x = strtol(s, &t, 10);
+				op = toupper(*t);
+				if (op == 'M') op = BAM_CMATCH;
+				else if (op == 'I') op = BAM_CINS;
+				else if (op == 'D') op = BAM_CDEL;
+				else if (op == 'N') op = BAM_CREF_SKIP;
+				else if (op == 'S') op = BAM_CSOFT_CLIP;
+				else if (op == 'H') op = BAM_CHARD_CLIP;
+				else if (op == 'P') op = BAM_CPAD;
+				else if (op == '=') op = BAM_CEQUAL;
+				else if (op == 'X') op = BAM_CDIFF;
+				else if (op == 'B') op = BAM_CBACK;
+				else parse_error(fp->n_lines, "invalid CIGAR operation");
+				s = t + 1;
+				cigar[i] = bam_cigar_gen(x, op);
+			}
+			if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation");
+			c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar));
+			doff += c->n_cigar * 4;
+		} else {
+			if (!(c->flag&BAM_FUNMAP)) {
+				fprintf(stderr, "Parse warning at line %lld: mapped sequence without CIGAR\n", (long long)fp->n_lines);
+				c->flag |= BAM_FUNMAP;
+			}
+			c->bin = bam_reg2bin(c->pos, c->pos + 1);
+		}
+	}
+	{ // mtid, mpos, isize
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+		c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid;
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+		c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
+		c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0;
+		if (ret < 0) return -4;
+	}
+	{ // seq and qual
+		int i;
+		uint8_t *p = 0;
+		if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq
+		z += str->l + 1;
+		if (strcmp(str->s, "*")) {
+			c->l_qseq = strlen(str->s);
+			if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) {
+				fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n",
+						(long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b)));
+				parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
+			}
+			p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
+			memset(p, 0, (c->l_qseq+1)/2);
+			for (i = 0; i < c->l_qseq; ++i)
+				p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
+		} else c->l_qseq = 0;
+		if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual
+		z += str->l + 1;
+		if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s))
+			parse_error(fp->n_lines, "sequence and quality are inconsistent");
+		p += (c->l_qseq+1)/2;
+		if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff;
+		else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33;
+		doff += c->l_qseq + (c->l_qseq+1)/2;
+	}
+	doff0 = doff;
+	if (dret != '\n' && dret != '\r') { // aux
+		while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) {
+			uint8_t *s, type, key[2];
+			z += str->l + 1;
+			if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':')
+				parse_error(fp->n_lines, "missing colon in auxiliary data");
+			key[0] = str->s[0]; key[1] = str->s[1];
+			type = str->s[3];
+			s = alloc_data(b, doff + 3) + doff;
+			s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2;
+			if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility
+				s = alloc_data(b, doff + 2) + doff;
+				*s++ = 'A'; *s = str->s[5];
+				doff += 2;
+			} else if (type == 'I' || type == 'i') {
+				long long x;
+				s = alloc_data(b, doff + 5) + doff;
+				x = (long long)atoll(str->s + 5);
+				if (x < 0) {
+					if (x >= -127) {
+						*s++ = 'c'; *(int8_t*)s = (int8_t)x;
+						s += 1; doff += 2;
+					} else if (x >= -32767) {
+						*s++ = 's'; *(int16_t*)s = (int16_t)x;
+						s += 2; doff += 3;
+					} else {
+						*s++ = 'i'; *(int32_t*)s = (int32_t)x;
+						s += 4; doff += 5;
+						if (x < -2147483648ll)
+							fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+									(long long)fp->n_lines, x);
+					}
+				} else {
+					if (x <= 255) {
+						*s++ = 'C'; *s++ = (uint8_t)x;
+						doff += 2;
+					} else if (x <= 65535) {
+						*s++ = 'S'; *(uint16_t*)s = (uint16_t)x;
+						s += 2; doff += 3;
+					} else {
+						*s++ = 'I'; *(uint32_t*)s = (uint32_t)x;
+						s += 4; doff += 5;
+						if (x > 4294967295ll)
+							fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+									(long long)fp->n_lines, x);
+					}
+				}
+			} else if (type == 'f') {
+				s = alloc_data(b, doff + 5) + doff;
+				*s++ = 'f';
+				*(float*)s = (float)atof(str->s + 5);
+				s += 4; doff += 5;
+			} else if (type == 'd') {
+				s = alloc_data(b, doff + 9) + doff;
+				*s++ = 'd';
+				*(float*)s = (float)atof(str->s + 9);
+				s += 8; doff += 9;
+			} else if (type == 'Z' || type == 'H') {
+				int size = 1 + (str->l - 5) + 1;
+				if (type == 'H') { // check whether the hex string is valid
+					int i;
+					if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even");
+					for (i = 0; i < str->l - 5; ++i) {
+						int c = toupper(str->s[5 + i]);
+						if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
+							parse_error(fp->n_lines, "invalid hex character");
+					}
+				}
+				s = alloc_data(b, doff + size) + doff;
+				*s++ = type;
+				memcpy(s, str->s + 5, str->l - 5);
+				s[str->l - 5] = 0;
+				doff += size;
+			} else if (type == 'B') {
+				int32_t n = 0, Bsize, k = 0, size;
+				char *p;
+				if (str->l < 8) parse_error(fp->n_lines, "too few values in aux type B");
+				Bsize = bam_aux_type2size(str->s[5]); // the size of each element
+				for (p = (char*)str->s + 6; *p; ++p) // count the number of elements in the array
+					if (*p == ',') ++n;
+				p = str->s + 7; // now p points to the first number in the array
+				size = 6 + Bsize * n; // total number of bytes allocated to this tag
+				s = alloc_data(b, doff + 6 * Bsize * n) + doff; // allocate memory
+				*s++ = 'B'; *s++ = str->s[5];
+				memcpy(s, &n, 4); s += 4; // write the number of elements
+				if (str->s[5] == 'c')      while (p < str->s + str->l) ((int8_t*)s)[k++]   = (int8_t)strtol(p, &p, 0),   ++p;
+				else if (str->s[5] == 'C') while (p < str->s + str->l) ((uint8_t*)s)[k++]  = (uint8_t)strtol(p, &p, 0),  ++p;
+				else if (str->s[5] == 's') while (p < str->s + str->l) ((int16_t*)s)[k++]  = (int16_t)strtol(p, &p, 0),  ++p; // FIXME: avoid unaligned memory
+				else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p;
+				else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++]  = (int32_t)strtol(p, &p, 0),  ++p;
+				else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p;
+				else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++]    = (float)strtod(p, &p),       ++p;
+				else parse_error(fp->n_lines, "unrecognized array type");
+				s += Bsize * n; doff += size;
+			} else parse_error(fp->n_lines, "unrecognized type");
+			if (dret == '\n' || dret == '\r') break;
+		}
+	}
+	b->l_aux = doff - doff0;
+	b->data_len = doff;
+	if (bam_no_B) bam_remove_B(b);
+	return z;
+}
+
+tamFile sam_open(const char *fn)
+{
+	tamFile fp;
+	gzFile gzfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "rb") : gzopen(fn, "rb");
+	if (gzfp == 0) return 0;
+	fp = (tamFile)calloc(1, sizeof(struct __tamFile_t));
+	fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
+	fp->fp = gzfp;
+	fp->ks = ks_init(fp->fp);
+	return fp;
+}
+
+void sam_close(tamFile fp)
+{
+	if (fp) {
+		ks_destroy(fp->ks);
+		gzclose(fp->fp);
+		free(fp->str->s); free(fp->str);
+		free(fp);
+	}
+}
diff --git a/bam_index.c b/bam_index.c
new file mode 100644
index 0000000..d6b94e2
--- /dev/null
+++ b/bam_index.c
@@ -0,0 +1,724 @@
+#include <ctype.h>
+#include <assert.h>
+#include "bam.h"
+#include "khash.h"
+#include "ksort.h"
+#include "bam_endian.h"
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+/*!
+  @header
+
+  Alignment indexing. Before indexing, BAM must be sorted based on the
+  leftmost coordinate of alignments. In indexing, BAM uses two indices:
+  a UCSC binning index and a simple linear index. The binning index is
+  efficient for alignments spanning long distance, while the auxiliary
+  linear index helps to reduce unnecessary seek calls especially for
+  short alignments.
+
+  The UCSC binning scheme was suggested by Richard Durbin and Lincoln
+  Stein and is explained by Kent et al. (2002). In this scheme, each bin
+  represents a contiguous genomic region which can be fully contained in
+  another bin; each alignment is associated with a bin which represents
+  the smallest region containing the entire alignment. The binning
+  scheme is essentially another representation of R-tree. A distinct bin
+  uniquely corresponds to a distinct internal node in a R-tree. Bin A is
+  a child of Bin B if region A is contained in B.
+
+  In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
+  0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
+  585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
+  find the alignments overlapped with a region [rbeg,rend), we need to
+  calculate the list of bins that may be overlapped the region and test
+  the alignments in the bins to confirm the overlaps. If the specified
+  region is short, typically only a few alignments in six bins need to
+  be retrieved. The overlapping alignments can be quickly fetched.
+
+ */
+
+#define BAM_MIN_CHUNK_GAP 32768
+// 1<<14 is the size of minimum bin.
+#define BAM_LIDX_SHIFT    14
+
+#define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
+
+typedef struct {
+	uint64_t u, v;
+} pair64_t;
+
+#define pair64_lt(a,b) ((a).u < (b).u)
+KSORT_INIT(off, pair64_t, pair64_lt)
+
+typedef struct {
+	uint32_t m, n;
+	pair64_t *list;
+} bam_binlist_t;
+
+typedef struct {
+	int32_t n, m;
+	uint64_t *offset;
+} bam_lidx_t;
+
+KHASH_MAP_INIT_INT(i, bam_binlist_t)
+
+struct __bam_index_t {
+	int32_t n;
+	uint64_t n_no_coor; // unmapped reads without coordinate
+	khash_t(i) **index;
+	bam_lidx_t *index2;
+};
+
+// requirement: len <= LEN_MASK
+static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
+{
+	khint_t k;
+	bam_binlist_t *l;
+	int ret;
+	k = kh_put(i, h, bin, &ret);
+	l = &kh_value(h, k);
+	if (ret) { // not present
+		l->m = 1; l->n = 0;
+		l->list = (pair64_t*)calloc(l->m, 16);
+	}
+	if (l->n == l->m) {
+		l->m <<= 1;
+		l->list = (pair64_t*)realloc(l->list, l->m * 16);
+	}
+	l->list[l->n].u = beg; l->list[l->n++].v = end;
+}
+
+static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
+{
+	int i, beg, end;
+	beg = b->core.pos >> BAM_LIDX_SHIFT;
+	end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
+	if (index2->m < end + 1) {
+		int old_m = index2->m;
+		index2->m = end + 1;
+		kroundup32(index2->m);
+		index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+		memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
+	}
+	if (beg == end) {
+		if (index2->offset[beg] == 0) index2->offset[beg] = offset;
+	} else {
+		for (i = beg; i <= end; ++i)
+			if (index2->offset[i] == 0) index2->offset[i] = offset;
+	}
+	index2->n = end + 1;
+}
+
+static void merge_chunks(bam_index_t *idx)
+{
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+	khash_t(i) *index;
+	int i, l, m;
+	khint_t k;
+	for (i = 0; i < idx->n; ++i) {
+		index = idx->index[i];
+		for (k = kh_begin(index); k != kh_end(index); ++k) {
+			bam_binlist_t *p;
+			if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
+			p = &kh_value(index, k);
+			m = 0;
+			for (l = 1; l < p->n; ++l) {
+#ifdef BAM_TRUE_OFFSET
+				if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
+#else
+				if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
+#endif
+				else p->list[++m] = p->list[l];
+			} // ~for(l)
+			p->n = m + 1;
+		} // ~for(k)
+	} // ~for(i)
+#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
+}
+
+static void fill_missing(bam_index_t *idx)
+{
+	int i, j;
+	for (i = 0; i < idx->n; ++i) {
+		bam_lidx_t *idx2 = &idx->index2[i];
+		for (j = 1; j < idx2->n; ++j)
+			if (idx2->offset[j] == 0)
+				idx2->offset[j] = idx2->offset[j-1];
+	}
+}
+
+bam_index_t *bam_index_core(bamFile fp)
+{
+	bam1_t *b;
+	bam_header_t *h;
+	int i, ret;
+	bam_index_t *idx;
+	uint32_t last_bin, save_bin;
+	int32_t last_coor, last_tid, save_tid;
+	bam1_core_t *c;
+	uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
+
+	h = bam_header_read(fp);
+	if(h == 0) {
+	    fprintf(stderr, "[bam_index_core] Invalid BAM header.");
+	    return NULL;
+	}
+
+	idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
+	b = (bam1_t*)calloc(1, sizeof(bam1_t));
+	c = &b->core;
+
+	idx->n = h->n_targets;
+	bam_header_destroy(h);
+	idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+	for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
+	idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+
+	save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
+	save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
+	n_mapped = n_unmapped = n_no_coor = off_end = 0;
+	off_beg = off_end = bam_tell(fp);
+	while ((ret = bam_read1(fp, b)) >= 0) {
+		if (c->tid < 0) ++n_no_coor;
+		if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
+			last_tid = c->tid;
+			last_bin = 0xffffffffu;
+		} else if ((uint32_t)last_tid > (uint32_t)c->tid) {
+			fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
+					bam1_qname(b), last_tid+1, c->tid+1);
+			return NULL;
+		} else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
+			fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
+					bam1_qname(b), last_coor, c->pos, c->tid+1);
+			return NULL;
+		}
+		if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
+		if (c->bin != last_bin) { // then possibly write the binning index
+			if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
+				insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
+			if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
+				off_end = last_off;
+				insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
+				insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
+				n_mapped = n_unmapped = 0;
+				off_beg = off_end;
+			}
+			save_off = last_off;
+			save_bin = last_bin = c->bin;
+			save_tid = c->tid;
+			if (save_tid < 0) break;
+		}
+		if (bam_tell(fp) <= last_off) {
+			fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
+					(unsigned long long)bam_tell(fp), (unsigned long long)last_off);
+			return NULL;
+		}
+		if (c->flag & BAM_FUNMAP) ++n_unmapped;
+		else ++n_mapped;
+		last_off = bam_tell(fp);
+		last_coor = b->core.pos;
+	}
+	if (save_tid >= 0) {
+		insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
+		insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
+		insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
+	}
+	merge_chunks(idx);
+	fill_missing(idx);
+	if (ret >= 0) {
+		while ((ret = bam_read1(fp, b)) >= 0) {
+			++n_no_coor;
+			if (c->tid >= 0 && n_no_coor) {
+				fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
+				return NULL;
+			}
+		}
+	}
+	if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
+	free(b->data); free(b);
+	idx->n_no_coor = n_no_coor;
+	return idx;
+}
+
+void bam_index_destroy(bam_index_t *idx)
+{
+	khint_t k;
+	int i;
+	if (idx == 0) return;
+	for (i = 0; i < idx->n; ++i) {
+		khash_t(i) *index = idx->index[i];
+		bam_lidx_t *index2 = idx->index2 + i;
+		for (k = kh_begin(index); k != kh_end(index); ++k) {
+			if (kh_exist(index, k))
+				free(kh_value(index, k).list);
+		}
+		kh_destroy(i, index);
+		free(index2->offset);
+	}
+	free(idx->index); free(idx->index2);
+	free(idx);
+}
+
+void bam_index_save(const bam_index_t *idx, FILE *fp)
+{
+	int32_t i, size;
+	khint_t k;
+	fwrite("BAI\1", 1, 4, fp);
+	if (bam_is_be) {
+		uint32_t x = idx->n;
+		fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+	} else fwrite(&idx->n, 4, 1, fp);
+	for (i = 0; i < idx->n; ++i) {
+		khash_t(i) *index = idx->index[i];
+		bam_lidx_t *index2 = idx->index2 + i;
+		// write binning index
+		size = kh_size(index);
+		if (bam_is_be) { // big endian
+			uint32_t x = size;
+			fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+		} else fwrite(&size, 4, 1, fp);
+		for (k = kh_begin(index); k != kh_end(index); ++k) {
+			if (kh_exist(index, k)) {
+				bam_binlist_t *p = &kh_value(index, k);
+				if (bam_is_be) { // big endian
+					uint32_t x;
+					x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+					x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+					for (x = 0; (int)x < p->n; ++x) {
+						bam_swap_endian_8p(&p->list[x].u);
+						bam_swap_endian_8p(&p->list[x].v);
+					}
+					fwrite(p->list, 16, p->n, fp);
+					for (x = 0; (int)x < p->n; ++x) {
+						bam_swap_endian_8p(&p->list[x].u);
+						bam_swap_endian_8p(&p->list[x].v);
+					}
+				} else {
+					fwrite(&kh_key(index, k), 4, 1, fp);
+					fwrite(&p->n, 4, 1, fp);
+					fwrite(p->list, 16, p->n, fp);
+				}
+			}
+		}
+		// write linear index (index2)
+		if (bam_is_be) {
+			int x = index2->n;
+			fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+		} else fwrite(&index2->n, 4, 1, fp);
+		if (bam_is_be) { // big endian
+			int x;
+			for (x = 0; (int)x < index2->n; ++x)
+				bam_swap_endian_8p(&index2->offset[x]);
+			fwrite(index2->offset, 8, index2->n, fp);
+			for (x = 0; (int)x < index2->n; ++x)
+				bam_swap_endian_8p(&index2->offset[x]);
+		} else fwrite(index2->offset, 8, index2->n, fp);
+	}
+	{ // write the number of reads coor-less records.
+		uint64_t x = idx->n_no_coor;
+		if (bam_is_be) bam_swap_endian_8p(&x);
+		fwrite(&x, 8, 1, fp);
+	}
+	fflush(fp);
+}
+
+static bam_index_t *bam_index_load_core(FILE *fp)
+{
+	int i;
+	char magic[4];
+	bam_index_t *idx;
+	if (fp == 0) {
+		fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
+		return 0;
+	}
+	fread(magic, 1, 4, fp);
+	if (strncmp(magic, "BAI\1", 4)) {
+		fprintf(stderr, "[bam_index_load] wrong magic number.\n");
+		fclose(fp);
+		return 0;
+	}
+	idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));	
+	fread(&idx->n, 4, 1, fp);
+	if (bam_is_be) bam_swap_endian_4p(&idx->n);
+	idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+	idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+	for (i = 0; i < idx->n; ++i) {
+		khash_t(i) *index;
+		bam_lidx_t *index2 = idx->index2 + i;
+		uint32_t key, size;
+		khint_t k;
+		int j, ret;
+		bam_binlist_t *p;
+		index = idx->index[i] = kh_init(i);
+		// load binning index
+		fread(&size, 4, 1, fp);
+		if (bam_is_be) bam_swap_endian_4p(&size);
+		for (j = 0; j < (int)size; ++j) {
+			fread(&key, 4, 1, fp);
+			if (bam_is_be) bam_swap_endian_4p(&key);
+			k = kh_put(i, index, key, &ret);
+			p = &kh_value(index, k);
+			fread(&p->n, 4, 1, fp);
+			if (bam_is_be) bam_swap_endian_4p(&p->n);
+			p->m = p->n;
+			p->list = (pair64_t*)malloc(p->m * 16);
+			fread(p->list, 16, p->n, fp);
+			if (bam_is_be) {
+				int x;
+				for (x = 0; x < p->n; ++x) {
+					bam_swap_endian_8p(&p->list[x].u);
+					bam_swap_endian_8p(&p->list[x].v);
+				}
+			}
+		}
+		// load linear index
+		fread(&index2->n, 4, 1, fp);
+		if (bam_is_be) bam_swap_endian_4p(&index2->n);
+		index2->m = index2->n;
+		index2->offset = (uint64_t*)calloc(index2->m, 8);
+		fread(index2->offset, index2->n, 8, fp);
+		if (bam_is_be)
+			for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
+	}
+	if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
+	if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
+	return idx;
+}
+
+bam_index_t *bam_index_load_local(const char *_fn)
+{
+	FILE *fp;
+	char *fnidx, *fn;
+
+	if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
+		const char *p;
+		int l = strlen(_fn);
+		for (p = _fn + l - 1; p >= _fn; --p)
+			if (*p == '/') break;
+		fn = strdup(p + 1);
+	} else fn = strdup(_fn);
+	fnidx = (char*)calloc(strlen(fn) + 5, 1);
+	strcpy(fnidx, fn); strcat(fnidx, ".bai");
+	fp = fopen(fnidx, "rb");
+	if (fp == 0) { // try "{base}.bai"
+		char *s = strstr(fn, "bam");
+		if (s == fn + strlen(fn) - 3) {
+			strcpy(fnidx, fn);
+			fnidx[strlen(fn)-1] = 'i';
+			fp = fopen(fnidx, "rb");
+		}
+	}
+	free(fnidx); free(fn);
+	if (fp) {
+		bam_index_t *idx = bam_index_load_core(fp);
+		fclose(fp);
+		return idx;
+	} else return 0;
+}
+
+#ifdef _USE_KNETFILE
+static void download_from_remote(const char *url)
+{
+	const int buf_size = 1 * 1024 * 1024;
+	char *fn;
+	FILE *fp;
+	uint8_t *buf;
+	knetFile *fp_remote;
+	int l;
+	if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
+	l = strlen(url);
+	for (fn = (char*)url + l - 1; fn >= url; --fn)
+		if (*fn == '/') break;
+	++fn; // fn now points to the file name
+	fp_remote = knet_open(url, "r");
+	if (fp_remote == 0) {
+		fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
+		return;
+	}
+	if ((fp = fopen(fn, "wb")) == 0) {
+		fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
+		knet_close(fp_remote);
+		return;
+	}
+	buf = (uint8_t*)calloc(buf_size, 1);
+	while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+		fwrite(buf, 1, l, fp);
+	free(buf);
+	fclose(fp);
+	knet_close(fp_remote);
+}
+#else
+static void download_from_remote(const char *url)
+{
+	return;
+}
+#endif
+
+bam_index_t *bam_index_load(const char *fn)
+{
+	bam_index_t *idx;
+	idx = bam_index_load_local(fn);
+	if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
+		char *fnidx = calloc(strlen(fn) + 5, 1);
+		strcat(strcpy(fnidx, fn), ".bai");
+		fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
+		download_from_remote(fnidx);
+		idx = bam_index_load_local(fn);
+	}
+	if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
+	return idx;
+}
+
+int bam_index_build2(const char *fn, const char *_fnidx)
+{
+	char *fnidx;
+	FILE *fpidx;
+	bamFile fp;
+	bam_index_t *idx;
+	if ((fp = bam_open(fn, "r")) == 0) {
+		fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
+		return -1;
+	}
+	idx = bam_index_core(fp);
+	bam_close(fp);
+	if(idx == 0) {
+		fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n");
+		return -1;
+	}
+	if (_fnidx == 0) {
+		fnidx = (char*)calloc(strlen(fn) + 5, 1);
+		strcpy(fnidx, fn); strcat(fnidx, ".bai");
+	} else fnidx = strdup(_fnidx);
+	fpidx = fopen(fnidx, "wb");
+	if (fpidx == 0) {
+		fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
+		free(fnidx);
+		return -1;
+	}
+	bam_index_save(idx, fpidx);
+	bam_index_destroy(idx);
+	fclose(fpidx);
+	free(fnidx);
+	return 0;
+}
+
+int bam_index_build(const char *fn)
+{
+	return bam_index_build2(fn, 0);
+}
+
+int bam_index(int argc, char *argv[])
+{
+	if (argc < 2) {
+		fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
+		return 1;
+	}
+	if (argc >= 3) bam_index_build2(argv[1], argv[2]);
+	else bam_index_build(argv[1]);
+	return 0;
+}
+
+int bam_idxstats(int argc, char *argv[])
+{
+	bam_index_t *idx;
+	bam_header_t *header;
+	bamFile fp;
+	int i;
+	if (argc < 2) {
+		fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
+		return 1;
+	}
+	fp = bam_open(argv[1], "r");
+	if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
+	header = bam_header_read(fp);
+	bam_close(fp);
+	idx = bam_index_load(argv[1]);
+	if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
+	for (i = 0; i < idx->n; ++i) {
+		khint_t k;
+		khash_t(i) *h = idx->index[i];
+		printf("%s\t%d", header->target_name[i], header->target_len[i]);
+		k = kh_get(i, h, BAM_MAX_BIN);
+		if (k != kh_end(h))
+			printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
+		else printf("\t0\t0");
+		putchar('\n');
+	}
+	printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
+	bam_header_destroy(header);
+	bam_index_destroy(idx);
+	return 0;
+}
+
+static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
+{
+	int i = 0, k;
+	if (beg >= end) return 0;
+	if (end >= 1u<<29) end = 1u<<29;
+	--end;
+	list[i++] = 0;
+	for (k =    1 + (beg>>26); k <=    1 + (end>>26); ++k) list[i++] = k;
+	for (k =    9 + (beg>>23); k <=    9 + (end>>23); ++k) list[i++] = k;
+	for (k =   73 + (beg>>20); k <=   73 + (end>>20); ++k) list[i++] = k;
+	for (k =  585 + (beg>>17); k <=  585 + (end>>17); ++k) list[i++] = k;
+	for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
+	return i;
+}
+
+static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
+{
+	uint32_t rbeg = b->core.pos;
+	uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
+	return (rend > beg && rbeg < end);
+}
+
+struct __bam_iter_t {
+	int from_first; // read from the first record; no random access
+	int tid, beg, end, n_off, i, finished;
+	uint64_t curr_off;
+	pair64_t *off;
+};
+
+// bam_fetch helper function retrieves 
+bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
+{
+	uint16_t *bins;
+	int i, n_bins, n_off;
+	pair64_t *off;
+	khint_t k;
+	khash_t(i) *index;
+	uint64_t min_off;
+	bam_iter_t iter = 0;
+
+	if (beg < 0) beg = 0;
+	if (end < beg) return 0;
+	// initialize iter
+	iter = calloc(1, sizeof(struct __bam_iter_t));
+	iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
+	//
+	bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
+	n_bins = reg2bins(beg, end, bins);
+	index = idx->index[tid];
+	if (idx->index2[tid].n > 0) {
+		min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
+			: idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
+		if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
+			int n = beg>>BAM_LIDX_SHIFT;
+			if (n > idx->index2[tid].n) n = idx->index2[tid].n;
+			for (i = n - 1; i >= 0; --i)
+				if (idx->index2[tid].offset[i] != 0) break;
+			if (i >= 0) min_off = idx->index2[tid].offset[i];
+		}
+	} else min_off = 0; // tabix 0.1.2 may produce such index files
+	for (i = n_off = 0; i < n_bins; ++i) {
+		if ((k = kh_get(i, index, bins[i])) != kh_end(index))
+			n_off += kh_value(index, k).n;
+	}
+	if (n_off == 0) {
+		free(bins); return iter;
+	}
+	off = (pair64_t*)calloc(n_off, 16);
+	for (i = n_off = 0; i < n_bins; ++i) {
+		if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
+			int j;
+			bam_binlist_t *p = &kh_value(index, k);
+			for (j = 0; j < p->n; ++j)
+				if (p->list[j].v > min_off) off[n_off++] = p->list[j];
+		}
+	}
+	free(bins);
+	if (n_off == 0) {
+		free(off); return iter;
+	}
+	{
+		bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
+		int l;
+		ks_introsort(off, n_off, off);
+		// resolve completely contained adjacent blocks
+		for (i = 1, l = 0; i < n_off; ++i)
+			if (off[l].v < off[i].v)
+				off[++l] = off[i];
+		n_off = l + 1;
+		// resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+		for (i = 1; i < n_off; ++i)
+			if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
+		{ // merge adjacent blocks
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+			for (i = 1, l = 0; i < n_off; ++i) {
+#ifdef BAM_TRUE_OFFSET
+				if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
+#else
+				if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
+#endif
+				else off[++l] = off[i];
+			}
+			n_off = l + 1;
+#endif
+		}
+		bam_destroy1(b);
+	}
+	iter->n_off = n_off; iter->off = off;
+	return iter;
+}
+
+pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
+{ // for pysam compatibility
+	bam_iter_t iter;
+	pair64_t *off;
+	iter = bam_iter_query(idx, tid, beg, end);
+	off = iter->off; *cnt_off = iter->n_off;
+	free(iter);
+	return off;
+}
+
+void bam_iter_destroy(bam_iter_t iter)
+{
+	if (iter) { free(iter->off); free(iter); }
+}
+
+int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
+{
+	int ret;
+	if (iter && iter->finished) return -1;
+	if (iter == 0 || iter->from_first) {
+		ret = bam_read1(fp, b);
+		if (ret < 0 && iter) iter->finished = 1;
+		return ret;
+	}
+	if (iter->off == 0) return -1;
+	for (;;) {
+		if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
+			if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
+			if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
+			if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
+				bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
+				iter->curr_off = bam_tell(fp);
+			}
+			++iter->i;
+		}
+		if ((ret = bam_read1(fp, b)) >= 0) {
+			iter->curr_off = bam_tell(fp);
+			if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
+				ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
+				break;
+			}
+			else if (is_overlap(iter->beg, iter->end, b)) return ret;
+		} else break; // end of file or error
+	}
+	iter->finished = 1;
+	return ret;
+}
+
+int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+{
+	int ret;
+	bam_iter_t iter;
+	bam1_t *b;
+	b = bam_init1();
+	iter = bam_iter_query(idx, tid, beg, end);
+	while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
+	bam_iter_destroy(iter);
+	bam_destroy1(b);
+	return (ret == -1)? 0 : ret;
+}
diff --git a/bam_lpileup.c b/bam_lpileup.c
new file mode 100644
index 0000000..d4dd63b
--- /dev/null
+++ b/bam_lpileup.c
@@ -0,0 +1,198 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "bam.h"
+#include "ksort.h"
+
+#define TV_GAP 2
+
+typedef struct __freenode_t {
+	uint32_t level:28, cnt:4;
+	struct __freenode_t *next;
+} freenode_t, *freenode_p;
+
+#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level))
+KSORT_INIT(node, freenode_p, freenode_lt)
+
+/* Memory pool, similar to the one in bam_pileup.c */
+typedef struct {
+	int cnt, n, max;
+	freenode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+	return (mempool_t*)calloc(1, sizeof(mempool_t));
+}
+static void mp_destroy(mempool_t *mp)
+{
+	int k;
+	for (k = 0; k < mp->n; ++k) free(mp->buf[k]);
+	free(mp->buf); free(mp);
+}
+static inline freenode_t *mp_alloc(mempool_t *mp)
+{
+	++mp->cnt;
+	if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t));
+	else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, freenode_t *p)
+{
+	--mp->cnt; p->next = 0; p->cnt = TV_GAP;
+	if (mp->n == mp->max) {
+		mp->max = mp->max? mp->max<<1 : 256;
+		mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max);
+	}
+	mp->buf[mp->n++] = p;
+}
+
+/* core part */
+struct __bam_lplbuf_t {
+	int max, n_cur, n_pre;
+	int max_level, *cur_level, *pre_level;
+	mempool_t *mp;
+	freenode_t **aux, *head, *tail;
+	int n_nodes, m_aux;
+	bam_pileup_f func;
+	void *user_data;
+	bam_plbuf_t *plbuf;
+};
+
+void bam_lplbuf_reset(bam_lplbuf_t *buf)
+{
+	freenode_t *p, *q;
+	bam_plbuf_reset(buf->plbuf);
+	for (p = buf->head; p->next;) {
+		q = p->next;
+		mp_free(buf->mp, p);
+		p = q;
+	}
+	buf->head = buf->tail;
+	buf->max_level = 0;
+	buf->n_cur = buf->n_pre = 0;
+	buf->n_nodes = 0;
+}
+
+static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+	bam_lplbuf_t *tv = (bam_lplbuf_t*)data;
+	freenode_t *p;
+	int i, l, max_level;
+	// allocate memory if necessary
+	if (tv->max < n) { // enlarge
+		tv->max = n;
+		kroundup32(tv->max);
+		tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max);
+		tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max);
+	}
+	tv->n_cur = n;
+	// update cnt
+	for (p = tv->head; p->next; p = p->next)
+		if (p->cnt > 0) --p->cnt;
+	// calculate cur_level[]
+	max_level = 0;
+	for (i = l = 0; i < n; ++i) {
+		const bam_pileup1_t *p = pl + i;
+		if (p->is_head) {
+			if (tv->head->next && tv->head->cnt == 0) { // then take a free slot
+				freenode_t *p = tv->head->next;
+				tv->cur_level[i] = tv->head->level;
+				mp_free(tv->mp, tv->head);
+				tv->head = p;
+				--tv->n_nodes;
+			} else tv->cur_level[i] = ++tv->max_level;
+		} else {
+			tv->cur_level[i] = tv->pre_level[l++];
+			if (p->is_tail) { // then return a free slot
+				tv->tail->level = tv->cur_level[i];
+				tv->tail->next = mp_alloc(tv->mp);
+				tv->tail = tv->tail->next;
+				++tv->n_nodes;
+			}
+		}
+		if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i];
+		((bam_pileup1_t*)p)->level = tv->cur_level[i];
+	}
+	assert(l == tv->n_pre);
+	tv->func(tid, pos, n, pl, tv->user_data);
+	// sort the linked list
+	if (tv->n_nodes) {
+		freenode_t *q;
+		if (tv->n_nodes + 1 > tv->m_aux) { // enlarge
+			tv->m_aux = tv->n_nodes + 1;
+			kroundup32(tv->m_aux);
+			tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux);
+		}
+		for (p = tv->head, i = l = 0; p->next;) {
+			if (p->level > max_level) { // then discard this entry
+				q = p->next;
+				mp_free(tv->mp, p);
+				p = q;
+			} else {
+				tv->aux[i++] = p;
+				p = p->next;
+			}
+		}
+		tv->aux[i] = tv->tail; // add a proper tail for the loop below
+		tv->n_nodes = i;
+		if (tv->n_nodes) {
+			ks_introsort(node, tv->n_nodes, tv->aux);
+			for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1];
+			tv->head = tv->aux[0];
+		} else tv->head = tv->tail;
+	}
+	// clean up
+	tv->max_level = max_level;
+	memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4);
+	// squeeze out terminated levels
+	for (i = l = 0; i < n; ++i) {
+		const bam_pileup1_t *p = pl + i;
+		if (!p->is_tail)
+			tv->pre_level[l++] = tv->pre_level[i];
+	}
+	tv->n_pre = l;
+/*
+	fprintf(stderr, "%d\t", pos+1);
+	for (i = 0; i < n; ++i) {
+		const bam_pileup1_t *p = pl + i;
+		if (p->is_head) fprintf(stderr, "^");
+		if (p->is_tail) fprintf(stderr, "$");
+		fprintf(stderr, "%d,", p->level);
+	}
+	fprintf(stderr, "\n");
+*/
+	return 0;
+}
+
+bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data)
+{
+	bam_lplbuf_t *tv;
+	tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t));
+	tv->mp = mp_init();
+	tv->head = tv->tail = mp_alloc(tv->mp);
+	tv->func = func;
+	tv->user_data = data;
+	tv->plbuf = bam_plbuf_init(tview_func, tv);
+	return (bam_lplbuf_t*)tv;
+}
+
+void bam_lplbuf_destroy(bam_lplbuf_t *tv)
+{
+	freenode_t *p, *q;
+	free(tv->cur_level); free(tv->pre_level);
+	bam_plbuf_destroy(tv->plbuf);
+	free(tv->aux);
+	for (p = tv->head; p->next;) {
+		q = p->next;
+		mp_free(tv->mp, p); p = q;
+	}
+	mp_free(tv->mp, p);
+	assert(tv->mp->cnt == 0);
+	mp_destroy(tv->mp);
+	free(tv);
+}
+
+int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv)
+{
+	return bam_plbuf_push(b, tv->plbuf);
+}
diff --git a/bam_md.c b/bam_md.c
new file mode 100644
index 0000000..ce40a12
--- /dev/null
+++ b/bam_md.c
@@ -0,0 +1,389 @@
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+#include "faidx.h"
+#include "sam.h"
+#include "kstring.h"
+#include "kaln.h"
+#include "kprobaln.h"
+
+#define USE_EQUAL 1
+#define DROP_TAG  2
+#define BIN_QUAL  4
+#define UPDATE_NM 8
+#define UPDATE_MD 16
+#define HASH_QNM  32
+
+char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+int bam_aux_drop_other(bam1_t *b, uint8_t *s);
+
+void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
+{
+	uint8_t *seq = bam1_seq(b);
+	uint32_t *cigar = bam1_cigar(b);
+	bam1_core_t *c = &b->core;
+	int i, x, y, u = 0;
+	kstring_t *str;
+	int32_t old_nm_i = -1, nm = 0;
+
+	str = (kstring_t*)calloc(1, sizeof(kstring_t));
+	for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+		int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+		if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+			for (j = 0; j < l; ++j) {
+				int z = y + j;
+				int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+				if (ref[x+j] == 0) break; // out of boundary
+				if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
+					if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
+					++u;
+				} else {
+					kputw(u, str); kputc(ref[x+j], str);
+					u = 0; ++nm;
+				}
+			}
+			if (j < l) break;
+			x += l; y += l;
+		} else if (op == BAM_CDEL) {
+			kputw(u, str); kputc('^', str);
+			for (j = 0; j < l; ++j) {
+				if (ref[x+j] == 0) break;
+				kputc(ref[x+j], str);
+			}
+			u = 0;
+			if (j < l) break;
+			x += l; nm += l;
+		} else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+			y += l;
+			if (op == BAM_CINS) nm += l;
+		} else if (op == BAM_CREF_SKIP) {
+			x += l;
+		}
+	}
+	kputw(u, str);
+	// apply max_nm
+	if (max_nm > 0 && nm >= max_nm) {
+		for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+			int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+			if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+				for (j = 0; j < l; ++j) {
+					int z = y + j;
+					int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+					if (ref[x+j] == 0) break; // out of boundary
+					if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
+						seq[z/2] |= (z&1)? 0x0f : 0xf0;
+						bam1_qual(b)[z] = 0;
+					}
+				}
+				if (j < l) break;
+				x += l; y += l;
+			} else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
+			else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+		}
+	}
+	// update NM
+	if (flag & UPDATE_NM) {
+		uint8_t *old_nm = bam_aux_get(b, "NM");
+		if (c->flag & BAM_FUNMAP) return;
+		if (old_nm) old_nm_i = bam_aux2i(old_nm);
+		if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+		else if (nm != old_nm_i) {
+			fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
+			bam_aux_del(b, old_nm);
+			bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+		}
+	}
+	// update MD
+	if (flag & UPDATE_MD) {
+		uint8_t *old_md = bam_aux_get(b, "MD");
+		if (c->flag & BAM_FUNMAP) return;
+		if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+		else {
+			int is_diff = 0;
+			if (strlen((char*)old_md+1) == str->l) {
+				for (i = 0; i < str->l; ++i)
+					if (toupper(old_md[i+1]) != toupper(str->s[i]))
+						break;
+				if (i < str->l) is_diff = 1;
+			} else is_diff = 1;
+			if (is_diff) {
+				fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
+				bam_aux_del(b, old_md);
+				bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+			}
+		}
+	}
+	// drop all tags but RG
+	if (flag&DROP_TAG) {
+		uint8_t *q = bam_aux_get(b, "RG");
+		bam_aux_drop_other(b, q);
+	}
+	// reduce the resolution of base quality
+	if (flag&BIN_QUAL) {
+		uint8_t *qual = bam1_qual(b);
+		for (i = 0; i < b->core.l_qseq; ++i)
+			if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
+	}
+	free(str->s); free(str);
+}
+
+void bam_fillmd1(bam1_t *b, char *ref, int flag)
+{
+	bam_fillmd1_core(b, ref, flag, 0);
+}
+
+int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
+{
+	uint8_t *seq = bam1_seq(b), *qual = bam1_qual(b);
+	uint32_t *cigar = bam1_cigar(b);
+	bam1_core_t *c = &b->core;
+	int i, x, y, mm, q, len, clip_l, clip_q;
+	double t;
+	if (thres < 0) thres = 40; // set the default
+	mm = q = len = clip_l = clip_q = 0;
+	for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+		int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+		if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+			for (j = 0; j < l; ++j) {
+				int z = y + j;
+				int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+				if (ref[x+j] == 0) break; // out of boundary
+				if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
+					++len;
+					if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
+						++mm;
+						q += qual[z] > 33? 33 : qual[z];
+					}
+				}
+			}
+			if (j < l) break;
+			x += l; y += l; len += l;
+		} else if (op == BAM_CDEL) {
+			for (j = 0; j < l; ++j)
+				if (ref[x+j] == 0) break;
+			if (j < l) break;
+			x += l;
+		} else if (op == BAM_CSOFT_CLIP) {
+			for (j = 0; j < l; ++j) clip_q += qual[y+j];
+			clip_l += l;
+			y += l;
+		} else if (op == BAM_CHARD_CLIP) {
+			clip_q += 13 * l;
+			clip_l += l;
+		} else if (op == BAM_CINS) y += l;
+		else if (op == BAM_CREF_SKIP) x += l;
+	}
+	for (i = 0, t = 1; i < mm; ++i)
+		t *= (double)len / (i+1);
+	t = q - 4.343 * log(t) + clip_q / 5.;
+	if (t > thres) return -1;
+	if (t < 0) t = 0;
+	t = sqrt((thres - t) / thres) * thres;
+//	fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q);
+	return (int)(t + .499);
+}
+
+int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
+{
+	int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
+	uint32_t *cigar = bam1_cigar(b);
+	bam1_core_t *c = &b->core;
+	kpa_par_t conf = kpa_par_def;
+	uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b);
+	if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing
+	// test if BQ or ZQ is present
+	if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
+	if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
+	if (bq && redo_baq)
+	{
+	    bam_aux_del(b, bq-1);
+	    bq = 0;
+	}
+	if (bq && zq) { // remove the ZQ tag
+		bam_aux_del(b, zq-1);
+		zq = 0;
+	}
+	if (bq || zq) {
+		if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
+		if (bq && apply_baq) { // then convert BQ to ZQ
+			for (i = 0; i < c->l_qseq; ++i)
+				qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
+			*(bq - 3) = 'Z';
+		} else if (zq && !apply_baq) { // then convert ZQ to BQ
+			for (i = 0; i < c->l_qseq; ++i)
+				qual[i] += (int)zq[i] - 64;
+			*(zq - 3) = 'B';
+		}
+		return 0;
+	}
+	// find the start and end of the alignment	
+	x = c->pos, y = 0, yb = ye = xb = xe = -1;
+	for (k = 0; k < c->n_cigar; ++k) {
+		int op, l;
+		op = cigar[k]&0xf; l = cigar[k]>>4;
+		if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+			if (yb < 0) yb = y;
+			if (xb < 0) xb = x;
+			ye = y + l; xe = x + l;
+			x += l; y += l;
+		} else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+		else if (op == BAM_CDEL) x += l;
+		else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
+	}
+	// set bandwidth and the start and the end
+	bw = 7;
+	if (abs((xe - xb) - (ye - yb)) > bw)
+		bw = abs((xe - xb) - (ye - yb)) + 3;
+	conf.bw = bw;
+	xb -= yb + bw/2; if (xb < 0) xb = 0;
+	xe += c->l_qseq - ye + bw/2;
+	if (xe - xb - c->l_qseq > bw)
+		xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
+	{ // glocal
+		uint8_t *s, *r, *q, *seq = bam1_seq(b), *bq;
+		int *state;
+		bq = calloc(c->l_qseq + 1, 1);
+		memcpy(bq, qual, c->l_qseq);
+		s = calloc(c->l_qseq, 1);
+		for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)];
+		r = calloc(xe - xb, 1);
+		for (i = xb; i < xe; ++i) {
+			if (ref[i] == 0) { xe = i; break; }
+			r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]];
+		}
+		state = calloc(c->l_qseq, sizeof(int));
+		q = calloc(c->l_qseq, 1);
+		kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
+		if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
+			for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
+				int op = cigar[k]&0xf, l = cigar[k]>>4;
+				if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+					for (i = y; i < y + l; ++i) {
+						if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
+						else bq[i] = bq[i] < q[i]? bq[i] : q[i];
+					}
+					x += l; y += l;
+				} else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+				else if (op == BAM_CDEL) x += l;
+			}
+			for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
+		} else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
+			uint8_t *left, *rght;
+			left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
+			for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
+				int op = cigar[k]&0xf, l = cigar[k]>>4;
+				if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+					for (i = y; i < y + l; ++i)
+						bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
+					for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
+						left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
+					for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
+						rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
+					for (i = y; i < y + l; ++i)
+						bq[i] = left[i] < rght[i]? left[i] : rght[i];
+					x += l; y += l;
+				} else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+				else if (op == BAM_CDEL) x += l;
+			}
+			for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
+			free(left); free(rght);
+		}
+		if (apply_baq) {
+			for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
+			bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
+		} else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
+		free(bq); free(s); free(r); free(q); free(state);
+	}
+	return 0;
+}
+
+int bam_prob_realn(bam1_t *b, const char *ref)
+{
+	return bam_prob_realn_core(b, ref, 1);
+}
+
+int bam_fillmd(int argc, char *argv[])
+{
+	int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
+	samfile_t *fp, *fpout = 0;
+	faidx_t *fai;
+	char *ref = 0, mode_w[8], mode_r[8];
+	bam1_t *b;
+
+	flt_flag = UPDATE_NM | UPDATE_MD;
+	is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
+	mode_w[0] = mode_r[0] = 0;
+	strcpy(mode_r, "r"); strcpy(mode_w, "w");
+	while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) {
+		switch (c) {
+		case 'r': is_realn = 1; break;
+		case 'e': flt_flag |= USE_EQUAL; break;
+		case 'd': flt_flag |= DROP_TAG; break;
+		case 'q': flt_flag |= BIN_QUAL; break;
+		case 'h': flt_flag |= HASH_QNM; break;
+		case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break;
+		case 'b': is_bam_out = 1; break;
+		case 'u': is_uncompressed = is_bam_out = 1; break;
+		case 'S': is_sam_in = 1; break;
+		case 'n': max_nm = atoi(optarg); break;
+		case 'C': capQ = atoi(optarg); break;
+		case 'A': baq_flag |= 1; break;
+		case 'E': baq_flag |= 2; break;
+		default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
+		}
+	}
+	if (!is_sam_in) strcat(mode_r, "b");
+	if (is_bam_out) strcat(mode_w, "b");
+	else strcat(mode_w, "h");
+	if (is_uncompressed) strcat(mode_w, "u");
+	if (optind + 1 >= argc) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   samtools fillmd [-eubrS] <aln.bam> <ref.fasta>\n\n");
+		fprintf(stderr, "Options: -e       change identical bases to '='\n");
+		fprintf(stderr, "         -u       uncompressed BAM output (for piping)\n");
+		fprintf(stderr, "         -b       compressed BAM output\n");
+		fprintf(stderr, "         -S       the input is SAM with header\n");
+		fprintf(stderr, "         -A       modify the quality string\n");
+		fprintf(stderr, "         -r       compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n");
+		fprintf(stderr, "         -E       extended BAQ for better sensitivity but lower specificity\n\n");
+		return 1;
+	}
+	fp = samopen(argv[optind], mode_r, 0);
+	if (fp == 0) return 1;
+	if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) {
+		fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
+		return 1;
+	}
+	fpout = samopen("-", mode_w, fp->header);
+	fai = fai_load(argv[optind+1]);
+
+	b = bam_init1();
+	while ((ret = samread(fp, b)) >= 0) {
+		if (b->core.tid >= 0) {
+			if (tid != b->core.tid) {
+				free(ref);
+				ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len);
+				tid = b->core.tid;
+				if (ref == 0)
+					fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
+							fp->header->target_name[tid]);
+			}
+			if (is_realn) bam_prob_realn_core(b, ref, baq_flag);
+			if (capQ > 10) {
+				int q = bam_cap_mapQ(b, ref, capQ);
+				if (b->core.qual > q) b->core.qual = q;
+			}
+			if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm);
+		}
+		samwrite(fpout, b);
+	}
+	bam_destroy1(b);
+
+	free(ref);
+	fai_destroy(fai);
+	samclose(fp); samclose(fpout);
+	return 0;
+}
diff --git a/bam_pileup.c b/bam_pileup.c
new file mode 100644
index 0000000..57434e0
--- /dev/null
+++ b/bam_pileup.c
@@ -0,0 +1,437 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include "sam.h"
+
+typedef struct {
+	int k, x, y, end;
+} cstate_t;
+
+static cstate_t g_cstate_null = { -1, 0, 0, 0 };
+
+typedef struct __linkbuf_t {
+	bam1_t b;
+	uint32_t beg, end;
+	cstate_t s;
+	struct __linkbuf_t *next;
+} lbnode_t;
+
+/* --- BEGIN: Memory pool */
+
+typedef struct {
+	int cnt, n, max;
+	lbnode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+	mempool_t *mp;
+	mp = (mempool_t*)calloc(1, sizeof(mempool_t));
+	return mp;
+}
+static void mp_destroy(mempool_t *mp)
+{
+	int k;
+	for (k = 0; k < mp->n; ++k) {
+		free(mp->buf[k]->b.data);
+		free(mp->buf[k]);
+	}
+	free(mp->buf);
+	free(mp);
+}
+static inline lbnode_t *mp_alloc(mempool_t *mp)
+{
+	++mp->cnt;
+	if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
+	else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, lbnode_t *p)
+{
+	--mp->cnt; p->next = 0; // clear lbnode_t::next here
+	if (mp->n == mp->max) {
+		mp->max = mp->max? mp->max<<1 : 256;
+		mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
+	}
+	mp->buf[mp->n++] = p;
+}
+
+/* --- END: Memory pool */
+
+/* --- BEGIN: Auxiliary functions */
+
+/* s->k: the index of the CIGAR operator that has just been processed.
+   s->x: the reference coordinate of the start of s->k
+   s->y: the query coordiante of the start of s->k
+ */
+static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s)
+{
+#define _cop(c) ((c)&BAM_CIGAR_MASK)
+#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
+
+	bam1_t *b = p->b;
+	bam1_core_t *c = &b->core;
+	uint32_t *cigar = bam1_cigar(b);
+	int k, is_head = 0;
+	// determine the current CIGAR operation
+//	fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam1_qname(b), pos, s->end, s->k, s->x, s->y);
+	if (s->k == -1) { // never processed
+		is_head = 1;
+		if (c->n_cigar == 1) { // just one operation, save a loop
+		  if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
+		} else { // find the first match or deletion
+			for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
+				int op = _cop(cigar[k]);
+				int l = _cln(cigar[k]);
+				if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
+				else if (op == BAM_CREF_SKIP) s->x += l;
+				else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
+			}
+			assert(k < c->n_cigar);
+			s->k = k;
+		}
+	} else { // the read has been processed before
+		int op, l = _cln(cigar[s->k]);
+		if (pos - s->x >= l) { // jump to the next operation
+			assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
+			op = _cop(cigar[s->k+1]);
+			if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
+			  if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
+				s->x += l;
+				++s->k;
+			} else { // find the next M/D/N/=/X
+			  if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
+				s->x += l;
+				for (k = s->k + 1; k < c->n_cigar; ++k) {
+					op = _cop(cigar[k]), l = _cln(cigar[k]);
+					if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
+					else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
+				}
+				s->k = k;
+			}
+			assert(s->k < c->n_cigar); // otherwise a bug
+		} // else, do nothing
+	}
+	{ // collect pileup information
+		int op, l;
+		op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
+		p->is_del = p->indel = p->is_refskip = 0;
+		if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
+			int op2 = _cop(cigar[s->k+1]);
+			int l2 = _cln(cigar[s->k+1]);
+			if (op2 == BAM_CDEL) p->indel = -(int)l2;
+			else if (op2 == BAM_CINS) p->indel = l2;
+			else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
+				int l3 = 0;
+				for (k = s->k + 2; k < c->n_cigar; ++k) {
+					op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
+					if (op2 == BAM_CINS) l3 += l2;
+					else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
+				}
+				if (l3 > 0) p->indel = l3;
+			}
+		}
+		if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+			p->qpos = s->y + (pos - s->x);
+		} else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+			p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
+			p->is_refskip = (op == BAM_CREF_SKIP);
+		} // cannot be other operations; otherwise a bug
+		p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
+	}
+	return 1;
+}
+
+/* --- END: Auxiliary functions */
+
+/*******************
+ * pileup iterator *
+ *******************/
+
+struct __bam_plp_t {
+	mempool_t *mp;
+	lbnode_t *head, *tail, *dummy;
+	int32_t tid, pos, max_tid, max_pos;
+	int is_eof, flag_mask, max_plp, error, maxcnt;
+	bam_pileup1_t *plp;
+	// for the "auto" interface only
+	bam1_t *b;
+	bam_plp_auto_f func;
+	void *data;
+};
+
+bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
+{
+	bam_plp_t iter;
+	iter = calloc(1, sizeof(struct __bam_plp_t));
+	iter->mp = mp_init();
+	iter->head = iter->tail = mp_alloc(iter->mp);
+	iter->dummy = mp_alloc(iter->mp);
+	iter->max_tid = iter->max_pos = -1;
+	iter->flag_mask = BAM_DEF_MASK;
+	iter->maxcnt = 8000;
+	if (func) {
+		iter->func = func;
+		iter->data = data;
+		iter->b = bam_init1();
+	}
+	return iter;
+}
+
+void bam_plp_destroy(bam_plp_t iter)
+{
+	mp_free(iter->mp, iter->dummy);
+	mp_free(iter->mp, iter->head);
+	if (iter->mp->cnt != 0)
+		fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt);
+	mp_destroy(iter->mp);
+	if (iter->b) bam_destroy1(iter->b);
+	free(iter->plp);
+	free(iter);
+}
+
+const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+{
+	if (iter->error) { *_n_plp = -1; return 0; }
+	*_n_plp = 0;
+	if (iter->is_eof && iter->head->next == 0) return 0;
+	while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
+		int n_plp = 0;
+		lbnode_t *p, *q;
+		// write iter->plp at iter->pos
+		iter->dummy->next = iter->head;
+		for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) {
+			if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
+				q->next = p->next; mp_free(iter->mp, p); p = q;
+			} else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
+				if (n_plp == iter->max_plp) { // then double the capacity
+					iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
+					iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
+				}
+				iter->plp[n_plp].b = &p->b;
+				if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
+			}
+		}
+		iter->head = iter->dummy->next; // dummy->next may be changed
+		*_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
+		// update iter->tid and iter->pos
+		if (iter->head->next) {
+			if (iter->tid > iter->head->b.core.tid) {
+				fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__);
+				iter->error = 1;
+				*_n_plp = -1;
+				return 0;
+			}
+		}
+		if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
+			iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
+		} else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
+			iter->pos = iter->head->beg; // jump to the next position
+		} else ++iter->pos; // scan contiguously
+		// return
+		if (n_plp) return iter->plp;
+		if (iter->is_eof && iter->head->next == 0) break;
+	}
+	return 0;
+}
+
+int bam_plp_push(bam_plp_t iter, const bam1_t *b)
+{
+	if (iter->error) return -1;
+	if (b) {
+		if (b->core.tid < 0) return 0;
+		if (b->core.flag & iter->flag_mask) return 0;
+		if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0;
+		bam_copy1(&iter->tail->b, b);
+		iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b));
+		iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
+		if (b->core.tid < iter->max_tid) {
+			fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
+			iter->error = 1;
+			return -1;
+		}
+		if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
+			fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
+			iter->error = 1;
+			return -1;
+		}
+		iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
+		if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
+			iter->tail->next = mp_alloc(iter->mp);
+			iter->tail = iter->tail->next;
+		}
+	} else iter->is_eof = 1;
+	return 0;
+}
+
+const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+{
+	const bam_pileup1_t *plp;
+	if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
+	if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+	else { // no pileup line can be obtained; read alignments
+		*_n_plp = 0;
+		if (iter->is_eof) return 0;
+		while (iter->func(iter->data, iter->b) >= 0) {
+			if (bam_plp_push(iter, iter->b) < 0) {
+				*_n_plp = -1;
+				return 0;
+			}
+			if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+			// otherwise no pileup line can be returned; read the next alignment.
+		}
+		bam_plp_push(iter, 0);
+		if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+		return 0;
+	}
+}
+
+void bam_plp_reset(bam_plp_t iter)
+{
+	lbnode_t *p, *q;
+	iter->max_tid = iter->max_pos = -1;
+	iter->tid = iter->pos = 0;
+	iter->is_eof = 0;
+	for (p = iter->head; p->next;) {
+		q = p->next;
+		mp_free(iter->mp, p);
+		p = q;
+	}
+	iter->head = iter->tail;
+}
+
+void bam_plp_set_mask(bam_plp_t iter, int mask)
+{
+	iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask);
+}
+
+void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
+{
+	iter->maxcnt = maxcnt;
+}
+
+/*****************
+ * callback APIs *
+ *****************/
+
+int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
+{
+	bam_plbuf_t *buf;
+	int ret;
+	bam1_t *b;
+	b = bam_init1();
+	buf = bam_plbuf_init(func, func_data);
+	bam_plbuf_set_mask(buf, mask);
+	while ((ret = bam_read1(fp, b)) >= 0)
+		bam_plbuf_push(b, buf);
+	bam_plbuf_push(0, buf);
+	bam_plbuf_destroy(buf);
+	bam_destroy1(b);
+	return 0;
+}
+
+void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)
+{
+	bam_plp_set_mask(buf->iter, mask);
+}
+
+void bam_plbuf_reset(bam_plbuf_t *buf)
+{
+	bam_plp_reset(buf->iter);
+}
+
+bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
+{
+	bam_plbuf_t *buf;
+	buf = calloc(1, sizeof(bam_plbuf_t));
+	buf->iter = bam_plp_init(0, 0);
+	buf->func = func;
+	buf->data = data;
+	return buf;
+}
+
+void bam_plbuf_destroy(bam_plbuf_t *buf)
+{
+	bam_plp_destroy(buf->iter);
+	free(buf);
+}
+
+int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
+{
+	int ret, n_plp, tid, pos;
+	const bam_pileup1_t *plp;
+	ret = bam_plp_push(buf->iter, b);
+	if (ret < 0) return ret;
+	while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0)
+		buf->func(tid, pos, n_plp, plp, buf->data);
+	return 0;
+}
+
+/***********
+ * mpileup *
+ ***********/
+
+struct __bam_mplp_t {
+	int n;
+	uint64_t min, *pos;
+	bam_plp_t *iter;
+	int *n_plp;
+	const bam_pileup1_t **plp;
+};
+
+bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
+{
+	int i;
+	bam_mplp_t iter;
+	iter = calloc(1, sizeof(struct __bam_mplp_t));
+	iter->pos = calloc(n, 8);
+	iter->n_plp = calloc(n, sizeof(int));
+	iter->plp = calloc(n, sizeof(void*));
+	iter->iter = calloc(n, sizeof(void*));
+	iter->n = n;
+	iter->min = (uint64_t)-1;
+	for (i = 0; i < n; ++i) {
+		iter->iter[i] = bam_plp_init(func, data[i]);
+		iter->pos[i] = iter->min;
+	}
+	return iter;
+}
+
+void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
+{
+	int i;
+	for (i = 0; i < iter->n; ++i)
+		iter->iter[i]->maxcnt = maxcnt;
+}
+
+void bam_mplp_destroy(bam_mplp_t iter)
+{
+	int i;
+	for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
+	free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
+	free(iter);
+}
+
+int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
+{
+	int i, ret = 0;
+	uint64_t new_min = (uint64_t)-1;
+	for (i = 0; i < iter->n; ++i) {
+		if (iter->pos[i] == iter->min) {
+			int tid, pos;
+			iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
+			iter->pos[i] = (uint64_t)tid<<32 | pos;
+		}
+		if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
+	}
+	iter->min = new_min;
+	if (new_min == (uint64_t)-1) return 0;
+	*_tid = new_min>>32; *_pos = (uint32_t)new_min;
+	for (i = 0; i < iter->n; ++i) {
+		if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
+			n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
+			++ret;
+		} else n_plp[i] = 0, plp[i] = 0;
+	}
+	return ret;
+}
diff --git a/bam_reheader.c b/bam_reheader.c
new file mode 100644
index 0000000..6619428
--- /dev/null
+++ b/bam_reheader.c
@@ -0,0 +1,62 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "knetfile.h"
+#include "bgzf.h"
+#include "bam.h"
+
+#define BUF_SIZE 0x10000
+
+int bam_reheader(BGZF *in, const bam_header_t *h, int fd)
+{
+	BGZF *fp;
+	bam_header_t *old;
+	int len;
+	uint8_t *buf;
+	if (in->is_write) return -1;
+	buf = malloc(BUF_SIZE);
+	old = bam_header_read(in);
+	fp = bgzf_fdopen(fd, "w");
+	bam_header_write(fp, h);
+	if (in->block_offset < in->block_length) {
+		bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
+		bgzf_flush(fp);
+	}
+#ifdef _USE_KNETFILE
+	while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0)
+		fwrite(buf, 1, len, fp->fp);
+#else
+	while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0)
+		fwrite(buf, 1, len, fp->file);
+#endif
+	free(buf);
+	fp->block_offset = in->block_offset = 0;
+	bgzf_close(fp);
+	return 0;
+}
+
+int main_reheader(int argc, char *argv[])
+{
+	bam_header_t *h;
+	BGZF *in;
+	if (argc != 3) {
+		fprintf(stderr, "Usage: samtools reheader <in.header.sam> <in.bam>\n");
+		return 1;
+	}
+	{ // read the header
+		tamFile fph = sam_open(argv[1]);
+		if (fph == 0) {
+			fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]);
+			return 1;
+		}
+		h = sam_header_read(fph);
+		sam_close(fph);
+	}
+	in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r");
+	if (in == 0) {
+		fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]);
+		return 1;
+	}
+	bam_reheader(in, h, fileno(stdout));
+	bgzf_close(in);
+	return 0;
+}
diff --git a/bam_sort.c b/bam_sort.c
new file mode 100644
index 0000000..7d00cd1
--- /dev/null
+++ b/bam_sort.c
@@ -0,0 +1,566 @@
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "bam.h"
+#include "ksort.h"
+
+static int g_is_by_qname = 0;
+
+static int strnum_cmp(const char *_a, const char *_b)
+{
+	const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
+	const unsigned char *pa = a, *pb = b;
+	while (*pa && *pb) {
+		if (isdigit(*pa) && isdigit(*pb)) {
+			while (*pa == '0') ++pa;
+			while (*pb == '0') ++pb;
+			while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb;
+			if (isdigit(*pa) && isdigit(*pb)) {
+				int i = 0;
+				while (isdigit(pa[i]) && isdigit(pb[i])) ++i;
+				return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb;
+			} else if (isdigit(*pa)) return 1;
+			else if (isdigit(*pb)) return -1;
+			else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1;
+		} else {
+			if (*pa != *pb) return (int)*pa - (int)*pb;
+			++pa; ++pb;
+		}
+	}
+	return *pa? 1 : *pb? -1 : 0;
+}
+
+#define HEAP_EMPTY 0xffffffffffffffffull
+
+typedef struct {
+	int i;
+	uint64_t pos, idx;
+	bam1_t *b;
+} heap1_t;
+
+#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx))))
+
+static inline int heap_lt(const heap1_t a, const heap1_t b)
+{
+	if (g_is_by_qname) {
+		int t;
+		if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0;
+		t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b));
+		return (t > 0 || (t == 0 && (a.b->core.flag&0xc0) > (b.b->core.flag&0xc0)));
+	} else return __pos_cmp(a, b);
+}
+
+KSORT_INIT(heap, heap1_t, heap_lt)
+
+static void swap_header_targets(bam_header_t *h1, bam_header_t *h2)
+{
+	bam_header_t t;
+	t.n_targets = h1->n_targets, h1->n_targets = h2->n_targets, h2->n_targets = t.n_targets;
+	t.target_name = h1->target_name, h1->target_name = h2->target_name, h2->target_name = t.target_name;
+	t.target_len = h1->target_len, h1->target_len = h2->target_len, h2->target_len = t.target_len;
+}
+
+static void swap_header_text(bam_header_t *h1, bam_header_t *h2)
+{
+	int tempi;
+	char *temps;
+	tempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi;
+	temps = h1->text, h1->text = h2->text, h2->text = temps;
+}
+
+#define MERGE_RG     1
+#define MERGE_UNCOMP 2
+#define MERGE_LEVEL1 4
+#define MERGE_FORCE  8
+
+/*!
+  @abstract    Merge multiple sorted BAM.
+  @param  is_by_qname whether to sort by query name
+  @param  out  output BAM file name
+  @param  headers  name of SAM file from which to copy '@' header lines,
+                   or NULL to copy them from the first file to be merged
+  @param  n    number of files to be merged
+  @param  fn   names of files to be merged
+
+  @discussion Padding information may NOT correctly maintained. This
+  function is NOT thread safe.
+ */
+int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads, int level)
+{
+	bamFile fpout, *fp;
+	heap1_t *heap;
+	bam_header_t *hout = 0;
+	bam_header_t *hheaders = NULL;
+	int i, j, *RG_len = 0;
+	uint64_t idx = 0;
+	char **RG = 0, mode[8];
+	bam_iter_t *iter = 0;
+
+	if (headers) {
+		tamFile fpheaders = sam_open(headers);
+		if (fpheaders == 0) {
+			const char *message = strerror(errno);
+			fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
+			return -1;
+		}
+		hheaders = sam_header_read(fpheaders);
+		sam_close(fpheaders);
+	}
+
+	g_is_by_qname = by_qname;
+	fp = (bamFile*)calloc(n, sizeof(bamFile));
+	heap = (heap1_t*)calloc(n, sizeof(heap1_t));
+	iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t));
+	// prepare RG tag
+	if (flag & MERGE_RG) {
+		RG = (char**)calloc(n, sizeof(void*));
+		RG_len = (int*)calloc(n, sizeof(int));
+		for (i = 0; i != n; ++i) {
+			int l = strlen(fn[i]);
+			const char *s = fn[i];
+			if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
+			for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
+			++j; l -= j;
+			RG[i] = calloc(l + 1, 1);
+			RG_len[i] = l;
+			strncpy(RG[i], s + j, l);
+		}
+	}
+	// read the first
+	for (i = 0; i != n; ++i) {
+		bam_header_t *hin;
+		fp[i] = bam_open(fn[i], "r");
+		if (fp[i] == 0) {
+			int j;
+			fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
+			for (j = 0; j < i; ++j) bam_close(fp[j]);
+			free(fp); free(heap);
+			// FIXME: possible memory leak
+			return -1;
+		}
+		hin = bam_header_read(fp[i]);
+		if (i == 0) { // the first BAM
+			hout = hin;
+		} else { // validate multiple baf
+			int min_n_targets = hout->n_targets;
+			if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets;
+
+			for (j = 0; j < min_n_targets; ++j)
+				if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) {
+					fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n",
+							hout->target_name[j], hin->target_name[j], fn[i]);
+					return -1;
+				}
+
+			// If this input file has additional target reference sequences,
+			// add them to the headers to be output
+			if (hin->n_targets > hout->n_targets) {
+				swap_header_targets(hout, hin);
+				// FIXME Possibly we should also create @SQ text headers
+				// for the newly added reference sequences
+			}
+
+			bam_header_destroy(hin);
+		}
+	}
+
+	if (hheaders) {
+		// If the text headers to be swapped in include any @SQ headers,
+		// check that they are consistent with the existing binary list
+		// of reference information.
+		if (hheaders->n_targets > 0) {
+			if (hout->n_targets != hheaders->n_targets) {
+				fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers);
+				if (!reg) return -1;
+			}
+			for (j = 0; j < hout->n_targets; ++j)
+				if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) {
+					fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers);
+					if (!reg) return -1;
+				}
+		}
+
+		swap_header_text(hout, hheaders);
+		bam_header_destroy(hheaders);
+	}
+
+	if (reg) {
+		int tid, beg, end;
+		if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) {
+			fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__);
+			return -1;
+		}
+		for (i = 0; i < n; ++i) {
+			bam_index_t *idx;
+			idx = bam_index_load(fn[i]);
+			iter[i] = bam_iter_query(idx, tid, beg, end);
+			bam_index_destroy(idx);
+		}
+	}
+
+	for (i = 0; i < n; ++i) {
+		heap1_t *h = heap + i;
+		h->i = i;
+		h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
+		if (bam_iter_read(fp[i], iter[i], h->b) >= 0) {
+			h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b);
+			h->idx = idx++;
+		}
+		else h->pos = HEAP_EMPTY;
+	}
+	if (flag & MERGE_UNCOMP) level = 0;
+	else if (flag & MERGE_LEVEL1) level = 1;
+	strcpy(mode, "w");
+	if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
+	if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) {
+		fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
+		return -1;
+	}
+	bam_header_write(fpout, hout);
+	bam_header_destroy(hout);
+	if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256);
+
+	ks_heapmake(heap, n, heap);
+	while (heap->pos != HEAP_EMPTY) {
+		bam1_t *b = heap->b;
+		if (flag & MERGE_RG) {
+			uint8_t *rg = bam_aux_get(b, "RG");
+			if (rg) bam_aux_del(b, rg);
+			bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
+		}
+		bam_write1_core(fpout, &b->core, b->data_len, b->data);
+		if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) {
+			heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b);
+			heap->idx = idx++;
+		} else if (j == -1) {
+			heap->pos = HEAP_EMPTY;
+			free(heap->b->data); free(heap->b);
+			heap->b = 0;
+		} else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
+		ks_heapadjust(heap, 0, n, heap);
+	}
+
+	if (flag & MERGE_RG) {
+		for (i = 0; i != n; ++i) free(RG[i]);
+		free(RG); free(RG_len);
+	}
+	for (i = 0; i != n; ++i) {
+		bam_iter_destroy(iter[i]);
+		bam_close(fp[i]);
+	}
+	bam_close(fpout);
+	free(fp); free(heap); free(iter);
+	return 0;
+}
+
+int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg)
+{
+	return bam_merge_core2(by_qname, out, headers, n, fn, flag, reg, 0, -1);
+}
+
+int bam_merge(int argc, char *argv[])
+{
+	int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1;
+	char *fn_headers = NULL, *reg = 0;
+
+	while ((c = getopt(argc, argv, "h:nru1R:f@:l:")) >= 0) {
+		switch (c) {
+		case 'r': flag |= MERGE_RG; break;
+		case 'f': flag |= MERGE_FORCE; break;
+		case 'h': fn_headers = strdup(optarg); break;
+		case 'n': is_by_qname = 1; break;
+		case '1': flag |= MERGE_LEVEL1; break;
+		case 'u': flag |= MERGE_UNCOMP; break;
+		case 'R': reg = strdup(optarg); break;
+		case 'l': level = atoi(optarg); break;
+		case '@': n_threads = atoi(optarg); break;
+		}
+	}
+	if (optind + 2 >= argc) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]\n\n");
+		fprintf(stderr, "Options: -n       sort by read names\n");
+		fprintf(stderr, "         -r       attach RG tag (inferred from file names)\n");
+		fprintf(stderr, "         -u       uncompressed BAM output\n");
+		fprintf(stderr, "         -f       overwrite the output BAM if exist\n");
+		fprintf(stderr, "         -1       compress level 1\n");
+		fprintf(stderr, "         -l INT   compression level, from 0 to 9 [-1]\n");
+		fprintf(stderr, "         -@ INT   number of BAM compression threads [0]\n");
+		fprintf(stderr, "         -R STR   merge file in the specified region STR [all]\n");
+		fprintf(stderr, "         -h FILE  copy the header in FILE to <out.bam> [in1.bam]\n\n");
+		fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n");
+		fprintf(stderr, "      must provide the correct header with -h, or uses Picard which properly maintains\n");
+		fprintf(stderr, "      the header dictionary in merging.\n\n");
+		return 1;
+	}
+	if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) {
+		FILE *fp = fopen(argv[optind], "rb");
+		if (fp != NULL) {
+			fclose(fp);
+			fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]);
+			return 1;
+		}
+	}
+	if (bam_merge_core2(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg, n_threads, level) < 0) ret = 1;
+	free(reg);
+	free(fn_headers);
+	return ret;
+}
+
+/***************
+ * BAM sorting *
+ ***************/
+
+#include <pthread.h>
+
+typedef bam1_t *bam1_p;
+
+static int change_SO(bam_header_t *h, const char *so)
+{
+	char *p, *q, *beg = 0, *end = 0, *newtext;
+	if (h->l_text > 3) {
+		if (strncmp(h->text, "@HD", 3) == 0) {
+			if ((p = strchr(h->text, '\n')) == 0) return -1;
+			*p = '\0';
+			if ((q = strstr(h->text, "\tSO:")) != 0) {
+				*p = '\n'; // change back
+				if (strncmp(q + 4, so, p - q - 4) != 0) {
+					beg = q;
+					for (q += 4; *q != '\n' && *q != '\t'; ++q);
+					end = q;
+				} else return 0; // no need to change
+			} else beg = end = p, *p = '\n';
+		}
+	}
+	if (beg == 0) { // no @HD
+		h->l_text += strlen(so) + 15;
+		newtext = malloc(h->l_text + 1);
+		sprintf(newtext, "@HD\tVN:1.3\tSO:%s\n", so);
+		strcat(newtext, h->text);
+	} else { // has @HD but different or no SO
+		h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end);
+		newtext = malloc(h->l_text + 1);
+		strncpy(newtext, h->text, beg - h->text);
+		sprintf(newtext + (beg - h->text), "\tSO:%s", so);
+		strcat(newtext, end);
+	}
+	free(h->text);
+	h->text = newtext;
+	return 0;
+}
+
+static inline int bam1_lt(const bam1_p a, const bam1_p b)
+{
+	if (g_is_by_qname) {
+		int t = strnum_cmp(bam1_qname(a), bam1_qname(b));
+		return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0)));
+	} else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam1_strand(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam1_strand(b)));
+}
+KSORT_INIT(sort, bam1_p, bam1_lt)
+
+typedef struct {
+	size_t buf_len;
+	const char *prefix;
+	bam1_p *buf;
+	const bam_header_t *h;
+	int index;
+} worker_t;
+
+static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_header_t *h, int n_threads)
+{
+	size_t i;
+	bamFile fp;
+	fp = strcmp(fn, "-")? bam_open(fn, mode) : bam_dopen(fileno(stdout), mode);
+	if (fp == 0) return;
+	bam_header_write(fp, h);
+	if (n_threads > 1) bgzf_mt(fp, n_threads, 256);
+	for (i = 0; i < l; ++i)
+		bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data);
+	bam_close(fp);
+}
+
+static void *worker(void *data)
+{
+	worker_t *w = (worker_t*)data;
+	char *name;
+	ks_mergesort(sort, w->buf_len, w->buf, 0);
+	name = (char*)calloc(strlen(w->prefix) + 20, 1);
+	sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
+	write_buffer(name, "w1", w->buf_len, w->buf, w->h, 0);
+	free(name);
+	return 0;
+}
+
+static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, const bam_header_t *h, int n_threads)
+{
+	int i;
+	size_t rest;
+	bam1_p *b;
+	pthread_t *tid;
+	pthread_attr_t attr;
+	worker_t *w;
+
+	if (n_threads < 1) n_threads = 1;
+	if (k < n_threads * 64) n_threads = 1; // use a single thread if we only sort a small batch of records
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+	w = calloc(n_threads, sizeof(worker_t));
+	tid = calloc(n_threads, sizeof(pthread_t));
+	b = buf; rest = k;
+	for (i = 0; i < n_threads; ++i) {
+		w[i].buf_len = rest / (n_threads - i);
+		w[i].buf = b;
+		w[i].prefix = prefix;
+		w[i].h = h;
+		w[i].index = n_files + i;
+		b += w[i].buf_len; rest -= w[i].buf_len;
+		pthread_create(&tid[i], &attr, worker, &w[i]);
+	}
+	for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+	free(tid); free(w);
+	return n_files + n_threads;
+}
+
+/*!
+  @abstract Sort an unsorted BAM file based on the chromosome order
+  and the leftmost position of an alignment
+
+  @param  is_by_qname whether to sort by query name
+  @param  fn       name of the file to be sorted
+  @param  prefix   prefix of the output and the temporary files; upon
+	                   sucessess, prefix.bam will be written.
+  @param  max_mem  approxiate maximum memory (very inaccurate)
+
+  @discussion It may create multiple temporary subalignment files
+  and then merge them by calling bam_merge_core(). This function is
+  NOT thread safe.
+ */
+void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level)
+{
+	int ret, i, n_files = 0;
+	size_t mem, max_k, k, max_mem;
+	bam_header_t *header;
+	bamFile fp;
+	bam1_t *b, **buf;
+	char *fnout = 0;
+
+	if (n_threads < 2) n_threads = 1;
+	g_is_by_qname = is_by_qname;
+	max_k = k = 0; mem = 0;
+	max_mem = _max_mem * n_threads;
+	buf = 0;
+	fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
+	if (fp == 0) {
+		fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn);
+		return;
+	}
+	header = bam_header_read(fp);
+	if (is_by_qname) change_SO(header, "queryname");
+	else change_SO(header, "coordinate");
+	// write sub files
+	for (;;) {
+		if (k == max_k) {
+			size_t old_max = max_k;
+			max_k = max_k? max_k<<1 : 0x10000;
+			buf = realloc(buf, max_k * sizeof(void*));
+			memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max));
+		}
+		if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
+		b = buf[k];
+		if ((ret = bam_read1(fp, b)) < 0) break;
+		if (b->data_len < b->m_data>>2) { // shrink
+			b->m_data = b->data_len;
+			kroundup32(b->m_data);
+			b->data = realloc(b->data, b->m_data);
+		}
+		mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays
+		++k;
+		if (mem >= max_mem) {
+			n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
+			mem = k = 0;
+		}
+	}
+	if (ret != -1)
+		fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
+	// output file name
+	fnout = calloc(strlen(prefix) + 20, 1);
+	if (is_stdout) sprintf(fnout, "-");
+	else sprintf(fnout, "%s.bam", prefix);
+	// write the final output
+	if (n_files == 0) { // a single block
+		char mode[8];
+		strcpy(mode, "w");
+		if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
+		ks_mergesort(sort, k, buf, 0);
+		write_buffer(fnout, mode, k, buf, header, n_threads);
+	} else { // then merge
+		char **fns;
+		n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
+		fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files);
+		fns = (char**)calloc(n_files, sizeof(char*));
+		for (i = 0; i < n_files; ++i) {
+			fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
+			sprintf(fns[i], "%s.%.4d.bam", prefix, i);
+		}
+		bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level);
+		for (i = 0; i < n_files; ++i) {
+			unlink(fns[i]);
+			free(fns[i]);
+		}
+		free(fns);
+	}
+	free(fnout);
+	// free
+	for (k = 0; k < max_k; ++k) {
+		if (!buf[k]) continue;
+		free(buf[k]->data);
+		free(buf[k]);
+	}
+	free(buf);
+	bam_header_destroy(header);
+	bam_close(fp);
+}
+
+void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem)
+{
+	bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0, 0, -1);
+}
+
+int bam_sort(int argc, char *argv[])
+{
+	size_t max_mem = 768<<20; // 512MB
+	int c, is_by_qname = 0, is_stdout = 0, n_threads = 0, level = -1;
+	while ((c = getopt(argc, argv, "nom:@:l:")) >= 0) {
+		switch (c) {
+		case 'o': is_stdout = 1; break;
+		case 'n': is_by_qname = 1; break;
+		case 'm': {
+				char *q;
+				max_mem = strtol(optarg, &q, 0);
+				if (*q == 'k' || *q == 'K') max_mem <<= 10;
+				else if (*q == 'm' || *q == 'M') max_mem <<= 20;
+				else if (*q == 'g' || *q == 'G') max_mem <<= 30;
+				break;
+			}
+		case '@': n_threads = atoi(optarg); break;
+		case 'l': level = atoi(optarg); break;
+		}
+	}
+	if (optind + 2 > argc) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   samtools sort [options] <in.bam> <out.prefix>\n\n");
+		fprintf(stderr, "Options: -n        sort by read name\n");
+		fprintf(stderr, "         -o        final output to stdout\n");
+		fprintf(stderr, "         -l INT    compression level, from 0 to 9 [-1]\n");
+		fprintf(stderr, "         -@ INT    number of sorting and compression threads [1]\n");
+		fprintf(stderr, "         -m INT    max memory per thread; suffix K/M/G recognized [768M]\n");
+		fprintf(stderr, "\n");
+		return 1;
+	}
+	bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout, n_threads, level);
+	return 0;
+}
diff --git a/bedidx.c b/bedidx.c
new file mode 100644
index 0000000..ec75a10
--- /dev/null
+++ b/bedidx.c
@@ -0,0 +1,162 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+
+#ifdef _WIN32
+#define drand48() ((double)rand() / RAND_MAX)
+#endif
+
+#include "ksort.h"
+KSORT_INIT_GENERIC(uint64_t)
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 8192)
+
+typedef struct {
+	int n, m;
+	uint64_t *a;
+	int *idx;
+} bed_reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, bed_reglist_t)
+
+#define LIDX_SHIFT 13
+
+typedef kh_reg_t reghash_t;
+
+int *bed_index_core(int n, uint64_t *a, int *n_idx)
+{
+	int i, j, m, *idx;
+	m = *n_idx = 0; idx = 0;
+	for (i = 0; i < n; ++i) {
+		int beg, end;
+		beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
+		if (m < end + 1) {
+			int oldm = m;
+			m = end + 1;
+			kroundup32(m);
+			idx = realloc(idx, m * sizeof(int));
+			for (j = oldm; j < m; ++j) idx[j] = -1;
+		}
+		if (beg == end) {
+			if (idx[beg] < 0) idx[beg] = i;
+		} else {
+			for (j = beg; j <= end; ++j)
+				if (idx[j] < 0) idx[j] = i;
+		}
+		*n_idx = end + 1;
+	}
+	return idx;
+}
+
+void bed_index(void *_h)
+{
+	reghash_t *h = (reghash_t*)_h;
+	khint_t k;
+	for (k = 0; k < kh_end(h); ++k) {
+		if (kh_exist(h, k)) {
+			bed_reglist_t *p = &kh_val(h, k);
+			if (p->idx) free(p->idx);
+			ks_introsort(uint64_t, p->n, p->a);
+			p->idx = bed_index_core(p->n, p->a, &p->m);
+		}
+	}
+}
+
+int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
+{
+	int i, min_off;
+	if (p->n == 0) return 0;
+	min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
+	if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
+		int n = beg>>LIDX_SHIFT;
+		if (n > p->n) n = p->n;
+		for (i = n - 1; i >= 0; --i)
+			if (p->idx[i] >= 0) break;
+		min_off = i >= 0? p->idx[i] : 0;
+	}
+	for (i = min_off; i < p->n; ++i) {
+		if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
+		if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
+			return 1; // find the overlap; return
+	}
+	return 0;
+}
+
+int bed_overlap(const void *_h, const char *chr, int beg, int end)
+{
+	const reghash_t *h = (const reghash_t*)_h;
+	khint_t k;
+	if (!h) return 0;
+	k = kh_get(reg, h, chr);
+	if (k == kh_end(h)) return 0;
+	return bed_overlap_core(&kh_val(h, k), beg, end);
+}
+
+void *bed_read(const char *fn)
+{
+	reghash_t *h = kh_init(reg);
+	gzFile fp;
+	kstream_t *ks;
+	int dret;
+	kstring_t *str;
+	// read the list
+	fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+	if (fp == 0) return 0;
+	str = calloc(1, sizeof(kstring_t));
+	ks = ks_init(fp);
+	while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
+		int beg = -1, end = -1;
+		bed_reglist_t *p;
+		khint_t k = kh_get(reg, h, str->s);
+		if (k == kh_end(h)) { // absent from the hash table
+			int ret;
+			char *s = strdup(str->s);
+			k = kh_put(reg, h, s, &ret);
+			memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
+		}
+		p = &kh_val(h, k);
+		if (dret != '\n') { // if the lines has other characters
+			if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+				beg = atoi(str->s); // begin
+				if (dret != '\n') {
+					if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+						end = atoi(str->s); // end
+						if (end < beg) end = -1;
+					}
+				}
+			}
+		}
+		if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
+		if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+		if (beg >= 0 && end > beg) {
+			if (p->n == p->m) {
+				p->m = p->m? p->m<<1 : 4;
+				p->a = realloc(p->a, p->m * 8);
+			}
+			p->a[p->n++] = (uint64_t)beg<<32 | end;
+		}
+	}
+	ks_destroy(ks);
+	gzclose(fp);
+	free(str->s); free(str);
+	bed_index(h);
+	return h;
+}
+
+void bed_destroy(void *_h)
+{
+	reghash_t *h = (reghash_t*)_h;
+	khint_t k;
+	for (k = 0; k < kh_end(h); ++k) {
+		if (kh_exist(h, k)) {
+			free(kh_val(h, k).a);
+			free(kh_val(h, k).idx);
+			free((char*)kh_key(h, k));
+		}
+	}
+	kh_destroy(reg, h);
+}
diff --git a/bgzf.c b/bgzf.c
new file mode 100644
index 0000000..880d5af
--- /dev/null
+++ b/bgzf.c
@@ -0,0 +1,694 @@
+/* The MIT License
+
+   Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+                 2011 Attractive Chaos <attractor at live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include "bgzf.h"
+
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+typedef knetFile *_bgzf_file_t;
+#define _bgzf_open(fn, mode) knet_open(fn, mode)
+#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
+#define _bgzf_close(fp) knet_close(fp)
+#define _bgzf_fileno(fp) ((fp)->fd)
+#define _bgzf_tell(fp) knet_tell(fp)
+#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
+#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
+#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
+#else // ~defined(_USE_KNETFILE)
+#if defined(_WIN32) || defined(_MSC_VER)
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else // ~defined(_WIN32)
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif // ~defined(_WIN32)
+typedef FILE *_bgzf_file_t;
+#define _bgzf_open(fn, mode) fopen(fn, mode)
+#define _bgzf_dopen(fp, mode) fdopen(fp, mode)
+#define _bgzf_close(fp) fclose(fp)
+#define _bgzf_fileno(fp) fileno(fp)
+#define _bgzf_tell(fp) ftello(fp)
+#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
+#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
+#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
+#endif // ~define(_USE_KNETFILE)
+
+#define BLOCK_HEADER_LENGTH 18
+#define BLOCK_FOOTER_LENGTH 8
+
+
+/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ | 31|139|  8|  4|              0|  0|255|      6| 66| 67|      2|BLK_LEN|
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+*/
+static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
+
+#ifdef BGZF_CACHE
+typedef struct {
+	int size;
+	uint8_t *block;
+	int64_t end_offset;
+} cache_t;
+#include "khash.h"
+KHASH_MAP_INIT_INT64(cache, cache_t)
+#endif
+
+static inline void packInt16(uint8_t *buffer, uint16_t value)
+{
+	buffer[0] = value;
+	buffer[1] = value >> 8;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+	return buffer[0] | buffer[1] << 8;
+}
+
+static inline void packInt32(uint8_t *buffer, uint32_t value)
+{
+	buffer[0] = value;
+	buffer[1] = value >> 8;
+	buffer[2] = value >> 16;
+	buffer[3] = value >> 24;
+}
+
+static BGZF *bgzf_read_init()
+{
+	BGZF *fp;
+	fp = calloc(1, sizeof(BGZF));
+	fp->is_write = 0;
+	fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+	fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+#ifdef BGZF_CACHE
+	fp->cache = kh_init(cache);
+#endif
+	return fp;
+}
+
+static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
+{
+	BGZF *fp;
+	fp = calloc(1, sizeof(BGZF));
+	fp->is_write = 1;
+	fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+	fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+	fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+	if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
+	return fp;
+}
+// get the compress level from the mode string
+static int mode2level(const char *__restrict mode)
+{
+	int i, compress_level = -1;
+	for (i = 0; mode[i]; ++i)
+		if (mode[i] >= '0' && mode[i] <= '9') break;
+	if (mode[i]) compress_level = (int)mode[i] - '0';
+	if (strchr(mode, 'u')) compress_level = 0;
+	return compress_level;
+}
+
+BGZF *bgzf_open(const char *path, const char *mode)
+{
+	BGZF *fp = 0;
+	assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+	if (strchr(mode, 'r') || strchr(mode, 'R')) {
+		_bgzf_file_t fpr;
+		if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
+		fp = bgzf_read_init();
+		fp->fp = fpr;
+	} else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+		FILE *fpw;
+		if ((fpw = fopen(path, "w")) == 0) return 0;
+		fp = bgzf_write_init(mode2level(mode));
+		fp->fp = fpw;
+	}
+	return fp;
+}
+
+BGZF *bgzf_dopen(int fd, const char *mode)
+{
+	BGZF *fp = 0;
+	assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+	if (strchr(mode, 'r') || strchr(mode, 'R')) {
+		_bgzf_file_t fpr;
+		if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
+		fp = bgzf_read_init();
+		fp->fp = fpr;
+	} else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+		FILE *fpw;
+		if ((fpw = fdopen(fd, "w")) == 0) return 0;
+		fp = bgzf_write_init(mode2level(mode));
+		fp->fp = fpw;
+	}
+	return fp;
+}
+
+static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level)
+{
+	uint32_t crc;
+	z_stream zs;
+	uint8_t *dst = (uint8_t*)_dst;
+
+	// compress the body
+	zs.zalloc = NULL; zs.zfree = NULL;
+	zs.next_in  = src;
+	zs.avail_in = slen;
+	zs.next_out = dst + BLOCK_HEADER_LENGTH;
+	zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+	if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer
+	if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1;
+	if (deflateEnd(&zs) != Z_OK) return -1;
+	*dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+	// write the header
+	memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
+	packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes
+	// write the footer
+	crc = crc32(crc32(0L, NULL, 0L), src, slen);
+	packInt32((uint8_t*)&dst[*dlen - 8], crc);
+	packInt32((uint8_t*)&dst[*dlen - 4], slen);
+	return 0;
+}
+
+// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
+static int deflate_block(BGZF *fp, int block_length)
+{
+	int comp_size = BGZF_MAX_BLOCK_SIZE;
+	if (bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level) != 0) {
+		fp->errcode |= BGZF_ERR_ZLIB;
+		return -1;
+	}
+	fp->block_offset = 0;
+	return comp_size;
+}
+
+// Inflate the block in fp->compressed_block into fp->uncompressed_block
+static int inflate_block(BGZF* fp, int block_length)
+{
+	z_stream zs;
+	zs.zalloc = NULL;
+	zs.zfree = NULL;
+	zs.next_in = fp->compressed_block + 18;
+	zs.avail_in = block_length - 16;
+	zs.next_out = fp->uncompressed_block;
+	zs.avail_out = BGZF_MAX_BLOCK_SIZE;
+
+	if (inflateInit2(&zs, -15) != Z_OK) {
+		fp->errcode |= BGZF_ERR_ZLIB;
+		return -1;
+	}
+	if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
+		inflateEnd(&zs);
+		fp->errcode |= BGZF_ERR_ZLIB;
+		return -1;
+	}
+	if (inflateEnd(&zs) != Z_OK) {
+		fp->errcode |= BGZF_ERR_ZLIB;
+		return -1;
+	}
+	return zs.total_out;
+}
+
+static int check_header(const uint8_t *header)
+{
+	return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
+			&& unpackInt16((uint8_t*)&header[10]) == 6
+			&& header[12] == 'B' && header[13] == 'C'
+			&& unpackInt16((uint8_t*)&header[14]) == 2);
+}
+
+#ifdef BGZF_CACHE
+static void free_cache(BGZF *fp)
+{
+	khint_t k;
+	khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+	if (fp->is_write) return;
+	for (k = kh_begin(h); k < kh_end(h); ++k)
+		if (kh_exist(h, k)) free(kh_val(h, k).block);
+	kh_destroy(cache, h);
+}
+
+static int load_block_from_cache(BGZF *fp, int64_t block_address)
+{
+	khint_t k;
+	cache_t *p;
+	khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+	k = kh_get(cache, h, block_address);
+	if (k == kh_end(h)) return 0;
+	p = &kh_val(h, k);
+	if (fp->block_length != 0) fp->block_offset = 0;
+	fp->block_address = block_address;
+	fp->block_length = p->size;
+	memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE);
+	_bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
+	return p->size;
+}
+
+static void cache_block(BGZF *fp, int size)
+{
+	int ret;
+	khint_t k;
+	cache_t *p;
+	khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+	if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
+	if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > fp->cache_size) {
+		/* A better way would be to remove the oldest block in the
+		 * cache, but here we remove a random one for simplicity. This
+		 * should not have a big impact on performance. */
+		for (k = kh_begin(h); k < kh_end(h); ++k)
+			if (kh_exist(h, k)) break;
+		if (k < kh_end(h)) {
+			free(kh_val(h, k).block);
+			kh_del(cache, h, k);
+		}
+	}
+	k = kh_put(cache, h, fp->block_address, &ret);
+	if (ret == 0) return; // if this happens, a bug!
+	p = &kh_val(h, k);
+	p->size = fp->block_length;
+	p->end_offset = fp->block_address + size;
+	p->block = malloc(BGZF_MAX_BLOCK_SIZE);
+	memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
+}
+#else
+static void free_cache(BGZF *fp) {}
+static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
+static void cache_block(BGZF *fp, int size) {}
+#endif
+
+int bgzf_read_block(BGZF *fp)
+{
+	uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
+	int count, size = 0, block_length, remaining;
+	int64_t block_address;
+	block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+	if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0;
+	count = _bgzf_read(fp->fp, header, sizeof(header));
+	if (count == 0) { // no data read
+		fp->block_length = 0;
+		return 0;
+	}
+	if (count != sizeof(header) || !check_header(header)) {
+		fp->errcode |= BGZF_ERR_HEADER;
+		return -1;
+	}
+	size = count;
+	block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
+	compressed_block = (uint8_t*)fp->compressed_block;
+	memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+	remaining = block_length - BLOCK_HEADER_LENGTH;
+	count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+	if (count != remaining) {
+		fp->errcode |= BGZF_ERR_IO;
+		return -1;
+	}
+	size += count;
+	if ((count = inflate_block(fp, block_length)) < 0) return -1;
+	if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
+	fp->block_address = block_address;
+	fp->block_length = count;
+	cache_block(fp, size);
+	return 0;
+}
+
+ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
+{
+	ssize_t bytes_read = 0;
+	uint8_t *output = data;
+	if (length <= 0) return 0;
+	assert(fp->is_write == 0);
+	while (bytes_read < length) {
+		int copy_length, available = fp->block_length - fp->block_offset;
+		uint8_t *buffer;
+		if (available <= 0) {
+			if (bgzf_read_block(fp) != 0) return -1;
+			available = fp->block_length - fp->block_offset;
+			if (available <= 0) break;
+		}
+		copy_length = length - bytes_read < available? length - bytes_read : available;
+		buffer = fp->uncompressed_block;
+		memcpy(output, buffer + fp->block_offset, copy_length);
+		fp->block_offset += copy_length;
+		output += copy_length;
+		bytes_read += copy_length;
+	}
+	if (fp->block_offset == fp->block_length) {
+		fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+		fp->block_offset = fp->block_length = 0;
+	}
+	return bytes_read;
+}
+
+/***** BEGIN: multi-threading *****/
+
+typedef struct {
+	BGZF *fp;
+	struct mtaux_t *mt;
+	void *buf;
+	int i, errcode, toproc;
+} worker_t;
+
+typedef struct mtaux_t {
+	int n_threads, n_blks, curr, done;
+	volatile int proc_cnt;
+	void **blk;
+	int *len;
+	worker_t *w;
+	pthread_t *tid;
+	pthread_mutex_t lock;
+	pthread_cond_t cv;
+} mtaux_t;
+
+static int worker_aux(worker_t *w)
+{
+	int i, tmp, stop = 0;
+	// wait for condition: to process or all done
+	pthread_mutex_lock(&w->mt->lock);
+	while (!w->toproc && !w->mt->done)
+		pthread_cond_wait(&w->mt->cv, &w->mt->lock);
+	if (w->mt->done) stop = 1;
+	w->toproc = 0;
+	pthread_mutex_unlock(&w->mt->lock);
+	if (stop) return 1; // to quit the thread
+	w->errcode = 0;
+	for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) {
+		int clen = BGZF_MAX_BLOCK_SIZE;
+		if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0)
+			w->errcode |= BGZF_ERR_ZLIB;
+		memcpy(w->mt->blk[i], w->buf, clen);
+		w->mt->len[i] = clen;
+	}
+	tmp = __sync_fetch_and_add(&w->mt->proc_cnt, 1);
+	return 0;
+}
+
+static void *mt_worker(void *data)
+{
+	while (worker_aux(data) == 0);
+	return 0;
+}
+
+int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
+{
+	int i;
+	mtaux_t *mt;
+	pthread_attr_t attr;
+	if (!fp->is_write || fp->mt || n_threads <= 1) return -1;
+	mt = calloc(1, sizeof(mtaux_t));
+	mt->n_threads = n_threads;
+	mt->n_blks = n_threads * n_sub_blks;
+	mt->len = calloc(mt->n_blks, sizeof(int));
+	mt->blk = calloc(mt->n_blks, sizeof(void*));
+	for (i = 0; i < mt->n_blks; ++i)
+		mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE);
+	mt->tid = calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master
+	mt->w = calloc(mt->n_threads, sizeof(worker_t));
+	for (i = 0; i < mt->n_threads; ++i) {
+		mt->w[i].i = i;
+		mt->w[i].mt = mt;
+		mt->w[i].fp = fp;
+		mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE);
+	}
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+	pthread_mutex_init(&mt->lock, 0);
+	pthread_cond_init(&mt->cv, 0);
+	for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread
+		pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]);
+	fp->mt = mt;
+	return 0;
+}
+
+static void mt_destroy(mtaux_t *mt)
+{
+	int i;
+	// signal all workers to quit
+	pthread_mutex_lock(&mt->lock);
+	mt->done = 1; mt->proc_cnt = 0;
+	pthread_cond_broadcast(&mt->cv);
+	pthread_mutex_unlock(&mt->lock);
+	for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread
+	// free other data allocated on heap
+	for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]);
+	for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf);
+	free(mt->blk); free(mt->len); free(mt->w); free(mt->tid);
+	pthread_cond_destroy(&mt->cv);
+	pthread_mutex_destroy(&mt->lock);
+	free(mt);
+}
+
+static void mt_queue(BGZF *fp)
+{
+	mtaux_t *mt = (mtaux_t*)fp->mt;
+	assert(mt->curr < mt->n_blks); // guaranteed by the caller
+	memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset);
+	mt->len[mt->curr] = fp->block_offset;
+	fp->block_offset = 0;
+	++mt->curr;
+}
+
+static int mt_flush(BGZF *fp)
+{
+	int i;
+	mtaux_t *mt = (mtaux_t*)fp->mt;
+	if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail
+	// signal all the workers to compress
+	pthread_mutex_lock(&mt->lock);
+	for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1;
+	mt->proc_cnt = 0;
+	pthread_cond_broadcast(&mt->cv);
+	pthread_mutex_unlock(&mt->lock);
+	// worker 0 is doing things here
+	worker_aux(&mt->w[0]);
+	// wait for all the threads to complete
+	while (mt->proc_cnt < mt->n_threads);
+	// dump data to disk
+	for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode;
+	for (i = 0; i < mt->curr; ++i)
+		if (fwrite(mt->blk[i], 1, mt->len[i], fp->fp) != mt->len[i])
+			fp->errcode |= BGZF_ERR_IO;
+	mt->curr = 0;
+	return 0;
+}
+
+static int mt_lazy_flush(BGZF *fp)
+{
+	mtaux_t *mt = (mtaux_t*)fp->mt;
+	if (fp->block_offset) mt_queue(fp);
+	if (mt->curr == mt->n_blks)
+		return mt_flush(fp);
+	return -1;
+}
+
+static ssize_t mt_write(BGZF *fp, const void *data, ssize_t length)
+{
+	const uint8_t *input = data;
+	ssize_t rest = length;
+	while (rest) {
+		int copy_length = BGZF_BLOCK_SIZE - fp->block_offset < rest? BGZF_BLOCK_SIZE - fp->block_offset : rest;
+		memcpy(fp->uncompressed_block + fp->block_offset, input, copy_length);
+		fp->block_offset += copy_length; input += copy_length; rest -= copy_length;
+		if (fp->block_offset == BGZF_BLOCK_SIZE) mt_lazy_flush(fp);
+	}
+	return length - rest;
+}
+
+/***** END: multi-threading *****/
+
+int bgzf_flush(BGZF *fp)
+{
+	if (!fp->is_write) return 0;
+	if (fp->mt) return mt_flush(fp);
+	while (fp->block_offset > 0) {
+		int block_length;
+		block_length = deflate_block(fp, fp->block_offset);
+		if (block_length < 0) return -1;
+		if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
+			fp->errcode |= BGZF_ERR_IO; // possibly truncated file
+			return -1;
+		}
+		fp->block_address += block_length;
+	}
+	return 0;
+}
+
+int bgzf_flush_try(BGZF *fp, ssize_t size)
+{
+	if (fp->block_offset + size > BGZF_BLOCK_SIZE) {
+		if (fp->mt) return mt_lazy_flush(fp);
+		else return bgzf_flush(fp);
+	}
+	return -1;
+}
+
+ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
+{
+	const uint8_t *input = data;
+	int block_length = BGZF_BLOCK_SIZE, bytes_written = 0;
+	assert(fp->is_write);
+	if (fp->mt) return mt_write(fp, data, length);
+	while (bytes_written < length) {
+		uint8_t* buffer = fp->uncompressed_block;
+		int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
+		memcpy(buffer + fp->block_offset, input, copy_length);
+		fp->block_offset += copy_length;
+		input += copy_length;
+		bytes_written += copy_length;
+		if (fp->block_offset == block_length && bgzf_flush(fp)) break;
+	}
+	return bytes_written;
+}
+
+int bgzf_close(BGZF* fp)
+{
+	int ret, count, block_length;
+	if (fp == 0) return -1;
+	if (fp->is_write) {
+		if (bgzf_flush(fp) != 0) return -1;
+		fp->compress_level = -1;
+		block_length = deflate_block(fp, 0); // write an empty block
+		count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
+		if (fflush(fp->fp) != 0) {
+			fp->errcode |= BGZF_ERR_IO;
+			return -1;
+		}
+		if (fp->mt) mt_destroy(fp->mt);
+	}
+	ret = fp->is_write? fclose(fp->fp) : _bgzf_close(fp->fp);
+	if (ret != 0) return -1;
+	free(fp->uncompressed_block);
+	free(fp->compressed_block);
+	free_cache(fp);
+	free(fp);
+	return 0;
+}
+
+void bgzf_set_cache_size(BGZF *fp, int cache_size)
+{
+	if (fp) fp->cache_size = cache_size;
+}
+
+int bgzf_check_EOF(BGZF *fp)
+{
+	static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
+	uint8_t buf[28];
+	off_t offset;
+	offset = _bgzf_tell((_bgzf_file_t)fp->fp);
+	if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
+	_bgzf_read(fp->fp, buf, 28);
+	_bgzf_seek(fp->fp, offset, SEEK_SET);
+	return (memcmp(magic, buf, 28) == 0)? 1 : 0;
+}
+
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+	int block_offset;
+	int64_t block_address;
+
+	if (fp->is_write || where != SEEK_SET) {
+		fp->errcode |= BGZF_ERR_MISUSE;
+		return -1;
+	}
+	block_offset = pos & 0xFFFF;
+	block_address = pos >> 16;
+	if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
+		fp->errcode |= BGZF_ERR_IO;
+		return -1;
+	}
+	fp->block_length = 0;  // indicates current block has not been loaded
+	fp->block_address = block_address;
+	fp->block_offset = block_offset;
+	return 0;
+}
+
+int bgzf_is_bgzf(const char *fn)
+{
+	uint8_t buf[16];
+	int n;
+	_bgzf_file_t fp;
+	if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
+	n = _bgzf_read(fp, buf, 16);
+	_bgzf_close(fp);
+	if (n != 16) return 0;
+	return memcmp(g_magic, buf, 16) == 0? 1 : 0;
+}
+
+int bgzf_getc(BGZF *fp)
+{
+	int c;
+	if (fp->block_offset >= fp->block_length) {
+		if (bgzf_read_block(fp) != 0) return -2; /* error */
+		if (fp->block_length == 0) return -1; /* end-of-file */
+	}
+	c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+    if (fp->block_offset == fp->block_length) {
+        fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+        fp->block_offset = 0;
+        fp->block_length = 0;
+    }
+	return c;
+}
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
+{
+	int l, state = 0;
+	unsigned char *buf = (unsigned char*)fp->uncompressed_block;
+	str->l = 0;
+	do {
+		if (fp->block_offset >= fp->block_length) {
+			if (bgzf_read_block(fp) != 0) { state = -2; break; }
+			if (fp->block_length == 0) { state = -1; break; }
+		}
+		for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
+		if (l < fp->block_length) state = 1;
+		l -= fp->block_offset;
+		if (str->l + l + 1 >= str->m) {
+			str->m = str->l + l + 2;
+			kroundup32(str->m);
+			str->s = (char*)realloc(str->s, str->m);
+		}
+		memcpy(str->s + str->l, buf + fp->block_offset, l);
+		str->l += l;
+		fp->block_offset += l + 1;
+		if (fp->block_offset >= fp->block_length) {
+			fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+			fp->block_offset = 0;
+			fp->block_length = 0;
+		} 
+	} while (state == 0);
+	if (str->l == 0 && state < 0) return state;
+	str->s[str->l] = 0;
+	return str->l;
+}
diff --git a/determine-phred b/determine-phred
new file mode 100755
index 0000000..ac8d39f
--- /dev/null
+++ b/determine-phred
@@ -0,0 +1,86 @@
+#!/usr/bin/perl
+use strict;
+
+my $ssiz=7000;			# sample size
+
+if ($ARGV[0] =~ /^-[h?]/) {
+	print "Usage: determine-phred FILE
+
+Reads a sam, fastq or pileup, possibly gzipped and returns the phred-scale, 
+  either 64 or 33, based on a quick scan of the data in the file.
+";
+	exit 0;
+}
+my $cnt;
+my $dphred = 64;
+if ($ARGV[0] =~ /\.gz$/) {
+	$ARGV[0] = "gunzip -c '$ARGV[0]'|";
+}
+my $qual;
+my $comm;
+my $fmt;
+if (@ARGV > 1) {
+	my @mult = @ARGV;
+	for my $f (@mult) {
+		@ARGV = ($f);
+		determine();
+		print "$f\t$dphred\n";
+	}
+} else {
+	determine();
+	print "$dphred\n";
+}
+
+sub determine {
+	$_ = <>;
+	if (/^\@/ && ! /^\@SQ\t/) {
+		# fastq
+		scalar <>;              # read
+		$comm = scalar <>;      # comment
+		if (!(substr($comm,0,1) eq '+')) {
+			die "Unknown file format\n";
+		}
+		$qual = <>;
+		chomp $qual;
+		$fmt = 'fq';
+	} elsif (/^\S+\t\d+\t[ACTGN]\t\d+\t\S+\t(\S+)$/i) {
+		$qual = $1;
+		$fmt = 'pileup';
+	} else {
+		# sam
+		$fmt = 'sam';
+		$qual = (split(/\t/, $_))[10];
+	}
+	if (!$qual) {
+		die "Unknown file format\n";
+	}
+    my $rc = 1;
+	while($qual) {
+        ++$rc;
+		for (my $i =length($qual)/2; $i < length($qual); ++$i) {
+			if (ord(substr($qual,$i,1)) < 64) {
+				$dphred = 33;
+				$cnt=$ssiz;	# last
+				last;
+			}
+		}
+		$qual = '';
+		last if ++$cnt >= $ssiz;	# got enough
+		if ($fmt eq 'fq') {
+			# fastq
+			last if ! scalar <>;		# id
+			last if ! scalar <>;		# read
+			last if ! scalar <>;		# comment
+			$qual = <>;
+			chomp $qual;
+		} elsif ($fmt eq 'pileup') {
+			$qual = (split(/\t/, $_))[5];
+		} else {
+			# sam
+			$qual = (split(/\t/, $_))[10];
+		}
+	}
+    if ($rc < 10) {
+		$dphred = 33;
+    }
+}
diff --git a/ea-utils.spec b/ea-utils.spec
new file mode 100644
index 0000000..633c489
--- /dev/null
+++ b/ea-utils.spec
@@ -0,0 +1,37 @@
+%define name ea-utils
+%define ver 1.1.2
+%define rel 686
+
+Summary: 	fastq-processing utilities
+Name:           %{name}
+Version:        %{ver}
+Release:        %{rel}
+Source:         %{name}.tar.gz
+Prefix:         /usr
+BuildRoot:      /tmp/%{name}-%{ver}-root
+Vendor:         Expression Analysis <earonesty at expressionanalysis.com>
+URL:            https://code.google.com/p/ea-utils/
+License: 	MIT
+Group: 		Applications/Engineering
+Distribution: 	Centos 5
+Packager: 	Erik Aronesty <earonesty at expressionanalysis.com>
+
+%description
+Utilities for processing fastq files, stitching paired-end reads,
+demultiplexing paired-end in-sync, adapter-trimming & skew removal.
+
+%prep
+%setup -c
+
+%install
+make PREFIX=%{buildroot}/%{_prefix} install
+
+%clean
+rm -rf %{buildroot}
+
+%files
+
+%{_bindir}/fastq-join
+%{_bindir}/fastq-clipper
+%{_bindir}/fastq-mcf
+%{_bindir}/fastq-multx
diff --git a/faidx.c b/faidx.c
new file mode 100644
index 0000000..51c82ac
--- /dev/null
+++ b/faidx.c
@@ -0,0 +1,437 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include "faidx.h"
+#include "khash.h"
+
+typedef struct {
+	int32_t line_len, line_blen;
+	int64_t len;
+	uint64_t offset;
+} faidx1_t;
+KHASH_MAP_INIT_STR(s, faidx1_t)
+
+#ifndef _NO_RAZF
+#include "razf.h"
+#else
+#ifdef _WIN32
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif
+#define RAZF FILE
+#define razf_read(fp, buf, size) fread(buf, 1, size, fp)
+#define razf_open(fn, mode) fopen(fn, mode)
+#define razf_close(fp) fclose(fp)
+#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)
+#define razf_tell(fp) ftello(fp)
+#endif
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+struct __faidx_t {
+	RAZF *rz;
+	int n, m;
+	char **name;
+	khash_t(s) *hash;
+};
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
+{
+	khint_t k;
+	int ret;
+	faidx1_t t;
+	if (idx->n == idx->m) {
+		idx->m = idx->m? idx->m<<1 : 16;
+		idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);
+	}
+	idx->name[idx->n] = strdup(name);
+	k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
+	t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
+	kh_value(idx->hash, k) = t;
+	++idx->n;
+}
+
+faidx_t *fai_build_core(RAZF *rz)
+{
+	char c, *name;
+	int l_name, m_name, ret;
+	int line_len, line_blen, state;
+	int l1, l2;
+	faidx_t *idx;
+	uint64_t offset;
+	int64_t len;
+
+	idx = (faidx_t*)calloc(1, sizeof(faidx_t));
+	idx->hash = kh_init(s);
+	name = 0; l_name = m_name = 0;
+	len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
+	while (razf_read(rz, &c, 1)) {
+		if (c == '\n') { // an empty line
+			if (state == 1) {
+				offset = razf_tell(rz);
+				continue;
+			} else if ((state == 0 && len < 0) || state == 2) continue;
+		}
+		if (c == '>') { // fasta header
+			if (len >= 0)
+				fai_insert_index(idx, name, len, line_len, line_blen, offset);
+			l_name = 0;
+			while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {
+				if (m_name < l_name + 2) {
+					m_name = l_name + 2;
+					kroundup32(m_name);
+					name = (char*)realloc(name, m_name);
+				}
+				name[l_name++] = c;
+			}
+			name[l_name] = '\0';
+			if (ret == 0) {
+				fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
+				free(name); fai_destroy(idx);
+				return 0;
+			}
+			if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n');
+			state = 1; len = 0;
+			offset = razf_tell(rz);
+		} else {
+			if (state == 3) {
+				fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
+				free(name); fai_destroy(idx);
+				return 0;
+			}
+			if (state == 2) state = 3;
+			l1 = l2 = 0;
+			do {
+				++l1;
+				if (isgraph(c)) ++l2;
+			} while ((ret = razf_read(rz, &c, 1)) && c != '\n');
+			if (state == 3 && l2) {
+				fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
+				free(name); fai_destroy(idx);
+				return 0;
+			}
+			++l1; len += l2;
+			if (state == 1) line_len = l1, line_blen = l2, state = 0;
+			else if (state == 0) {
+				if (l1 != line_len || l2 != line_blen) state = 2;
+			}
+		}
+	}
+	fai_insert_index(idx, name, len, line_len, line_blen, offset);
+	free(name);
+	return idx;
+}
+
+void fai_save(const faidx_t *fai, FILE *fp)
+{
+	khint_t k;
+	int i;
+	for (i = 0; i < fai->n; ++i) {
+		faidx1_t x;
+		k = kh_get(s, fai->hash, fai->name[i]);
+		x = kh_value(fai->hash, k);
+#ifdef _WIN32
+		fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
+#else
+		fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
+#endif
+	}
+}
+
+faidx_t *fai_read(FILE *fp)
+{
+	faidx_t *fai;
+	char *buf, *p;
+	int len, line_len, line_blen;
+#ifdef _WIN32
+	long offset;
+#else
+	long long offset;
+#endif
+	fai = (faidx_t*)calloc(1, sizeof(faidx_t));
+	fai->hash = kh_init(s);
+	buf = (char*)calloc(0x10000, 1);
+	while (!feof(fp) && fgets(buf, 0x10000, fp)) {
+		for (p = buf; *p && isgraph(*p); ++p);
+		*p = 0; ++p;
+#ifdef _WIN32
+		sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
+#else
+		sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
+#endif
+		fai_insert_index(fai, buf, len, line_len, line_blen, offset);
+	}
+	free(buf);
+	return fai;
+}
+
+void fai_destroy(faidx_t *fai)
+{
+	int i;
+	for (i = 0; i < fai->n; ++i) free(fai->name[i]);
+	free(fai->name);
+	kh_destroy(s, fai->hash);
+	if (fai->rz) razf_close(fai->rz);
+	free(fai);
+}
+
+int fai_build(const char *fn)
+{
+	char *str;
+	RAZF *rz;
+	FILE *fp;
+	faidx_t *fai;
+	str = (char*)calloc(strlen(fn) + 5, 1);
+	sprintf(str, "%s.fai", fn);
+	rz = razf_open(fn, "r");
+	if (rz == 0) {
+		fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
+		free(str);
+		return -1;
+	}
+	fai = fai_build_core(rz);
+	razf_close(rz);
+	fp = fopen(str, "wb");
+	if (fp == 0) {
+		fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
+		fai_destroy(fai); free(str);
+		return -1;
+	}
+	fai_save(fai, fp);
+	fclose(fp);
+	free(str);
+	fai_destroy(fai);
+	return 0;
+}
+
+#ifdef _USE_KNETFILE
+FILE *download_and_open(const char *fn)
+{
+    const int buf_size = 1 * 1024 * 1024;
+    uint8_t *buf;
+    FILE *fp;
+    knetFile *fp_remote;
+    const char *url = fn;
+    const char *p;
+    int l = strlen(fn);
+    for (p = fn + l - 1; p >= fn; --p)
+        if (*p == '/') break;
+    fn = p + 1;
+
+    // First try to open a local copy
+    fp = fopen(fn, "r");
+    if (fp)
+        return fp;
+
+    // If failed, download from remote and open
+    fp_remote = knet_open(url, "rb");
+    if (fp_remote == 0) {
+        fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
+        return NULL;
+    }
+    if ((fp = fopen(fn, "wb")) == 0) {
+        fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
+        knet_close(fp_remote);
+        return NULL;
+    }
+    buf = (uint8_t*)calloc(buf_size, 1);
+    while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+        fwrite(buf, 1, l, fp);
+    free(buf);
+    fclose(fp);
+    knet_close(fp_remote);
+
+    return fopen(fn, "r");
+}
+#endif
+
+faidx_t *fai_load(const char *fn)
+{
+	char *str;
+	FILE *fp;
+	faidx_t *fai;
+	str = (char*)calloc(strlen(fn) + 5, 1);
+	sprintf(str, "%s.fai", fn);
+
+#ifdef _USE_KNETFILE
+    if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)
+    {
+        fp = download_and_open(str);
+        if ( !fp )
+        {
+            fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
+            free(str);
+            return 0;
+        }
+    }
+    else
+#endif
+        fp = fopen(str, "rb");
+	if (fp == 0) {
+		fprintf(stderr, "[fai_load] build FASTA index.\n");
+		fai_build(fn);
+		fp = fopen(str, "rb");
+		if (fp == 0) {
+			fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
+			free(str);
+			return 0;
+		}
+	}
+
+	fai = fai_read(fp);
+	fclose(fp);
+
+	fai->rz = razf_open(fn, "rb");
+	free(str);
+	if (fai->rz == 0) {
+		fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
+		return 0;
+	}
+	return fai;
+}
+
+char *fai_fetch(const faidx_t *fai, const char *str, int *len)
+{
+	char *s, c;
+	int i, l, k, name_end;
+	khiter_t iter;
+	faidx1_t val;
+	khash_t(s) *h;
+	int beg, end;
+
+	beg = end = -1;
+	h = fai->hash;
+	name_end = l = strlen(str);
+	s = (char*)malloc(l+1);
+	// remove space
+	for (i = k = 0; i < l; ++i)
+		if (!isspace(str[i])) s[k++] = str[i];
+	s[k] = 0; l = k;
+	// determine the sequence name
+	for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
+	if (i >= 0) name_end = i;
+	if (name_end < l) { // check if this is really the end
+		int n_hyphen = 0;
+		for (i = name_end + 1; i < l; ++i) {
+			if (s[i] == '-') ++n_hyphen;
+			else if (!isdigit(s[i]) && s[i] != ',') break;
+		}
+		if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
+		s[name_end] = 0;
+		iter = kh_get(s, h, s);
+		if (iter == kh_end(h)) { // cannot find the sequence name
+			iter = kh_get(s, h, str); // try str as the name
+			if (iter == kh_end(h)) {
+				*len = 0;
+			free(s); return 0;
+			} else s[name_end] = ':', name_end = l;
+		}
+	} else iter = kh_get(s, h, str);
+	if(iter == kh_end(h)) {
+		fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
+		free(s);
+		return 0;
+	};
+	val = kh_value(h, iter);
+	// parse the interval
+	if (name_end < l) {
+		for (i = k = name_end + 1; i < l; ++i)
+			if (s[i] != ',') s[k++] = s[i];
+		s[k] = 0;
+		beg = atoi(s + name_end + 1);
+		for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
+		end = i < k? atoi(s + i + 1) : val.len;
+		if (beg > 0) --beg;
+	} else beg = 0, end = val.len;
+	if (beg >= val.len) beg = val.len;
+	if (end >= val.len) end = val.len;
+	if (beg > end) beg = end;
+	free(s);
+
+	// now retrieve the sequence
+	l = 0;
+	s = (char*)malloc(end - beg + 2);
+	razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
+	while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err)
+		if (isgraph(c)) s[l++] = c;
+	s[l] = '\0';
+	*len = l;
+	return s;
+}
+
+int faidx_main(int argc, char *argv[])
+{
+	if (argc == 1) {
+		fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n");
+		return 1;
+	} else {
+		if (argc == 2) fai_build(argv[1]);
+		else {
+			int i, j, k, l;
+			char *s;
+			faidx_t *fai;
+			fai = fai_load(argv[1]);
+			if (fai == 0) return 1;
+			for (i = 2; i != argc; ++i) {
+				printf(">%s\n", argv[i]);
+				s = fai_fetch(fai, argv[i], &l);
+				for (j = 0; j < l; j += 60) {
+					for (k = 0; k < 60 && k < l - j; ++k)
+						putchar(s[j + k]);
+					putchar('\n');
+				}
+				free(s);
+			}
+			fai_destroy(fai);
+		}
+	}
+	return 0;
+}
+
+int faidx_fetch_nseq(const faidx_t *fai) 
+{
+	return fai->n;
+}
+
+char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len)
+{
+	int l;
+	char c;
+    khiter_t iter;
+    faidx1_t val;
+	char *seq=NULL;
+
+    // Adjust position
+    iter = kh_get(s, fai->hash, c_name);
+    if(iter == kh_end(fai->hash)) return 0;
+    val = kh_value(fai->hash, iter);
+	if(p_end_i < p_beg_i) p_beg_i = p_end_i;
+    if(p_beg_i < 0) p_beg_i = 0;
+    else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
+    if(p_end_i < 0) p_end_i = 0;
+    else if(val.len <= p_end_i) p_end_i = val.len - 1;
+
+    // Now retrieve the sequence 
+	l = 0;
+	seq = (char*)malloc(p_end_i - p_beg_i + 2);
+	razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
+	while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1)
+		if (isgraph(c)) seq[l++] = c;
+	seq[l] = '\0';
+	*len = l;
+	return seq;
+}
+
+#ifdef FAIDX_MAIN
+int main(int argc, char *argv[]) { return faidx_main(argc, argv); }
+#endif
diff --git a/fastq-clipper.c b/fastq-clipper.c
new file mode 100644
index 0000000..842c464
--- /dev/null
+++ b/fastq-clipper.c
@@ -0,0 +1,279 @@
+/*
+Copyright (c) 2011 Expression Analysis / Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+
+Replaced, largely, by fastq-mcf.
+
+See "void usage" below for usage.
+
+*/
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <math.h>
+
+#include "fastq-lib.h"
+
+#define MAX_ADAPTER_NUM 20
+#define MAX_ADAPTER_LEN 160
+
+void usage(FILE *f);
+int hd(char *a, char *b, int n);
+int debug=0;
+int main (int argc, char **argv) {
+	char c;
+	bool eol;
+	int nmin = 4, nkeep = 15, xmax=-1, pctdiff = 20;
+	char *outfile = NULL;
+	
+	int i;
+	
+	char *a = NULL, *f = NULL;
+	while (	(c = getopt (argc, argv, "-hedbp:i:o:l:m:x::")) != -1) {
+		switch (c) {
+		case '\1': 
+			if (!f) 
+				f=optarg; 
+			else if (!a) 
+				a=optarg; 
+			else {
+				usage(stderr); return 1;
+			}
+			break;
+		case 'm': nmin = atoi(optarg); break;
+		case 'p': pctdiff = atoi(optarg); break;
+		case 'l': nkeep = atoi(optarg); break;
+		case 'e': eol = 1; break;
+		case 'h': usage(stdout); return 1; 
+		case 'b': eol = 0; break;
+		case 'd': debug = 1; break;
+		case 'x': xmax = optarg ? atoi(optarg) : -1; break;
+		case 'o': outfile = optarg; break;
+		case 'i': f = optarg; break;
+		case '?': 
+		     if (strchr("lm", optopt))
+		       fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+		     else if (isprint(optopt))
+		       fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+		     else
+		       fprintf (stderr,
+				"Unknown option character `\\x%x'.\n",
+				optopt);
+		     usage(stderr);
+             	     return 1;
+		}
+	}
+
+	if (argc < 3 || !a || !f) {
+		usage(stderr);
+		return 1;
+	}
+
+	FILE *fin = strcmp(f,"-") ? fopen(f, "r") : stdin; 
+	if (!fin) {
+		fprintf(stderr, "Error opening file '%s': %s\n",f, strerror(errno));
+		return 1;
+	}
+
+	FILE *fout = stdout;
+	FILE *fstat = stderr;
+	if (outfile ) {
+		fout = fopen(outfile, "w"); 
+		if (!fout) {
+			fprintf(stderr, "Error opening output file '%s': %s",outfile, strerror(errno));
+			return 1;
+		}
+		fstat = stdout;
+	}
+
+	char *adapters[MAX_ADAPTER_NUM+1];
+	int adapter_len[MAX_ADAPTER_NUM+1];
+	char *p;
+	int adapter_count=0;
+	while (p=strtok(a,":")) {
+		a = NULL;					// strtok requirement
+                adapters[adapter_count] = p;
+                adapter_len[adapter_count] = strlen(p);         // append to list
+		++adapter_count;
+                if (adapter_count >= MAX_ADAPTER_NUM) {
+                        break;
+                }
+        }
+
+	char *s[4] = {0,0,0,0}; 	// id, sequence, comment, quality
+	size_t na[4] = {0,0,0,0};	// lengths of above
+	int ns[4] = {0,0,0,0};	// lengths of above
+	int nrec=0;
+	int nerr=0;
+	int nok=0;
+	int ntooshort=0;
+	int ntrim=0;
+	int nbtrim=0;
+	while (1) {
+		int i;
+		for (i = 0; i < 4; ++i ) {
+			ns[i] = getline(&s[i], &na[i], fin);
+		}
+
+		if (ns[1] <= 0) { 
+			break;
+		}
+
+		++nrec;
+
+		// skip malformed records
+		if (ns[1] != ns[3] || s[0][0] != '@' || s[2][0] != '+') {
+			if (nerr < 10) {
+				fprintf(stderr, "Malformed fastq record at line %d\n", nrec*4-3);
+			}
+			++nerr;
+			continue;
+		}
+
+		// chomp
+		s[1][ns[1]-1]='\0';
+		--ns[1];
+		s[3][ns[3]-1]='\0';
+		--ns[3];
+
+		if (debug) fprintf(stderr, "seq: %s %d\n", s[1], ns[1]);
+
+		bool skip = 0;
+		int bestscore = 999, bestoff = 0, bestlen = 0;
+
+		for (i =0; i < adapter_count; ++i) {
+			int nmatch = nmin;
+			if (!nmatch) nmatch = adapter_len[i];			// full match required if nmin == 0
+	
+			// how far in to search for a match?
+			int mx = adapter_len[i];
+			if (xmax) {
+				 mx = ns[1];
+				 if (xmax > 0 && (xmax+adapter_len[i]) < mx)
+					mx = xmax+adapter_len[i];		// xmax is added to adapter length
+			}
+
+			if (debug)
+				fprintf(stderr, "adapter: %s, adlen: %d, nmatch: %d, mx: %d\n", adapters[i], adapter_len[i], nmatch, mx);
+
+			int off;
+			for (off = nmatch; off <= mx; ++off) {			// off is distance from tail of sequence
+				char *seqtail = s[1]+ns[1]-off; 		// search at tail
+				int ncmp = off<adapter_len[i] ? off : adapter_len[i];
+				int mind = (pctdiff * ncmp) / 100;
+				int d = hd(adapters[i],seqtail,ncmp);		// # differences
+				if (debug)
+					fprintf(stderr, "tail: %s, bestoff: %d, off: %d, ncmp: %d, mind: %d, hd %d\n", seqtail, bestoff, off, ncmp, mind, d);
+				// calc squared distance over length score
+				if (d <= mind) {
+					int score = (d*d+1)/ncmp;
+					if (score <= bestscore) {			// better score?
+						bestscore = score;			// save max score
+						bestoff = off;				// offset at max
+						bestlen = ncmp;				// cmp length at max
+					}
+					if (d == 0 && (ncmp == adapter_len[i])) {
+						break;
+					}
+				}
+			}
+
+			// assure time wasn't wasted running a comparison that couldn't matter
+			assert((bestlen == 0) || (bestlen >= nmatch));
+
+			if (bestoff > 0) {
+				if ( (ns[1]-bestoff) < nkeep) {
+					++ntooshort;
+					skip = 1;
+					break;
+				}
+			}
+		}	
+
+		if (!skip) {
+			if (bestoff > 0) {
+				++ntrim;
+				s[1][ns[1]-bestoff]='\0';
+				s[3][ns[1]-bestoff]='\0';
+			}
+			fputs(s[0],fout);
+			fputs(s[1],fout);
+			fputc('\n',fout);
+			fputs(s[2],fout);
+			fputs(s[3],fout);
+			fputc('\n',fout);
+		}
+	}
+	fprintf(fstat, "Total: %d\n", nrec);
+	fprintf(fstat, "Too Short: %d\n", ntooshort);
+	fprintf(fstat, "Trimmed: %d\n", ntrim);
+	fprintf(fstat, "Errors: %d\n", nerr);
+	return 0;
+}
+
+void usage(FILE *f) {
+	fprintf(f, 
+"usage: fastq-clipper [options] <fastq-file> <adapters>\n"
+"\n"
+"Removes one or more adapter sequences from the fastq file.\n"
+"Adapter sequences are colon-delimited.\n"
+"Stats go to stderr, unless -o is specified.\n"
+"\n"
+"Options:\n"
+"	-h	This help\n"
+"	-o FIL	Output file (stats to stdout)\n"
+"	-p N	Maximum difference percentage (10)\n"
+"	-m N	Minimum clip length (1)\n"
+"	-l N	Minimum remaining sequence length (15)\n"
+"	-x [N]	Extra match length past adapter length, \n"
+"		 N =-1 : search all\n"
+"		 N = 0 : search only up to adapter length\n"
+"	-e	End-of-line (default)\n"
+"	-b	Beginning-of-line (not supported yet)\n"
+	);
+}
+
+/*
+#!/usr/bin/perl
+
+my ($f, $a) = @ARGV;
+
+my @a = split(/,/, $a);
+
+open (F, $f) || die;
+
+while (my $r = <F>) {
+	for my $a (@a) {
+		for (my $i = 1; $i < length($a); ++$i) {
+			
+		}
+	}
+}
+# http://www.perlmonks.org/?node_id=500235
+sub hd{ length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) }
+*/
diff --git a/fastq-join.c b/fastq-join.c
new file mode 100644
index 0000000..30b54a3
--- /dev/null
+++ b/fastq-join.c
@@ -0,0 +1,424 @@
+/*
+Copyright (c) 2011 Expression Analysis / Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "fastq-lib.h"
+
+/*
+
+See "void usage" below for usage.
+
+*/
+
+#define VERSION "1.01"
+#define SVNREV atoi(strchr("$LastChangedRevision: 679 $", ':')+1)
+
+void usage(FILE *f);
+int debug=0;
+
+int main (int argc, char **argv) {
+	char c;
+	int mismatch = 0;
+	char *in[3] = {0,0,0};
+	char *out[5];
+	char *orep=NULL;
+	int out_n = 0;
+	int in_n = 0;
+	int threads = 1;				// not really necessary
+	char verify='\0';
+
+	int i;
+	int mino = 6;
+	int pctdiff = 8;				// this number tested well on exome data... tweak for best results
+	bool omode = false;	
+	char *bfil = NULL;
+    bool norevcomp = false;
+    bool allow_ex = false;
+
+	while (	(c = getopt (argc, argv, "-dRnbeo:t:v:m:p:r:xV")) != -1) {
+		switch (c) {
+		case '\1':
+			if (!in[0]) 
+				in[0]=optarg;
+			else if (!in[1])		
+				in[1]=optarg;
+			else if (!in[2])		
+				in[2]=optarg;
+			else {
+				usage(stderr); return 1;
+			}
+			++in_n;
+			break;
+                case 'o': if (out_n == 3) {
+				usage(stderr); return 1;
+			  }
+			  out[out_n++] = optarg; 
+			  break;
+		case 'r': orep = optarg; break;
+		case 't': threads = atoi(optarg); break;
+        case 'V': printf("Version: %s.%d\n", VERSION, SVNREV); return 0; break;
+		case 'm': mino = atoi(optarg); break;
+		case 'x': allow_ex = true; break;
+		case 'p': pctdiff = atoi(optarg); break;
+		case 'R': norevcomp = true; break;
+		case 'd': debug = 1; break;
+                case 'v':
+                        if (strlen(optarg)>1) {
+                                fprintf(stderr, "Option -v requires a single character argument");
+                                exit(1);
+                        }
+                        verify = *optarg; break;
+		case '?': 
+		     if (strchr("otvmpr", optopt))
+		       fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+		     else if (isprint(optopt))
+		       fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+		     else
+		       fprintf (stderr,
+				"Unknown option character `\\x%x'.\n",
+				optopt);
+		     usage(stderr);
+             	     return 1;
+		}
+	}
+
+	if (argc < 3 || !in[1] || (!in[2] && out_n != 1 && out_n != 3) || (in[2] && out_n != 1 && out_n != 5)) {
+		usage(stderr);
+		return 1;
+	}
+
+	FILE *fin[2];
+	bool gzin[2]; meminit(gzin);
+	for (i = 0; i < in_n; ++i) {
+		fin[i] = gzopen(in[i], "r",&gzin[i]); 
+		if (!fin[i]) {
+			fprintf(stderr, "Error opening file '%s': %s\n",in[i], strerror(errno));
+			return 1;
+		}
+	}
+
+	const char *suffix[5]={"un1", "un2", "join", "un3", "join2"};
+	FILE *fout[5]; meminit(fout);
+	bool gzout[5]; meminit(gzout);
+	char *pre = out[0];
+	for (i = 0; i < (in[2] ? 5 : 3); ++i) {
+		// prefix out
+		if (out_n == 1) {
+			out[i]=(char *)malloc(strlen(pre)+10);
+			strcpy(out[i], pre);
+			char *p;
+			if (p=strchr(out[i], '%')) {
+				// substiture instead of append
+				strcpy(p, suffix[i]);
+				strcpy(p+strlen(suffix[i]), pre+(p-out[i])+1);
+			} else {
+				strcat(out[i], suffix[i]);
+			}
+		} // else explicit
+		fout[i] = gzopen(out[i], "w",&gzout[i]);
+		if (!fout[i]) {
+				fprintf(stderr, "Error opening output file '%s': %s\n",out[i], strerror(errno));
+				return 1;
+		}
+	}
+
+//printf("in_n:%d in:%x fo:%x", in_n, in[3], fout[4]);
+//return 1;
+
+	FILE *frep = NULL;
+	if (orep) {
+                frep = fopen(orep, "w");
+                if (!orep) {
+                        fprintf(stderr, "Error opening report file '%s': %s\n",out[i], strerror(errno));
+                        return 1;
+                }
+	}
+
+
+	// some basic validation of the file formats
+	{
+		for (i=0;i<in_n;++i) {
+			char c=getc(fin[i]);
+			if (c != '@')  {
+				fprintf(stderr, "%s doesn't appear to be a fastq file (%c)\n", in[i], c);
+				return 1;
+			}
+			ungetc(c, fin[i]);
+		}
+	}
+
+	struct fq fq[3];	
+        meminit(fq);
+
+	int nrec=0;
+	int nerr=0;
+	int nok=0;
+	int joincnt=0;
+	double tlen=0;
+	double tlensq=0;
+	int read_ok;
+
+	struct fq rc;
+	meminit(rc);
+
+	// read in 1 record from each file
+	while (read_ok=read_fq(fin[0], nrec, &fq[0])) {
+		for (i=1;i<in_n;++i) {
+		int mate_ok=read_fq(fin[i], nrec, &fq[i]);
+		if (read_ok != mate_ok) {
+			fprintf(stderr, "# of rows in mate file '%s' doesn't match primary file, quitting!\n", in[i]);
+			return 1;
+		}
+		if (verify) {
+			// verify 1 in 100
+			if (0 == (nrec % 100)) {
+				char *p=strchr(fq[i].id.s,verify);
+				if (!p) {
+					fprintf(stderr, "File %s is missing id verification char %c at line %d", in[i], verify, nrec*4+1);
+					return 1;
+				}
+				int l = p-fq[i].id.s;
+				if (strncmp(fq[0].id.s, fq[i].id.s, l)) {
+					fprintf(stderr, "File %s, id doesn't match file %s at line %d", in[0], in[i], nrec*4+1);
+					return 1;
+				}
+			}
+		}
+		}
+
+		++nrec;
+		if (read_ok < 0) continue;
+
+		if (debug) fprintf(stderr, "seq: %s %d\n", fq[0].seq.s, fq[0].seq.n);
+
+        if (!norevcomp) {
+    		revcomp(&rc, &fq[1]);
+        } else {
+            rc=fq[1];
+        }
+
+		if (debug) fprintf(stderr, "comp: %s %d\n", rc.seq.s, rc.seq.n);
+
+		int maxo = min(fq[0].seq.n, rc.seq.n);
+		int bestscore=INT_MAX;
+		int besto=-1;
+		for (i=mino; i <= maxo; ++i) {
+			int mind = (pctdiff * i) / 100;
+            int d;
+            d=hd(fq[0].seq.s+fq[0].seq.n-i, rc.seq.s, i);
+			if (debug) fprintf(stderr, "hd: %d, %d\n", i, d);
+			if (d <= mind) {
+				// squared-distance over length, probably can be proven better (like pearson's)
+				int score = (1000*(d*d+1))/i;	
+				if (score < bestscore) {
+					bestscore=score;
+					besto=i;
+				}
+			}
+		}
+
+        int hasex=0;
+        if (allow_ex && besto<maxo) {
+            if (fq[0].seq.n > rc.seq.n) {
+                int mind = (pctdiff * maxo) / 100;
+                for (i=0; i < fq[0].seq.n-maxo; ++i ) {
+                    int d;
+                    d=hd(fq[0].seq.s+fq[0].seq.n-rc.seq.n-i-1, rc.seq.s, maxo);
+                    if (debug) fprintf(stderr, "hd: %d, %d\n", -i, d);
+                    if (d <= mind) {
+                        // squared-distance over length, probably can be proven better (like pearson's)
+                        int score = (1000*(d*d+1))/maxo;
+                        if (score < bestscore) {
+                            bestscore=score;
+                            // negative overlap!
+                            hasex=-i;
+                            besto=maxo;
+                        }
+                    }
+                }
+            } else if (fq[0].seq.n < rc.seq.n) {
+                int mind = (pctdiff * maxo) / 100;
+                for (i=0; i < rc.seq.n-maxo; ++i ) {
+                    int d;
+                    d=hd(fq[0].seq.s, rc.seq.s+i, maxo);
+                    if (debug) fprintf(stderr, "hd: %d, %d\n", -i, d);
+                    if (d <= mind) {
+                        // squared-distance over length, probably can be proven better (like pearson's)
+                        int score = (1000*(d*d+1))/maxo;
+                        if (score < bestscore) {
+                            bestscore=score;
+                            // negative overlap!
+                            hasex=-i;
+                            besto=maxo;
+                        }
+                    }
+                }
+            }
+        }
+
+		if (debug) {
+			fprintf(stderr, "best: %d %d\n", besto-hasex, bestscore);
+		}
+
+		FILE *fmate = NULL;
+        int olen = besto-hasex;
+
+		if (besto > 0) {
+			++joincnt;
+
+            int l=besto/2;                  // discard from left
+            int r=besto-(besto/2);                  // discard from right
+
+			tlen+=olen;
+			tlensq+=olen*olen;
+
+            char *sav_fqs=NULL, *sav_rcs;
+            char *sav_fqq, *sav_rcq;
+
+            if (hasex) {
+                sav_fqs=fq[0].seq.s;
+                sav_fqq=fq[0].qual.s;
+                sav_rcs=rc.seq.s;
+                sav_rcq=rc.qual.s;
+                if (fq[0].seq.n < rc.seq.n) {
+                    rc.seq.s=rc.seq.s-hasex;
+                    rc.qual.s=rc.qual.s-hasex;
+                    rc.seq.n=maxo;
+                    rc.qual.n=maxo;
+                } else {
+                    // fprintf(stderr, "rc negative overlap: %s %d\n", rc.seq.s, hasex);
+                    fq[0].seq.s=fq[0].seq.s+fq[0].seq.n-maxo+hasex-1;
+                    fq[0].qual.s=fq[0].qual.s+fq[0].seq.n-maxo+hasex-1;
+                    fq[0].seq.n=maxo;
+                    fq[0].qual.n=maxo;
+                    // fprintf(stderr, "negative overlap: %s -> %s, %d\n", fq[0].seq.s, rc.seq.s, maxo);
+                }
+                // ok now pretend everythings normal, 100% overlap
+		        //if (debug) 
+            }
+
+			FILE *f=fout[2];
+
+			if (verify) {
+				char *p=strchr(fq[0].id.s,verify);
+				if (p) {
+					*p++ = '\n';
+					*p = '\0';
+				}
+			}
+			fputs(fq[0].id.s,f);
+			for (i = 0; i < besto; ++i ) {
+				int li = fq[0].seq.n-besto+i;
+				int ri = i;
+				if (fq[0].seq.s[li] == rc.seq.s[ri]) {
+					fq[0].qual.s[li] = max(fq[0].qual.s[li], rc.qual.s[ri]);
+					rc.qual.s[ri] = max(fq[0].qual.s[li], rc.qual.s[ri]);
+				} else {
+					// use the better-quality read, although the qual should be downgraded due to the difference!
+					if (fq[0].qual.s[li] > rc.qual.s[ri]) {
+						rc.seq.s[ri] = fq[0].seq.s[li];
+					} else {
+						fq[0].seq.s[li] = rc.seq.s[ri];
+					}
+				}
+			}
+
+			fwrite(fq[0].seq.s,1,fq[0].seq.n-l,f);
+			fputs(rc.seq.s+r,f);
+			fputc('\n',f);
+			fputs(fq[0].com.s,f);
+			fwrite(fq[0].qual.s,1,fq[0].qual.n-l,f);
+			fputs(rc.qual.s+r,f);
+			fputc('\n',f);
+			fmate=fout[4];
+
+            if (sav_fqs) {
+                fq[0].seq.s=sav_fqs;
+                fq[0].qual.s=sav_fqq;
+                rc.seq.s=sav_rcs;
+                rc.qual.s=sav_rcq;
+            }
+
+			if (frep) {
+				fprintf(frep, "%d\n", besto);
+			}
+		} else {
+			for (i=0;i<2;++i) {
+				FILE *f=fout[i];
+				fputs(fq[i].id.s,f);
+				fputs(fq[i].seq.s,f);
+				fputc('\n',f);
+				fputs(fq[i].com.s,f);
+				fputs(fq[i].qual.s,f);
+				fputc('\n',f);
+			}
+			fmate=fout[3];
+		}
+
+		if (fmate) {
+			fputs(fq[2].id.s,fmate);
+			fputs(fq[2].seq.s,fmate);
+			fputc('\n',fmate);
+			fputs(fq[2].com.s,fmate);
+			fputs(fq[2].qual.s,fmate);
+			fputc('\n',fmate);
+		}
+	}
+
+
+	double dev = sqrt((((double)joincnt)*tlensq-pow((double)tlen,2)) / ((double)joincnt*((double)joincnt-1)) );
+	printf("Total reads: %d\n", nrec);
+	printf("Total joined: %d\n", joincnt);
+	printf("Average join len: %.2f\n", (double) tlen / (double) joincnt);
+	printf("Stdev join len: %.2f\n", dev);
+    printf("Version: %s.%d\n", VERSION, SVNREV);
+
+	return 0;
+}
+
+void usage(FILE *f) {
+	fputs( 
+"Usage: fastq-join [options] <read1.fq> <read2.fq> [mate.fq] -o <read.%.fq>\n"
+"\n"
+"Joins two paired-end reads on the overlapping ends.\n"
+"\n"
+"Options:\n"
+"\n"
+"-o FIL     See 'Output' below\n"
+"-v C       Verifies that the 2 files probe id's match up to char C\n"
+"            use ' ' (space) for Illumina reads\n"
+"-p N       N-percent maximum difference (8)\n"
+"-m N       N-minimum overlap (6)\n"
+"-r FIL     Verbose stitch length report\n"
+"-R         No reverse complement\n"
+"-x         Allow insert < read length\n"
+"\n"
+"Output: \n"
+"\n"
+"  You can supply 3 -o arguments, for un1, un2, join files, or one \n"
+"argument as a file name template.  The suffix 'un1, un2, or join' is \n"
+"appended to the file, or they replace a %-character if present.\n"
+"\n"
+"  If a 'mate' input file is present (barcode read), then the files\n"
+"'un3' and 'join2' are also created.\n"
+"\n"
+	,f);
+}
diff --git a/fastq-lib.cpp b/fastq-lib.cpp
new file mode 100644
index 0000000..17f90ec
--- /dev/null
+++ b/fastq-lib.cpp
@@ -0,0 +1,375 @@
+/*
+Copyright (c) 2011 Expression Analysis / Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "fastq-lib.h"
+
+#ifdef __MAIN__
+int main(int argc, char **argv) {
+	// todo... put testing stuff in here, so the lib can be tested independently of the other componenets
+}
+#endif
+
+int read_line(FILE *in, struct line &l) {
+        return (l.n = getline(&l.s, &l.a, in));
+}
+
+int read_fq(FILE *in, int rno, struct fq *fq, const char *name) {
+    read_line(in, fq->id);
+	if (fq->id.s && (*fq->id.s == '>')) {
+		fq->id.s[0] = '@';
+		// read fasta instead
+		char c = fgetc(in);
+		while (c != '>' && c != EOF) {
+			if (fq->seq.a <= (fq->seq.n+1)) {
+				fq->seq.s=(char *)realloc(fq->seq.s, fq->seq.a=(fq->seq.a+16)*2);
+			}
+			if (!isspace(c)) 
+				fq->seq.s[fq->seq.n++]=c;
+			c = fgetc(in);
+		}
+		if (c != EOF) {
+			ungetc(c, in);
+		}
+		// make it look like a fastq
+		fq->qual.s=(char *)realloc(fq->qual.s, fq->qual.a=(fq->seq.n+1));
+		memset(fq->qual.s, 'h', fq->seq.n);
+		fq->qual.s[fq->qual.n=fq->seq.n]=fq->seq.s[fq->seq.n]='\0';
+		fq->com.s=(char *)malloc(fq->com.a=2);
+		fq->com.n=1;
+		strcpy(fq->com.s,"+");
+	} else {
+		read_line(in, fq->seq);
+		read_line(in, fq->com);
+		read_line(in, fq->qual);
+	}
+
+        if (fq->qual.n <= 0)
+                return 0;
+        if (fq->id.s[0] != '@' || fq->com.s[0] != '+' || fq->seq.n != fq->qual.n) {
+                const char *errtyp = (fq->seq.n != fq->qual.n) ?  "length mismatch" : fq->id.s[0] != '@' ? "no '@' for id" : "no '+' for comment";
+                if (name) {
+                    fprintf(stderr, "Malformed fastq record (%s) in file '%s', line %d\n", errtyp, name, rno*2+1);
+                } else {
+                    fprintf(stderr, "Malformed fastq record (%s) at line %d\n", errtyp, rno*2+1);
+                }
+                return -1;
+        }
+        // win32-safe chomp
+        fq->seq.s[--fq->seq.n] = '\0';
+        if (fq->seq.s[fq->seq.n-1] == '\r') {
+                fq->seq.s[--fq->seq.n] = '\0';
+        }
+        fq->qual.s[--fq->qual.n] = '\0';
+        if (fq->qual.s[fq->qual.n-1] == '\r') {
+                fq->qual.s[--fq->qual.n] = '\0';
+        }
+        return 1;
+}
+
+struct qual_str {
+        long long int cnt;
+        long long int sum;
+        long long int ssq;
+        long long int ns;
+} quals[MAX_FILENO_QUALS+1] = {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}};
+
+int gzclose(FILE *f, bool isgz) {
+	return isgz ? pclose(f) : fclose(f);
+}
+
+FILE *gzopen(const char *f, const char *m, bool*isgz) {
+	// maybe use zlib some day?
+        FILE *h;
+        const char * ext = fext(f);
+        if (!strcmp(ext,".gz")) {
+            char *tmp=(char *)malloc(strlen(f)+100);
+            if (strchr(m,'w')) {
+                    strcpy(tmp, "gzip --rsyncable > '");
+                    strcat(tmp, f);
+                    strcat(tmp, "'");
+            } else {
+                    strcpy(tmp, "gunzip -c '");
+                    strcat(tmp, f);
+                    strcat(tmp, "'");
+            }
+            h = popen(tmp, m);
+            *isgz=1;
+            free(tmp);
+        } else if (!strcmp(ext,".zip")) {
+            char *tmp=(char *)malloc(strlen(f)+100);
+            if (strchr(m,'w')) {
+                    strcpy(tmp, "zip -q '");
+                    strcat(tmp, f);
+                    strcat(tmp, "' -");
+            } else {
+                    strcpy(tmp, "unzip -p '");
+                    strcat(tmp, f);
+                    strcat(tmp, "'");
+            }
+            h = popen(tmp, m);
+            *isgz=1;
+            free(tmp);
+        } else {
+                h = fopen(f, m);
+                *isgz=0;
+        }
+        if (!h) {
+                fprintf(stderr, "Error opening file '%s': %s\n",f, strerror(errno));
+                exit(1);
+        }
+        return h;
+}
+
+const char *fext(const char *f) {
+        const char *x=strrchr(f,'.');
+        return x ? x : "";
+}
+
+bool poorqual(int n, int l, const char *s, const char *q) {
+        int i=0, sum=0, ns=0;
+        for (i=0;i<l;++i) {
+                if (s[i] == 'N')
+                    ++ns;
+                quals[n].cnt++;
+                quals[n].ssq += q[i] * q[i];
+                sum+=q[i];
+        }
+        quals[n].sum += sum;
+        quals[n].ns += ns;
+        int xmean = sum/l;
+        if (quals[n].cnt < 20000) {
+            // mean qual < 18 = junk
+            return ((xmean-33) < 18) || (ns > 1);
+        }
+        // enough data? use stdev
+        int pmean = quals[n].sum / quals[n].cnt;                                // mean q
+        double pdev = stdev(quals[n].cnt, quals[n].sum, quals[n].ssq);          // dev q
+        int serr = min(pmean/2,max(1,pdev/sqrt(l)));                                         // stderr for length l
+        // mean qual < min(18,peman-serr*3) = junk/skip it
+        // cap low qual, because adapters often are low qual
+        // but you still need to calculate something, in case we're doing ion/pacbio
+        int thr = min((33+18), (pmean - serr * 3));
+        if (xmean < thr) {
+//           fprintf(stderr, "POORQ xmean:%d, pmean:%d, pdev:%f, sqrt(l):%f, serr:%d, thr: %d, %s",xmean,pmean,pdev,sqrt(l),serr,thr,s);
+            return 1;                                                       // ditch it
+        }
+        if (ns > (1+(l*quals[n].ns / quals[n].cnt))) {                          // 1 more n than average?
+//           fprintf(stderr, "POORQ: ns:%d, thr: %d\n",ns,(int)(1+(l*quals[n].ns / quals[n].cnt)));
+            return 1;                                                       // ditch it
+        }
+        return 0;
+}
+
+#define comp(c) ((c)=='A'?'T':(c)=='a'?'t':(c)=='C'?'G':(c)=='c'?'g':(c)=='G'?'C':(c)=='g'?'c':(c)=='T'?'A':(c)=='t'?'a':(c))
+
+void revcomp(struct fq *d, struct fq *s) {
+        if (!d->seq.s) {
+                d->seq.s=(char *) malloc(d->seq.a=s->seq.n+1);
+                d->qual.s=(char *) malloc(d->qual.a=s->qual.n+1);
+        } else if (d->seq.a <= s->seq.n) {
+                d->seq.s=(char *) realloc(d->seq.s, d->seq.a=(s->seq.n+1));
+                d->qual.s=(char *) realloc(d->qual.s, d->qual.a=(s->qual.n+1));
+        }
+        int i;
+        for (i=0;i<s->seq.n/2;++i) {
+                char b=s->seq.s[i];
+                char q=s->qual.s[i];
+                //printf("%d: %c, %c\n", i, comp(s->seq.s[s->seq.n-i-1]), s->qual.s[s->qual.n-i-1]);
+                d->seq.s[i]=comp(s->seq.s[s->seq.n-i-1]);
+                d->qual.s[i]=s->qual.s[s->qual.n-i-1];
+                //printf("%d: %c, %c\n", s->seq.n-i-1, comp(b), q);
+                d->seq.s[s->seq.n-i-1]=comp(b);
+                d->qual.s[s->seq.n-i-1]=q;
+        }
+        if (s->seq.n % 2) {
+                //printf("%d: %c, %c\n", 1+s->seq.n/2, comp(s->seq.s[s->seq.n/2]));
+                d->seq.s[s->seq.n/2] = comp(s->seq.s[s->seq.n/2]);
+                d->qual.s[s->seq.n/2] = s->qual.s[s->seq.n/2];
+        }
+        d->seq.n=s->seq.n;
+        d->qual.n=s->qual.n;
+        d->seq.s[s->seq.n]='\0';
+        d->qual.s[s->seq.n]='\0';
+}
+
+void free_line(struct line *l) {
+   if (l) {
+       if (l->s) free(l->s); 
+       l->s=NULL;
+   }
+}
+
+void free_fq(struct fq *f) {
+    if (f) {
+        free_line(&f->id);
+        free_line(&f->seq);
+        free_line(&f->com);
+        free_line(&f->qual);
+    }
+}
+
+
+/* getline.c -- Replacement for GNU C library function getline
+
+Copyright (C) 1993 Free Software Foundation, Inc.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/* Written by Jan Brittenson, bson at gnu.ai.mit.edu.  */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+
+/* Read up to (and including) a TERMINATOR from STREAM into *LINEPTR
+   + OFFSET (and null-terminate it). *LINEPTR is a pointer returned from
+   malloc (or NULL), pointing to *N characters of space.  It is realloc'd
+   as necessary.  Return the number of characters read (not including the
+   null terminator), or -1 on error or EOF.  */
+
+int getstr (char ** lineptr, size_t *n, FILE * stream, char terminator, int offset)
+{
+  int nchars_avail;		/* Allocated but unused chars in *LINEPTR.  */
+  char *read_pos;		/* Where we're reading into *LINEPTR. */
+  int ret;
+
+  if (!lineptr || !n || !stream)
+    return -1;
+
+  if (!*lineptr)
+    {
+      *n = 64;
+      *lineptr = (char *) malloc (*n);
+      if (!*lineptr)
+	return -1;
+    }
+
+  nchars_avail = *n - offset;
+  read_pos = *lineptr + offset;
+
+  for (;;)
+    {
+      register int c = getc (stream);
+
+      /* We always want at least one char left in the buffer, since we
+	 always (unless we get an error while reading the first char)
+	 NUL-terminate the line buffer.  */
+
+      assert(*n - nchars_avail == read_pos - *lineptr);
+      if (nchars_avail < 1)
+	{
+	  if (*n > 64)
+	    *n *= 2;
+	  else
+	    *n += 64;
+
+	  nchars_avail = *n + *lineptr - read_pos;
+	  *lineptr = (char *) realloc (*lineptr, *n);
+	  if (!*lineptr)
+	    return -1;
+	  read_pos = *n - nchars_avail + *lineptr;
+	  assert(*n - nchars_avail == read_pos - *lineptr);
+	}
+
+      if (c == EOF || ferror (stream))
+	{
+	  /* Return partial line, if any.  */
+	  if (read_pos == *lineptr)
+	    return -1;
+	  else
+	    break;
+	}
+
+      *read_pos++ = c;
+      nchars_avail--;
+
+      if (c == terminator)
+	/* Return the line.  */
+	break;
+    }
+
+  /* Done - NUL terminate and return the number of chars read.  */
+  *read_pos = '\0';
+
+  ret = read_pos - (*lineptr + offset);
+  return ret;
+}
+
+#if !defined(__GNUC__) || defined(__APPLE__) || defined(WIN32)
+
+ssize_t getline(char **lineptr, size_t *n, FILE *stream)
+{
+  return getstr (lineptr, n, stream, '\n', 0);
+}
+
+/* 
+ * public domain strtok_r() by Charlie Gordon
+ *
+ *   from comp.lang.c  9/14/2007
+ *
+ *      http://groups.google.com/group/comp.lang.c/msg/2ab1ecbb86646684
+ *
+ *     (Declaration that it's public domain):
+ *      http://groups.google.com/group/comp.lang.c/msg/7c7b39328fefab9c
+ */
+
+char* strtok_r(char *str, const char *delim, char **nextp) 
+{
+    char *ret;
+
+    if (str == NULL) {
+        str = *nextp;
+    }
+
+    str += strspn(str, delim);
+
+    if (*str == '\0'){
+        return NULL;
+    }
+	
+    ret = str;
+    str += strcspn(str, delim);
+
+    if (*str) {
+        *str++ = '\0';
+    }
+
+    *nextp = str;
+    return ret;
+}
+
+#endif
+
+
diff --git a/fastq-lib.h b/fastq-lib.h
new file mode 100644
index 0000000..d602ccd
--- /dev/null
+++ b/fastq-lib.h
@@ -0,0 +1,113 @@
+/*
+Copyright (c) 2011 Expression Analysis / Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// 32-bit o/s support
+#if defined(__i386__)
+	#define _FILE_OFFSET_BITS 64
+#endif
+
+// standard libs
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <getopt.h>
+#include <assert.h>
+#include <math.h>
+#include <sys/stat.h>
+#include <search.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#if defined(__APPLE__)
+	#define getopt(a,b,c) getopt_long(a,b,c,NULL,NULL)
+#endif
+
+// misc useful macros
+#define max(a,b) ((a)>(b)?(a):(b))
+#define min(a,b) ((a)<(b)?(a):(b))
+#define meminit(l) (memset(&l,0,sizeof(l)))
+#define fail(s,...) ((fprintf(stderr,s,##__VA_ARGS__), exit(1)))
+#define warn(s,...) ((fprintf(stderr,s,##__VA_ARGS__)))
+#define stdev(cnt, sum, ssq) sqrt((((double)cnt)*ssq-pow((double)sum,2)) / ((double)cnt*((double)cnt-1)))
+
+// maximum number of files that can be tracked by poorquals lib
+#define MAX_FILENO_QUALS 6
+
+// read line, read fq
+typedef struct line {
+        char *s; int n; size_t a;
+} line;
+
+struct fq {
+        line id;
+        line seq;
+        line com;
+        line qual;
+};
+
+
+void free_line(struct line *l);
+void free_fq(struct fq *fq);
+
+// not GNU?  probably no getline & strtok_r...
+#if !defined( __GNUC__) || defined(WIN32) || defined(__APPLE__)
+	ssize_t getline(char **lineptr, size_t *n, FILE *stream);
+	char* strtok_r(char *str, const char *delim, char **nextp);
+#endif
+    
+// get file extension
+const char *fext(const char *f);
+
+// read fq
+int read_line(FILE *in, struct line &l);                // 0=done, 1=ok, -1=err+continue
+int read_fq(FILE *in, int rno, struct fq *fq, const char *name=NULL);          // 0=done, 1=ok, -1=err+continue
+void free_fq(struct fq *fq);
+
+// open a file, possibly gzipped, exit on failure
+FILE *gzopen(const char *in, const char *mode, bool *isgz);
+int gzclose(FILE *f, bool isgz);
+
+// keep track of poor quals (n == "file number", maybe should have persistent stat struct instead?)
+bool poorqual(int n, int l, const char *s, const char *q);
+
+// returns number of differences between 2 strings, where n is the "max-length to check"
+inline int hd(char *a, char *b, int n) {
+        int d=0;
+        //if (debug) fprintf(stderr, "hd: %s,%s ", a, b);
+        while (*a && *b && n > 0) {
+                if (*a != *b) ++d;
+                --n;
+                ++a;
+                ++b;
+        }
+        //if (debug) fprintf(stderr, ", %d/%d\n", d, n);
+        return d+n;
+}
+
+// reverse complement an fq entry into a blank (memset 0) one
+void revcomp(struct fq *dest, struct fq* src);
+
+
diff --git a/fastq-mcf.c b/fastq-mcf.c
new file mode 100644
index 0000000..ee25914
--- /dev/null
+++ b/fastq-mcf.c
@@ -0,0 +1,1697 @@
+/*
+Copyright (c) 2011 Expression Analysis / Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* 
+
+See "void usage" below for usage.
+
+*/
+
+#include <google/sparse_hash_map> // or sparse_hash_set, dense_hash_map, ...
+#include <string>
+
+#include "fastq-lib.h"
+
+#define VERSION "1.04"
+#define SVNREV atoi(strchr("$LastChangedRevision: 676 $", ':')+1)
+
+#define MAX_ADAPTER_NUM 1000
+#define SCANLEN 15
+#define SCANMIDP ((int) SCANLEN/2)
+#define MAX_FILES 5
+#define MAX_REF 10
+#define B_A     0
+#define B_C     1
+#define B_G     2
+#define B_T     3
+#define B_N     4
+#define B_CNT   5
+#define MAXWARN 10
+#define MAX_PHRED 100
+
+struct ad {
+	char *id;  int nid;  size_t naid; 
+	char *seq; int nseq; size_t naseq;
+	char escan[SCANLEN+1]; 			// scan sequence
+	int bcnt[MAX_FILES];			// number found at beginning
+	int bcntz[MAX_FILES];			// number found at beginning
+	int ecnt[MAX_FILES];			// number found at end
+	int ecntz[MAX_FILES];			// number found at end
+
+	char end[MAX_FILES];			// 'b' or 'e'
+	int thr[MAX_FILES];			// min-length for clip
+};
+
+int read_fa(FILE *in, int rno, struct ad *ad);		// 0=done, 1=ok, -1=err+continue
+int meanqwin(const char *q, int qn, int i, int w);     // mean quality within window win, at position i
+bool evalqual(struct fq &fq, int file_num);
+
+int char2bp(char c);
+char bp2char(int b);
+void saveskip(FILE **fout, int fo_n, struct fq *fq);
+
+void valid_arg(char c, const char *a);
+
+void usage(FILE *f, const char *msg=NULL);
+int debug=0;
+int warncount = 0;
+
+// used to filter out other genomes, spike in controls, etc
+
+const char *cmd_align_se = "bowtie -S %i -f %1";
+const char *cmd_align_pe = "bowtie -S %i -1 %1 -2 %2";
+
+// quality filter args
+int qf_mean=0, qf_max_ns=-1, qf_xgt_num=0, qf_xgt_min=0, qf_max_n_pct=-1;
+int qf2_mean=0, qf2_max_ns=-1, qf2_xgt_num=0, qf2_xgt_min=0, qf2_max_n_pct=0;
+
+// qual adjust
+class adjustment {
+public:
+    int pos;
+    int adj;
+    adjustment() {pos=adj=0;}
+};
+std::vector<adjustment> cycle_adjust;
+int phred_adjust[MAX_PHRED];
+int phred_adjust_max=0;
+bool have_phred_adjust=false;
+
+std::string arg2cmdstr(int argc, char** argv);
+
+// phred used
+char phred = 0;
+
+google::sparse_hash_map <std::string, int> dupset;
+int dupmax = 40000000;              // this should be configurable, but right now it isn't
+int max_in_buffer = 2400000;
+
+class inbuffer {
+    int max_buf;
+public:
+    inbuffer() {fin=0; gz=0; bp=0; max_buf=max_in_buffer;};
+    ~inbuffer() {close();};
+
+    FILE *fin;      
+    bool gz;
+    int bp;
+    std::vector<std::string> buf;
+
+    ssize_t getline(char **lineptr, size_t *n) {
+        if (bp < buf.size()) {
+            // return bufffered
+            int l=buf[bp].length();         // length without null char
+            if (!*lineptr || *n < (l+1)) {
+                // alloc with room for null
+                *lineptr=(char*)realloc(*lineptr,*n=(l+1));
+            }
+            memcpy(*lineptr,buf[bp].data(),l);
+            (*lineptr)[l]='\0';
+            ++bp;
+            return l;
+        } else {
+            int l=::getline(lineptr, n, fin);
+            if (max_buf > 0) {
+                if (buf.size() > max_buf) {
+					if (debug) fprintf(stderr, "Clearing buffer at %d lines\n", (int) buf.size());
+                    buf.resize(0);
+                    bp=0;
+                    max_buf = 0;
+                } else {
+                    if (l > 0) {
+                        buf.push_back(std::string(*lineptr));
+                        ++bp;
+                    }
+                }
+            }
+            return l;
+        }
+    }
+
+    int read_fq(int rno, struct fq *fq, const char *name=NULL) {
+        if (bp < buf.size()) {
+            fq->id.n=getline(&fq->id.s, &fq->id.a);
+            fq->seq.n=getline(&fq->seq.s, &fq->seq.a);
+            fq->com.n=getline(&fq->com.s, &fq->com.a);
+            fq->qual.n=getline(&fq->qual.s, &fq->qual.a);
+            if (fq->qual.n <= 0)
+                    return 0;
+
+            if (fq->id.s[0] != '@' || fq->com.s[0] != '+' || fq->seq.n != fq->qual.n) {
+                    const char *errtyp = (fq->seq.n != fq->qual.n) ?  "length mismatch" : fq->id.s[0] != '@' ? "no '@' for id" : "no '+' for comment";
+                    if (name) {
+                        fprintf(stderr, "Malformed fastq record (%s) in file '%s', line %d\n", errtyp, name, rno*2+1);
+                    } else {
+                        fprintf(stderr, "Malformed fastq record (%s) at line %d\n", errtyp, rno*2+1);
+                    }
+                    return -1;
+            }
+
+            fq->seq.s[--fq->seq.n] = '\0';
+            if (fq->seq.s[fq->seq.n-1] == '\r') {
+                fq->seq.s[--fq->seq.n] = '\0';
+            }
+            fq->qual.s[--fq->qual.n] = '\0';
+            if (fq->qual.s[fq->qual.n-1] == '\r') {
+                fq->qual.s[--fq->qual.n] = '\0';
+            }
+ 
+            return fq->qual.n > 0;
+        } else {
+            return ::read_fq(fin, rno, fq, name);
+        }
+    }
+
+    void reset() {
+        assert(max_buf > 0);
+        bp=0;
+    }
+
+    bool full() {
+        return buf.size()>=max_buf;
+    }
+
+    int close() {
+       int ret=true;
+       if (fin) {
+            ret = gz ? pclose(fin) : fclose(fin);
+            fin=NULL;
+       }
+        return ret;
+    }
+};
+
+int main (int argc, char **argv) {
+	char c;
+	bool eol;
+	int nmin = 1, nkeep = 19, nmax=0;
+    int qf2_min_len=0;
+	float minpct = 0.25;
+	int pctdiff = 10;
+	int sampcnt = 300000;			// # of reads to sample to determine adapter profile, and base skewing
+	int xmax = -1;
+	float scale = 2.2;
+	int noclip=0;
+	char end[MAX_FILES]; meminit(end);
+	float skewpct = 2; 			// any base at any position is less than skewpct of reads
+	float pctns = 20;			// any base that is more than 20% n's
+	bool rmns = 1;				// remove n's at the end of the read
+	int qthr = 7;				// remove end of-read with quality < qthr
+	int qwin = 1;				// remove end of read with mean quality < qthr
+	int ilv3 = -1;
+	int duplen = 0;
+	int dupskip = 0;
+    bool noexec = 0;
+    bool hompol_filter = 0;
+    bool lowcom_filter = 0;
+    float hompol_pct = .92;
+    float lowcom_pct = .90;
+
+    dupset.set_deleted_key("<>");
+
+	int i;
+
+	char *afil = NULL;
+	char *ifil[MAX_FILES]; meminit(ifil);
+	const char *ofil[MAX_FILES]; meminit(ofil);
+	int i_n = 0;
+	int o_n = 0;
+	int e_n = 0;
+	bool skipb = 0;
+	char *fref[MAX_REF]; meminit(fref); 
+	int fref_n = 0;
+    char *qspec = NULL;
+
+    static struct option long_options[] = {
+       {"qual-mean", 1, 0, 0},
+       {"max-ns", 1, 0, 0},
+       {"qual-gt", 1, 0, 0},
+       {"min-len", 1, 0, 'l'},
+       {"cycle-adjust", 1, 0, 0},
+       {"phred-adjust", 1, 0, 0},
+       {"phred-adjust-max", 1, 0, 0},
+       {"mate-qual-mean", 1, 0, 0},
+       {"mate-max-ns", 1, 0, 0},
+       {"mate-qual-gt", 1, 0, 0},
+       {"mate-min-len", 1, 0, 0},
+       {"homopolymer-pct", 1, 0, 0},
+       {"lowcomplex-pct", 1, 0, 0},
+       {0, 0, 0, 0}
+    };
+
+    meminit(phred_adjust);
+
+    int option_index = 0;
+    while (	(c = getopt_long(argc, argv, "-nf0uXUVHSRdbehp:o:l:s:m:t:k:x:P:q:L:C:w:F:D:",long_options,&option_index)) != -1) {
+		switch (c) {
+			case '\0':
+                { 
+                    const char *oname=long_options[option_index].name;
+                    if(!strcmp(oname,        "qual-mean")) {
+                        qf_mean=qf2_mean=atoi(optarg);
+                    } else if(!strcmp(oname, "mate-qual-mean")) {
+                        qf2_mean=atoi(optarg);
+                    } else if(!strcmp(oname, "homopolymer-pct")) {
+                        hompol_pct=atof(optarg)/100.0;
+                        hompol_filter=1;
+                    } else if(!strcmp(oname, "lowcomplex-pct")) {
+                        lowcom_pct=atof(optarg)/100.0;
+                        lowcom_filter=1;
+                    } else if(!strcmp(oname, "qual-gt")) {
+                        if (!strchr(optarg, ',')) {
+                            fprintf(stderr, "Error, %s requires NUM,THR as argument\n", oname);
+                            exit(1);
+                        }
+                        qf_xgt_num=qf2_xgt_num=atoi(optarg);
+                        qf_xgt_min=qf2_xgt_min=atoi(strchr(optarg, ',')+1);
+                    } else if(!strcmp(oname, "mate-qual-gt")) {
+                        if (!strchr(optarg, ',')) { 
+                            fprintf(stderr, "Error, %s requires NUM,THR as argument\n", oname);
+                            exit(1);
+                        }
+                        qf2_xgt_num=atoi(optarg);
+                        qf2_xgt_min=atoi(strchr(optarg, ',')+1);
+                    } else if(!strcmp(oname, "cycle-adjust")) {
+                        if (!strchr(optarg, ',')) {
+                            fprintf(stderr, "Error, %s requires CYC,ADJ as argument\n", oname);
+                            exit(1);
+                        }
+                        adjustment a;
+                        a.pos=atoi(optarg);
+                        a.adj=atoi(strchr(optarg, ',')+1);
+                        cycle_adjust.push_back(a);
+                    } else if(!strcmp(oname, "phred-adjust-max")) {
+                        phred_adjust_max=atoi(optarg);
+                    } else if(!strcmp(oname, "phred-adjust")) {
+                        if (!strchr(optarg, ',')) {
+                            fprintf(stderr, "Error, %s requires CYC,ADJ as argument\n", oname);
+                            exit(1);
+                        }
+                        int phred=atoi(optarg);
+                        int adj=atoi(strchr(optarg, ',')+1);
+                        assert(phred<MAX_PHRED && phred >= 0);
+                        if (adj)
+                            have_phred_adjust=true;
+                        phred_adjust[phred]=adj;
+                    } else if(!strcmp(oname, "max-ns")) {
+                        if (strchr(optarg,'%')) {
+                            qf_max_n_pct=atoi(optarg);
+                            qf2_max_n_pct=atoi(optarg);
+                        } else {
+                            qf_max_ns=atoi(optarg);
+                            qf2_max_ns=atoi(optarg);
+                        }
+
+                    } else if(!strcmp(oname, "mate-max-ns")) {
+                        if (strchr(optarg,'%')) {
+                            qf2_max_n_pct=atoi(optarg);
+                        } else {
+                            qf2_max_ns=atoi(optarg);
+                        }
+                    } else if(!strcmp(oname, "mate-min-len")) {
+                        qf2_min_len=atoi(optarg);
+                    }
+                    break;
+                }
+			case '\1': 
+				if (!afil) 
+					afil = optarg; 
+				else if (i_n<MAX_FILES) 
+					ifil[i_n++] = optarg; 
+				else {
+					usage(stderr, "Too many input files."); return 1;
+				}
+				break;
+			case 't': minpct = atof(optarg); break;
+			case 'm': nmin = atoi(optarg); break;
+			case 'l': nkeep = atoi(optarg); break;
+			case 'L': nmax = atoi(optarg); break;
+			case '0': nmax=0; skewpct=0; pctns=0; rmns=0; qthr=0; nkeep=0; ilv3=-1;  break;
+			case 'u': ilv3=1; break;
+			case 'U': ilv3=0; break;
+			case 'H': hompol_filter=1; break;
+			case 'X': lowcom_filter=1; break;
+			case 'k': skewpct = atof(optarg); break;
+			case 'q': qthr = atoi(optarg); valid_arg(c,optarg); break;
+			case 'Q': qspec = optarg; break;
+			case 'w': qwin = atoi(optarg); break;
+			case 'C': sampcnt = atoi(optarg); if (sampcnt*8 > max_in_buffer) max_in_buffer = sampcnt * 8; break;
+			case 'F': fref[fref_n++] = optarg; break;
+			case 'x': pctns = atof(optarg); break;
+			case 'R': rmns = false; break;
+			case 'V': printf("Version: %s.%d\n", VERSION, SVNREV); return 0; break;
+			case 'p': pctdiff = atoi(optarg); break;
+			case 'P': phred = (char) atoi(optarg); break;
+			case 'D': duplen = atoi(optarg); break;
+			case 'h': usage(stdout); return 1; 
+			case 'o': if (!o_n < MAX_FILES) 
+						  ofil[o_n++] = optarg;
+					  break;
+			case 's': scale = atof(optarg); break;
+			case 'S': skipb = 1; break;
+			case 'i': if (i_n<MAX_FILES)
+						  ifil[i_n++] = optarg; 
+					  else
+						  return usage(stderr, "Too many input files."), 1;
+					  break;
+			case 'n': noclip = 1; break;
+			case 'd': ++debug; break;
+			case 'b': end[e_n++] = 'b'; break;
+			case 'e': end[e_n++] = 'e'; break;
+			case '?': 
+					  if (strchr("polsmtkx", optopt))
+						  fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+					  else if (isprint(optopt))
+						  fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+					  else
+						  fprintf (stderr,
+								  "Unknown option character `\\x%x'.\n",
+								  optopt);
+					  usage(stderr);
+					  return 1;
+		}
+	}
+
+    if (duplen > 75) {
+		fprintf(stderr, "WARNING: duplen of %d is probably too long, do you really need it?\n", duplen);
+    }
+
+	if (i_n == 1 && o_n == 0) {
+		ofil[o_n++]="-";
+	}
+
+	if (!noclip && o_n != i_n) {
+		fprintf(stderr, "Error: number of input files must match number of '-o' output files.\n");
+		return 1;
+	}
+
+	if (argc < 3 || !afil || !i_n) {
+		usage(stderr);
+		return 1;
+	}
+
+	FILE *ain = NULL;
+	if (strcasecmp(afil, "n/a") && strcasecmp(afil, "/dev/null") && strcasecmp(afil, "NUL")) {
+		ain = fopen(afil, "r");
+		if (!ain) {
+			fprintf(stderr, "Error opening adapter file '%s': %s\n",afil, strerror(errno));
+			return 1;
+		}
+	}
+
+	FILE *fstat = stderr;
+	if (!noclip && strcmp(ofil[0], "-")) {
+		fstat = stdout;
+	}
+	if (noclip) {
+		fstat = stdout;
+	}
+
+	fprintf(fstat, "Command Line: %s\n", arg2cmdstr(argc, argv).c_str());
+
+	FILE *fout[MAX_FILES]; meminit(fout);
+	bool gzout[MAX_FILES]; meminit(gzout);
+    inbuffer fin[MAX_FILES];
+
+   // if (debug) fprintf(stderr,"i_n:%d, ifil[0]:%s\n",i_n, ifil[0]);
+
+	for (i=0;i<i_n;++i) {
+	    if ((i_n==1) && !strcmp(ifil[0], "-")) {
+            fin[i].fin=stdin;
+            fin[i].gz=0;
+        } else {
+            fin[i].fin=gzopen(ifil[i], "r", &fin[i].gz);
+        }
+	}
+
+	struct ad ad[MAX_ADAPTER_NUM+1];
+	memset(ad, 0, sizeof(ad));
+
+	int acnt=0, ok=0, rno=0;	// adapter count, ok flag, record number
+
+	if (ain) {
+		while (acnt < MAX_ADAPTER_NUM && (ok = read_fa(ain, rno, &ad[acnt]))) {
+			++rno;
+			if (ok < 0)
+				break;
+			// copy in truncated to max scan length
+			strncpy(ad[acnt].escan, ad[acnt].seq, SCANLEN);
+			ad[acnt].escan[SCANLEN] = '\0';
+			//fprintf(stderr, "escan: %s, %s\n", ad[acnt].id, ad[acnt].escan);
+			++acnt;
+		}
+
+		if (acnt == 0) {
+			fprintf(stderr, "No adapters in file '%s'\n",afil);
+		}
+	}
+
+	fprintf(fstat, "Scale used: %g\n", scale);
+	int maxns = 0;						// max sequence length
+	int avgns[MAX_FILES]; meminit(avgns);			// average sequence length per file
+	// read length
+	for (i=0;i<i_n;++i) {
+
+		char *s = NULL; size_t na = 0; int nr = 0, ns = 0, totn[MAX_FILES]; meminit(totn);
+		char *q = NULL; size_t naq = 0; int nq =0;
+		int j;
+		int ilv3det=2;
+        int skipped = 0;
+
+        struct stat st;
+        stat(ifil[i], &st);
+
+		while (fin[i].getline(&s, &na) > 0) {
+			if (*s == '@')  {
+				// look for illumina purity filtering flags
+				if (ilv3det==2) {
+					ilv3det=0;
+					const char *p=strchr(s, ':');
+					if (p) {
+						++p;
+						if (isdigit(*p)) {
+							p=strchr(s, ' ');
+							if (p) {
+								++p;
+								if (isdigit(*p)) {
+									++p;
+									if (*p ==':') {
+										++p;
+										if (*p =='Y') {
+											// filtering found
+											ilv3det=1;
+										} else if (*p =='N') {
+											// still illumina
+											ilv3det=2;
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+
+				if ((ns=fin[i].getline(&s, &na)) <=0) {
+					// reached EOF
+					if (debug) fprintf(stderr, "Dropping out of sampling loop\n");
+					break;
+				}
+
+				nq=fin[i].getline(&q, &naq);
+				nq=fin[i].getline(&q, &naq);		// qual is 2 lines down
+
+				// skip poor quals/lots of N's when doing sampling
+				if (st.st_size > (sampcnt * 500) && (skipped < sampcnt) && poorqual(i, ns, s, q)) {
+					if (debug) fprintf(stderr, "Skip poorqual\n");
+                    ++skipped;
+					continue;
+                }
+
+				if (phred == 0) {
+					--nq;
+					for (j=0;j<nq;++j) {
+						if (q[j] < 64) {
+							if (debug) fprintf(stderr, "Using phred 33, because saw: %c\n", q[j]);
+							// default to sanger 33, if you see a qual < 64
+							phred = 33;
+							break;
+						}
+					}
+				}
+				--ns;                                   // don't count newline for read len
+				++nr;
+				avgns[i] += ns;
+				if (ns > maxns) maxns = ns;
+
+				// just 10000 reads for readlength sampling
+				if (nr >= 10000) {	
+					if (debug) fprintf(stderr, "Read 10000\n");
+					break;
+				}
+			} else {
+				fprintf(stderr, "Invalid FASTQ format : %s\n", ifil[i]);
+				break;
+			}
+		}
+		if (ilv3det == 1 && (ilv3 == -1)) {
+			ilv3=1;
+		}
+		if (debug) fprintf(stderr,"Ilv3det: %d\n", ilv3det);
+		if (s) free(s);
+		if (q) free(q);
+		if (nr)
+			avgns[i] = avgns[i]/nr;
+	}
+
+	if (ilv3 == -1) {
+		ilv3 = 0;
+	}
+
+	if (ilv3) {
+		fprintf(fstat, "Filtering Illumina reads on purity field\n");
+	}
+
+	// default to illumina 64 if you never saw a qual < 33
+	if (phred == 0) phred = 64;
+	fprintf(fstat, "Phred: %d\n", phred);
+
+	for (i=0;i<i_n;++i) {
+		if (avgns[i] == 0) {
+			fprintf(stderr, "No records in file %s\n", ifil[i]);
+			exit(1);
+		}
+	}
+
+	for (i=0;i<i_n;++i) {
+        fin[i].reset();
+	}
+
+	if (debug) fprintf(stderr,"Max ns: %d, Avg[0]: %d\n", maxns, avgns[0]);
+
+	// total base count per read position in sample
+	int balloc = maxns;
+	bool dobcnt = 1;
+	if (maxns > 500) {
+		dobcnt = 0;
+		balloc = 1;
+	}
+
+	int bcnt[MAX_FILES][2][balloc][6]; meminit(bcnt);
+	int qcnt[MAX_FILES][2]; meminit(qcnt);
+	char qmin=127, qmax=0;
+	int nsampcnt = 0;
+    double stat_lowcom_total=0, stat_lowcom_ssq=0, stat_lowcom_b4_total=0, stat_lowcom_b4_ssq=0;
+    long stat_lowcom_cnt=0, stat_lowcom_b4_cnt=0;
+
+	for (i=0;i<i_n;++i) {
+
+		struct stat st;
+		stat(ifil[i], &st);
+
+		// todo, use readfq
+		char *s = NULL; size_t na = 0; int ns = 0, nr = 0;
+		char *q = NULL; size_t naq = 0; int nq =0;
+		char *d = NULL; size_t nad = 0; int nd =0;
+
+        int skipped = 0;
+		while ((nd=fin[i].getline(&d, &nad)) > 0) {
+			if (*d == '@')  {
+				if ((ns=fin[i].getline(&s, &na)) <=0) 
+					break;
+				nq=fin[i].getline(&q, &naq);
+				nq=fin[i].getline(&q, &naq);		// qual is 2 lines down
+
+				--nq; --ns;				// don't count newline for read len
+
+				// skip poor quals/lots of N's when doing sampling (otherwise you'll miss some)
+				if ((st.st_size > (sampcnt * 500)) && (skipped < sampcnt) && poorqual(i, ns, s, q)) {
+                    ++skipped;
+					continue;
+                }
+
+				if (nq != ns) {
+					if (warncount < MAXWARN) {
+						fprintf(stderr, "Warning, corrupt quality for sequence: %s", s, q);
+						++warncount;
+					}
+					continue;
+				}
+
+				if (i > 0 && avgns[i] < 11) 			// reads of avg length < 11 ? barcode lane, skip it
+					continue;
+
+				if (ilv3) {					// illumina purity filtering
+					char * p = strchr(d, ' ');
+					if (p) {
+						p+=2;
+						if (*p==':') {
+							++p;
+							if (*p == 'Y') {
+								continue;
+							}
+						}
+					}
+				}
+
+				++nr;
+
+				// to be safe, we don't assume reads are fixed-length, not any slower, just a little more code
+				if (dobcnt) {
+					int b;
+					for (b = 0; b < ns/2 && b < maxns; ++b) {
+						++bcnt[i][0][b][char2bp(s[b])];		// count from begin
+						++bcnt[i][0][b][B_CNT];			// count of samples at position
+						++bcnt[i][1][b][char2bp(s[ns-b-1])];	// count from end
+						++bcnt[i][1][b][B_CNT];			// count of samples at offset-from-end position
+					}
+				}
+				qcnt[i][0]+=((q[0]-phred)<qthr);		// count of q<thr for last (first trimmable) base
+				qcnt[i][1]+=((q[ns-1]-phred)<qthr);	
+				//fprintf(stderr,"qcnt i%d e0=%d, e1=%d\n", i, qcnt[i][0], qcnt[i][1]);
+
+				int a;
+				char buf[SCANLEN+1];
+				strncpy(buf, s, SCANLEN);
+				for(a=0;a<acnt;++a) {
+					char *p;
+					// search whole seq for 15 char "end" of adap string
+					if (p = strstr(s+1, ad[a].escan)) { 
+						if (debug > 1) fprintf(stderr, "  END S: %s A: %s (%s), P: %d, SL: %d, Z:%d\n", s, ad[a].id, ad[a].escan, (int) (p-s), ns, (p-s) == ns-SCANLEN);
+                        // found at the very end
+						if ((p-s) == ns-SCANLEN) 
+							++ad[a].ecntz[i];
+						++ad[a].ecnt[i];
+					}
+					// search 15 char begin of seq in longer adap string
+					int slen;
+					if (SCANLEN <= ad[a].nseq) {
+						slen = SCANLEN;
+						p = strstr(ad[a].seq, buf);
+					} else {
+						slen = ad[a].nseq;
+						if (!strncmp(ad[a].seq,buf,ad[a].nseq)) 
+							p=ad[a].seq;
+						else
+							p=NULL;
+					}
+					if (p) { 
+						if (debug > 1) fprintf(stderr, "BEGIN S: %s A: %s (%s), P: %d, SL: %d, Z:%d\n", buf, ad[a].id, ad[a].seq, (int) (p-ad[a].seq), ns, (p-ad[a].seq )  == ad[a].nseq-slen);
+                        // found the end of the adapter
+						if (p-ad[a].seq == ad[a].nseq-slen) 
+							++ad[a].bcntz[i];
+						++ad[a].bcnt[i];
+					}
+				}
+			}
+			if (fin[i].full() || nr >= sampcnt)		// enough samples 
+				break;
+		}
+		if (s) free(s);
+		if (d) free(d);
+		if (q) free(q);
+		if (i == 0 || avgns[i] >= 11) {
+			if (nsampcnt == 0 || nr < nsampcnt)			// fewer than max, set for thresholds
+				nsampcnt=nr;
+		}
+	}
+
+	if (nsampcnt == 0) {
+		fprintf(stderr, "ERROR: Unable to read file for subsampling\n");
+		exit(1);
+	}
+
+	sampcnt = nsampcnt;
+	int sktrim[i_n][2]; meminit(sktrim);
+
+	// look for severe base skew, and auto-trim ends based on it
+	int needqtrim=0;
+	if (dobcnt) {
+	if (sampcnt > 0 && skewpct > 0) {
+		for (i=0;i<i_n;++i) {
+			if (avgns[i] < 11) 			// reads of avg length < 11 ? barcode lane, skip it
+				continue;
+			int e;
+			for (e = 0; e < 2; ++e) {
+				// 5% qual less than low-threshold?  need qualtrim
+				if (qthr > 0 && (100.0*qcnt[i][e])/sampcnt > 5) {
+					needqtrim = 1;
+				}
+
+				int p;
+				for (p = 0; p < maxns/2; ++p) {
+					int b;
+
+					int skth = (int) ( (float) bcnt[i][e][p][B_CNT] * ( skewpct / 100.0 ) ) ;	// skew threshold
+					int thr_n = (int) ( (float) bcnt[i][e][p][B_CNT] * ( pctns / 100.0 ) );		// n-threshold
+
+					if (debug > 1) 
+						fprintf(stderr,"Sk Prof [%d, %d]: skth=%d, bcnt=%d, ncnt=%d, a=%d, c=%d, g=%d, t=%d\n", e, p, skth, 
+								bcnt[i][e][p][B_CNT], bcnt[i][e][p][B_N], bcnt[i][e][p][B_A], 
+								bcnt[i][e][p][B_C], bcnt[i][e][p][B_G], bcnt[i][e][p][B_T]);
+
+					if (skth < 10)						// too few samples to detect skew
+						continue;
+
+					int tr = 0;
+					for (b = 0; b < 4; ++b) {
+						if (bcnt[i][e][p][b] < skth) {			// too few bases of this type
+							tr=1;
+							if (debug > 1) 
+								fprintf(stderr, "Skew at i:%d e:%d p:%d b:%d\n", i, e, p, b);
+							break;
+						}
+					}
+					if (bcnt[i][e][p][B_N] > thr_n) {			// too many n's
+						if (debug > 1) 
+							fprintf(stderr, "Too many N's at i:%d e:%d p:%d b:%d ( %d > %d )\n", i, e, p, b, bcnt[i][e][p][B_N], thr_n);
+						tr=1;
+					}
+
+					if (tr) {
+						if (p == sktrim[i][e]) {				// adjacent, so increase trim
+							++sktrim[i][e];
+						} else {
+							fprintf(fstat, "Within-read Skew: Position %d from the %s of reads is skewed!\n", p, e==0?"start":"end");
+						}
+					}
+				}
+			}
+		}
+	}
+
+	}
+
+	int e;
+	bool someskew = false;
+	for (i=0;i<i_n;++i) {
+		int totskew = sktrim[i][0] + sktrim[i][1];
+		if ((maxns - totskew) < nkeep) {
+			if (totskew > 0) {
+				fprintf(fstat, "Warning: Too much skewing found (%d), disabling skew clipping\n", totskew);
+			}
+			meminit(sktrim);
+			break;
+		}
+	}
+
+	for (i=0;i<i_n;++i) {
+		for (e=0;e<2;++e) {
+			if (sktrim[i][e] > 0) {
+				fprintf(fstat, "Trim '%s': %d from %s\n",  e==0?"start":"end", sktrim[i][e], ifil[i]);
+				someskew=true;
+			}
+		}
+	}
+
+	int athr = (int) ((float)sampcnt * minpct) / 100;
+	fprintf(fstat, "Threshold used: %d out of %d\n", athr+1, sampcnt);
+
+	int a;
+	int newc=0;
+	for(a=0;a<acnt;++a) {
+		int any=0;
+		for (i=0;i<i_n;++i) {
+			if (debug) fprintf(stderr, "ad:%s, EC:%d, BC:%d, ECZ: %d, BCZ: %d\n", ad[a].id, ad[a].ecnt[i], ad[a].bcnt[i], ad[a].ecntz[i], ad[a].bcntz[i]);
+			if (ad[a].ecnt[i] > athr || ad[a].bcnt[i] > athr) {
+				int cnt;
+				// heavily weighted toward start/end maches
+				if ((ad[a].ecnt[i] + 10*ad[a].ecntz[i]) >= (ad[a].bcnt[i] + 10*ad[a].bcntz[i])) {
+					ad[a].end[i]='e';
+					cnt = ad[a].ecnt[i];
+				} else {
+					ad[a].end[i]='b';
+					cnt = ad[a].bcnt[i];
+				}
+
+                char *p;
+                if (p=strstr(ad[a].id, "_3p")) {
+                    if (p[3] == '\0' || p[3] == '_') {
+                        ad[a].end[i]='e'; 
+					    cnt = ad[a].ecnt[i];
+                    }
+                } else if (p=strstr(ad[a].id, "_5p")) {
+                    if (p[3] == '\0' || p[3] == '_') {
+                        ad[a].end[i]='b';
+                        cnt = ad[a].bcnt[i];
+                    }
+                }
+
+				// user supplied end.... don't clip elsewhere
+				if (end[i] && ad[a].end[i] != end[i])
+					continue;
+
+				if (scale >= 100) 
+					ad[a].thr[i] = ad[a].nseq;
+				else
+					ad[a].thr[i] = min(ad[a].nseq,max(nmin,(int) (-log(cnt / (float) sampcnt)/log(scale))));
+
+				fprintf(fstat, "Adapter %s (%s): counted %d at the '%s' of '%s', clip set to %d", ad[a].id, ad[a].seq, cnt, ad[a].end[i] == 'e' ? "end" : "start", ifil[i], ad[a].thr[i]);
+				if (abs((ad[a].bcnt[i]-ad[a].ecnt[i])) < athr/4) {
+					fprintf(fstat, ", warning end was not reliable\n", ad[a].id, ad[a].seq);
+				} else {
+					fputc('\n', fstat);
+				}
+				++any;
+			}
+		}
+		if (!any) 
+			continue;
+		ad[newc++]=ad[a];
+	}
+
+	acnt=newc;
+
+	if (acnt == 0 && !someskew && !needqtrim && !ilv3) {
+		fprintf(fstat, "No adapters found");
+		if (skewpct > 0) fprintf(fstat, ", no skewing detected"); 
+		if (qthr > 0) fprintf(fstat, ", and no trimming needed");
+		fprintf(fstat, ".\n");
+		if (noclip) exit (1);			// for including in a test
+	} else {
+		if (debug) fprintf(stderr, "acnt: %d, ssk: %d, needq: %d\n", acnt, someskew, needqtrim);
+		if (noclip) {
+			if (acnt == 0) fprintf(fstat, "No adapters found. ");
+			if (someskew) fprintf(fstat, "Skewing detected. "); 
+			if (needqtrim) fprintf(fstat, "Quality trimming is needed. ");
+			fprintf(fstat, "\n");
+		}
+	}
+
+	if (noclip)
+		exit(0);
+
+	for (i=0;i<o_n;++i) {
+		if (!strcmp(ofil[i],"-")) {
+			fout[i]=stdout;
+		} else {
+			fout[i]=gzopen(ofil[i], "w", &gzout[i]);
+		}
+	}
+
+	FILE *fskip[MAX_FILES]; meminit(fskip);
+	bool gzskip[MAX_FILES]; meminit(gzskip);
+
+	if (skipb) {
+		for (i=0;i<o_n;++i) {
+			if (!strcmp(ofil[i],"-")) {
+				fskip[i]=stderr;
+			} else {
+				char *skipfil = (char *) malloc(strlen(ofil[i])+10);
+				if (!strcmp(fext(ofil[i]),".gz")) {
+					char *p=(char *)strrchr(ofil[i],'.');
+					*p='\0';
+					sprintf(skipfil, "%s.skip.gz", ofil[i]);
+					*p='.';
+				} else {
+					sprintf(skipfil, "%s.skip", ofil[i]);
+				}
+				if (!(fskip[i]=gzopen(skipfil, "w", &gzskip[i]))) {
+					fprintf(stderr, "Error opening skip file '%s': %s\n",skipfil, strerror(errno));
+					return 1;
+				}
+				free(skipfil);
+			}
+		}
+	}
+
+	struct fq fq[MAX_FILES];	
+	memset(&fq, 0, sizeof(fq));
+
+	int nrec=0;
+	int nerr=0;
+	int nok=0;
+	int ntooshort=0;
+	int ntoohompol=0;
+	int ntoolowcom=0;
+	int nfiltered=0;
+	// total per read
+	int ntrim[MAX_FILES]; meminit(ntrim);
+
+	// total per end
+	int cnttrim[MAX_FILES][2]; meminit(cnttrim);
+	double tottrim[MAX_FILES][2]; meminit(tottrim);
+	double ssqtrim[MAX_FILES][2]; meminit(ssqtrim);
+	int trimql[MAX_FILES]; meminit(trimql);
+	int trimqb[MAX_FILES]; meminit(trimqb);
+	int nilv3pf=0;	// number of illumina version 3 purity filitered
+	int read_ok;
+
+	if (i_n > 0)
+		fprintf(fstat, "Files: %d\n", i_n);
+
+	for (i=0;i<i_n;++i) {
+        fin[i].reset();
+	}
+
+    google::sparse_hash_map <std::string, int>::const_iterator lookup_it;
+
+    bool io_ok = true;
+
+	while (read_ok=fin[0].read_fq(nrec, &fq[0])) {
+		for (i=1;i<i_n;++i) {
+			int mok=fin[1].read_fq(nrec, &fq[i]);
+			if (mok != read_ok) {
+				fprintf(stderr, "# of rows in mate file '%s' doesn't match, quitting!\n", ifil[i]);
+				return 1;
+			}
+		}
+		++nrec;
+		if (read_ok < 0) {
+			++nerr;
+			continue;
+		}
+
+		if (ilv3) {
+			char * p = strchr(fq[0].id.s, ' ');
+			if (p) {
+				p+=2;
+				if (*p==':') {
+					++p;
+					if (*p == 'Y') {
+						++nilv3pf;
+						if (skipb) saveskip(fskip, i_n, fq);
+						continue;
+					}
+				}
+			}
+		}
+
+		// chomp
+
+		int dotrim[MAX_FILES][2];
+		int skip = 0;							// skip whole record?
+		int hompol_seq=0;
+		int hompol_cnt=0;
+		int lowcom_seq=0;
+		int lowcom_cnt=0;
+		int f;	
+		for (f=0;f<i_n;++f) {
+			dotrim[f][0] = sktrim[f][0];					// default, trim to detected skew levels
+			dotrim[f][1] = sktrim[f][1];
+			if (avgns[f] < 11)  
+				// reads of avg length < 11 ? barcode lane, skip it
+				continue;
+
+
+            if (have_phred_adjust) {
+                for (i=0;i<fq[f].qual.n;++i) {
+                   if (phred_adjust[fq[f].qual.s[i]-phred]) {
+                        fq[f].qual.s[i]+=phred_adjust[fq[f].qual.s[i]-phred];
+                   } 
+                }
+            }
+
+            if (phred_adjust_max) {
+                for (i=0;i<fq[f].qual.n;++i) {
+                   if ((fq[f].qual.s[i]-phred)>phred_adjust_max) {
+                        fq[f].qual.s[i]=phred_adjust_max+phred;
+                   } 
+                }
+            }
+
+
+            for (i=0;i<cycle_adjust.size();++i) {
+                if (abs(cycle_adjust[i].pos) < fq[f].qual.n) {
+                    if (cycle_adjust[i].pos>0) {
+                        fq[f].qual.s[cycle_adjust[i].pos-1]+=cycle_adjust[i].adj;
+                    } else {
+                        fq[f].qual.s[fq[f].qual.n+cycle_adjust[i].pos]+=cycle_adjust[i].adj;
+                    }
+                }
+            }
+
+
+			if (rmns) {
+				for (i=dotrim[f][0];i<(fq[f].seq.n);++i) {
+					// trim N's from the front
+					if (fq[f].seq.s[i] == 'N') 
+						dotrim[f][0] = i + 1;
+					else
+						break;
+				}
+				for (i=dotrim[f][1];i<(fq[f].seq.n);++i) {
+					// trim N's from the end
+					if (fq[f].seq.s[fq[f].seq.n-i-1] == 'N')
+						dotrim[f][1] = i + 1;
+					else 
+						break;
+				}
+			}
+
+            if (hompol_filter) {
+                char p; int h = 0;
+                for (i = dotrim[f][0]+1;i<fq[f].seq.n;++i) {
+                    // N's always match everything
+                    if (fq[f].seq.s[i] == 'N' || (fq[f].seq.s[i] == fq[f].seq.s[i-1])) {
+                        ++hompol_seq;
+                    }
+                    ++hompol_cnt;
+                }
+            }
+
+            if (lowcom_filter) {
+                char p; int h = 0;
+                for (i = dotrim[f][0]+1;i<fq[f].seq.n;++i) {
+                    // N's always match everything
+                    if (fq[f].seq.s[i] == 'N' || (fq[f].seq.s[i] == fq[f].seq.s[i-1])) {
+                        ++lowcom_seq;
+                    } else if (i >= dotrim[f][0]+3) {
+                        if (fq[f].seq.s[i] == fq[f].seq.s[i-2] && fq[f].seq.s[i-1] == fq[f].seq.s[i-3]) {
+                            ++lowcom_seq;
+                        }
+                    } else if (i >= dotrim[f][0]+3) {
+                        if (fq[f].seq.s[i] == fq[f].seq.s[i-3] && fq[f].seq.s[i-1] == fq[f].seq.s[i-4] && fq[f].seq.s[i-3] == fq[f].seq.s[i-5]) {
+                            ++lowcom_seq;
+                        }
+                    } 
+                    ++lowcom_cnt;
+                }
+            }
+
+			if (qthr > 0) {
+				bool istrimq = false;
+
+				// trim qual from the begin
+				for (i=dotrim[f][0];i<(fq[f].seq.n);++i) {
+					if (qwin > 1 && (meanqwin(fq[f].qual.s,fq[f].seq.n,i,qwin)-phred) < qthr) {
+						++trimqb[f];
+						istrimq = true;
+						dotrim[f][0] = i + 1;
+					} else if ((fq[f].qual.s[i]-phred) < qthr) {
+						++trimqb[f];
+						istrimq = true;
+						dotrim[f][0] = i + 1;
+					} else
+						break;
+				}
+
+				for (i=dotrim[f][1];i<(fq[f].seq.n);++i) {
+					if (qwin > 1 && (meanqwin(fq[f].qual.s,fq[f].seq.n,fq[f].seq.n-i-1,qwin)-phred) < qthr) {
+						++trimqb[f];
+						istrimq = true;
+						dotrim[f][1] = i + 1;
+					} else if ((fq[f].qual.s[fq[f].seq.n-i-1]-phred) < qthr) {
+						++trimqb[f];
+						istrimq = true;
+						dotrim[f][1] = i + 1;
+					} else 
+						break;
+				}
+				if (istrimq) ++trimql[f];
+			}
+
+			int bestscore_e = INT_MAX, bestoff_e = 0, bestlen_e = 0; 
+			int bestscore_b = INT_MAX, bestoff_b = 0, bestlen_b = 0; 
+
+			for (i =0; i < acnt; ++i) {
+				if (debug) fprintf(stderr, "seq[%d]: %s %d\n", f, fq[f].seq.s, fq[f].seq.n);
+
+				if (!ad[i].end[f])
+					continue;
+
+				int nmatch = ad[i].thr[f];
+				if (!nmatch) nmatch = ad[i].nseq;			// full match required if nmin == 0
+
+				// how far in to search for a match?
+				int mx = ad[i].nseq;
+				if (xmax) {
+					mx = fq[f].seq.n;
+					if (xmax > 0 && (xmax+ad[i].nseq) < mx)
+						mx = xmax+ad[i].nseq;			// xmax is added to adapter length
+				}
+
+				if (debug)
+					fprintf(stderr, "adapter: %s, adlen: %d, nmatch: %d, mx: %d\n", ad[i].seq, ad[i].nseq, nmatch, mx);
+
+				if (ad[i].end[f] == 'e') {
+					int off;
+					for (off = nmatch; off <= mx; ++off) {		// off is distance from tail of sequence
+						char *seqtail = fq[f].seq.s+fq[f].seq.n-off; 	// search at tail
+						int ncmp = off<ad[i].nseq ? off : ad[i].nseq;
+						int mind = (pctdiff * ncmp) / 100;
+						int d = hd(ad[i].seq,seqtail,ncmp);		// # differences
+						if (debug>1)
+							fprintf(stderr, "tail: %s, bestoff: %d, off: %d, ncmp: %d, mind: %d, hd %d\n", seqtail, bestoff_e, off, ncmp, mind, d);
+						if (d <= mind) {
+							// squared-distance over length
+							int score = (1000*(d*d+1))/ncmp;
+							if (score <= bestscore_e) {			// better score?
+								bestscore_e = score;			// save max score
+								bestoff_e = off;			// offset at max
+								bestlen_e = ncmp;			// cmp length at max
+							}
+							if (d == 0 && (ncmp == ad[i].nseq)) {
+								break;
+							}
+						}
+					}
+				} else {
+					int off;
+					for (off = nmatch; off <= mx; ++off) {              // off is distance from start of sequence
+						int ncmp = off<ad[i].nseq ? off : ad[i].nseq;	// number we are comparing
+						char *matchtail = ad[i].seq+ad[i].nseq-ncmp;    // tail of adapter
+						char *seqstart = fq[f].seq.s+off-ncmp;		// offset into sequence (if any)
+						int mind = (pctdiff * ncmp) / 100;
+						int d = hd(matchtail,seqstart,ncmp);            // # differences
+						if (debug>1)
+							fprintf(stderr, "bestoff: %d, off: %d, ncmp: %d, mind: %d, hd %d\n", bestoff_e, off, ncmp, mind, d);
+
+						if (d <= mind) {
+							int score = (1000*(d*d+1))/ncmp;
+							if (score <= bestscore_b) {                       // better score?
+								bestscore_b = score;                      // save max score
+								bestoff_b = off;                          // offset at max
+								bestlen_b = ncmp;                         // cmp length at max
+							}
+							if (d == 0 && (ncmp == ad[i].nseq)) {
+								break;
+							}
+						}
+					}
+				}
+			}
+			// lengthen trim based on best level
+			if (bestoff_b > dotrim[f][0])
+				dotrim[f][0]=bestoff_b;
+
+			if (bestoff_e > dotrim[f][1])
+				dotrim[f][1]=bestoff_e;
+
+			int totclip = min(fq[f].seq.n,dotrim[f][0] + dotrim[f][1]);
+
+//			if (debug > 1) fprintf(stderr,"totclip %d\n", totclip);
+
+			if (totclip > 0) {
+                // keep length > X, X based on mate
+                int tkeep = f == 0 ? nkeep : qf2_min_len > 0 ? qf2_min_len : nkeep;
+
+				if ( (fq[f].seq.n-totclip) < tkeep) {
+					// skip all reads if one is severely truncated ??
+					// maybe not... ?
+					skip = 1;
+					break;
+				}
+
+				// count number of adapters clipped, not the number of rows trimmed
+				if (bestoff_b > 0 || bestoff_e > 0) 
+					++ntrim[f];
+
+				// save some stats
+				if (bestoff_b > 0) {
+					cnttrim[f][0]++;
+					tottrim[f][0]+=bestoff_b;
+					ssqtrim[f][0]+=bestoff_b*bestoff_b;
+				}
+				if (bestoff_e > 0) {
+					cnttrim[f][1]++;
+					tottrim[f][1]+=bestoff_e;
+					ssqtrim[f][1]+=bestoff_e*bestoff_e;
+				}
+
+			} else {
+				// skip even if the original was too short
+				if (fq[f].seq.n < nkeep) 
+					skip = 1;
+			}
+		}
+
+        int hompol_skip=0;
+        if (hompol_filter) {
+            int hompol_max = hompol_pct * hompol_cnt;
+            if (debug>0) printf("%s: hompol cnt:%d, max:%d, seq:%d\n", fq[0].id.s, hompol_cnt, hompol_max, hompol_seq);
+            if (hompol_seq>=hompol_max) hompol_skip = skip = true;
+        }
+
+        int lowcom_skip=0;
+        if (!hompol_skip && lowcom_filter) {
+            int lowcom_max = lowcom_pct * lowcom_cnt;
+            if (debug>0) printf("%s: lowcom cnt:%d, max:%d, seq:%d\n", fq[0].id.s, lowcom_cnt, lowcom_max, lowcom_seq);
+            if (lowcom_seq>=lowcom_max) lowcom_skip = skip = true;
+            if (!lowcom_skip) { 
+                stat_lowcom_total+=((double)lowcom_seq/(double)lowcom_cnt);
+                stat_lowcom_ssq+=pow(((double)lowcom_seq/(double)lowcom_cnt),2);
+                stat_lowcom_cnt+=1;
+            }
+            stat_lowcom_b4_total+=((double)lowcom_seq/(double)lowcom_cnt);
+            stat_lowcom_b4_ssq+=pow(((double)lowcom_seq/(double)lowcom_cnt),2);
+            stat_lowcom_b4_cnt+=1;
+        }
+
+
+		if (!skip) {
+			int f;
+			for (f=0;f<o_n;++f) {
+                if (dotrim[f][1] >= strlen(fq[f].seq.s)) {
+					if (debug) fprintf(stderr,"trimmming full sequence from end (%d), %s", dotrim[f][1], fq[f].id.s);
+                    skip=1;
+                    continue;
+                }
+				if (dotrim[f][1] > 0) {
+					if (debug) fprintf(stderr,"trimming %d from end, %s", dotrim[f][1], fq[f].id.s);
+					fq[f].seq.s[fq[f].seq.n -=dotrim[f][1]]='\0';
+					fq[f].qual.s[fq[f].qual.n-=dotrim[f][1]]='\0';
+				}
+				if (dotrim[f][0] > 0) {
+					if (debug) fprintf(stderr,"trimming %d from begin, %s", dotrim[f][0], fq[f].id.s);
+					fq[f].seq.n -= dotrim[f][0];
+					fq[f].qual.n -= dotrim[f][0];
+                    if (fq[f].seq.n < 0) {
+                        fq[f].seq.n = 0;
+                        fq[f].qual.n = 0;
+                    }
+					memmove(fq[f].seq.s ,fq[f].seq.s +dotrim[f][0],fq[f].seq.n );
+					memmove(fq[f].qual.s,fq[f].qual.s+dotrim[f][0],fq[f].qual.n);
+					fq[f].seq.s[fq[f].seq.n]='\0';
+					fq[f].qual.s[fq[f].qual.n]='\0';
+				}
+				if (nmax > 0) {
+					if (fq[f].seq.n >= nmax ) {
+						fq[f].seq.s[nmax]='\0';
+						fq[f].qual.s[nmax]='\0';
+					}
+				}
+                if (avgns[f]>=11 && !evalqual(fq[f],f)) {
+                    skip = 2;                       // 2==qual
+                }
+			}
+
+            if (duplen > 0 && !skip) {
+                // lookup dupset
+                for (f=0;!skip&&f<o_n;++f) {
+                    if (avgns[f]>=11) {
+                        char t;
+                        if (fq[f].seq.a > duplen) {
+                            // truncate if needed
+                            t = fq[f].seq.s[duplen];
+                            fq[f].seq.s[duplen] = '\0';
+                        }
+                        lookup_it = dupset.find(fq[f].seq.s);
+                        if (lookup_it != dupset.end()) {
+                            skip=1;                 // 1==dup
+                        } else {
+                            if (dupset.size() < dupmax) {
+                                dupset[fq[f].seq.s]=1;
+                            }
+                        }
+                        if (fq[f].seq.a > duplen) {
+                            // restore full length
+                            fq[f].seq.s[duplen] = t;
+                        }
+                    }
+                }
+            }
+            if (!skip) {
+               for (f=0;f<o_n;++f) {
+                    io_ok=io_ok&&(fputs(fq[f].id.s,fout[f])>=0);
+                    io_ok=io_ok&&(fputs(fq[f].seq.s,fout[f])>=0);
+                    io_ok=io_ok&&(fputc('\n',fout[f])>=0);
+                    io_ok=io_ok&&(fputs(fq[f].com.s,fout[f])>=0);
+                    io_ok=io_ok&&(fputs(fq[f].qual.s,fout[f])>=0);
+                    io_ok=io_ok&&(fputc('\n',fout[f])>=0);
+                }
+            } else {
+                if (skipb) saveskip(fskip, i_n, fq);
+                if (skip==2) ++nfiltered;
+                if (skip==1) ++dupskip;
+            }
+		} else {
+			if (skipb) saveskip(fskip, i_n, fq);
+            if (hompol_skip) {
+    			++ntoohompol;
+            } else if (lowcom_skip) {
+    			++ntoolowcom;
+            } else {
+    			++ntooshort;
+            }
+		}
+	}
+
+	for (i=0;i<i_n;++i) {
+		if (fout[i])  { io_ok = io_ok && ( gzout[i] ? !pclose(fout[i]) : !fclose(fout[i]) ); }
+        fin[i].close();
+		if (fskip[i]) { if (gzskip[i]) pclose(fskip[i]); else fclose(fskip[i]); }
+	}
+
+    if (!io_ok) {
+	    fprintf(fstat, "Error during file close, possible partial write, failing\n");
+    }
+
+	fprintf(fstat, "Total reads: %d\n", nrec);
+	fprintf(fstat, "Too short after clip: %d\n", ntooshort);
+    if (nfiltered)
+	fprintf(fstat, "Filtered on quality: %d\n", nfiltered);
+    if (dupskip)
+	fprintf(fstat, "Filtered on duplicates: %d\n", dupskip);
+    if (ntoohompol)
+	fprintf(fstat, "Filtered on hompolymer: %d\n", ntoohompol);
+    if (ntoolowcom)
+	fprintf(fstat, "Filtered on low complexity: %d\n", ntoolowcom);
+    if (stat_lowcom_b4_total > 0) {
+    	fprintf(fstat, "Mean lowcom score: %2.2f(%2.2f), %2.2f(%2.2f) after\n", 
+                100*(stat_lowcom_b4_total/(double)stat_lowcom_b4_cnt), 100*stdev(stat_lowcom_b4_cnt,stat_lowcom_b4_total,stat_lowcom_b4_ssq), 
+                100*(stat_lowcom_total/(double)stat_lowcom_cnt), 100*stdev(stat_lowcom_cnt,stat_lowcom_total,stat_lowcom_ssq)
+        );
+    }
+
+	int f;
+	if (i_n == 1) {
+		f=0;
+		for (e=0;e<2;++e) {
+			if (cnttrim[f][e]>0) {
+				fprintf(fstat, "Clipped '%s' reads: Count: %d, Mean: %.2f, Sd: %.2f\n", e==0?"start":"end", cnttrim[f][e], (double) tottrim[f][e] / cnttrim[f][e], stdev(cnttrim[f][e], tottrim[f][e], ssqtrim[f][e]));
+			}
+		}
+		if (trimql[f] > 0) {
+			fprintf(fstat, "Trimmed %d reads by an average of %.2f bases on quality < %d\n", trimql[f], (float) trimqb[f]/trimql[f], qthr);
+		}
+	} else
+		for (f=0;f<i_n;++f) {
+			for (e=0;e<2;++e) {
+				if (cnttrim[f][e]>0) {
+					fprintf(fstat, "Clipped '%s' reads (%s): Count %d, Mean: %.2f, Sd: %.2f\n", e==0?"start":"end", ifil[f], cnttrim[f][e], (double) tottrim[f][e] / cnttrim[f][e], stdev(cnttrim[f][e], tottrim[f][e], ssqtrim[f][e]));
+				}
+			}
+			if (trimql[f] > 0) {
+				fprintf(fstat, "Trimmed %d reads (%s) by an average of %.2f bases on quality < %d\n", trimql[f], ifil[f], (float) trimqb[f]/trimql[f], qthr);
+			}
+		}
+	if (nilv3pf > 0) {
+		fprintf(fstat, "Filtered %d reads on purity flag\n", nilv3pf);
+	}
+	if (nerr > 0) {
+		fprintf(fstat, "Errors (%s): %d\n", ifil[f], nerr);
+		return 2;
+	}
+    if (!io_ok) {
+        return 3;
+    }
+	return 0;
+}
+
+int read_fa(FILE *in, int rno, struct ad *fa) {
+// note: this only reads one line of sequence!
+	fa->nid = getline(&fa->id, &fa->naid, in);
+	fa->nseq = getline(&fa->seq, &fa->naseq, in);
+	if (fa->nseq <= 0)
+		return 0;
+	if (fa->id[0] != '>') {
+		fprintf(stderr, "Malformed adapter fasta record at line %d\n", rno*2+1);
+		return -1;
+	}
+	// chomp
+	fa->seq[--fa->nseq] = '\0';
+	fa->id[--fa->nid] = '\0';
+	char *p = fa->id+1;
+	while (*p == ' ') {
+		++p;
+	}
+	memmove(fa->id, p, strlen(p)+1);
+	fa->nid=strlen(fa->id);
+
+	// rna 2 dna
+	int i;
+	for (i=0;i<fa->nseq;++i) {
+		if (fa->seq[i]=='U') fa->seq[i] = 'T';
+	}
+	return 1;
+}
+
+void usage(FILE *f, const char *msg) {
+	if(msg)
+		fprintf(f, "%s\n", msg);
+
+	fprintf(f, 
+"Usage: fastq-mcf [options] <adapters.fa> <reads.fq> [mates1.fq ...] \n"
+"Version: %s.%d\n"
+"\n"
+"Detects levels of adapter presence, computes likelihoods and\n"
+"locations (start, end) of the adapters.   Removes the adapter\n"
+"sequences from the fastq file(s).\n"
+"\n"
+"Stats go to stderr, unless -o is specified.\n"
+"\n"
+"Specify -0 to turn off all default settings\n"
+"\n"
+"If you specify multiple 'paired-end' inputs, then a -o option is\n" 
+"required for each.  IE: -o read1.clip.q -o read2.clip.fq\n"
+"\n"
+"Options:\n"
+"    -h       This help\n"
+"    -o FIL   Output file (stats to stdout)\n"
+"    -s N.N   Log scale for adapter minimum-length-match (2.2)\n"
+"    -t N     %% occurance threshold before adapter clipping (0.25)\n"
+"    -m N     Minimum clip length, overrides scaled auto (1)\n"
+"    -p N     Maximum adapter difference percentage (10)\n"
+"    -l N     Minimum remaining sequence length (19)\n"
+"    -L N     Maximum remaining sequence length (none)\n"
+"    -D N     Remove duplicate reads : Read_1 has an identical N bases (0)\n"
+"    -k N     sKew percentage-less-than causing cycle removal (2)\n"
+"    -x N     'N' (Bad read) percentage causing cycle removal (20)\n"
+"    -q N     quality threshold causing base removal (10)\n"
+"    -w N     window-size for quality trimming (1)\n"
+"    -H       remove >95%% homopolymer reads (no)\n"
+"    -X       remove low complexity reads (no)\n"
+//"    -F FIL  remove sequences that align to FIL\n"
+"    -0       Set all default parameters to zero/do nothing\n"
+"    -U|u     Force disable/enable Illumina PF filtering (auto)\n"
+"    -P N     Phred-scale (auto)\n"
+"    -R       Don't remove N's from the fronts/ends of reads\n"
+"    -n       Don't clip, just output what would be done\n"
+"    -C N     Number of reads to use for subsampling (300k)\n"
+"    -S       Save all discarded reads to '.skip' files\n"
+"    -d       Output lots of random debugging stuff\n"
+"\n"
+"Quality adjustment options:\n"
+"    --cycle-adjust      CYC,AMT   Adjust cycle CYC (negative = offset from end) by amount AMT\n"
+"    --phred-adjust      SCORE,AMT Adjust score SCORE by amount AMT\n"
+"    --phred-adjust-max  SCORE     Adjust scores > SCORE to SCOTE\n"
+"\n"
+"Filtering options*:\n"
+"    --[mate-]qual-mean  NUM       Minimum mean quality score\n"
+"    --[mate-]qual-gt    NUM,THR   At least NUM quals > THR\n" 
+"    --[mate-]max-ns     NUM       Maxmium N-calls in a read (can be a %%)\n"
+"    --[mate-]min-len    NUM       Minimum remaining length (same as -l)\n"
+"    --homopolymer-pct   PCT       Homopolymer filter percent (95)\n"
+"    --lowcomplex-pct    PCT       Complexity filter percent (95)\n"
+"\n"
+"If mate- prefix is used, then applies to second non-barcode read only\n"
+/*
+"Config:\n"
+"\n"
+"Some options are best set globally, such as the aligner to use\n"
+"for filtering, these can be ENV vars, or in /etc/ea-utils.conf:\n"
+"\n"
+"Command line options, if specified, always override config vars.\n"
+"\n"
+"When uses as environment vars, they are all caps, and with\n"
+"EAUTILS_ as the prefix (IE: EAUTILS_PHRED=33)\n"
+"\n"
+"    phred           (auto)\n"
+"    sample_reads    100000\n"
+"    scale_clip_len  2.2\n"
+"    trim_skew       2\n"
+"    trim_quality    10\n"
+"    min_clip_len    0\n"
+"    min_seq_remain  15\n"
+"    max_adap_diff   20\n"
+//"    cmd_align_se    bowtie -S %%i -f %%1\n"
+//"    cmd_align_pe    bowtie -S %%i -1 %%1 -2 %%2\n"
+//"\n"
+//"Command lines must return SAM formatted lines, %%i is the filter FIL,\n"
+//"%%1 and %%2 are the first and second fastq's\n"
+*/
+"\n"
+"Adapter files are 'fasta' formatted:\n"
+"\n"
+"Specify n/a to turn off adapter clipping, and just use filters\n"
+"\n"
+"Increasing the scale makes recognition-lengths longer, a scale\n"
+"of 100 will force full-length recognition of adapters.\n"
+"\n"
+"Adapter sequences with _5p in their label will match 'end's,\n"
+"and sequences with _3p in their label will match 'start's,\n"
+"otherwise the 'end' is auto-determined.\n"
+"\n"
+"Skew is when one cycle is poor, 'skewed' toward a particular base.\n"
+"If any nucleotide is less than the skew percentage, then the\n"
+"whole cycle is removed.  Disable for methyl-seq, etc.\n"
+"\n"
+"Set the skew (-k) or N-pct (-x) to 0 to turn it off (should be done\n"
+"for miRNA, amplicon and other low-complexity situations!)\n"
+"\n"
+"Duplicate read filtering is appropriate for assembly tasks, and\n"
+"never when read length < expected coverage.  -D 50 will use\n"
+"4.5GB RAM on 100m DNA reads - be careful. Great for RNA assembly.\n"
+"\n"
+"*Quality filters are evaluated after clipping/trimming\n"
+"\n"
+"Homopolymer filtering is a subset of low-complexity, but will not\n"
+"be separately tracked unless both are turned on.\n"
+	,VERSION, SVNREV);
+}
+
+inline int char2bp(char c) {
+        if (c == 'A' || c == 'a') return B_A;
+        if (c == 'C' || c == 'c') return B_C;
+        if (c == 'G' || c == 'g') return B_G;
+        if (c == 'T' || c == 't') return B_T;
+        return B_N;
+}
+
+inline int char2bp_rc(char c) {
+        if (c == 'A' || c == 'a') return B_T;
+        if (c == 'C' || c == 'c') return B_G;
+        if (c == 'G' || c == 'g') return B_C;
+        if (c == 'T' || c == 't') return B_A;
+        return B_N;
+}
+
+
+inline char bp2char(int b) {
+        if (b == B_A) return 'A';
+        if (b == B_C) return 'C';
+        if (b == B_G) return 'G';
+        if (b == B_T) return 'T';
+        return 'N';
+}
+
+void saveskip(FILE **fout, int fo_n, struct fq *fq)  {
+	int f;
+	for (f=0;f<fo_n;++f) {
+		fputs(fq[f].id.s,fout[f]);
+		fputs(fq[f].seq.s,fout[f]);
+		fputc('\n',fout[f]);
+		fputs(fq[f].com.s,fout[f]);
+		fputs(fq[f].qual.s,fout[f]);
+		fputc('\n',fout[f]);
+	}
+}
+
+int meanqwin(const char *q, int qn, int i, int w) {
+	if (w > qn) w=qn/4;                         // maximum window is length/4
+	int s = i-w/2;                              // start/end window
+	int e = i+w/2;
+	if (s < 0) {e-=s;s=0;}                      // shift window over if you're past the start
+	if (e >= qn) {s-=((e-qn)+1);e=qn-1;}        // shift window over if you're past the end
+	int t = 0;
+	for (i=s;i<=e;++i) {
+		t+=q[i];	
+	}
+	return t / (e-s+1);                         // mean quality within the window at that position
+}
+
+bool evalqual(struct fq &fq, int file_num) {
+    int t_mean, t_max_ns, t_xgt_num, t_xgt_min, t_max_n_pct;
+
+
+    if (file_num <= 0) {
+        // applies to file 1
+        t_mean=qf_mean;
+        t_max_ns=qf_max_ns;
+        t_max_n_pct=qf_max_n_pct;
+        t_xgt_num=qf_xgt_num;
+        t_xgt_min=qf_xgt_min;
+    } else {
+        // applies to file 2 or greater, only if they are set
+        t_mean=qf2_mean > 0 ? qf2_mean : qf_mean;
+        t_max_ns=qf_max_ns > -1 ? qf2_max_ns : qf_max_ns;
+        t_max_n_pct=qf_max_n_pct > -1 ? qf2_max_n_pct : qf_max_n_pct;
+        t_xgt_num=qf_xgt_num > 0 ? qf2_xgt_num : qf_xgt_num;
+        t_xgt_min=qf_xgt_min > 0 ? qf2_xgt_min : qf_xgt_min;
+    }
+
+    if (t_max_n_pct>=0) {
+        t_max_ns=max(t_max_ns,(fq.qual.n*100)/t_max_n_pct);
+    }
+
+    if (t_mean > 0) {
+        int t = 0;
+        int i;
+        for (i=0;i<=fq.qual.n;++i) {
+            t+=fq.qual.s[i];
+        }
+        if ((t/fq.qual.n-phred) < t_mean) {
+            return false;
+        }
+    }
+    if (t_max_ns >= 0) {
+        int t = 0;
+        int i;
+        for (i=0;i<=fq.seq.n;++i) {
+            t+=(fq.seq.s[i]=='N');
+        }
+
+//        if (debug > 2) fprintf(stderr,"maxn: max:%d,t:%d,i:%d,id:%s", t_max_ns, t, i, fq.id.s);
+
+        if (t > t_max_ns) {
+            return false;
+        }
+    }
+
+    if (t_xgt_num > 0) {
+        int t = 0;
+        int i;
+        int h = t_xgt_min+phred;
+        for (i=0;i<=fq.qual.n;++i) {
+            t+=(fq.qual.s[i]>=h);
+        }
+        if (t < t_xgt_num) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void valid_arg(char opt, const char *arg) {
+    if (!arg || !*arg || *arg == '-') {
+        fprintf(stderr,"Option '%c' requires an argument.\n\n", opt);
+        usage(stderr); 
+        exit(1);
+    }
+}
+
+bool  arg_int_pair(const char *optarg, int &a, int&b) {
+    if (!strchr(optarg, ',')) {
+        return false;
+    }
+    a=atoi(optarg);
+    b=atoi(strchr(optarg, ',')+1);
+}
+
+char *arg2cmd(int argc, char** argv) {
+    char *buf=NULL;
+    int n = 0;
+    int k, i;
+    for (i=1; i <argc;++i) {
+        int k=strlen(argv[i]);
+        buf=( char *)realloc(buf,n+k+4);
+        char *p=buf+n;
+        char endq=0;
+        // this is a poor mans quoting, which is good enough for anything that's not rediculous
+        if (strchr(argv[i], ' ')) {
+            if (!strchr(argv[i], '\'')) {
+                *p++='\'';
+                endq='\'';
+            } else {
+                *p++='\"';
+                endq='\"';
+            }
+        }
+        memcpy(p, argv[i], k);
+        p+=k;
+        if (i < (argc-1)) *p++=' ';
+        if (endq) *p++=endq;
+        *p='\0';
+        n = p-buf;
+    }
+    return buf;
+}
+
+std::string arg2cmdstr(int argc, char **argv) {
+    char *tmp=arg2cmd(argc, argv);
+    std::string ret=tmp;
+    free(tmp);
+    return ret;
+}
+
+
+/* vim: set noai ts=4 sw=4: */
diff --git a/fastq-multx.c b/fastq-multx.c
new file mode 100644
index 0000000..ddb3f72
--- /dev/null
+++ b/fastq-multx.c
@@ -0,0 +1,1087 @@
+/*
+Copyright (c) 2011 Expression Analysis / Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+
+See "void usage" below for usage.
+
+*/
+
+#include "fastq-lib.h"
+
+#define MAX_BARCODE_NUM 6000
+#define MAX_GROUP_NUM 500
+// factor to divide max by
+#define THFIXFACTOR 20
+#define endstr(e) (e=='e'?"end":e=='b'?"start":"n/a")
+
+const char * VERSION = "1.02";
+#define SVNREV atoi(strchr("$LastChangedRevision: 684 $", ':')+1)
+
+// barcode
+struct bc {
+	line id;
+	line seq;
+	char *out[6];			// one output per input
+	FILE *fout[6];
+	bool gzout[6];
+	int cnt;			// count found
+	bool shifted;			// count found in 1-shifted position
+	char * dual;			// is this a dual-indexed barcode?  if so, this points to the second index.
+	int dual_n;			// length of dual
+};
+
+// group of barcodes
+struct group {
+	char *id;
+	int tcnt;			// number of codes past thresh
+	int i;				// my index
+};
+
+// barcode group
+struct bcg {
+	struct bc b;			// barcode
+        line group;			// group (fluidigm, truseq, etc)
+        int bcnt[6];			// matched begin of file n
+        int ecnt[6];			// matched end of file n
+        int bscnt[6];			// matched begin of file n, shifted by 1
+        int escnt[6];			// matched end of file n, shifted by 1
+        int dbcnt[6];                   // dual matched begin of file n
+        int decnt[6];                   // dual matched end of file n
+	struct group *gptr;		
+};
+
+struct group* getgroup(char *s);
+
+void usage(FILE *f);
+static int debug=0;
+// it's times like this when i think a class might be handy, but nah, not worth it
+typedef struct bnode {
+	char *seq;
+	int cnt;
+} bnode;
+
+struct group grs[MAX_GROUP_NUM];
+static int grcnt=0;
+
+struct bc bc[MAX_BARCODE_NUM+1];
+static int bcnt=0;
+
+static int pickmax=0;
+static void *picktab=NULL;
+void pickbest(const void *nodep, const VISIT which, const int depth);
+int bnodecomp(const void *a, const void *b) {return strcmp(((bnode*)a)->seq,((bnode*)b)->seq);};
+static float pickmaxpct=0.10;
+
+int main (int argc, char **argv) {
+	char c;
+	bool trim = true;
+	int mismatch = 1;
+	int distance = 2;
+	int poor_distance = 0;       // count of skipped reads on distance only
+	int quality = 0;
+	char end = '\0';
+	char dend = '\0';
+	bool dual = false;
+	char *in[6];
+	const char *out[6];
+	int f_n=0;
+	int f_oarg=0;
+	const char* guide=NULL;		// use an indexed-read
+	const char* list=NULL;		// use a barcode master list
+	char verify='\0';
+	bool noexec = false;
+	const char *group = NULL;
+    bool usefile1 = false;
+    int phred = 33;
+    double threshfactor = 1;
+
+	int i;
+	bool omode = false;	
+	char *bfil = NULL;
+	while (	(c = getopt (argc, argv, "-Dzxnbeov:m:B:g:L:l:G:q:d:t:")) != -1) {
+		switch (c) t:{
+		case '\1': 
+                       	if (omode) {
+				if (f_oarg<5)
+					out[f_oarg++] = optarg;
+				else {
+					usage(stderr); return 1;
+				}
+			} else if (!bfil && !guide && !list) 
+				bfil = optarg; 
+			else if (f_n<5) {
+				in[f_n++] = optarg; 
+			} else {
+				usage(stderr); return 1;
+			}
+			break;
+                case 'o': omode=true; break;
+                case 'v': 
+			if (strlen(optarg)>1) {
+				fprintf(stderr, "Option -v requires a single character argument");
+				exit(1);
+			}
+			verify = *optarg; break;
+		case 'b': end = 'b'; break;
+		case 'e': end = 'e'; break;
+		case 'G': group = optarg; break;
+		case 'g': 
+			guide = optarg;
+			in[f_n++] = optarg;
+			out[f_oarg++] = "n/a";
+			break;
+		case 'l': list = optarg; usefile1=0; break;
+		case 'L': list = optarg; usefile1=1; break;
+		case 'B': bfil = optarg; list = NULL; break;
+		case 'x': trim = false; break;
+		case 'n': noexec = true; break;
+		case 't': threshfactor = atof(optarg); break;
+		case 'm': mismatch = atoi(optarg); break;
+		case 'd': distance = atoi(optarg); break;
+		case 'q': quality = atoi(optarg); break;
+		case 'D': ++debug; break;
+		case '?': 
+		     if (strchr("vmBglG", optopt))
+		       fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+		     else if (isprint(optopt))
+		       fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+		     else
+		       fprintf (stderr,
+				"Unknown option character `\\x%x'.\n",
+				optopt);
+		     usage(stderr);
+             	     return 1;
+		}
+	}
+
+	if (group && !list) {
+		fprintf(stderr, "Error: -G only works with -l\n");
+		return 1;
+	}
+
+    if ((list && guide) || (list && bfil) || (guide && bfil)) {
+            fprintf(stderr, "Error: Only one of -B -g or -l\n");
+            return 1;
+    }
+
+	if (f_n != f_oarg) {
+		fprintf(stderr, "Error: number of input files (%d) must match number of output files following '-o'.\n", f_n);
+		return 1;
+	}
+
+	if (argc < 3 || !f_n || (!bfil && !guide && !list)) {
+		usage(stderr);
+		return 1;
+	}
+
+    quality+=phred;
+
+	FILE *fin[6];
+	bool gzin[6]; meminit(gzin);
+	for (i = 0; i < f_n; ++i) {
+		fin[i]=gzopen(in[i],"r",&gzin[i]);
+	}
+
+	// set all to null, zero
+	meminit(bc);
+
+
+	// 3 ways to get barcodes
+	if (list) {
+		// use a list of barcode groups... determine the best set, then use the determined set 
+		struct bcg *bcg = (struct bcg *) malloc(sizeof(*bcg) * MAX_GROUP_NUM * MAX_BARCODE_NUM);
+		if (!bcg) {
+                        fprintf(stderr, "Out of memory\n");
+                        return 1;
+		}
+		memset(bcg, 0, sizeof(*bcg) * MAX_GROUP_NUM * MAX_BARCODE_NUM);
+		int bgcnt=0;
+		int b;
+        FILE *lin = fopen(list, "r");
+        if (!lin) {
+                fprintf(stderr, "Error opening file '%s': %s\n",list, strerror(errno));
+                return 1;
+        }
+        // read barcode groups
+        int ok;
+
+        while (bgcnt < (MAX_GROUP_NUM * MAX_BARCODE_NUM) && (ok = read_line(lin, bcg[bgcnt].b.id))) {
+            if (ok <= 0) break;
+            if (bcg[bgcnt].b.id.s[0]=='#') continue;
+            bcg[bgcnt].b.id.s=strtok(bcg[bgcnt].b.id.s, "\t\n\r ");
+            bcg[bgcnt].b.seq.s=strtok(NULL, "\t\n\r ");
+            char *g=strtok(NULL, "\n\r");
+			if (!g) {
+				if (bgcnt==0){
+					fprintf(stderr,"Barcode guide list needs to be ID<whitespace>SEQUENCE<whitespace>GROUP");
+					return 1;
+				} else {
+					continue;
+				}
+			}
+			if (group) {
+				if (strcasecmp(group, g)) {
+					continue;
+				}
+			}
+            if (!strcmp(bcg[bgcnt].b.seq.s,"seq")) continue;
+
+            // dual indexed indicated by a dash in the sequence...
+			if (bcg[bgcnt].b.dual=strchr(bcg[bgcnt].b.seq.s,'-')) {
+				*bcg[bgcnt].b.dual = '\0';
+				++bcg[bgcnt].b.dual;
+				bcg[bgcnt].b.dual_n = strlen(bcg[bgcnt].b.dual);
+			}
+            // group pointer for this group
+			bcg[bgcnt].gptr = getgroup(g);
+            bcg[bgcnt].b.id.n=strlen(bcg[bgcnt].b.id.s);
+            bcg[bgcnt].b.seq.n=strlen(bcg[bgcnt].b.seq.s);
+
+            if (debug) fprintf(stderr, "BCG: %d bc:%s n:%d\n", bgcnt, bcg[bgcnt].b.seq.s, bcg[bgcnt].b.seq.n);
+            ++bgcnt;
+        }
+
+		if (!bgcnt) {
+			fprintf(stderr,"No barcodes %s from guide list %s.\n", group ? "matched" : "read", list);
+			return 1;
+		}
+
+        int sampcnt = 200000;
+        struct stat st;
+		int fsum[f_n], fmax[f_n]; int bestcnt=0, besti=-1, bestdual=0;
+		int dfsum[f_n], dfmax[f_n]; int dbestcnt=0, dbesti=-1;
+		meminit(fsum); meminit(fmax); meminit(dfsum); meminit(dfmax);
+
+        // subsample to determine group to use
+		for (i=0;i<(usefile1?1:f_n);++i) {
+            char *s = NULL; size_t na = 0; int nr = 0, ns = 0;
+            char *q = NULL; size_t nq = 0;
+			double tots=0, totsq=0;
+			
+			stat(in[i], &st);
+
+			while (getline(&s, &na, fin[i]) > 0) {
+				if (*s != '@')  {
+					fprintf(stderr,"Invalid fastq file: %s.\n", in[i]);
+					exit(1);
+				}
+
+				if ((ns=getline(&s, &na, fin[i])) <=0)
+					break;
+
+				getline(&q, &nq, fin[i]);
+				getline(&q, &nq, fin[i]);
+
+				s[--ns]='\0'; q[ns]='\0';
+
+// skip if quality is below average
+				if (st.st_size > (sampcnt * 500) && poorqual(i, ns, s, q)) 
+					continue;
+	
+				for (b=0;b<bgcnt;++b) {
+                    // matches front of read?
+					if (!strncasecmp(s, bcg[b].b.seq.s, bcg[b].b.seq.n)) {
+						++bcg[b].bcnt[i];
+					} else if (!strncasecmp(s+1, bcg[b].b.seq.s, bcg[b].b.seq.n)) {
+                        // shifted read?
+						++bcg[b].bscnt[i];
+                    }
+
+					if (ns >= bcg[b].b.seq.n && !strcasecmp(s+ns-bcg[b].b.seq.n, bcg[b].b.seq.s)) {
+						++bcg[b].ecnt[i]; 
+					} else if (ns > bcg[b].b.seq.n && !strncasecmp(s+ns-bcg[b].b.seq.n-1, bcg[b].b.seq.s, bcg[b].b.seq.n)) {
+						++bcg[b].escnt[i]; 
+					}
+
+					if (bcg[b].b.dual) {
+						if (!strncasecmp(s, bcg[b].b.dual, bcg[b].b.dual_n)) {
+							++bcg[b].dbcnt[i];
+                        }
+
+						if (ns >= bcg[b].b.dual_n && !strcasecmp(s+ns-bcg[b].b.dual_n, bcg[b].b.dual)) {
+							++bcg[b].decnt[i];
+                        }
+					}
+				}	
+				
+				++nr;
+                // got enough reads?
+				if (nr >= sampcnt) 
+                    break;
+			}
+
+			for (b=0;b<bgcnt;++b) {
+				// highest count
+				int hcnt = (int) (max(bcg[b].bcnt[i],bcg[b].ecnt[i]) * log(bcg[b].b.seq.n));
+				fsum[i]+=hcnt;
+				if (hcnt > fmax[i])
+					fmax[i]=hcnt;
+
+				if (fsum[i] > bestcnt)  {
+                    if (debug > 1) 
+                        fprintf(stderr,"file %d(%s), bcg: %s, file-sum: %d, bestsum: %d\n", i, in[i], bcg[b].gptr->id, fsum[i], bestcnt);
+
+					bestcnt=fsum[i];
+					besti=i;
+					bestdual=(bcg[b].b.dual!=NULL);
+				}
+
+                if (debug > 1) 
+                    fprintf(stderr,"dual %d(%s), bcg: %s, file-sum: %d, bestsum: %d\n", i, in[i], bcg[b].gptr->id, dfsum[i], dbestcnt);
+
+				if (bcg[b].b.dual) {
+					// highest count
+					int dcnt = (int) (max(bcg[b].dbcnt[i],bcg[b].decnt[i]) * log(bcg[b].b.dual_n));
+					dfsum[i]+=dcnt;
+					if (dcnt > dfmax[i])
+						dfmax[i]=dcnt;
+					if (dfsum[i] > dbestcnt)  {
+                        if (debug > 1) 
+                            fprintf(stderr,"dual %d(%s), bcg: %s, file-sum: %d, bestsum: %d\n", i, in[i], bcg[b].gptr->id, dfsum[i], dbestcnt);
+						dbestcnt=dfsum[i];
+						dbesti=i;
+					}
+				}
+                        }
+			if (debug > 0) fprintf(stderr,"file-best %d sum:%d, max:%d\n", besti, fsum[besti], fmax[besti]);
+			if (debug > 0 && bestdual) fprintf(stderr,"dual file-best %d sum:%d, max:%d\n", dbesti, dfsum[dbesti], dfmax[dbesti]);
+		}
+
+        // chosen file is "besti"
+		i=usefile1?0:besti;
+
+		int gmax=0, gindex=-1, scnt = 0, ecnt=0, dscnt = 0, decnt = 0;
+		int thresh = (int) (pickmaxpct*fmax[i]); 
+
+		if (debug > 0) fprintf(stderr,"besti: %d thresh: %d, dual: %d\n", besti, thresh, bestdual);
+		for (b=0;b<bgcnt;++b) {
+			int hcnt = (int) (max(bcg[b].bcnt[i],bcg[b].ecnt[i]) * log(bcg[b].b.seq.n));
+			if (debug > 1) fprintf(stderr,"cnt: %s %s hc:%d bc:%d ec: %d\n", bcg[b].b.id.s, bcg[b].b.seq.s, hcnt, bcg[b].bcnt[i], bcg[b].ecnt[i]);
+			if (hcnt >= thresh) {
+				// increase group count	
+				bcg[b].gptr->tcnt += hcnt;
+				if (bcg[b].gptr->tcnt > gmax) {
+					gindex=bcg[b].gptr->i;
+					gmax=bcg[b].gptr->tcnt;
+				}
+			}
+		}
+		if (gindex == -1) {
+            fprintf(stderr, "Unable to determine barcode group\n");
+			exit(1);
+		}
+//		printf("gmax: %d, gindex %d, %s, thresh: %d\n", gmax, gindex, grs[gindex].id, thresh);
+
+        for (b=0;b<bgcnt;++b) {
+			if (bcg[b].gptr->i == gindex) {
+				if (bcg[b].bcnt[i] > bcg[b].ecnt[i]) {
+					scnt+=bcg[b].dbcnt[i];
+				} else if (bcg[b].bcnt[i] < bcg[b].ecnt[i]) {
+					ecnt+=bcg[b].decnt[i];
+				}
+				if (bcg[b].dbcnt[dbesti] > bcg[b].decnt[dbesti]) {
+					dscnt+=bcg[b].dbcnt[dbesti];
+				} else if (bcg[b].dbcnt[dbesti] < bcg[b].decnt[dbesti]) {
+					decnt+=bcg[b].decnt[dbesti];
+				}
+			}
+		};
+		end = scnt >= ecnt ? 'b' : 'e';
+
+		if (debug) fprintf(stderr,"scnt: %d, ecnt, %d, end: %c\n", scnt, ecnt, end);
+
+		thresh/=threshfactor;
+		if (bestdual) 
+		    thresh/=5;
+
+		// since this is a known good set, use a very low threshold, just to catch them all
+        fprintf(stderr, "Using Barcode Group: %s on File: %s (%s), Threshold %2.2f%%\n", 
+        grs[gindex].id, in[i], endstr(end), 100.0 * (float) ((float)thresh/THFIXFACTOR)/sampcnt);
+
+		if (bestdual) {
+			dend = dscnt >= decnt ? 'b' : 'e';
+			fprintf(stderr, "Dual index on File: %s (%s)\n", in[dbesti], endstr(dend));
+			dual=true;
+			for (b=0;b<bgcnt;++b) {
+				// trim down a bit, but later should trim down to "both-match"
+				if (bcg[b].gptr->i == gindex) {
+					if (bcg[b].decnt[dbesti] < bcg[b].ecnt[i]) 
+						bcg[b].ecnt[i] = bcg[b].decnt[dbesti];
+					if (bcg[b].dbcnt[dbesti] < bcg[b].bcnt[i]) 
+						bcg[b].bcnt[i] = bcg[b].dbcnt[dbesti];
+				}
+			}
+		}
+
+        for (b=0;b<bgcnt;++b) {
+			if (bcg[b].gptr->i == gindex) {
+				int cnt = (end == 'e' ? (bcg[b].ecnt[i]+bcg[b].escnt[i]) : ( bcg[b].bcnt[i] + bcg[b].bscnt[i] ));
+				if (cnt > thresh/THFIXFACTOR) {
+					// count exceeds threshold... use it
+					bc[bcnt]=bcg[b].b;
+					if ((end == 'e' && (bcg[b].escnt[i] < 1.2*bcg[b].ecnt[i])) ||
+					    (end == 'b' && (bcg[b].bscnt[i] < 1.2*bcg[b].bcnt[i]))
+					  ) {
+						if (!dual)
+							fprintf(stderr, "Using Barcode %s (%s)\n", bcg[b].b.id.s, bcg[b].b.seq.s);
+
+						if (debug) fprintf(stderr, "Debug Barcode %s (%s-%s) ... ecnt:%d, escnt:%d,bcnt:%d, bscnt:%d\n", bcg[b].b.id.s, bcg[b].b.seq.s, bcg[b].b.dual, bcg[b].ecnt[i], bcg[b].escnt[i], bcg[b].bcnt[i], bcg[b].bscnt[i]);
+
+					} else {
+						bc[bcnt].shifted=1;
+
+						if (!dual)
+							fprintf(stderr, "Using Barcode %s (%s) shifted\n", bcg[b].b.id.s, bcg[b].b.seq.s);
+
+						if (debug) printf("Debug Barcode %s (%s-%s) shifted ... ecnt:%d, escnt:%d,bcnt:%d, bscnt:%d\n", bcg[b].b.id.s, bcg[b].b.seq.s, bcg[b].b.dual, bcg[b].ecnt[i], bcg[b].escnt[i], bcg[b].bcnt[i], bcg[b].bscnt[i]);
+					}
+					++bcnt;
+				}
+			}
+		}
+
+		if (i != 0) {
+			// in[0] needs to be the guide file
+			FILE *f = fin[0];
+			char *n = in[0];
+			const char *o = out[0];
+			bool gzi = gzin[0];
+			fin[0]=fin[i];
+			in[0]=in[i];
+			out[0]=out[i];
+			gzin[0]=gzin[i];
+			fin[i]=f;
+			in[i]=n;
+			out[i]=o;
+			gzin[i]=gzi;
+			// swap file in to position 1 if dual
+			if (dual && dbesti != 1) {
+				FILE *f = fin[1];
+				char *n = in[1];
+				const char *o = out[1];
+				bool gzi = gzin[1];
+				fin[1]=fin[dbesti];
+				in[1]=in[dbesti];
+				out[1]=out[dbesti];
+				gzin[1]=gzin[dbesti];
+				fin[dbesti]=f;
+				in[dbesti]=n;
+				out[dbesti]=o;
+				gzin[dbesti]=gzi;
+			}
+		}
+		if (bcg) free(bcg);
+	} else if (guide) {
+		// use the first file as a "guide file" ... and select a set of codes from that
+		FILE *gin = fin[0];
+
+		int blen = 0;
+	
+		int sampcnt = 100000;
+		struct stat st;
+		stat(guide, &st);
+
+		char *s = NULL; size_t na = 0; int nr = 0, ns = 0;
+		char *q = NULL; size_t nq = 0;
+
+// small sample to get lengths
+		double tots=0, totsq=0;
+		while (getline(&s, &na, gin) > 0) {
+			if (*s != '@')  {
+				fprintf(stderr,"Invalid fastq file: %s.\n", in[0]);
+				exit(1);
+			}
+			if ((ns=getline(&s, &na, gin)) <=0)
+				break;
+			getline(&q, &nq, gin);
+			getline(&q, &nq, gin);
+			--ns;
+			tots+=ns;
+			totsq+=ns*ns;
+			++nr;
+			if (nr >= 200) break;
+		}
+		double dev = stdev(nr, tots, totsq);
+
+		// short, and nonvarying (by much, depends on the tech used)
+		if (dev < .25 && roundl(tots/nr) < 12) {
+			// most probably a barcode-only read
+			blen = (int) round(tots/nr);
+			end = 'b';
+		} else if (round(tots/nr) < 12) {
+			fprintf(stderr, "File %s looks to be barcode-only, but it's length deviation is too high (%.4g)\n", in[0], dev);
+			return 1;
+		} else {
+			fprintf(stderr, "File %s isn't a barcode-only file, try using -l instead\n", in[0]);
+			return 1;
+		}
+
+		fprintf(stderr, "Barcode length used: %d (%s)\n", blen, endstr(end));
+
+		// load a table of possble codes
+		pickmax=0;
+		picktab=NULL;
+		bnode * ent = NULL;
+        while (getline(&s, &na, gin) > 0) {
+			if (*s != '@')  {
+				fprintf(stderr,"Invalid fastq file: %s.\n", in[i]);
+				exit(1);
+			}
+
+			if ((ns=getline(&s, &na, gin)) <=0)
+				break;
+
+			getline(&q, &nq, gin);
+			if (getline(&q, &nq, gin) != ns)
+				break;
+
+			s[--ns]='\0'; q[ns]='\0';
+
+			if (st.st_size > (sampcnt * 500) && poorqual(i, ns, s, q)) 
+				continue;
+
+            ++nr;
+
+			char *p;
+			if (end == 'b') {
+				p=s;
+			} else {
+				p=s+nr-blen;
+			}
+			p[blen]='\0';
+			if (!ent)		// make a new ent 
+				ent = (bnode *) malloc(sizeof(*ent));
+
+			if (strchr(p, 'N')||strchr(p, 'n'))
+				continue;
+
+			ent->cnt=0;
+			strcpy(ent->seq=(char*)malloc(strlen(p)+1), p);
+
+			bnode *fent = * (bnode**)  tsearch(ent, &picktab, bnodecomp);
+
+			if (fent == ent)	// used the ent, added to tree
+				ent = NULL;	// need a new one
+
+			++fent->cnt;
+
+			if (fent->cnt > pickmax) pickmax=fent->cnt;
+			
+			if (nr > sampcnt)
+				break;
+		}
+		pickmax=max(1,(int)(pickmaxpct*pickmax));
+		fprintf(stderr, "Threshold used: %d\n", pickmax);
+		twalk(picktab, pickbest);
+	} else {
+		// user specifies a list of barcodes, indexed read is f[0] and f[1] if dual
+		FILE *bin = fopen(bfil, "r");
+		if (!bin) {
+			fprintf(stderr, "Error opening file '%s': %s\n",bfil, strerror(errno));
+			return 1;
+		}
+
+		bcnt = 0;
+		int ok;
+		while (bcnt < MAX_BARCODE_NUM && (ok = read_line(bin, bc[bcnt].id))) {
+			if (ok <= 0) break;
+			if (bc[bcnt].id.s[0]=='#') continue;
+			bc[bcnt].id.s=strtok(bc[bcnt].id.s, "\t\n\r ");
+			bc[bcnt].seq.s=strtok(NULL, "\t\n\r ");
+			if (!bc[bcnt].seq.s) {
+				fprintf(stderr, "Barcode file '%s' required format is 'ID SEQ'\n",bfil);
+				return 1;
+			}
+            if (bc[bcnt].dual=strchr(bc[bcnt].seq.s,'-')) {
+                *bc[bcnt].dual = '\0';
+                ++bc[bcnt].dual;
+				bc[bcnt].dual_n = strlen(bc[bcnt].dual);
+				dual=true;
+            }
+			bc[bcnt].id.n=strlen(bc[bcnt].id.s);
+			bc[bcnt].seq.n=strlen(bc[bcnt].seq.s);
+			if (debug) fprintf(stderr, "BC: %d bc:%s n:%d\n", bcnt, bc[bcnt].seq.s, bc[bcnt].seq.n);
+			++bcnt; 
+		}
+
+        fprintf(stderr, "Using Barcode File: %s\n", bfil);
+	}
+
+	if (noexec) {
+		int b;
+        	for (b=0;b<bcnt;++b) {
+			fprintf(stdout, "%s %s\n", bc[b].id.s, bc[b].seq.s);
+		}
+		exit(0);
+	}
+
+	// for whatever reason, the end is not supplied... easy enough to determine accurately
+	// or it's dual... which means we need to resample stuff
+    if (end == '\0' || dual) {
+        for (i=0;i<f_n;++i) {
+            if (!gzin[i])
+                fseek(fin[i],0,0);
+            else {
+                pclose(fin[i]);
+                fin[i]=gzopen(in[i],"r",&gzin[i]);
+            }
+        }
+
+        int sampcnt = dual ? 200000 : 10000;
+        struct stat st;
+        stat(in[0], &st);
+        char *s = NULL; size_t na = 0; int nr = 0, ns = 0;
+        char *q = NULL; size_t nq = 0;
+        int ne=0, nb=0, dne=0, dnb=0, tcount=0, read_ok=0;
+
+        int *recount = dual ? ((int *) malloc(sizeof(int)*bcnt)) : NULL;
+        if (dual) memset(recount, 0, sizeof(int)*bcnt);
+
+        struct fq fq[2]; meminit(fq);
+
+        while (read_ok=read_fq(fin[0], nr, &fq[0])) {
+            fq[0].id.s[--fq[0].id.n]='\0';
+
+            if (dual)
+                read_fq(fin[1], nr, &fq[1]);
+            ++nr;
+
+            if (st.st_size > (sampcnt * 500) && poorqual(0, fq[0].seq.n, fq[0].seq.s, fq[0].qual.s)) 
+                continue;
+
+            if (dual)
+                if (st.st_size > (sampcnt * 500) && poorqual(1, fq[1].seq.n, fq[1].seq.s, fq[1].qual.s))
+                    continue;
+
+            for (i=0;i<bcnt;++i) {
+                int dok = 0;
+                if (debug > 1) fprintf(stderr, "check %s vs %s: %s vs %s", fq[0].id.s, bc[i].id.s, fq[0].seq.s, bc[i].seq.s);
+                if (!strncmp(fq[0].seq.s, bc[i].seq.s, bc[i].seq.n)) {
+                    ++nb;
+                    ++dok;
+                } else if (!strncmp(fq[0].seq.s+fq[0].seq.n-bc[i].seq.n, bc[i].seq.s, bc[i].seq.n)) {
+                    ++ne;
+                    ++dok;
+                }
+                if (dual) {
+                    if (debug > 1) fprintf(stderr, ", dual: %s vs %s, ", fq[1].seq.s, bc[i].dual);
+                    if (!strncmp(fq[1].seq.s, bc[i].dual, bc[i].dual_n)) {
+                        ++dnb;
+                        ++dok;
+                    } else if (!strncmp(fq[1].seq.s+fq[1].seq.n-bc[i].dual_n, bc[i].dual, bc[i].dual_n)) {
+                        ++dne;
+                        ++dok;
+                    }
+                }
+                if (debug > 1) fprintf(stderr, ", dok:%d, i:%d\n", dok, i);
+                if (dok == 2) {
+                    ++recount[i];
+                    ++tcount;
+                    break;
+                }
+            }
+
+            if (nr >= sampcnt) 
+                break;
+        }
+
+        end = (ne > nb) ? 'e' : 'b';
+        fprintf(stderr, "End used: %s\n", endstr(end));
+
+        if (dual && list) {
+            // trim down possiblities to reduce number of open files, and small stub files
+            dend = (dne > dnb) ? 'e' : 'b';
+            fprintf(stderr, "Dual-end used: %s\n", endstr(dend));
+            int ocnt = bcnt;
+            // this should allow up to a 300 plex
+            int thresh = max(1,tcount/1000);
+            thresh /= threshfactor;
+            bcnt=0;
+            if (debug)
+                fprintf(stderr, "dual resample threshold: %d out of %d\n", thresh, tcount);
+            for (i=0;i<ocnt;++i) {
+                if (recount[i] >= thresh) {
+                    fprintf(stderr, "Using Barcode %s (%s-%s)\n", bc[i].id.s, bc[i].seq.s, bc[i].dual);
+                    if (debug)
+                        fprintf(stderr, "%d >= %d\n", recount[i], thresh);
+                    bc[bcnt].seq=bc[i].seq;
+                    bc[bcnt].id=bc[i].id;
+                    bc[bcnt].dual=bc[i].dual;
+                    bc[bcnt].dual_n=bc[i].dual_n;
+                    ++bcnt;
+                } else {
+                    if (debug)
+                        fprintf(stderr, "skipping barcode %s (%s-%s), %d < %d\n", bc[i].id.s, bc[i].seq.s, bc[i].dual, recount[i], thresh);
+                }
+            }
+        }
+    }
+
+	if (bcnt == 0) { 
+		fprintf(stderr, "No barcodes defined, quitting.\n");
+		exit(1);
+	}
+
+	// one beyond barcode count is unmatched
+	bc[bcnt].id.s=(char *)"unmatched";
+
+	// TODO: output barcode read ...but only for unmatched?
+	int b;
+    for (b=0;b<=bcnt;++b) {
+		for (i=0;i<f_n;++i) {
+			if (!strcasecmp(out[i],"n/a") || !strcasecmp(out[i],"/dev/null")) {
+				bc[b].out[i] = NULL;
+				bc[b].fout[i] = NULL;
+				continue;
+			}
+			const char *p=strchr(out[i],'%');
+			if (!p) fail("Each output file name must contain a '%%' sign, which is replaced by the barcode id\n");
+			bc[b].out[i]=(char *) malloc(strlen(out[i])+strlen(bc[b].id.s)+100);
+			strncpy(bc[b].out[i], out[i], p-out[i]);
+			bc[b].out[i][p-out[i]]='\0';
+			strcat(bc[b].out[i], bc[b].id.s);
+			strcat(bc[b].out[i], p+1);
+			if (!(bc[b].fout[i]=gzopen(bc[b].out[i], "w", &bc[b].gzout[i]))) {
+				fprintf(stderr, "Error opening output file '%s': %s\n",bc[b].out[i], strerror(errno));
+				return 1;
+			}
+		}
+	}
+
+	// seek back to beginning of fastq
+	for (i=0;i<f_n;++i) {
+		if (!gzin[i])
+			fseek(fin[i],0,0);
+		else {
+			pclose(fin[i]);
+			fin[i]=gzopen(in[i],"r",&gzin[i]);
+		}
+	}
+
+    // don't trim if you're not outputting the read
+
+	struct fq fq[6];	
+        meminit(fq);
+
+	int nrec=0;
+	int nerr=0;
+	int nok=0;
+	int ntooshort=0;
+	int ntrim=0;
+	int nbtrim=0;
+	int read_ok;
+
+    // ACTUAL DEMUX HAPPENS HERE
+	// read in 1 record from EACH file supplied
+	while (read_ok=read_fq(fin[0], nrec, &fq[0])) {
+		for (i=1;i<f_n;++i) {
+			int mate_ok=read_fq(fin[i], nrec, &fq[i]);
+			if (read_ok != mate_ok) {
+				fprintf(stderr, "# of rows in mate file '%s' doesn't match primary file, quitting!\n", in[i]);
+				return 1;
+			}
+			if (verify) {
+				// verify 1 in 100
+				if (0 == (nrec % 100)) {
+					char *p=strchr(fq[i].id.s,verify);
+					if (!p) {
+						fprintf(stderr, "File %s is missing id verification char %c at line %d", in[i], verify, nrec*4+1);
+						return 1;
+					}
+					int l = p-fq[i].id.s;
+					if (strncmp(fq[0].id.s, fq[i].id.s, l)) {
+						fprintf(stderr, "File %s, id doesn't match file %s at line %d", in[0], in[i], nrec*4+1);
+						return 1;
+					}
+				}
+			}
+		}
+		++nrec;
+		if (read_ok < 0) continue;
+
+		int i, best=-1, bestmm=mismatch+distance+1, bestd=mismatch+distance+1, next_best=mismatch+distance*2+1;
+
+		if (debug) {
+			fq[0].id.s[fq[0].id.n-1] = '\0';
+			fprintf(stderr, "id: %s, seq: %s %d", fq[0].id.s, fq[0].seq.s, fq[0].seq.n);
+			if (dual) fprintf(stderr, ", sdual: %s %d", fq[1].seq.s, fq[1].seq.n);
+			fq[0].id.s[fq[0].id.n] = '\n';
+			if (debug > 1) printf("\n");
+            if (!memcmp(fq[0].id.s, "HWI-ST1000:199:C0KG2ACXX:6:1101:1497:1878",41)) {
+                printf("HERE %d\n", debug);
+                exit(0);
+            }	
+		}
+
+        if (quality > 0) {
+            for (i=0;i<fq[0].seq.n;++i) {
+                if (fq[0].qual.s[i]<quality) {
+                    fq[0].seq.s[i]='N';
+                }
+            }
+        }
+
+        // for each barcode
+        for (i =0; i < bcnt; ++i) {
+            int d;
+            if (end == 'e') {
+                if (bc[i].shifted) {
+                    if (fq[0].seq.n > bc[i].seq.n) {
+                        d=hd(fq[0].seq.s+fq[0].seq.n-bc[i].seq.n-1, bc[i].seq.s, bc[i].seq.n);
+                    } else {
+                        d=bc[i].seq.n;
+                    }
+                } else {
+                    if (fq[0].seq.n >= bc[i].seq.n) {
+                        d=hd(fq[0].seq.s+fq[0].seq.n-bc[i].seq.n, bc[i].seq.s, bc[i].seq.n);
+                    } else {
+                        d=bc[i].seq.n;
+                    }
+                }
+
+                if (dual) {
+                    // distance is added in for duals
+                    if (fq[1].seq.n >= bc[i].dual_n) {
+                        d+=hd(fq[1].seq.s+fq[1].seq.n-bc[i].dual_n, bc[i].dual, bc[i].dual_n);
+                    } else {
+                        d+=bc[i].dual_n;
+                    }
+                }
+            } else {
+                if (bc[i].shifted) 
+                    d=hd(fq[0].seq.s+1,bc[i].seq.s, bc[i].seq.n);
+                else
+                    d=hd(fq[0].seq.s,bc[i].seq.s, bc[i].seq.n);
+
+                // distance is added in for duals
+                if (dual) 
+                    d+=hd(fq[1].seq.s,bc[i].dual, bc[i].dual_n);
+
+                //				if (debug > 1) {
+                //					fprintf(stderr, "index: %d dist: %d bc:%s n:%d", i, d, bc[i].seq.s, bc[i].seq.n);
+                //					if (dual) fprintf(stderr, ", idual: %s %d", bc[i].dual, bc[i].dual_n);
+                //					fprintf(stderr, "\n");
+                //				}
+            }
+            // simple... 
+            if (d < bestd) {
+                next_best=bestd;
+                bestd=d;
+                if (debug > 1) fprintf(stderr,"next_dist: %d, best_seq: %s:%d\n", next_best, bc[i].seq.s, bestd);
+            }
+            // if exact match
+            if (d==0) { 
+                if (debug) fprintf(stderr, ", found bc: %d bc:%s n:%d, bestd: %d, next_best: %d", i, bc[i].seq.s, bc[i].seq.n, bestd, next_best);
+                best=i; 
+                break; 
+            } else if (d <= mismatch) {
+                // if ok match
+                if (d == bestmm) {
+                    best=-1;		// more than 1 match... bad
+                } else if (d < bestmm) {
+                    bestmm=d;		// best match...ok
+                    best=i;
+                }
+            }
+        }
+
+        if ((best >= 0) && distance && (next_best-bestd) < distance) {
+            if (debug) fprintf(stderr, "%d<%d, skipping", next_best-bestd, distance);
+            // match is ok, but distance is poor
+            ++poor_distance;
+            best=-1;
+        }
+
+        bool trimmed = false;
+        // only trim if you're outputting the sequence
+		if (trim && best >= 0 && bc[best].fout[0]) {
+			// todo: save trimmed
+            trimmed = true;
+			int len=bc[best].seq.n;
+			if (end =='b') {
+				memmove(fq[0].seq.s, fq[0].seq.s+len, fq[0].seq.n-len);
+				memmove(fq[0].qual.s, fq[0].qual.s+len, fq[0].seq.n-len);
+			}
+			fq[0].seq.s[fq[0].seq.n-len]='\0';
+			fq[0].qual.s[fq[0].qual.n-len]='\0';
+		}
+
+		if (best < 0) {
+            // shuttle to unmatched file
+			best=bcnt;
+		}
+
+		if (debug) fprintf(stderr, ", best: %d %s\n", best, bc[best].id.s);
+
+		++bc[best].cnt;
+
+		for (i=0;i<f_n;++i) {
+			FILE *f=bc[best].fout[i];
+			if (!f) continue;
+            if (!trimmed) {
+			    // todo: capture always, not just when trim is off
+                *strrchr(fq[i].id.s, '\n') = '\0';
+                fputs(fq[i].id.s,f);
+                fputc(' ', f);
+                fputs(fq[0].seq.s,f);
+                if (dual) {
+                    fputc('-', f);
+                    fputs(fq[1].seq.s,f);
+                }
+                fputc('\n', f);
+            } else {
+                // id still has chr
+                fputs(fq[i].id.s,f);
+            }
+            fputs(fq[i].seq.s,f);
+            fputc('\n',f);
+            fputs(fq[i].com.s,f);
+            fputs(fq[i].qual.s,f);
+            fputc('\n',f);
+		}
+	}
+
+    bool io_ok=1;
+    for (b=0;b<=bcnt;++b) {
+        for (i=0;i<f_n;++i) {
+            if (bc[b].fout[i]) {
+                if (bc[b].gzout[i]) {
+                    io_ok = io_ok && !pclose(bc[b].fout[i]);
+                } else {
+                    io_ok = io_ok && !fclose(bc[b].fout[i]);
+                }
+            }
+        }
+    }
+
+
+    if (poor_distance > 0)
+        fprintf(stderr, "Skipped because of distance < %d : %d\n", distance, poor_distance);
+
+    if (!io_ok)
+        fprintf(stderr, "Returning error because of i/o error during file close\n");
+
+	int j;
+	printf("Id\tCount\tFile(s)\n");
+	int tot=0;
+	for (i=0;i<=bcnt;++i) {
+		printf("%s\t%d", bc[i].id.s, bc[i].cnt);
+		tot+=bc[i].cnt;
+		for (j=0;j<f_n;++j) {
+			if (bc[i].out[j])
+				printf("\t%s", bc[i].out[j]);
+		}
+		printf("\n");
+	}
+	printf("total\t%d\n", tot);
+
+    if (!io_ok)
+        return 3;
+
+	return 0;
+}
+
+struct group* getgroup(char *s) {
+	int i;
+	for (i=0;i<grcnt;++i) {
+		if (!strcasecmp(s,grs[i].id)) {
+			return &grs[i];
+		}
+	}
+    if (grcnt >= MAX_GROUP_NUM) {
+        fprintf(stderr,"Too many barcode groups, quitting\n");
+        exit(1);
+    }
+	grs[grcnt].id=(char *)malloc(strlen(s)+1);
+    strcpy(grs[grcnt].id,s);
+	grs[grcnt].tcnt=0;
+	grs[grcnt].i=grcnt;
+	return &grs[grcnt++];
+}
+
+void pickbest(const void *nodep, const VISIT which, const int depth)
+{
+	if (which==endorder || which==leaf) {
+		bnode *ent = *(bnode **) nodep;
+		// printf("HERE!! %s, %d, %d\n", ent->seq, ent->cnt, pickmax);
+		// allow one sample to be as much as 1/10 another, possibly too conservative
+		if (ent->cnt > pickmax && bcnt < MAX_BARCODE_NUM) {
+			bc[bcnt].seq.s=ent->seq;
+			bc[bcnt].id.s=ent->seq;
+			bc[bcnt].id.n=strlen(bc[bcnt].id.s);
+			bc[bcnt].seq.n=strlen(bc[bcnt].seq.s);
+			++bcnt;
+		} else {
+			//free(ent->seq);
+		}
+		//free(ent);
+		//tdelete((void*)ent, &picktab, scompare);
+	}
+}
+
+void usage(FILE *f) {
+	fprintf(f,
+"Usage: fastq-multx [-g|-l|-B] <barcodes.fil> <read1.fq> -o r1.%%.fq [mate.fq -o r2.%%.fq] ...\n"
+"Version: %s.%d\n"
+"\n"
+"Output files must contain a '%%' sign which is replaced with the barcode id in the barcodes file.\n"
+"Output file can be n/a to discard the corresponding data (use this for the barcode read)\n"
+"\n"
+"Barcodes file (-B) looks like this:\n"
+"\n"
+"<id1> <sequence1>\n"
+"<id2> <sequence2> ...\n"
+"\n"
+"Default is to guess the -bol or -eol based on clear stats.\n"
+"\n"
+"If -g is used, then it's parameter is an index lane, and frequently occuring sequences are used.\n"
+"\n"
+"If -l is used then all barcodes in the file are tried, and the *group* with the *most* matches is chosen.\n" 
+"\n"
+"Grouped barcodes file (-l or -L) looks like this:\n"
+"\n"
+"<id1> <sequence1> <group1>\n"
+"<id1> <sequence1> <group1>\n"
+"<id2> <sequence2> <group2>...\n"
+"\n"
+"Mated reads, if supplied, are kept in-sync\n"
+"\n"
+"Options:\n"
+"\n"
+"-o FIL1     Output files (one per input, required)\n"
+"-g SEQFIL   Determine barcodes from indexed read SEQFIL\n"
+"-l BCFIL    Determine barcodes from any read, using BCFIL as a master list\n"
+"-L BCFIL    Determine barcodes from <read1.fq>, using BCFIL as a master list\n"
+"-B BCFIL    Use barcodes from the specified file, don't run a determination step\n"
+"-b          Force beginning of line (5') for barcode matching\n"
+"-e          Force end of line (3') for batcode matching\n"
+"-t NUM      Divide threshold for auto-determine by factor NUM (1), > 1 = more sensitive\n"
+"-G NAME     Use group(s) matching NAME only\n"
+"-x          Don't trim barcodes off before writing out destination\n"
+"-n          Don't execute, just print likely barcode list\n"
+"-v C        Verify that mated id's match up to character C (Use ' ' for illumina)\n"
+"-m N        Allow up to N mismatches, as long as they are unique (1)\n"
+"-d N        Require a minimum distance of N between the best and next best (2)\n"
+"-q N        Require a minimum phred quality of N to accept a barcode base (0)\n"
+	,VERSION,SVNREV);
+}
diff --git a/fastq-stats.cpp b/fastq-stats.cpp
new file mode 100644
index 0000000..0a2d092
--- /dev/null
+++ b/fastq-stats.cpp
@@ -0,0 +1,672 @@
+/*
+Copyright (c) 2011 Expression Analysis / Gunjan Hariani, Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+$Id: fastq-stats.cpp 652 2013-09-17 17:40:32Z earonesty $
+*/
+const char * VERSION = "1.01 $Id: fastq-stats.cpp 652 2013-09-17 17:40:32Z earonesty $";
+
+#include <ctype.h>
+#include <stdio.h>
+
+void usage( FILE * f ) {
+  fprintf( f,
+	   "\nUsage: fastq-stats [options] <fastq-file>\n\n"
+	   "Version: %s\n" 
+	   "\n"
+	   "Produces lots of easily digested statistics for the files listed\n" 
+	   "\n"
+	   "Options\n"
+	   "\n"
+	   "-c     cyclemax: max cycles for which following quality stats are produced [35]\n"
+	   "-w INT window: max window size for generating duplicate read statistics [2000000]\n"
+	   "-d     debug: prints out debug statements\n"
+	   "-D     don't do duplicate read statistics\n"
+	   "-s INT number of top duplicate reads to display\n"
+	   "-x FIL output fastx statistics (requires an output filename)\n"
+	   "-b FIL output base breakdown by per phred quality at every cycle.\n"
+	   "       It sets cylemax to longest read length\n"
+	   "-L FIL Output length counts \n\n"
+	   
+	   "\n" 
+	   "The following data are printed to stdout:\n" "\n"
+	   "  reads			: #reads in the fastq file\n"
+	   "  len 	                : read length. mean and stdev are provided for variable read lengths\n"
+	   "  phred			: phred scale used\n"
+	   "  window-size		: Number of reads used to generate duplicate read statistics\n"
+	   "  cycle-max		: Number of bases to assess for duplicity\n"
+	   "  dups			: Number of reads that are duplicates\n"
+	   "  %%dup			: Pct reads that are duplcate\n"
+	   "  unique-dup seq	: Number sequences that are duplicated\n"
+	   "  min dup count		: Smallest duplicate tally for any duplicate sequence\n"
+	   "  dup seq <rank> <count> <sequence> \n"
+	   "  			: Lists top 10 most frequent duplicate reads along with count mean and stdev\n"
+	   "  qual			: Base Quality min, max and mean\n"
+	   "  %%A,%%T,%%C,%%G		: base percentages\n" 
+	   "  total bases		: total number of bases\n" 
+	   "\n"
+	   ,VERSION);
+  
+} //end usage function
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+#include <string>
+#include <sparsehash/sparse_hash_map> // or sparse_hash_set, dense_hash_map, ...
+#include <iostream>
+#include "fastq-lib.h"
+#include "gcModel.h"
+
+using namespace std;
+
+#define T_A 0
+#define T_C 2
+#define T_G 6
+#define T_T 19
+#define roundgt0(x) (long)(x+0.5)
+
+class ent {
+public:
+    std::string seq;
+    int cnt;
+
+    ent(const std::string &s, int c) { seq=s; cnt=c; };
+    static bool comp_cnt (const ent &a, const ent &b) {
+        return a.cnt > b.cnt;
+    };
+
+};
+
+class countPerCycle {
+	public:
+	int basecount[26];
+	int qc;
+	double qsum;
+	//vector<int> qual; 
+	int counts_by_qual[127];
+	int qmin;
+	int qmax;
+	
+	countPerCycle() {
+		//26 english alphabets for A/C/G/T char
+		for(int i=0; i<26; i++) {
+			basecount[i]=0;
+		}
+		for(int i=0; i<127; i++) {
+			counts_by_qual[i] = 0;
+		}
+		qc = 0;
+		qsum = 0;
+		qmin = 10000;
+		qmax = 0;
+	};
+};
+
+class count_perCycle_perQual {
+	public:
+	int counts_by_qual[127];
+
+	count_perCycle_perQual() {
+		for(int i=0; i<127; i++) {
+			counts_by_qual[i] = 0;
+		}
+	};
+};
+
+
+void usage( FILE * f );
+double std_dev( double count , double total, double sqsum );
+double quantile( const std::vector <int> & vec, double p );
+double quantiles_with_counts(int* v, int start, int end, double p, bool dbug);
+std::string string_format( const std::string &fmt, ... );
+
+extern int optind;
+bool nodup = 0;
+google::sparse_hash_map <std::string, int> dups;
+
+vector <std::string> dup_reads; // do i need this
+
+int window = 2000000;
+int cyclemax = 35; 
+int gcCyclemax = 100; // to compare with fastqc, seq is rounded to nearest 100 to reduce # of gc models; for < 200 length, this is teh same as max=100
+float gcSum;
+int gcTotal;
+
+int show_max = 10;
+bool debug = 0;
+bool fastx = 0;
+char *fastx_outfile = NULL;
+bool brkdown = 0;
+char *brkdown_outfile = NULL;
+bool len_hist = 0;
+vector<int> vlen; //all read lengths
+char *lenhist_outfile = NULL;
+bool gc = 0;
+char *gc_outfile = NULL;
+
+int main( int argc, char**argv ) {
+
+	int index;
+	char c;
+	optind = 0;
+	char *filename = NULL;
+
+// bad change to working syntax... breaks things!
+//	if(argc < 2) {usage(stdout); return 0;}
+
+	while ( (c = getopt (argc, argv, "?DdL:g:x:b:c:w:s:h")) != -1) {
+		switch (c) {
+			case 'c': cyclemax = atoi(optarg); break;
+			case 'D': ++nodup; break;
+			case 'd': ++debug; break;
+			case 'w': window = atoi(optarg); break;
+			case 's': show_max = atoi(optarg); break;
+			case 'x': fastx_outfile = optarg; ++fastx; break;
+			case 'b': brkdown_outfile = optarg; ++brkdown; break;
+			case 'L': ++len_hist; lenhist_outfile = optarg; break;
+			case 'g': gc_outfile = optarg; ++gc; break;
+			case 'h': usage(stdout); return 0;
+			case '?':
+					  if (!optopt) {
+						  usage(stdout); return 0;
+					  } else if(optopt && strchr("gbxcws", optopt)) {
+					 // 		fprintf(stderr, "Option -%c requires an argument.\n", optopt);
+					  } else {
+					//	  fprintf (stderr, "Unknown option \n", optopt);
+					  }
+					  usage(stderr);
+					  return 1;
+		}
+	}
+	
+	filename = argv[optind];
+
+	int lenmax = 0;
+	int lenmin = 100000000;
+	double lensum = 0;
+	double lenssq = 0;
+	double nbase = 0;
+	int qualmax = 0;
+	int qualmin = 100000;
+	double qualsum = 0;
+	double qualssq = 0;
+	int errs = 0;
+	long long nreads = 0;
+	int ndups = 0;
+	double dupss = 0;
+	bool fixlen = 0; //is fixed length
+	FILE *file;
+	struct fq newFq; meminit(newFq);
+	bool isgz;
+	vector<countPerCycle> qcStats (1);
+	vector<count_perCycle_perQual> qcStats_by_qual (1);
+	int phred = 64;
+	double ACGTN_count[26];
+	double total_bases = 0;
+
+
+	for(int i=0; i<26; i++) {
+		ACGTN_count[i] = 0;
+	}
+	dups.set_deleted_key("<>");
+
+	if(debug) {
+		cout << endl;
+		cout << "Parameters: " << endl;
+		printf("cyclemax: %d, window: %d, nodup: %d, debug: %d, showmax: %d, fastx: %d, outfile: %s, breakdown: %s, gc: %s\n",
+		       cyclemax, window, nodup, debug, show_max, fastx, fastx_outfile, brkdown_outfile, gc_outfile);
+		cout << endl;
+	}
+
+	if(gc) {
+	  gcInit(gcCyclemax);
+	}
+
+	//read file
+	file = filename ? gzopen(filename,"r",&isgz) : stdin;
+	while(read_fq(file,nreads++,&newFq)) {
+
+		if(newFq.seq.n != newFq.qual.n) {
+			errs++;
+		}
+		
+		if(nreads == 10000) {
+			if(!std_dev((double)nreads,lensum,lenssq)) {
+				fixlen = 1;
+			}
+		}
+		
+		total_bases += newFq.seq.n;
+		if(len_hist) {
+			if(newFq.seq.n > vlen.size()) 
+				vlen.resize(newFq.seq.n+1);
+			++vlen[newFq.seq.n];
+		}
+
+		if(!fixlen) {
+			if(newFq.seq.n > lenmax) {
+				lenmax = newFq.seq.n;
+			}
+			if(newFq.seq.n < lenmin) {
+				lenmin = newFq.seq.n;
+			}
+			lensum += newFq.seq.n;
+			lenssq += newFq.seq.n*newFq.seq.n;
+		}
+	
+		
+		if((newFq.seq.n > qcStats.size()) && (fastx)) {
+			qcStats.resize(newFq.seq.n,countPerCycle());
+
+		} 
+		
+		if((newFq.seq.n > qcStats_by_qual.size()) && (brkdown) && (!fastx)) {
+			qcStats_by_qual.resize(newFq.seq.n,count_perCycle_perQual());
+		}
+
+		int gcTally = 0;
+		//compute quality stats for the first cyclemax bases
+		for(int i=0; i < newFq.seq.n; i++) {
+			int ascii_val = (int) newFq.qual.s[i];
+			if(fastx && ((nreads < window) || (nreads%10 == 0))) {
+				qcStats[i].qc++;
+				qcStats[i].counts_by_qual[ascii_val]++;
+				qcStats[i].qsum += ascii_val;
+				qcStats[i].basecount[(toupper(newFq.seq.s[i])-65)]++;
+				if(ascii_val < qcStats[i].qmin) {
+					qcStats[i].qmin = ascii_val;
+				}
+				if(ascii_val > qcStats[i].qmax) {
+					qcStats[i].qmax = ascii_val;
+				}
+			}
+			if(brkdown && (!fastx) && ((nreads < window) || (nreads%10 == 0))) {
+				qcStats_by_qual[i].counts_by_qual[ascii_val]++;
+			}
+
+			if (i < cyclemax) {
+				nbase++;
+
+				if(ascii_val > qualmax) {
+					qualmax = ascii_val;
+				}
+				if(ascii_val < qualmin) {
+					qualmin = ascii_val;
+				}
+				qualsum += ascii_val;
+				qualssq += ascii_val*ascii_val;
+
+				ACGTN_count[(toupper(newFq.seq.s[i])-65)]++;
+			}
+			
+			if (gc && i < gcCyclemax) {
+			  if(toupper(newFq.seq.s[i]) == 'G' || toupper(newFq.seq.s[i]) == 'C') {
+			    gcTally++;
+			  }
+			}
+		}
+		if(gc) {
+		  int gcReadLength = newFq.seq.n > gcCyclemax? gcCyclemax : newFq.seq.n;
+		  gcProcessSequence(gcReadLength, gcTally);
+		  gcSum += (float)( gcTally )/gcReadLength;
+		  gcTotal++;
+		}
+
+		if(!nodup) {//if you want to look at duplicate counts
+			if(newFq.seq.n > cyclemax) {
+				newFq.seq.s[cyclemax] = '\0';
+				newFq.seq.n = cyclemax;
+			}
+
+			if(nreads < window) {
+				dups[newFq.seq.s]++;
+			} else {
+				if(dups.find(newFq.seq.s) != dups.end()) {
+					dups[newFq.seq.s]++;
+				}//make sure the element already exists in the key
+	
+				if(nreads==window) {
+					google::sparse_hash_map<string,int>::iterator it = dups.begin();
+					while(it != dups.end()) {
+						if((*it).second <= 1) {
+							dups.erase(it);
+						}
+						it++;
+					} //end while loop
+				}
+			}//if nreads > window
+		} //end if you want to look for dups
+	
+	} //end reading all fastq reads
+
+	nreads--;
+
+	int inputReadError = gzclose(file, isgz);
+
+
+	if(gc) {
+	  FILE *myfile;
+	  myfile = fopen(gc_outfile, "w");
+	  gcPrintDistribution(myfile);
+	  gcClose();
+	}
+
+	std::vector<ent> dup_sort;
+	google::sparse_hash_map<string,int>::iterator it = dups.begin();
+	while(it != dups.end()) {
+		if((*it).second > 1) {
+			ent e((*it).first,(*it).second);
+			//printf("seq: %s dups:%d\n", e.seq.c_str(), e.cnt);
+			dup_sort.push_back(e);
+			ndups += (*it).second;
+			dupss += (*it).second*(*it).second;
+		} 
+		it++;
+	} //end while loop
+	dups.clear();
+
+	std::sort(dup_sort.begin(),dup_sort.end(),ent::comp_cnt);
+	
+	if(nreads < window) {
+		window = nreads;
+	}
+
+	if(nreads < 1) {
+		cout << "No reads in " << filename << ", not generating output" << endl;
+		return 0;
+	}
+	//autodetect phred
+	if(qualmin < 64) {
+		phred = 33;
+	}
+	printf("reads\t%lld\n",nreads);
+
+	if(!fixlen) {
+		printf("len\t%d\n", lenmax);
+		printf("len mean\t%.4f\n", (double)lensum/nreads);
+		if(nreads > 1) {
+			printf("len stdev\t%.4f\n", std_dev((double)nreads,lensum,lenssq));
+		}
+		printf("len min\t%d\n", lenmin);
+	} else {
+		printf("len\t%d\n",lenmax);
+	}
+	
+	printf("phred\t%d\n", phred);
+	if(errs > 0) {
+		printf("errors\t%d\n", errs);
+	}
+
+
+	printf("window-size\t%d\n", window);
+	printf("cycle-max\t%d\n", cyclemax);
+
+	if(fastx) {
+
+		if(brkdown) {
+			FILE *myfile;
+			myfile = fopen(brkdown_outfile,"wd");
+			fprintf(myfile,"Cycle\tQuality\tCount\n");
+		
+			for(int i=0; i<qcStats.size(); i++) {
+				for(int j=qcStats[i].qmin; j<=qcStats[i].qmax; j++) {
+					fprintf(myfile,"%d\t%d\t%d\n",(i+1),(j-phred),qcStats[i].counts_by_qual[j]);
+				}
+			}
+			fclose(myfile);
+		}
+
+
+		FILE *myfile;
+		myfile = fopen(fastx_outfile,"wd");
+		fprintf(myfile,"column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\tA_count\tC_count\tG_count\tT_count\tN_count\tMax_count\n");
+		for(int i=0; i<qcStats.size(); i++) {
+			int A_tot = 0;
+			int C_tot = 0;
+			int G_tot = 0;
+			int T_tot = 0;
+			int N_tot = 0;
+			for(int j=0; j<26; j++) {
+				if(j==T_A) {
+					A_tot += qcStats[i].basecount[j];
+				} else if(j==T_C) {
+					C_tot += qcStats[i].basecount[j];
+				} else if(j==T_G) {
+					G_tot += qcStats[i].basecount[j];
+				} else if(j==T_T) {
+					T_tot += qcStats[i].basecount[j];
+				} else {
+					N_tot += qcStats[i].basecount[j];
+				}
+			}
+
+			double q1 = quantiles_with_counts(qcStats[i].counts_by_qual,qcStats[i].qmin,qcStats[i].qmax,.25,0)-phred;
+			double med = quantiles_with_counts(qcStats[i].counts_by_qual,qcStats[i].qmin,qcStats[i].qmax,.5,0)-phred;
+			double q3 = quantiles_with_counts(qcStats[i].counts_by_qual,qcStats[i].qmin,qcStats[i].qmax,.75,0)-phred;
+			
+			double iqr = q3-q1;
+			int lW = 0;
+			int rW = 0;
+			
+			int low_bound = round(q1-iqr*1.5);
+			if(low_bound <= (qcStats[i].qmin-phred)) {
+				lW = qcStats[i].qmin-phred;
+			} else {
+				for(int low=(low_bound+phred);low<=qcStats[i].qmax;low++) {
+					if(qcStats[i].counts_by_qual[low] > 0) {
+						lW = low-phred;
+						low = qcStats[i].qmax+1;
+					}
+				}
+			}
+	
+			int up_bound = round(q3+iqr*1.5);
+			if(up_bound >= (qcStats[i].qmax-phred)) {
+				rW = qcStats[i].qmax-phred;
+			} else {
+				for(int up=(up_bound+phred);up>=qualmin;up--) {
+					if(qcStats[i].counts_by_qual[up] > 0) {
+						rW = up-phred;
+						up = qcStats[i].qmin-1;
+					}
+				}
+			}
+
+			fprintf(myfile,"%d\t%d\t%d\t%d\t%.0f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t", (i+1), qcStats[i].qc, (qcStats[i].qmin-phred),
+			                        (qcStats[i].qmax-phred), (qcStats[i].qsum-qcStats[i].qc*phred), 
+									(qcStats[i].qsum/qcStats[i].qc-phred),
+									 q1, med, q3,iqr, lW, rW);
+			fprintf(myfile,"%d\t%d\t%d\t%d\t%d\t%lld\n", A_tot, C_tot, G_tot, T_tot, N_tot,nreads);
+		
+		}
+		fclose(myfile);
+	}
+
+	if(brkdown && (!fastx)) {
+		FILE *myfile;
+		myfile = fopen(brkdown_outfile,"wd");
+		fprintf(myfile,"Cycle\tQuality\tCount\n");
+		for(int i=0; i<qcStats_by_qual.size(); i++) {
+			for(int j=qualmin; j<=qualmax; j++) {
+				fprintf(myfile,"%d\t%d\t%d\n",(i+1),(j-phred),qcStats_by_qual[i].counts_by_qual[j]);
+			}
+		}
+		fclose(myfile);
+	}
+
+	if(len_hist) {
+		FILE *myfile;
+		myfile = fopen(lenhist_outfile,"wd");
+		fprintf(myfile,"Length\tCount\n");
+		for(int len_i=0; len_i<=vlen.size(); len_i++) {
+			if(vlen[len_i]) {
+				fprintf(myfile,"%d\t%d\n", len_i,vlen[len_i]);
+			}
+		}
+		fclose(myfile);
+	}
+
+	int uniq_dup = (int)dup_sort.size();
+	if(debug) {
+		cout << endl;
+		cout << "unique duplicates\t" << uniq_dup << endl;
+		cout << "total duplicates\t" << ndups << endl; 
+		cout << endl;
+	}
+	if (uniq_dup && !nodup) {
+		printf("dups\t%d\n",ndups-uniq_dup);
+		printf("%%dup\t%.4f\n", ((double)(ndups-uniq_dup)/nreads)*100);
+	    int uniq_dup = (int)dup_sort.size();
+	    printf("unique-dup seq\t%d\n", uniq_dup);
+		printf("min dup count\t%d\n", dup_sort.back().cnt);
+
+
+		for(int i=0; i<show_max; i++) {
+			if(i < dup_sort.size()) {
+				if(dup_sort.at(i).cnt != 0) {
+					cout << "dup seq \t" << (i+1) << "\t" <<  (dup_sort.at(i).cnt-1) << "\t" << dup_sort.at(i).seq << endl;
+				}
+			} else { i = show_max; }
+		}
+
+		if(uniq_dup > 1) {
+			printf("dup mean\t%.4f\n", (double)ndups/uniq_dup);
+			printf("dup stddev\t%.4f\n", (std_dev((double)uniq_dup, ndups, dupss)));
+		}
+	}
+	printf("qual min\t%d\n", qualmin-phred);
+	printf("qual max\t%d\n", qualmax-phred);
+	printf("qual mean\t%.4f\n", ((double)qualsum/nbase)-phred);
+	printf("qual stdev\t%.4f\n", std_dev((double)nbase,qualsum,qualssq));
+
+	
+	if(gc) {
+        // put these where they belong
+        printf("pct-gc cycle-max\t%d\n", gcCyclemax);
+        printf("pct-gc mean\t%.2f\n", 100.0 * gcSum / gcTotal);
+    }
+
+	printf("%%A\t%.4f\n", ((double)ACGTN_count[T_A]/nbase*100));
+	printf("%%C\t%.4f\n", ((double)ACGTN_count[T_C]/nbase*100));
+	printf("%%G\t%.4f\n", ((double)ACGTN_count[T_G]/nbase*100));
+	printf("%%T\t%.4f\n", ((double)ACGTN_count[T_T]/nbase*100));
+	double ACGT_total  = ACGTN_count[T_A] + ACGTN_count[T_C] + ACGTN_count[T_G] + ACGTN_count[T_T];
+	printf("%%N\t%.4f\n", ((double)(nbase-ACGT_total)/nbase*100));
+	printf("total bases\t%.0f\n",total_bases);
+
+    if (inputReadError) {   
+        printf("error\t%s\n", "error during close, output may be invalid");
+    }
+
+    // fail if input read failed....  even if we don't know why and reported all the stats
+    return inputReadError;
+
+} //end main method
+
+double quantile( const std::vector <int> & vec, double p ) {
+	int l = vec . size();
+	double t = ( (double) l- 1 ) * p;
+	int it = (int) t;
+	int v = vec [it];
+	if ( t > (double) it ) {
+		return ( v + (t-it) * ( vec [ it + 1 ] - v ) );
+	}
+	else {
+		return v;
+	}
+} //end quantile function
+
+std::string string_format( const std::string &fmt, ... ) {
+	int n, size = 100;
+	std::string str;
+	va_list ap;
+	while (1) {
+		str . resize(size);
+		va_start( ap, fmt );
+		int n =
+			vsnprintf( ( char * ) str . c_str(), size, fmt . c_str(), ap );
+		va_end(ap);
+		if ( n > -1 && n < size ) return str;
+		if ( n > -1 ) size = n + 1;
+		else size *= 2;
+	}
+} //end string_format function
+
+double std_dev(double count, double total, double sqsum) {
+	if(debug) {
+		cout << endl;
+		cout << "count " << count << " total " << total << " sqsum " << sqsum << endl;
+		cout << endl;
+	}
+	return sqrt(sqsum/(count-1)-(total/count *total/(count-1)));
+}
+
+double quantiles_with_counts(int *v, int start, int end, double p, bool dbug) {
+	int v_size = 0;
+	for(int i=start; i<=end; i++) {
+		if(dbug) 
+			cout << "i: " << i << " v[i]: " << v[i] << endl;
+		v_size += v[i];
+	}
+	
+	double q = p*(v_size-1);
+	int count_skip = (int) q;
+	double val = -1;
+	bool v_fill = 0;
+	int v_next = -1;
+
+	if(dbug) {
+		cout << "p : " << p << endl;
+		cout << "v-size: " << v_size << endl;
+		cout << "q : " << q << endl;
+		cout << "count-skip: " << count_skip << endl;
+	}
+	int tot=0;
+	for(int i=start; i<=end; i++) {
+		tot += v[i];
+		if(tot>count_skip && !v_fill) {
+			val = i;
+			if(dbug)
+				cout << "val : " << val << " val-count: " << v[i] << endl;
+			v_fill = 1;
+		}
+		if(tot>(count_skip+1)) {
+			v_next = i;
+			if(dbug)
+				cout << "val_next : " << v_next << " val-count: " << v[i] << endl;
+			i = end+1;			
+		}
+	}
+	
+	if(q > count_skip) {
+		if(dbug)
+			cout << "v_next - val " << (v_next-val) << endl;
+		return (val + (q-count_skip)*(v_next-val));
+	} else {
+		return val;
+	}
+}
+
diff --git a/fastx-graph b/fastx-graph
new file mode 100755
index 0000000..a06a5ab
--- /dev/null
+++ b/fastx-graph
@@ -0,0 +1,149 @@
+#!/usr/bin/Rscript --vanilla
+
+# Copyright (c) 2011 Expression Analysis / Gunjan Hariani, Erik Aronesty
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+# 
+# $Id: fastx-graph 525 2012-12-25 19:41:22Z earonesty $
+ 
+if (!require(getopt)) {
+    write(c("Installing package on:",system("hostname",intern=T)),file=stderr())
+    install.packages('getopt',repos='http://R-Forge.R-project.org')
+    # will die if it fails at this point
+    library(getopt)
+}
+
+if (!require("Hmisc")) {
+    write(c("Installing package on:",system("hostname",intern=T)),file=stderr())
+    install.packages('Hmisc',repos='http://R-Forge.R-project.org')
+    # will die if it fails at this point
+    library("Hmisc")
+}
+
+spec <- matrix(c(
+        'input'  , 'i', 1, "character", "file from fastq-stats -x (required)",
+        'gc'     , 'G', 1, "character", "input gc content file (optional)",
+        'out'    , 'o', 1, "character", "output filename (optional)",
+        'help'   , 'h', 0, "logical",   "this help"
+),ncol=5,byrow=T)
+
+opt = getopt(spec);
+
+if (!is.null(opt$help) || is.null(opt$input)) {
+    cat(paste(getopt(spec, usage=T, command="fastx-graph"),"\n"));
+    q();
+}
+
+in.file <- opt$input
+gc.file <- opt$gc
+out.file <- opt$out
+
+fastx <- read.table(in.file,sep="\t",header=T,as.is=T)
+
+# output is in, minus txt, plus png
+if (is.null(out.file)) {
+    in.file <- gsub(".txt$","",in.file,perl=T)
+    out.file <- paste(in.file,".png",sep="")
+}
+
+# correct for bug in R if the file has a % sign in it
+out.file <- gsub("%","%%",out.file)
+
+png(out.file,width=1000, height=500)
+
+par(mar=c(4,3,5,5),xaxs="i",yaxs="i",xpd=T)
+plot(c(0,0),pch="",ylim=c(min(c(0,fastx$lW)),max(41,fastx$rW)),xlim=c(0,(nrow(fastx)+1)),
+	xlab="",ylab="",las=2,cex.axis=.85,cex.lab=.85,xaxt="n")
+
+lims <- par("xaxp")[1:2] 
+major.ticks <- pretty(lims,n=par("xaxp")[3]) 
+minor.tick(nx=(major.ticks[2]-major.ticks[1]),ny=10)
+
+minor.ticks <- 1:nrow(fastx)
+mtext("Cycle",side=3,line=3,cex=.85)
+axis(1,at=major.ticks,cex.axis=.85,labels=F)
+axis(3,at=major.ticks,cex.axis=.85,labels=F)
+axis(3,at=minor.ticks,tcl=par("tcl")*0.5,labels=minor.ticks,cex.axis=.80,las=3) 
+
+colnames(fastx) <- c("column","count","min","max","sum","mean","Q1","med",
+			"Q3","IQR","lW","rW","A_count","C_count","G_count",
+			"T_count","N_count","Max_count")
+
+for(i in 1:nrow(fastx)) {
+	par(new=T)
+	rect(fastx$column[i]-.35,fastx$Q1[i],fastx$column[i]+0.35,fastx$Q3[i],col="gray")
+	segments(fastx$column[i]-.35,fastx$med[i],fastx$column[i]+0.35,fastx$med[i],col="red",lwd=1.5)
+	segments(fastx$column[i],fastx$lW[i],fastx$column[i],fastx$Q1[i],lty="dashed")
+	segments(fastx$column[i],fastx$Q3[i],fastx$column[i],fastx$rW[i],lty="dashed")
+	segments(fastx$column[i]-.35,fastx$lW[i],fastx$column[i]+0.35,fastx$lW[i])
+	segments(fastx$column[i]-.35,fastx$rW[i],fastx$column[i]+0.35,fastx$rW[i])
+}
+
+# if theres a significant difference
+tots<-fastx$N_count+fastx$T_count+fastx$G_count+fastx$C_count+fastx$A_count
+
+if (min(tots) < (max(tots)*.98)) {
+    par(new=T)
+    plot(tots,col="gray",xaxt="n",yaxt="n",xlab="",ylab="",pch="+",ylim=c(min(tots),max(tots)))
+    lines(fastx$N_count+fastx$T_count+fastx$G_count+fastx$C_count+fastx$A_count,col="gray",xaxt="n",yaxt="n",xlab="",ylab="", lty=2)
+}
+
+par(new=T)
+plot(fastx$column,fastx$A_count*100/fastx$count,col="red",type="l",xaxt="n",yaxt="n",xlab="",ylab="",
+			ylim=c(0,100),lwd=2)
+par(new=T)
+plot(fastx$column,fastx$C_count*100/fastx$count,col="blue",type="l",xaxt="n",yaxt="n",xlab="",ylab="",
+			ylim=c(0,100),lwd=2)
+par(new=T)
+plot(fastx$column,fastx$G_count*100/fastx$count,col="green",type="l",xaxt="n",yaxt="n",xlab="",ylab="",
+			ylim=c(0,100),lwd=2)
+par(new=T)
+plot(fastx$column,fastx$T_count*100/fastx$count,col="black",type="l",xaxt="n",yaxt="n",xlab="",ylab="",
+			ylim=c(0,100),lwd=2)
+par(new=T)
+barplot(fastx$N_count*100/fastx$count,col="orange",xaxt="n",yaxt="n",xlab="",ylab="",
+			ylim=c(0,100))
+
+axis(4,at=seq(0,100,25),cex.axis=.85)
+mtext("Pct Base Distribution",side=4,line=3,cex=.85)
+mtext("Base Quality",side=2,line=2,cex=.85)
+legend(0,-4,col=c("red","blue","green","black","orange"),lty=1,
+	legend=c("A","C","G","T","N"),cex=.70,horiz=T,lwd=2) 
+
+if(!is.null(gc.file)) {
+	tmp <- read.table(gc.file,sep="\t",header=F,as.is=T)
+    if (tmp[1,1] == 'pct-GC') {
+        # silly legacy format
+	    GC <- read.table(gc.file,sep="\t",header=T,as.is=T,skip=3)
+    } else if (tmp[1,1] == 'pct_GC') {
+	    GC <- read.table(gc.file,sep="\t",header=T,as.is=T,skip=1)
+    } else {
+	    GC <- read.table(gc.file,sep="\t",header=T,as.is=T)
+    }
+    colnames(GC)=c("pct_gc", "count")
+	par(new=T)
+	par(fig=c(0.1,0.2,0.45,0.60))
+	par(mar=c(0,0,1,0))
+	plot(GC$pct_gc,GC$count,type="l",xaxt="n",yaxt="n",
+			main="%GC per read",cex.main=.90)
+	axis(1,seq(0,100,20),labels=seq(0,100,20),las=2,tck=-0.1,cex.axis=.75)
+}
+
+
+graphics.off()
diff --git a/gcModel.c b/gcModel.c
new file mode 100644
index 0000000..9627dc2
--- /dev/null
+++ b/gcModel.c
@@ -0,0 +1,207 @@
+/*
+$Id: gcModel.c 564 2013-03-08 17:16:42Z earonesty $
+*/
+#include <ctype.h>
+#include <stdio.h>
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+#include <string>
+#include <iostream>
+
+#include "fastq-lib.h"
+#include "gcModel.h"
+
+// #define UNIT_TEST
+ 
+void gcInit(int maxReadLength);
+void gcProcessSequence(int l,int c);
+void gcPrintDistribution(FILE *fp);
+void gcClose();
+
+using namespace std;
+
+#define roundgt0(x) (long)(x<0.5?0:x+0.5)
+
+typedef struct GCModelValue {
+  int percentage;
+  double increment;
+} GC_MODEL_VALUE; 
+
+typedef struct GCModelValues {
+  GC_MODEL_VALUE * values;
+  int valuesLength;
+} GC_MODEL_VALUES;
+
+typedef GC_MODEL_VALUES *GC_MODELS;
+
+
+static int claimingCounts[101]; 
+static double gcDistribution[101]; 
+static GC_MODELS *cachedModels;
+static int gMaxReadLength = -1;
+
+GC_MODEL_VALUES *calcModels(int readLength) {
+
+  memset(claimingCounts,0,sizeof(claimingCounts));
+
+  GC_MODEL_VALUES *models = (GC_MODEL_VALUES *) malloc((readLength+1) * sizeof(GC_MODEL_VALUES ));
+  memset(models,0,(readLength+1) * sizeof(GC_MODEL_VALUES ));
+
+  for (int pos=0;pos<=readLength;pos++) {
+    double lowCount = pos-0.5;
+    double highCount = pos+0.5;
+    
+    if (lowCount < 0.0) lowCount = 0.0;
+    if (highCount < 0.0) highCount = 0.0;
+    if (highCount > readLength) highCount = readLength;
+    if (lowCount > readLength) lowCount = readLength;
+    
+    int lowPercentage = (int)roundgt0(((lowCount*100) / readLength));
+    int highPercentage = (int)roundgt0(((highCount*100) / readLength));
+    
+    for (int p=lowPercentage;p<=highPercentage;p++) {
+      claimingCounts[p]++;
+    }
+  }
+
+  // We now do a second pass to make up the model using the weightings
+  // we calculated previously.
+  
+  for (int pos=0;pos<=readLength;pos++) {
+    double lowCount = pos-0.5;
+    double highCount = pos+0.5;
+    
+    if (lowCount < 0) lowCount = 0;
+    if (highCount < 0) highCount = 0;
+    if (highCount > readLength) highCount = readLength;
+    if (lowCount > readLength) lowCount = readLength;
+    
+    int lowPercentage = (int)roundgt0((lowCount*100) / readLength);
+    int highPercentage = (int)roundgt0((highCount*100) / readLength);
+    
+    models[pos].values = (GC_MODEL_VALUE *) malloc(((highPercentage-lowPercentage)+1) * sizeof(GC_MODEL_VALUE) );
+    memset(models[pos].values,0,
+	   ((highPercentage-lowPercentage)+1) * sizeof(GC_MODEL_VALUE) );
+    models[pos].valuesLength = (highPercentage-lowPercentage)+1;
+
+    for (int p=lowPercentage;p<=highPercentage;p++) {
+      models[pos].values[p-lowPercentage].percentage = p;
+      models[pos].values[p-lowPercentage].increment = 1.0/claimingCounts[p];
+    }
+  }
+  
+  return (models);
+}
+
+
+void gcProcessSequence(int l,int c) {
+
+  if(l > gMaxReadLength) { printf("Error: read length (%d) exceeds specified maximum length(%d)\n", l, gMaxReadLength); }
+  if(c > l) { printf("Error: GC-count (%d) exceeds actual read length(%d)\n", c, l) ;}
+
+  GC_MODEL_VALUE *values = cachedModels[l][c].values;
+
+  for(int i=0; i < cachedModels[l][c].valuesLength; i++) {
+    gcDistribution[values[i].percentage] += values[i].increment;
+  }
+
+}
+
+void printModels(int rl) {
+  GC_MODEL_VALUES *m = cachedModels[rl];
+
+  printf("## Model values for read length=%d\n",rl);
+
+  for(int i = 0; i <= rl; i++) {
+    printf("%d: ",i);
+    for(int j = 0; j < m[i].valuesLength; j++) {
+      printf("%d,%.2f ",m[i].values[j].percentage, m[i].values[j].increment);
+    }
+    printf("\n");
+  }
+}
+
+void gcPrintDistribution(FILE *fp) {
+  if(fp == NULL) {
+    fp = stdout;
+  }
+  fprintf(fp, "pct_GC\tCount\n");
+  for(int i=0; i<=100;i++) {
+    fprintf(fp, "%d\t%.2f\n",i,gcDistribution[i]);
+  }
+}
+
+void gcClose() {
+  if(gMaxReadLength < 0)return; // never initialized
+
+  for(int rl = 0; rl < gMaxReadLength; rl++) {
+    GC_MODEL_VALUES * m =  cachedModels[rl];
+    for(int i = 0; i <= rl; i++) {
+      free(m[i].values);
+    }
+    free(m);
+  }
+  
+  free(cachedModels);
+}
+
+void gcInit(int maxReadLength) {
+  gMaxReadLength = maxReadLength;
+
+  memset(gcDistribution,0,sizeof(gcDistribution));
+  // Build all models for a given max readlength:
+  cachedModels = (GC_MODELS*)malloc((maxReadLength+1) * sizeof(GC_MODELS));
+  // original code fills this in,caching, as necessary 
+  // here, we just build all models at outset:
+  int pos;
+  for( pos = 0; pos <= maxReadLength; pos++) {
+    cachedModels[pos] = calcModels(pos);
+  }
+}
+
+#ifdef UNIT_TEST
+main() {
+
+  //  int maxReadLength = 35;
+  int maxReadLength = 5;
+
+  gcInit(maxReadLength);
+
+  // ***
+  // simulate processing A sequence:
+  //  int seqLength = 3; // this sequence's length
+  //  int gcCount = 2; // total G's & C's -- count 'em
+  /*
+  for(int i = 0; i < 10000000; i++) {
+    gcProcessSequence(35,15);
+  }
+  for(int i = 0; i < 5000000; i++) {
+    gcProcessSequence(35,10);
+  }
+  */
+
+    printModels(4);
+  //  exit(0);
+
+  //  for(int pos=0; pos <= maxReadLength; pos++) {
+  //    gcProcessSequence(maxReadLength,pos);
+    //  }
+
+    //  gcProcessSequence(3,0);
+    //  gcProcessSequence(4,2);
+    //  gcProcessSequence(5,4);
+
+    //  gcPrintDistribution(NULL);
+
+ 
+  gcClose();
+
+}
+#endif
diff --git a/gcModel.h b/gcModel.h
new file mode 100644
index 0000000..e75fcf5
--- /dev/null
+++ b/gcModel.h
@@ -0,0 +1,7 @@
+/*
+$Id: gcModel.h 556 2013-03-01 15:32:36Z earonesty $
+*/
+extern void gcInit(int maxReadLength);
+extern void gcProcessSequence(int l,int c);
+extern void gcPrintDistribution(FILE *fp);
+void gcClose();
diff --git a/gtf2bed b/gtf2bed
new file mode 100755
index 0000000..c624e81
--- /dev/null
+++ b/gtf2bed
@@ -0,0 +1,116 @@
+#!/usr/bin/perl
+
+# Copyright (c) 2011 Erik Aronesty (erik at q32.com)
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+# 
+# ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT.
+
+use Data::Dumper;
+
+$in = shift @ARGV;
+
+open IN, ($in =~ /\.gz$/ ? "gunzip -c $in" : $in =~ /\.zip$/ ? "unzip -p $in" : "$in");
+while (<IN>) {
+	$gff = 2 if /^##gff-version 2/;
+	$gff = 3 if /^##gff-version 3/;
+	next if /^#/ && $gff;
+
+	s/\s+$//;
+	# 0-chr 1-src 2-feat 3-beg 4-end 5-scor 6-dir 7-fram 8-attr
+	my @f = split /\t/;
+	if ($gff) {
+        # most ver 2's stick gene names in the id field
+		($id) = $f[8]=~ /\bID="([^"]+)"/;
+        # most ver 3's stick unquoted names in the name field
+		($id) = $f[8]=~ /\bName=([^";]+)/ if !$id && $gff == 3;
+	} else {
+		($id) = $f[8]=~ /transcript_id "([^"]+)"/;
+	}
+
+	next unless $id && $f[0];
+
+	if ($f[2] eq 'exon') {
+		die "no position at exon on line $." if ! $f[3];
+        # gff3 puts :\d in exons sometimes
+        $id =~ s/:\d+$// if $gff == 3;
+		push @{$exons{$id}}, \@f;
+		# save lowest start
+		$trans{$id} = \@f if !$trans{$id};
+	} elsif ($f[2] eq 'start_codon') {
+		#optional, output codon start/stop as "thick" region in bed
+		$sc{$id}->[0] = $f[3];
+	} elsif ($f[2] eq 'CDS') {
+		#optional, output codon start/stop as "thick" region in bed
+		push @{$cds{$id}}, \@f;
+		# save lowest start
+		$cdx{$id} = \@f if !$cdx{$id};
+	} elsif ($f[2] eq 'stop_codon') {
+		$sc{$id}->[1] = $f[4];
+	} elsif ($f[2] eq 'miRNA' ) {
+		$trans{$id} = \@f if !$trans{$id};
+		push @{$exons{$id}}, \@f;
+	}
+}
+
+for $id ( 
+	# sort by chr then pos
+	sort {
+		$trans{$a}->[0] eq $trans{$b}->[0] ? 
+		$trans{$a}->[3] <=> $trans{$b}->[3] : 
+		$trans{$a}->[0] cmp $trans{$b}->[0]
+	} (keys(%trans)) ) {
+		my ($chr, undef, undef, undef, undef, undef, $dir, undef, $attr, undef, $cds, $cde) = @{$trans{$id}};
+        my ($cds, $cde);
+        ($cds, $cde) = @{$sc{$id}} if $sc{$id};
+
+		# sort by pos
+		my @ex = sort {
+			$a->[3] <=> $b->[3]
+		} @{$exons{$id}};
+
+		my $beg = $ex[0][3];
+		my $end = $ex[-1][4];
+		
+		if ($dir eq '-') {
+			# swap
+			$tmp=$cds;
+			$cds=$cde;
+			$cde=$tmp;
+			$cds -= 2 if $cds;
+			$cde += 2 if $cde;
+		}
+
+		# not specified, just use exons
+		$cds = $beg if !$cds;
+		$cde = $end if !$cde;
+
+		# adjust start for bed
+		--$beg; --$cds;
+	
+		my $exn = @ex;												# exon count
+		my $exst = join ",", map {$_->[3]-$beg-1} @ex;				# exon start
+		my $exsz = join ",", map {$_->[4]-$_->[3]+1} @ex;			# exon size
+
+		# added an extra comma to make it look exactly like ucsc's beds
+		print "$chr\t$beg\t$end\t$id\t0\t$dir\t$cds\t$cde\t0\t$exn\t$exsz,\t$exst,\n";
+}
+
+
+close IN;
diff --git a/kaln.c b/kaln.c
new file mode 100644
index 0000000..9c0bbaa
--- /dev/null
+++ b/kaln.c
@@ -0,0 +1,486 @@
+/* The MIT License
+
+   Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3 at gmail.com>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+#include "kaln.h"
+
+#define FROM_M 0
+#define FROM_I 1
+#define FROM_D 2
+
+typedef struct {
+	int i, j;
+	unsigned char ctype;
+} path_t;
+
+int aln_sm_blosum62[] = {
+/*	 A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  *  X */
+	 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
+	-1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
+	-2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
+	-2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
+	 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
+	-1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
+	-1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
+	 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
+	-2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
+	-1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
+	-1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
+	-1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
+	-1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
+	-2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
+	-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
+	 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
+	 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
+	-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
+	-2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
+	 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
+	-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
+	 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
+};
+
+int aln_sm_blast[] = {
+	1, -3, -3, -3, -2,
+	-3, 1, -3, -3, -2,
+	-3, -3, 1, -3, -2,
+	-3, -3, -3, 1, -2,
+	-2, -2, -2, -2, -2
+};
+
+int aln_sm_qual[] = {
+	  0, -23, -23, -23, 0,
+	-23,   0, -23, -23, 0,
+	-23, -23,   0, -23, 0,
+	-23, -23, -23,   0, 0,
+	  0,   0,   0,   0, 0
+};
+
+ka_param_t ka_param_blast = {  5,  2,   5, 2, aln_sm_blast, 5, 50 };
+ka_param_t ka_param_aa2aa = { 10,  2,  10, 2, aln_sm_blosum62, 22, 50 };
+
+ka_param2_t ka_param2_qual  = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 };
+
+static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
+{
+	int i, n;
+	uint32_t *cigar;
+	unsigned char last_type;
+
+	if (path_len == 0 || path == 0) {
+		*n_cigar = 0;
+		return 0;
+	}
+
+	last_type = path->ctype;
+	for (i = n = 1; i < path_len; ++i) {
+		if (last_type != path[i].ctype) ++n;
+		last_type = path[i].ctype;
+	}
+	*n_cigar = n;
+	cigar = (uint32_t*)calloc(*n_cigar, 4);
+
+	cigar[0] = 1u << 4 | path[path_len-1].ctype;
+	last_type = path[path_len-1].ctype;
+	for (i = path_len - 2, n = 0; i >= 0; --i) {
+		if (path[i].ctype == last_type) cigar[n] += 1u << 4;
+		else {
+			cigar[++n] = 1u << 4 | path[i].ctype;
+			last_type = path[i].ctype;
+		}
+	}
+
+	return cigar;
+}
+
+/***************************/
+/* START OF common_align.c */
+/***************************/
+
+#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
+
+#define set_M(MM, cur, p, sc)							\
+{														\
+	if ((p)->M >= (p)->I) {								\
+		if ((p)->M >= (p)->D) {							\
+			(MM) = (p)->M + (sc); (cur)->Mt = FROM_M;	\
+		} else {										\
+			(MM) = (p)->D + (sc); (cur)->Mt = FROM_D;	\
+		}												\
+	} else {											\
+		if ((p)->I > (p)->D) {							\
+			(MM) = (p)->I + (sc); (cur)->Mt = FROM_I;	\
+		} else {										\
+			(MM) = (p)->D + (sc); (cur)->Mt = FROM_D;	\
+		}												\
+	}													\
+}
+#define set_I(II, cur, p)								\
+{														\
+	if ((p)->M - gap_open > (p)->I) {					\
+		(cur)->It = FROM_M;								\
+		(II) = (p)->M - gap_open - gap_ext;				\
+	} else {											\
+		(cur)->It = FROM_I;								\
+		(II) = (p)->I - gap_ext;						\
+	}													\
+}
+#define set_end_I(II, cur, p)							\
+{														\
+	if (gap_end_ext >= 0) {								\
+		if ((p)->M - gap_end_open > (p)->I) {			\
+			(cur)->It = FROM_M;							\
+			(II) = (p)->M - gap_end_open - gap_end_ext;	\
+		} else {										\
+			(cur)->It = FROM_I;							\
+			(II) = (p)->I - gap_end_ext;				\
+		}												\
+	} else set_I(II, cur, p);							\
+}
+#define set_D(DD, cur, p)								\
+{														\
+	if ((p)->M - gap_open > (p)->D) {					\
+		(cur)->Dt = FROM_M;								\
+		(DD) = (p)->M - gap_open - gap_ext;				\
+	} else {											\
+		(cur)->Dt = FROM_D;								\
+		(DD) = (p)->D - gap_ext;						\
+	}													\
+}
+#define set_end_D(DD, cur, p)							\
+{														\
+	if (gap_end_ext >= 0) {								\
+		if ((p)->M - gap_end_open > (p)->D) {			\
+			(cur)->Dt = FROM_M;							\
+			(DD) = (p)->M - gap_end_open - gap_end_ext;	\
+		} else {										\
+			(cur)->Dt = FROM_D;							\
+			(DD) = (p)->D - gap_end_ext;				\
+		}												\
+	} else set_D(DD, cur, p);							\
+}
+
+typedef struct {
+	uint8_t Mt:3, It:2, Dt:3;
+} dpcell_t;
+
+typedef struct {
+	int M, I, D;
+} dpscore_t;
+
+/***************************
+ * banded global alignment *
+ ***************************/
+uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
+{
+	int i, j;
+	dpcell_t **dpcell, *q;
+	dpscore_t *curr, *last, *s;
+	int b1, b2, tmp_end;
+	int *mat, end, max = 0;
+	uint8_t type, ctype;
+	uint32_t *cigar = 0;
+
+	int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
+	int *score_matrix, N_MATRIX_ROW;
+
+	/* initialize some align-related parameters. just for compatibility */
+	gap_open = ap->gap_open;
+	gap_ext = ap->gap_ext;
+	gap_end_open = ap->gap_end_open;
+	gap_end_ext = ap->gap_end_ext;
+	b = ap->band_width;
+	score_matrix = ap->matrix;
+	N_MATRIX_ROW = ap->row;
+
+	if (n_cigar) *n_cigar = 0;
+	if (len1 == 0 || len2 == 0) return 0;
+
+	/* calculate b1 and b2 */
+	if (len1 > len2) {
+		b1 = len1 - len2 + b;
+		b2 = b;
+	} else {
+		b1 = b;
+		b2 = len2 - len1 + b;
+	}
+	if (b1 > len1) b1 = len1;
+	if (b2 > len2) b2 = len2;
+	--seq1; --seq2;
+
+	/* allocate memory */
+	end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
+	dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
+	for (j = 0; j <= len2; ++j)
+		dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
+	for (j = b2 + 1; j <= len2; ++j)
+		dpcell[j] -= j - b2;
+	curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
+	last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
+	
+	/* set first row */
+	SET_INF(*curr); curr->M = 0;
+	for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
+		SET_INF(*s);
+		set_end_D(s->D, dpcell[0] + i, s - 1);
+	}
+	s = curr; curr = last; last = s;
+
+	/* core dynamic programming, part 1 */
+	tmp_end = (b2 < len2)? b2 : len2 - 1;
+	for (j = 1; j <= tmp_end; ++j) {
+		q = dpcell[j]; s = curr; SET_INF(*s);
+		set_end_I(s->I, q, last);
+		end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
+		mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+		++s; ++q;
+		for (i = 1; i != end; ++i, ++s, ++q) {
+			set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
+			set_I(s->I, q, last + i);
+			set_D(s->D, q, s - 1);
+		}
+		set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+		set_D(s->D, q, s - 1);
+		if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
+			set_end_I(s->I, q, last + i);
+		} else s->I = MINOR_INF;
+		s = curr; curr = last; last = s;
+	}
+	/* last row for part 1, use set_end_D() instead of set_D() */
+	if (j == len2 && b2 != len2 - 1) {
+		q = dpcell[j]; s = curr; SET_INF(*s);
+		set_end_I(s->I, q, last);
+		end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
+		mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+		++s; ++q;
+		for (i = 1; i != end; ++i, ++s, ++q) {
+			set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
+			set_I(s->I, q, last + i);
+			set_end_D(s->D, q, s - 1);
+		}
+		set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+		set_end_D(s->D, q, s - 1);
+		if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
+			set_end_I(s->I, q, last + i);
+		} else s->I = MINOR_INF;
+		s = curr; curr = last; last = s;
+		++j;
+	}
+
+	/* core dynamic programming, part 2 */
+	for (; j <= len2 - b2 + 1; ++j) {
+		SET_INF(curr[j - b2]);
+		mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+		end = j + b1 - 1;
+		for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
+			set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+			set_I(s->I, q, last + i);
+			set_D(s->D, q, s - 1);
+		}
+		set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+		set_D(s->D, q, s - 1);
+		s->I = MINOR_INF;
+		s = curr; curr = last; last = s;
+	}
+
+	/* core dynamic programming, part 3 */
+	for (; j < len2; ++j) {
+		SET_INF(curr[j - b2]);
+		mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+		for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
+			set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+			set_I(s->I, q, last + i);
+			set_D(s->D, q, s - 1);
+		}
+		set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
+		set_end_I(s->I, q, last + i);
+		set_D(s->D, q, s - 1);
+		s = curr; curr = last; last = s;
+	}
+	/* last row */
+	if (j == len2) {
+		SET_INF(curr[j - b2]);
+		mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+		for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
+			set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+			set_I(s->I, q, last + i);
+			set_end_D(s->D, q, s - 1);
+		}
+		set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
+		set_end_I(s->I, q, last + i);
+		set_end_D(s->D, q, s - 1);
+		s = curr; curr = last; last = s;
+	}
+
+	*_score = last[len1].M;
+	if (n_cigar) { /* backtrace */
+		path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
+		i = len1; j = len2;
+		q = dpcell[j] + i;
+		s = last + len1;
+		max = s->M; type = q->Mt; ctype = FROM_M;
+		if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
+		if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
+
+		p = path;
+		p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
+		++p;
+		do {
+			switch (ctype) {
+			case FROM_M: --i; --j; break;
+			case FROM_I: --j; break;
+			case FROM_D: --i; break;
+			}
+			q = dpcell[j] + i;
+			ctype = type;
+			switch (type) {
+			case FROM_M: type = q->Mt; break;
+			case FROM_I: type = q->It; break;
+			case FROM_D: type = q->Dt; break;
+			}
+			p->ctype = ctype; p->i = i; p->j = j;
+			++p;
+		} while (i || j);
+		cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
+		free(path);
+	}
+
+	/* free memory */
+	for (j = b2 + 1; j <= len2; ++j)
+		dpcell[j] += j - b2;
+	for (j = 0; j <= len2; ++j)
+		free(dpcell[j]);
+	free(dpcell);
+	free(curr); free(last);
+
+	return cigar;
+}
+
+typedef struct {
+	int M, I, D;
+} score_aux_t;
+
+#define MINUS_INF -0x40000000
+
+// matrix: len2 rows and len1 columns
+int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap)
+{
+	
+#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) {					\
+		int t1, t2;														\
+		score_aux_t *_q;												\
+		_q = _q0;														\
+		_p->M = _q->M >= _q->I? _q->M : _q->I;							\
+		_p->M = _p->M >= _q->D? _p->M : _q->D;							\
+		_p->M += (_sc);													\
+		++_q;      t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \
+		_q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \
+	}
+
+	int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret;
+	const uint8_t *seq1, *seq2;
+	score_aux_t *curr, *last, *swap;
+	bw = abs(len1 - len2) + ap->band_width;
+	i = len1 > len2? len1 : len2;
+	if (bw > i + 1) bw = i + 1;
+	seq1 = _seq1 - 1; seq2 = _seq2 - 1;
+	curr = calloc(len1 + 2, sizeof(score_aux_t));
+	last = calloc(len1 + 2, sizeof(score_aux_t));
+	{ // the zero-th row
+		int x, end = len1;
+		score_aux_t *p;
+		j = 0;
+		x = j + bw; end = len1 < x? len1 : x; // band end
+		p = curr;
+		p->M = 0; p->I = p->D = MINUS_INF;
+		for (i = 1, p = &curr[1]; i <= end; ++i, ++p)
+			p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i);
+		p->M = p->I = p->D = MINUS_INF;
+		swap = curr; curr = last; last = swap;
+	}
+	for (j = 1; j < len2; ++j) {
+		int x, beg = 0, end = len1, *scrow, col_end;
+		score_aux_t *p;
+		x = j - bw; beg =    0 > x?    0 : x; // band start
+		x = j + bw; end = len1 < x? len1 : x; // band end
+		if (beg == 0) { // from zero-th column
+			p = curr;
+			p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
+			++beg; // then beg = 1
+		}
+		scrow = scmat + seq2[j] * scmat_size;
+		if (end == len1) col_end = 1, --end;
+		else col_end = 0;
+		for (i = beg, p = &curr[beg]; i <= end; ++i, ++p)
+			__score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide);
+		if (col_end) {
+			__score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide);
+			++p;
+		}
+		p->M = p->I = p->D = MINUS_INF;
+//		for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
+		swap = curr; curr = last; last = swap;
+	}
+	{ // the last row
+		int x, beg = 0, *scrow;
+		score_aux_t *p;
+		j = len2;
+		x = j - bw; beg = 0 > x?    0 : x; // band start
+		if (beg == 0) { // from zero-th column
+			p = curr;
+			p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
+			++beg; // then beg = 1
+		}
+		scrow = scmat + seq2[j] * scmat_size;
+		for (i = beg, p = &curr[beg]; i < len1; ++i, ++p)
+			__score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede);
+		__score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede);
+//		for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
+	}
+	ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I;
+	ret = ret >= curr[len1].D? ret : curr[len1].D;
+	free(curr); free(last);
+	return ret;
+}
+
+#ifdef _MAIN
+int main(int argc, char *argv[])
+{
+//	int len1 = 35, len2 = 35;
+//	uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1";
+//	uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0";
+	int len1 = 4, len2 = 4;
+	uint8_t *seq1 = (uint8_t*)"\1\0\0\1";
+	uint8_t *seq2 = (uint8_t*)"\1\0\1\0";
+	int sc;
+//	ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0);
+	sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual);
+	printf("%d\n", sc);
+	return 0;
+}
+#endif
diff --git a/knetfile.c b/knetfile.c
new file mode 100644
index 0000000..af09146
--- /dev/null
+++ b/knetfile.c
@@ -0,0 +1,632 @@
+/* The MIT License
+
+   Copyright (c) 2008 by Genome Research Ltd (GRL).
+                 2010 by Attractive Chaos <attractor at live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Probably I will not do socket programming in the next few years and
+   therefore I decide to heavily annotate this file, for Linux and
+   Windows as well.  -ac */
+
+#include <time.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#ifndef _WIN32
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#endif
+
+#include "knetfile.h"
+
+/* In winsock.h, the type of a socket is SOCKET, which is: "typedef
+ * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
+ * integer -1. In knetfile.c, I use "int" for socket type
+ * throughout. This should be improved to avoid confusion.
+ *
+ * In Linux/Mac, recv() and read() do almost the same thing. You can see
+ * in the header file that netread() is simply an alias of read(). In
+ * Windows, however, they are different and using recv() is mandatory.
+ */
+
+/* This function tests if the file handler is ready for reading (or
+ * writing if is_read==0). */
+static int socket_wait(int fd, int is_read)
+{
+	fd_set fds, *fdr = 0, *fdw = 0;
+	struct timeval tv;
+	int ret;
+	tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+	FD_ZERO(&fds);
+	FD_SET(fd, &fds);
+	if (is_read) fdr = &fds;
+	else fdw = &fds;
+	ret = select(fd+1, fdr, fdw, 0, &tv);
+#ifndef _WIN32
+	if (ret == -1) perror("select");
+#else
+	if (ret == 0)
+		fprintf(stderr, "select time-out\n");
+	else if (ret == SOCKET_ERROR)
+		fprintf(stderr, "select: %d\n", WSAGetLastError());
+#endif
+	return ret;
+}
+
+#ifndef _WIN32
+/* This function does not work with Windows due to the lack of
+ * getaddrinfo() in winsock. It is addapted from an example in "Beej's
+ * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+	int on = 1, fd;
+	struct linger lng = { 0, 0 };
+	struct addrinfo hints, *res = 0;
+	memset(&hints, 0, sizeof(struct addrinfo));
+	hints.ai_family = AF_UNSPEC;
+	hints.ai_socktype = SOCK_STREAM;
+	/* In Unix/Mac, getaddrinfo() is the most convenient way to get
+	 * server information. */
+	if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
+	if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+	/* The following two setsockopt() are used by ftplib
+	 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
+	 * necessary. */
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+	if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+	if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+	freeaddrinfo(res);
+	return fd;
+}
+#else
+/* MinGW's printf has problem with "%lld" */
+char *int64tostr(char *buf, int64_t x)
+{
+	int cnt;
+	int i = 0;
+	do {
+		buf[i++] = '0' + x % 10;
+		x /= 10;
+	} while (x);
+	buf[i] = 0;
+	for (cnt = i, i = 0; i < cnt/2; ++i) {
+		int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
+	}
+	return buf;
+}
+
+int64_t strtoint64(const char *buf)
+{
+	int64_t x;
+	for (x = 0; *buf != '\0'; ++buf)
+		x = x * 10 + ((int64_t) *buf - 48);
+	return x;
+}
+/* In windows, the first thing is to establish the TCP connection. */
+int knet_win32_init()
+{
+	WSADATA wsaData;
+	return WSAStartup(MAKEWORD(2, 2), &wsaData);
+}
+void knet_win32_destroy()
+{
+	WSACleanup();
+}
+/* A slightly modfied version of the following function also works on
+ * Mac (and presummably Linux). However, this function is not stable on
+ * my Mac. It sometimes works fine but sometimes does not. Therefore for
+ * non-Windows OS, I do not use this one. */
+static SOCKET socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func)										\
+	do {														\
+		fprintf(stderr, "%s: %d\n", func, WSAGetLastError());	\
+		return -1;												\
+	} while (0)
+
+	int on = 1;
+	SOCKET fd;
+	struct linger lng = { 0, 0 };
+	struct sockaddr_in server;
+	struct hostent *hp = 0;
+	// open socket
+	if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
+	if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+	// get host info
+	if (isalpha(host[0])) hp = gethostbyname(host);
+	else {
+		struct in_addr addr;
+		addr.s_addr = inet_addr(host);
+		hp = gethostbyaddr((char*)&addr, 4, AF_INET);
+	}
+	if (hp == 0) __err_connect("gethost");
+	// connect
+	server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
+	server.sin_family= AF_INET;
+	server.sin_port = htons(atoi(port));
+	if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
+	// freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
+	return fd;
+}
+#endif
+
+static off_t my_netread(int fd, void *buf, off_t len)
+{
+	off_t rest = len, curr, l = 0;
+	/* recv() and read() may not read the required length of data with
+	 * one call. They have to be called repeatedly. */
+	while (rest) {
+		if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
+		curr = netread(fd, buf + l, rest);
+		/* According to the glibc manual, section 13.2, a zero returned
+		 * value indicates end-of-file (EOF), which should mean that
+		 * read() will not return zero if EOF has not been met but data
+		 * are not immediately available. */
+		if (curr == 0) break;
+		l += curr; rest -= curr;
+	}
+	return l;
+}
+
+/*************************
+ * FTP specific routines *
+ *************************/
+
+static int kftp_get_response(knetFile *ftp)
+{
+#ifndef _WIN32
+	unsigned char c;
+#else
+	char c;
+#endif
+	int n = 0;
+	char *p;
+	if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
+	while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+		//fputc(c, stderr);
+		if (n >= ftp->max_response) {
+			ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
+			ftp->response = realloc(ftp->response, ftp->max_response);
+		}
+		ftp->response[n++] = c;
+		if (c == '\n') {
+			if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
+				&& ftp->response[3] != '-') break;
+			n = 0;
+			continue;
+		}
+	}
+	if (n < 2) return -1;
+	ftp->response[n-2] = 0;
+	return strtol(ftp->response, &p, 0);
+}
+
+static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
+{
+	if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+	netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
+	return is_get? kftp_get_response(ftp) : 0;
+}
+
+static int kftp_pasv_prep(knetFile *ftp)
+{
+	char *p;
+	int v[6];
+	kftp_send_cmd(ftp, "PASV\r\n", 1);
+	for (p = ftp->response; *p && *p != '('; ++p);
+	if (*p != '(') return -1;
+	++p;
+	sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+	memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
+	ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
+	return 0;
+}
+
+
+static int kftp_pasv_connect(knetFile *ftp)
+{
+	char host[80], port[10];
+	if (ftp->pasv_port == 0) {
+		fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
+		return -1;
+	}
+	sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
+	sprintf(port, "%d", ftp->pasv_port);
+	ftp->fd = socket_connect(host, port);
+	if (ftp->fd == -1) return -1;
+	return 0;
+}
+
+int kftp_connect(knetFile *ftp)
+{
+	ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
+	if (ftp->ctrl_fd == -1) return -1;
+	kftp_get_response(ftp);
+	kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
+	kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
+	kftp_send_cmd(ftp, "TYPE I\r\n", 1);
+	return 0;
+}
+
+int kftp_reconnect(knetFile *ftp)
+{
+	if (ftp->ctrl_fd != -1) {
+		netclose(ftp->ctrl_fd);
+		ftp->ctrl_fd = -1;
+	}
+	netclose(ftp->fd);
+	ftp->fd = -1;
+	return kftp_connect(ftp);
+}
+
+// initialize ->type, ->host, ->retr and ->size
+knetFile *kftp_parse_url(const char *fn, const char *mode)
+{
+	knetFile *fp;
+	char *p;
+	int l;
+	if (strstr(fn, "ftp://") != fn) return 0;
+	for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+	if (*p != '/') return 0;
+	l = p - fn - 6;
+	fp = calloc(1, sizeof(knetFile));
+	fp->type = KNF_TYPE_FTP;
+	fp->fd = -1;
+	/* the Linux/Mac version of socket_connect() also recognizes a port
+	 * like "ftp", but the Windows version does not. */
+	fp->port = strdup("21");
+	fp->host = calloc(l + 1, 1);
+	if (strchr(mode, 'c')) fp->no_reconnect = 1;
+	strncpy(fp->host, fn + 6, l);
+	fp->retr = calloc(strlen(p) + 8, 1);
+	sprintf(fp->retr, "RETR %s\r\n", p);
+    fp->size_cmd = calloc(strlen(p) + 8, 1);
+    sprintf(fp->size_cmd, "SIZE %s\r\n", p);
+	fp->seek_offset = 0;
+	return fp;
+}
+// place ->fd at offset off
+int kftp_connect_file(knetFile *fp)
+{
+	int ret;
+	long long file_size;
+	if (fp->fd != -1) {
+		netclose(fp->fd);
+		if (fp->no_reconnect) kftp_get_response(fp);
+	}
+	kftp_pasv_prep(fp);
+    kftp_send_cmd(fp, fp->size_cmd, 1);
+#ifndef _WIN32
+    if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
+    {
+        fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
+        return -1;
+    }
+#else
+	const char *p = fp->response;
+	while (*p != ' ') ++p;
+	while (*p < '0' || *p > '9') ++p;
+	file_size = strtoint64(p);
+#endif
+	fp->file_size = file_size;
+	if (fp->offset>=0) {
+		char tmp[32];
+#ifndef _WIN32
+		sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
+#else
+		strcpy(tmp, "REST ");
+		int64tostr(tmp + 5, fp->offset);
+		strcat(tmp, "\r\n");
+#endif
+		kftp_send_cmd(fp, tmp, 1);
+	}
+	kftp_send_cmd(fp, fp->retr, 0);
+	kftp_pasv_connect(fp);
+	ret = kftp_get_response(fp);
+	if (ret != 150) {
+		fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
+		netclose(fp->fd);
+		fp->fd = -1;
+		return -1;
+	}
+	fp->is_ready = 1;
+	return 0;
+}
+
+
+/**************************
+ * HTTP specific routines *
+ **************************/
+
+knetFile *khttp_parse_url(const char *fn, const char *mode)
+{
+	knetFile *fp;
+	char *p, *proxy, *q;
+	int l;
+	if (strstr(fn, "http://") != fn) return 0;
+	// set ->http_host
+	for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+	l = p - fn - 7;
+	fp = calloc(1, sizeof(knetFile));
+	fp->http_host = calloc(l + 1, 1);
+	strncpy(fp->http_host, fn + 7, l);
+	fp->http_host[l] = 0;
+	for (q = fp->http_host; *q && *q != ':'; ++q);
+	if (*q == ':') *q++ = 0;
+	// get http_proxy
+	proxy = getenv("http_proxy");
+	// set ->host, ->port and ->path
+	if (proxy == 0) {
+		fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
+		fp->port = strdup(*q? q : "80");
+		fp->path = strdup(*p? p : "/");
+	} else {
+		fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+		for (q = fp->host; *q && *q != ':'; ++q);
+		if (*q == ':') *q++ = 0; 
+		fp->port = strdup(*q? q : "80");
+		fp->path = strdup(fn);
+	}
+	fp->type = KNF_TYPE_HTTP;
+	fp->ctrl_fd = fp->fd = -1;
+	fp->seek_offset = 0;
+	return fp;
+}
+
+int khttp_connect_file(knetFile *fp)
+{
+	int ret, l = 0;
+	char *buf, *p;
+	if (fp->fd != -1) netclose(fp->fd);
+	fp->fd = socket_connect(fp->host, fp->port);
+	buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+	l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
+    l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
+	l += sprintf(buf + l, "\r\n");
+	netwrite(fp->fd, buf, l);
+	l = 0;
+	while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
+		if (buf[l] == '\n' && l >= 3)
+			if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+		++l;
+	}
+	buf[l] = 0;
+	if (l < 14) { // prematured header
+		netclose(fp->fd);
+		fp->fd = -1;
+		return -1;
+	}
+	ret = strtol(buf + 8, &p, 0); // HTTP return code
+	if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
+		off_t rest = fp->offset;
+		while (rest) {
+			off_t l = rest < 0x10000? rest : 0x10000;
+			rest -= my_netread(fp->fd, buf, l);
+		}
+	} else if (ret != 206 && ret != 200) {
+		free(buf);
+		fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
+		netclose(fp->fd);
+		fp->fd = -1;
+		return -1;
+	}
+	free(buf);
+	fp->is_ready = 1;
+	return 0;
+}
+
+/********************
+ * Generic routines *
+ ********************/
+
+knetFile *knet_open(const char *fn, const char *mode)
+{
+	knetFile *fp = 0;
+	if (mode[0] != 'r') {
+		fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
+		return 0;
+	}
+	if (strstr(fn, "ftp://") == fn) {
+		fp = kftp_parse_url(fn, mode);
+		if (fp == 0) return 0;
+		if (kftp_connect(fp) == -1) {
+			knet_close(fp);
+			return 0;
+		}
+		kftp_connect_file(fp);
+	} else if (strstr(fn, "http://") == fn) {
+		fp = khttp_parse_url(fn, mode);
+		if (fp == 0) return 0;
+		khttp_connect_file(fp);
+	} else { // local file
+#ifdef _WIN32
+		/* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
+		 * be undefined on some systems, although it is defined on my
+		 * Mac and the Linux I have tested on. */
+		int fd = open(fn, O_RDONLY | O_BINARY);
+#else		
+		int fd = open(fn, O_RDONLY);
+#endif
+		if (fd == -1) {
+			perror("open");
+			return 0;
+		}
+		fp = (knetFile*)calloc(1, sizeof(knetFile));
+		fp->type = KNF_TYPE_LOCAL;
+		fp->fd = fd;
+		fp->ctrl_fd = -1;
+	}
+	if (fp && fp->fd == -1) {
+		knet_close(fp);
+		return 0;
+	}
+	return fp;
+}
+
+knetFile *knet_dopen(int fd, const char *mode)
+{
+	knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
+	fp->type = KNF_TYPE_LOCAL;
+	fp->fd = fd;
+	return fp;
+}
+
+off_t knet_read(knetFile *fp, void *buf, off_t len)
+{
+	off_t l = 0;
+	if (fp->fd == -1) return 0;
+	if (fp->type == KNF_TYPE_FTP) {
+		if (fp->is_ready == 0) {
+			if (!fp->no_reconnect) kftp_reconnect(fp);
+			kftp_connect_file(fp);
+		}
+	} else if (fp->type == KNF_TYPE_HTTP) {
+		if (fp->is_ready == 0)
+			khttp_connect_file(fp);
+	}
+	if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
+		off_t rest = len, curr;
+		while (rest) {
+			do {
+				curr = read(fp->fd, buf + l, rest);
+			} while (curr < 0 && EINTR == errno);
+			if (curr < 0) return -1;
+			if (curr == 0) break;
+			l += curr; rest -= curr;
+		}
+	} else l = my_netread(fp->fd, buf, len);
+	fp->offset += l;
+	return l;
+}
+
+off_t knet_seek(knetFile *fp, int64_t off, int whence)
+{
+	if (whence == SEEK_SET && off == fp->offset) return 0;
+	if (fp->type == KNF_TYPE_LOCAL) {
+		/* Be aware that lseek() returns the offset after seeking,
+		 * while fseek() returns zero on success. */
+		off_t offset = lseek(fp->fd, off, whence);
+		if (offset == -1) {
+            // Be silent, it is OK for knet_seek to fail when the file is streamed
+            // fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+			return -1;
+		}
+		fp->offset = offset;
+		return 0;
+	}
+    else if (fp->type == KNF_TYPE_FTP) 
+    {
+        if (whence==SEEK_CUR)
+            fp->offset += off;
+        else if (whence==SEEK_SET)
+            fp->offset = off;
+        else if ( whence==SEEK_END)
+            fp->offset = fp->file_size+off;
+		fp->is_ready = 0;
+		return 0;
+	} 
+    else if (fp->type == KNF_TYPE_HTTP) 
+    {
+		if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
+			fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
+			errno = ESPIPE;
+			return -1;
+		}
+        if (whence==SEEK_CUR)
+            fp->offset += off;
+        else if (whence==SEEK_SET)
+            fp->offset = off;
+		fp->is_ready = 0;
+		return 0;
+	}
+	errno = EINVAL;
+    fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+	return -1;
+}
+
+int knet_close(knetFile *fp)
+{
+	if (fp == 0) return 0;
+	if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
+	if (fp->fd != -1) {
+		/* On Linux/Mac, netclose() is an alias of close(), but on
+		 * Windows, it is an alias of closesocket(). */
+		if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
+		else netclose(fp->fd);
+	}
+	free(fp->host); free(fp->port);
+	free(fp->response); free(fp->retr); // FTP specific
+	free(fp->path); free(fp->http_host); // HTTP specific
+	free(fp);
+	return 0;
+}
+
+#ifdef KNETFILE_MAIN
+int main(void)
+{
+	char *buf;
+	knetFile *fp;
+	int type = 4, l;
+#ifdef _WIN32
+	knet_win32_init();
+#endif
+	buf = calloc(0x100000, 1);
+	if (type == 0) {
+		fp = knet_open("knetfile.c", "r");
+		knet_seek(fp, 1000, SEEK_SET);
+	} else if (type == 1) { // NCBI FTP, large file
+		fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
+		knet_seek(fp, 2500000000ll, SEEK_SET);
+		l = knet_read(fp, buf, 255);
+	} else if (type == 2) {
+		fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
+		knet_seek(fp, 1000, SEEK_SET);
+	} else if (type == 3) {
+		fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
+		knet_seek(fp, 1000, SEEK_SET);
+	} else if (type == 4) {
+		fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
+		knet_read(fp, buf, 10000);
+		knet_seek(fp, 20000, SEEK_SET);
+		knet_seek(fp, 10000, SEEK_SET);
+		l = knet_read(fp, buf+10000, 10000000) + 10000;
+	}
+	if (type != 4 && type != 1) {
+		knet_read(fp, buf, 255);
+		buf[255] = 0;
+		printf("%s\n", buf);
+	} else write(fileno(stdout), buf, l);
+	knet_close(fp);
+	free(buf);
+	return 0;
+}
+#endif
diff --git a/kprobaln.c b/kprobaln.c
new file mode 100644
index 0000000..04e526a
--- /dev/null
+++ b/kprobaln.c
@@ -0,0 +1,280 @@
+/* The MIT License
+
+   Copyright (c) 2003-2006, 2008-2010, by Heng Li <lh3lh3 at live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+#include "kprobaln.h"
+
+/*****************************************
+ * Probabilistic banded glocal alignment *
+ *****************************************/
+
+#define EI .25
+#define EM .33333333333
+
+static float g_qual2prob[256];
+
+#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
+
+kpa_par_t kpa_par_def = { 0.001, 0.1, 10 };
+kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 };
+
+/*
+  The topology of the profile HMM:
+
+           /\             /\        /\             /\
+           I[1]           I[k-1]    I[k]           I[L]
+            ^   \      \    ^    \   ^   \      \   ^
+            |    \      \   |     \  |    \      \  |
+    M[0]   M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L]   M[L+1]
+                \      \/        \/      \/      /
+                 \     /\        /\      /\     /
+                       -> D[k-1] -> D[k] ->
+
+   M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1].
+
+   On input, _ref is the reference sequence and _query is the query
+   sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an
+   ambiguous residue. iqual is the base quality. c sets the gap open
+   probability, gap extension probability and band width.
+
+   On output, state and q are arrays of length l_query. The higher 30
+   bits give the reference position the query base is matched to and the
+   lower two bits can be 0 (an alignment match) or 1 (an
+   insertion). q[i] gives the phred scaled posterior probability of
+   state[i] being wrong.
+ */
+int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
+			   const kpa_par_t *c, int *state, uint8_t *q)
+{
+	double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb;
+	float *qual, *_qual;
+	const uint8_t *ref, *query;
+	int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr;
+
+    if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault
+
+	/*** initialization ***/
+	is_backward = state && q? 1 : 0;
+	ref = _ref - 1; query = _query - 1; // change to 1-based coordinate
+	bw = l_ref > l_query? l_ref : l_query;
+	if (bw > c->bw) bw = c->bw;
+	if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query);
+	bw2 = bw * 2 + 1;
+	// allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
+	f = calloc(l_query+1, sizeof(void*));
+	if (is_backward) b = calloc(l_query+1, sizeof(void*));
+	for (i = 0; i <= l_query; ++i) {    // FIXME: this will lead in segfault for l_query==0
+		f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs
+		if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double));
+	}
+	s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow
+	// initialize qual
+	_qual = calloc(l_query, sizeof(float));
+	if (g_qual2prob[0] == 0)
+		for (i = 0; i < 256; ++i)
+			g_qual2prob[i] = pow(10, -i/10.);
+	for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30];
+	qual = _qual - 1;
+	// initialize transition probability
+	sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof
+	m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM);
+	m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.;
+	m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e;
+	bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1
+	/*** forward ***/
+	// f[0]
+	set_u(k, bw, 0, 0);
+	f[0][k] = s[0] = 1.;
+	{ // f[1]
+		double *fi = f[1], sum;
+		int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
+		for (k = beg, sum = 0.; k <= end; ++k) {
+			int u;
+			double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
+			set_u(u, bw, 1, k);
+			fi[u+0] = e * bM; fi[u+1] = EI * bI;
+			sum += fi[u] + fi[u+1];
+		}
+		// rescale
+		s[1] = sum;
+		set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2;
+		for (k = _beg; k <= _end; ++k) fi[k] /= sum;
+	}
+	// f[2..l_query]
+	for (i = 2; i <= l_query; ++i) {
+		double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i];
+		int beg = 1, end = l_ref, x, _beg, _end;
+		uint8_t qyi = query[i];
+		x = i - bw; beg = beg > x? beg : x; // band start
+		x = i + bw; end = end < x? end : x; // band end
+		for (k = beg, sum = 0.; k <= end; ++k) {
+			int u, v11, v01, v10;
+			double e;
+			e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM;
+			set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1);
+			fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
+			fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
+			fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
+			sum += fi[u] + fi[u+1] + fi[u+2];
+//			fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
+		}
+		// rescale
+		s[i] = sum;
+		set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
+		for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
+	}
+	{ // f[l_query+1]
+		double sum;
+		for (k = 1, sum = 0.; k <= l_ref; ++k) {
+			int u;
+			set_u(u, bw, l_query, k);
+			if (u < 3 || u >= bw2*3+3) continue;
+		    sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
+		}
+		s[l_query+1] = sum; // the last scaling factor
+	}
+	{ // compute likelihood
+		double p = 1., Pr1 = 0.;
+		for (i = 0; i <= l_query + 1; ++i) {
+			p *= s[i];
+			if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
+		}
+		Pr1 += -4.343 * log(p * l_ref * l_query);
+		Pr = (int)(Pr1 + .499);
+		if (!is_backward) { // skip backward and MAP
+			for (i = 0; i <= l_query; ++i) free(f[i]);
+			free(f); free(s); free(_qual);
+			return Pr;
+		}
+	}
+	/*** backward ***/
+	// b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
+	for (k = 1; k <= l_ref; ++k) {
+		int u;
+		double *bi = b[l_query];
+		set_u(u, bw, l_query, k);
+		if (u < 3 || u >= bw2*3+3) continue;
+		bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
+	}
+	// b[l_query-1..1]
+	for (i = l_query - 1; i >= 1; --i) {
+		int beg = 1, end = l_ref, x, _beg, _end;
+		double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1];
+		uint8_t qyi1 = query[i+1];
+		x = i - bw; beg = beg > x? beg : x;
+		x = i + bw; end = end < x? end : x;
+		for (k = end; k >= beg; --k) {
+			int u, v11, v01, v10;
+			double e;
+			set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1);
+			e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11];
+			bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
+			bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1];
+			bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
+//			fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
+		}
+		// rescale
+		set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
+		for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
+	}
+	{ // b[0]
+		int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
+		double sum = 0.;
+		for (k = end; k >= beg; --k) {
+			int u;
+			double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
+			set_u(u, bw, 1, k);
+			if (u < 3 || u >= bw2*3+3) continue;
+		    sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI;
+		}
+		set_u(k, bw, 0, 0);
+		pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0
+	}
+	is_diff = fabs(pb - 1.) > 1e-7? 1 : 0;
+	/*** MAP ***/
+	for (i = 1; i <= l_query; ++i) {
+		double sum = 0., *fi = f[i], *bi = b[i], max = 0.;
+		int beg = 1, end = l_ref, x, max_k = -1;
+		x = i - bw; beg = beg > x? beg : x;
+		x = i + bw; end = end < x? end : x;
+		for (k = beg; k <= end; ++k) {
+			int u;
+			double z;
+			set_u(u, bw, i, k);
+			z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z;
+			z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z;
+		}
+		max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
+		if (state) state[i-1] = max_k;
+		if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k;
+#ifdef _MAIN
+		fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2,
+				"ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG
+#endif
+	}
+	/*** free ***/
+	for (i = 0; i <= l_query; ++i) {
+		free(f[i]); free(b[i]);
+	}
+	free(f); free(b); free(s); free(_qual);
+	return Pr;
+}
+
+#ifdef _MAIN
+#include <unistd.h>
+int main(int argc, char *argv[])
+{
+	uint8_t conv[256], *iqual, *ref, *query;
+	int c, l_ref, l_query, i, q = 30, b = 10, P;
+	while ((c = getopt(argc, argv, "b:q:")) >= 0) {
+		switch (c) {
+		case 'b': b = atoi(optarg); break;
+		case 'q': q = atoi(optarg); break;
+		}
+	}
+	if (optind + 2 > argc) {
+		fprintf(stderr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc
+		return 1;
+	}
+	memset(conv, 4, 256);
+	conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1;
+	conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3;
+	ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1];
+	l_ref = strlen((char*)ref); l_query = strlen((char*)query);
+	for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]];
+	for (i = 0; i < l_query; ++i) query[i] = conv[query[i]];
+	iqual = malloc(l_query);
+	memset(iqual, q, l_query);
+	kpa_par_def.bw = b;
+	P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0);
+	fprintf(stderr, "%d\n", P);
+	free(iqual);
+	return 0;
+}
+#endif
diff --git a/kstring.c b/kstring.c
new file mode 100644
index 0000000..b8ff45c
--- /dev/null
+++ b/kstring.c
@@ -0,0 +1,212 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include "kstring.h"
+
+int ksprintf(kstring_t *s, const char *fmt, ...)
+{
+	va_list ap;
+	int l;
+	va_start(ap, fmt);
+	l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'.
+	va_end(ap);
+	if (l + 1 > s->m - s->l) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+		va_start(ap, fmt);
+		l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
+	}
+	va_end(ap);
+	s->l += l;
+	return l;
+}
+
+char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux)
+{
+	const char *p, *start;
+	if (sep) { // set up the table
+		if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished
+		aux->finished = 0;
+		if (sep[1]) {
+			aux->sep = -1;
+			aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
+			for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
+		} else aux->sep = sep[0];
+	}
+	if (aux->finished) return 0;
+	else if (str) aux->p = str - 1, aux->finished = 0;
+	if (aux->sep < 0) {
+		for (p = start = aux->p + 1; *p; ++p)
+			if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
+	} else {
+		for (p = start = aux->p + 1; *p; ++p)
+			if (*p == aux->sep) break;
+	}
+	aux->p = p; // end of token
+	if (*p == 0) aux->finished = 1; // no more tokens
+	return (char*)start;
+}
+
+// s MUST BE a null terminated string; l = strlen(s)
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
+{
+	int i, n, max, last_char, last_start, *offsets, l;
+	n = 0; max = *_max; offsets = *_offsets;
+	l = strlen(s);
+	
+#define __ksplit_aux do {												\
+		if (_offsets) {													\
+			s[i] = 0;													\
+			if (n == max) {												\
+				max = max? max<<1 : 2;									\
+				offsets = (int*)realloc(offsets, sizeof(int) * max);	\
+			}															\
+			offsets[n++] = last_start;									\
+		} else ++n;														\
+	} while (0)
+
+	for (i = 0, last_char = last_start = 0; i <= l; ++i) {
+		if (delimiter == 0) {
+			if (isspace(s[i]) || s[i] == 0) {
+				if (isgraph(last_char)) __ksplit_aux; // the end of a field
+			} else {
+				if (isspace(last_char) || last_char == 0) last_start = i;
+			}
+		} else {
+			if (s[i] == delimiter || s[i] == 0) {
+				if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
+			} else {
+				if (last_char == delimiter || last_char == 0) last_start = i;
+			}
+		}
+		last_char = s[i];
+	}
+	*_max = max; *_offsets = offsets;
+	return n;
+}
+
+/**********************
+ * Boyer-Moore search *
+ **********************/
+
+typedef unsigned char ubyte_t;
+
+// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
+static int *ksBM_prep(const ubyte_t *pat, int m)
+{
+	int i, *suff, *prep, *bmGs, *bmBc;
+	prep = (int*)calloc(m + 256, sizeof(int));
+	bmGs = prep; bmBc = prep + m;
+	{ // preBmBc()
+		for (i = 0; i < 256; ++i) bmBc[i] = m;
+		for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
+	}
+	suff = (int*)calloc(m, sizeof(int));
+	{ // suffixes()
+		int f = 0, g;
+		suff[m - 1] = m;
+		g = m - 1;
+		for (i = m - 2; i >= 0; --i) {
+			if (i > g && suff[i + m - 1 - f] < i - g)
+				suff[i] = suff[i + m - 1 - f];
+			else {
+				if (i < g) g = i;
+				f = i;
+				while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
+				suff[i] = f - g;
+			}
+		}
+	}
+	{ // preBmGs()
+		int j = 0;
+		for (i = 0; i < m; ++i) bmGs[i] = m;
+		for (i = m - 1; i >= 0; --i)
+			if (suff[i] == i + 1)
+				for (; j < m - 1 - i; ++j)
+					if (bmGs[j] == m)
+						bmGs[j] = m - 1 - i;
+		for (i = 0; i <= m - 2; ++i)
+			bmGs[m - 1 - suff[i]] = m - 1 - i;
+	}
+	free(suff);
+	return prep;
+}
+
+void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
+{
+	int i, j, *prep = 0, *bmGs, *bmBc;
+	const ubyte_t *str, *pat;
+	str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
+	prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
+	if (_prep && *_prep == 0) *_prep = prep;
+	bmGs = prep; bmBc = prep + m;
+	j = 0;
+	while (j <= n - m) {
+		for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
+		if (i >= 0) {
+			int max = bmBc[str[i+j]] - m + 1 + i;
+			if (max < bmGs[i]) max = bmGs[i];
+			j += max;
+		} else return (void*)(str + j);
+	}
+	if (_prep == 0) free(prep);
+	return 0;
+}
+
+char *kstrstr(const char *str, const char *pat, int **_prep)
+{
+	return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
+}
+
+char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
+{
+	return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
+}
+
+/***********************
+ * The main() function *
+ ***********************/
+
+#ifdef KSTRING_MAIN
+#include <stdio.h>
+int main()
+{
+	kstring_t *s;
+	int *fields, n, i;
+	ks_tokaux_t aux;
+	char *p;
+	s = (kstring_t*)calloc(1, sizeof(kstring_t));
+	// test ksprintf()
+	ksprintf(s, " abcdefg:    %d ", 100);
+	printf("'%s'\n", s->s);
+	// test ksplit()
+	fields = ksplit(s, 0, &n);
+	for (i = 0; i < n; ++i)
+		printf("field[%d] = '%s'\n", i, s->s + fields[i]);
+	// test kstrtok()
+	s->l = 0;
+	for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
+		kputsn(p, aux.p - p, s);
+		kputc('\n', s);
+	}
+	printf("%s", s->s);
+	// free
+	free(s->s); free(s); free(fields);
+
+	{
+		static char *str = "abcdefgcdgcagtcakcdcd";
+		static char *pat = "cd";
+		char *ret, *s = str;
+		int *prep = 0;
+		while ((ret = kstrstr(s, pat, &prep)) != 0) {
+			printf("match: %s\n", ret);
+			s = ret + prep[0];
+		}
+		free(prep);
+	}
+	return 0;
+}
+#endif
diff --git a/padding.c b/padding.c
new file mode 100644
index 0000000..a8da562
--- /dev/null
+++ b/padding.c
@@ -0,0 +1,479 @@
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include "kstring.h"
+#include "sam_header.h"
+#include "sam.h"
+#include "bam.h"
+#include "faidx.h"
+
+bam_header_t *bam_header_dup(const bam_header_t *h0); /*in sam.c*/
+
+static void replace_cigar(bam1_t *b, int n, uint32_t *cigar)
+{
+	if (n != b->core.n_cigar) {
+		int o = b->core.l_qname + b->core.n_cigar * 4;
+		if (b->data_len + (n - b->core.n_cigar) * 4 > b->m_data) {
+			b->m_data = b->data_len + (n - b->core.n_cigar) * 4;
+			kroundup32(b->m_data);
+			b->data = (uint8_t*)realloc(b->data, b->m_data);
+		}
+		memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->data_len - o);
+		memcpy(b->data + b->core.l_qname, cigar, n * 4);
+		b->data_len += (n - b->core.n_cigar) * 4;
+		b->core.n_cigar = n;
+	} else memcpy(b->data + b->core.l_qname, cigar, n * 4);
+}
+
+#define write_cigar(_c, _n, _m, _v) do { \
+		if (_n == _m) { \
+			_m = _m? _m<<1 : 4; \
+			_c = (uint32_t*)realloc(_c, _m * 4); \
+		} \
+		_c[_n++] = (_v); \
+	} while (0)
+
+static void unpad_seq(bam1_t *b, kstring_t *s)
+{
+	int k, j, i;
+	int length;
+	uint32_t *cigar = bam1_cigar(b);
+	uint8_t *seq = bam1_seq(b);
+	// b->core.l_qseq gives length of the SEQ entry (including soft clips, S)
+	// We need the padded length after alignment from the CIGAR (excluding
+	// soft clips S, but including pads from CIGAR D operations)
+	length = 0;
+	for (k = 0; k < b->core.n_cigar; ++k) {
+		int op, ol;
+		op= bam_cigar_op(cigar[k]);
+		ol = bam_cigar_oplen(cigar[k]);
+		if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF || op == BAM_CDEL)
+			length += ol;
+	}
+	ks_resize(s, length);
+	for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
+		int op, ol;
+		op = bam_cigar_op(cigar[k]);
+		ol = bam_cigar_oplen(cigar[k]);
+		if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+			for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam1_seqi(seq, j);
+		} else if (op == BAM_CSOFT_CLIP) {
+			j += ol;
+		} else if (op == BAM_CHARD_CLIP) {
+			/* do nothing */
+		} else if (op == BAM_CDEL) {
+			for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
+                } else {
+			fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam1_qname(b));
+                        assert(-1);
+		}
+	}
+	assert(length == s->l);
+}
+
+int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq)
+{
+	char base;
+	char *fai_ref = 0;
+	int fai_ref_len = 0, k;
+
+	fai_ref = fai_fetch(fai, ref_name, &fai_ref_len);
+	if (fai_ref_len != ref_len) {
+		fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len);
+		free(fai_ref);
+		return -1;
+	}
+	ks_resize(seq, ref_len);
+	seq->l = 0;
+	for (k = 0; k < ref_len; ++k) {
+		base = fai_ref[k];
+		if (base == '-' || base == '*') {
+			// Map gaps to null to match unpad_seq function
+			seq->s[seq->l++] = 0;
+		} else {
+			int i = bam_nt16_table[(int)base];
+			if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
+				fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name);
+				free(fai_ref);
+				return -1;
+			}
+			seq->s[seq->l++] = i;
+		}
+	}
+	assert(ref_len == seq->l);
+	free(fai_ref);
+	return 0;
+}
+
+int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len)
+{
+	char base;
+	char *fai_ref = 0;
+	int fai_ref_len = 0, k;
+	int bases=0, gaps=0;
+
+	fai_ref = fai_fetch(fai, ref_name, &fai_ref_len);
+	if (fai_ref_len != padded_len) {
+		fprintf(stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len);
+		free(fai_ref);
+		return -1;
+	}
+	for (k = 0; k < padded_len; ++k) {
+		//fprintf(stderr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref));
+		base = fai_ref[k];
+		if (base == '-' || base == '*') {
+			gaps += 1;
+		} else {
+			int i = bam_nt16_table[(int)base];
+			if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16
+				fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name);
+				free(fai_ref);
+				return -1;
+			}
+			bases += 1;
+		}
+	}
+	free(fai_ref);
+	assert (padded_len == bases + gaps);
+	return bases;
+}
+
+inline int * update_posmap(int *posmap, kstring_t ref)
+{
+	int i, k;
+	posmap = realloc(posmap, ref.m * sizeof(int));
+	for (i = k = 0; i < ref.l; ++i) {
+		posmap[i] = k;
+		if (ref.s[i]) ++k;
+	}
+	return posmap;
+}
+
+int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai)
+{
+	bam_header_t *h = 0;
+	bam1_t *b = 0;
+	kstring_t r, q;
+	int r_tid = -1;
+	uint32_t *cigar2 = 0;
+	int ret = 0, n2 = 0, m2 = 0, *posmap = 0;
+
+	b = bam_init1();
+	r.l = r.m = q.l = q.m = 0; r.s = q.s = 0;
+	int read_ret;
+	h = in->header;
+	while ((read_ret = samread(in, b)) >= 0) { // read one alignment from `in'
+		uint32_t *cigar = bam1_cigar(b);
+		n2 = 0;
+		if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) {
+			// fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam1_qname(b));
+			r_tid = b->core.tid;
+			unpad_seq(b, &r);
+			if (h->target_len[r_tid] != r.l) {
+				fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %d in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l);
+				return -1;
+			}
+			if (fai) {
+				// Check the embedded reference matches the FASTA file
+				if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) {
+					fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]);
+					return -1;
+				}
+				assert(r.l == q.l);
+				int i;
+				for (i = 0; i < r.l; ++i) {
+					if (r.s[i] != q.s[i]) {
+						// Show gaps as ASCII 45
+						fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n",
+							h->target_name[b->core.tid], i+1,
+							r.s[i] ? bam_nt16_rev_table[(int)r.s[i]] : 45,
+							q.s[i] ? bam_nt16_rev_table[(int)q.s[i]] : 45);
+						return -1;
+					}
+				}
+			}
+			write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH));
+			replace_cigar(b, n2, cigar2);
+			posmap = update_posmap(posmap, r);
+		} else if (b->core.n_cigar > 0) {
+			int i, k, op;
+			if (b->core.tid < 0) {
+				fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam1_qname(b));
+				return -1;
+			} else if (b->core.tid == r_tid) {
+				; // good case, reference available
+				//fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam1_qname(b));
+			} else if (fai) {
+				if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
+					fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
+					return -1;
+				}
+				posmap = update_posmap(posmap, r);
+				r_tid = b->core.tid;
+				// fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]);
+			} else {				
+				fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]);
+				return -1;
+			}
+			unpad_seq(b, &q);
+			if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) {
+				write_cigar(cigar2, n2, m2, cigar[0]);
+			} else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) {
+				write_cigar(cigar2, n2, m2, cigar[0]);
+				if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) {
+					write_cigar(cigar2, n2, m2, cigar[1]);
+				}
+			}
+			/* Determine CIGAR operator for each base in the aligned read */
+			for (i = 0, k = b->core.pos; i < q.l; ++i, ++k)
+				q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD);
+			/* Include any pads if starts with an insert */
+			if (q.s[0] == BAM_CINS) {
+				for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k);
+				if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD));
+			}
+			/* Count consecutive CIGAR operators to turn into a CIGAR string */
+			for (i = k = 1, op = q.s[0]; i < q.l; ++i) {
+				if (op != q.s[i]) {
+					write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
+					op = q.s[i]; k = 1;
+				} else ++k;
+			}
+			write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
+			if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) {
+				write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]);
+                        } else if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) {
+				if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) {
+					write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]);
+			  	}
+				write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]);
+			}
+			/* Remove redundant P operators between M/X/=/D operators, e.g. 5M2P10M -> 15M */
+			int pre_op, post_op;
+			for (i = 2; i < n2; ++i)
+				if (bam_cigar_op(cigar2[i-1]) == BAM_CPAD) {
+					pre_op = bam_cigar_op(cigar2[i-2]);
+					post_op = bam_cigar_op(cigar2[i]);
+					/* Note don't need to check for X/= as code above will use M only */
+					if ((pre_op == BAM_CMATCH || pre_op == BAM_CDEL) && (post_op == BAM_CMATCH || post_op == BAM_CDEL)) {
+						/* This is a redundant P operator */
+						cigar2[i-1] = 0; // i.e. 0M
+						/* If had same operator either side, combine them in post_op */
+						if (pre_op == post_op) {
+							/* If CIGAR M, could treat as simple integers since BAM_CMATCH is zero*/
+							cigar2[i] = bam_cigar_gen(bam_cigar_oplen(cigar2[i-2]) + bam_cigar_oplen(cigar2[i]), post_op);
+							cigar2[i-2] = 0; // i.e. 0M
+						}
+					}
+				}
+			/* Remove the zero'd operators (0M) */
+			for (i = k = 0; i < n2; ++i)
+				if (cigar2[i]) cigar2[k++] = cigar2[i];
+			n2 = k;
+			replace_cigar(b, n2, cigar2);
+			b->core.pos = posmap[b->core.pos];
+			if (b->core.mtid < 0 || b->core.mpos < 0) {
+				/* Nice case, no mate to worry about*/
+				// fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam1_qname(b));
+				/* TODO - Warning if FLAG says mate should be mapped? */
+				/* Clean up funny input where mate position is given but mate reference is missing: */
+				b->core.mtid = -1;
+				b->core.mpos = -1;
+			} else if (b->core.mtid == b->core.tid) {
+				/* Nice case, same reference */
+				// fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam1_qname(b));
+				b->core.mpos = posmap[b->core.mpos];
+			} else {
+				/* Nasty case, Must load alternative posmap */
+				// fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]);
+				if (!fai) {
+					fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]);
+					return -1;
+				}
+				/* Temporarily load the other reference sequence */
+				if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) {
+					fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]);
+					return -1;
+				}
+				posmap = update_posmap(posmap, r);
+				b->core.mpos = posmap[b->core.mpos];
+				/* Restore the reference and posmap*/
+				if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
+					fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
+					return -1;
+				}
+				posmap = update_posmap(posmap, r);
+			}
+		}
+		samwrite(out, b);
+	}
+	if (read_ret < -1) {
+		fprintf(stderr, "[depad] truncated file.\n");
+		ret = 1;
+	}
+	free(r.s); free(q.s); free(posmap);
+	bam_destroy1(b);
+	return ret;
+}
+
+bam_header_t * fix_header(bam_header_t *old, faidx_t *fai)
+{
+	int i = 0, unpadded_len = 0;
+	bam_header_t *header = 0 ;
+
+	header = bam_header_dup(old);
+	for (i = 0; i < old->n_targets; ++i) {
+		unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]);
+		if (unpadded_len < 0) {
+			fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]);
+		} else {
+			header->target_len[i] = unpadded_len;
+			//fprintf(stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]);
+		}
+	}
+	/* Duplicating the header allocated new buffer for header string */
+	/* After modifying the @SQ lines it will only get smaller, since */
+	/* the LN entries will be the same or shorter, and we'll remove */
+	/* any MD entries (MD5 checksums). */
+	assert(strlen(old->text) == strlen(header->text));
+	assert (0==strcmp(old->text, header->text));
+	const char *text;
+	text = old->text;
+	header->text[0] = '\0'; /* Resuse the allocated buffer */
+	char * newtext = header->text;
+	char * end=NULL;
+	while (text[0]=='@') {
+		end = strchr(text, '\n');
+		assert(end != 0);
+		if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') {
+			/* TODO - edit the @SQ line here to remove MD and fix LN. */
+			/* For now just remove the @SQ line, and samtools will */
+			/* automatically generate a minimal replacement with LN. */
+			/* However, that discards any other tags like AS, SP, UR. */
+			//fprintf(stderr, "[depad] Removing @SQ line\n");
+		} else {
+			/* Copy this line to the new header */
+			strncat(newtext, text, end - text + 1);
+		}
+		text = end + 1;
+	}
+	assert (text[0]=='\0');
+	/* Check we didn't overflow the buffer */
+	assert (strlen(header->text) <= strlen(old->text));
+	if (strlen(header->text) < header->l_text) {
+		//fprintf(stderr, "[depad] Reallocating header buffer\n");
+		assert (newtext == header->text);
+		newtext = malloc(strlen(header->text) + 1);
+		strcpy(newtext, header->text);
+		free(header->text);
+		header->text = newtext;
+		header->l_text = strlen(newtext);
+	}
+	//fprintf(stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text);
+	return header;
+}
+
+static int usage(int is_long_help);
+
+int main_pad2unpad(int argc, char *argv[])
+{
+	samfile_t *in = 0, *out = 0;
+        bam_header_t *h = 0;
+	faidx_t *fai = 0;
+	int c, is_bamin = 1, compress_level = -1, is_bamout = 1, is_long_help = 0;
+	char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0;
+        int ret=0;
+
+	/* parse command-line options */
+	strcpy(in_mode, "r"); strcpy(out_mode, "w");
+	while ((c = getopt(argc, argv, "Sso:u1T:?")) >= 0) {
+		switch (c) {
+		case 'S': is_bamin = 0; break;
+		case 's': assert(compress_level == -1); is_bamout = 0; break;
+		case 'o': fn_out = strdup(optarg); break;
+		case 'u': assert(is_bamout == 1); compress_level = 0; break;
+		case '1': assert(is_bamout == 1); compress_level = 1; break;
+		case 'T': fn_ref = strdup(optarg); break;
+                case '?': is_long_help = 1; break;
+		default: return usage(is_long_help);
+		}
+        }
+	if (argc == optind) return usage(is_long_help);
+
+	if (is_bamin) strcat(in_mode, "b");
+	if (is_bamout) strcat(out_mode, "b");
+	strcat(out_mode, "h");
+	if (compress_level >= 0) {
+		char tmp[2];
+		tmp[0] = compress_level + '0'; tmp[1] = '\0';
+		strcat(out_mode, tmp);
+	}
+
+	// Load FASTA reference (also needed for SAM -> BAM if missing header)
+	if (fn_ref) {
+		fn_list = samfaipath(fn_ref);
+		fai = fai_load(fn_ref);
+	}
+	// open file handlers
+	if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) {
+		fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]);
+		ret = 1;
+		goto depad_end;
+	}
+	if (in->header == 0) {
+		fprintf(stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]);
+		ret = 1;
+		goto depad_end;
+	}
+	if (in->header->text == 0 || in->header->l_text == 0) {
+		fprintf(stderr, "[depad] Warning - failed to read any header text from \"%s\".\n", argv[optind]);
+		assert (0 == in->header->l_text);
+		assert (0 == in->header->text);
+	}
+	if (fn_ref) {
+		h = fix_header(in->header, fai);
+	} else {
+		fprintf(stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n");
+		h = in->header;
+	}
+	if ((out = samopen(fn_out? fn_out : "-", out_mode, h)) == 0) {
+		fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
+		ret = 1;
+		goto depad_end;
+	}
+
+	// Do the depad
+	ret = bam_pad2unpad(in, out, fai);
+
+depad_end:
+	// close files, free and return
+	if (fai) fai_destroy(fai);
+	if (h != in->header) bam_header_destroy(h);
+	samclose(in);
+	samclose(out);
+	free(fn_list); free(fn_out);
+	return ret;
+}
+
+static int usage(int is_long_help)
+{
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Usage:   samtools depad <in.bam>\n\n");
+	fprintf(stderr, "Options: -s       output is SAM (default is BAM)\n");
+	fprintf(stderr, "         -S       input is SAM (default is BAM)\n");
+	fprintf(stderr, "         -u       uncompressed BAM output (can't use with -s)\n");
+	fprintf(stderr, "         -1       fast compression BAM output (can't use with -s)\n");
+	fprintf(stderr, "         -T FILE  reference sequence file [null]\n");
+	fprintf(stderr, "         -o FILE  output file name [stdout]\n");
+	fprintf(stderr, "         -?       longer help\n");
+	fprintf(stderr, "\n");
+	if (is_long_help)
+		fprintf(stderr, "Notes:\n\
+\n\
+  1. Requires embedded reference sequences (before the reads for that reference),\n\
+     with the future aim to also support a FASTA padded reference sequence file.\n\
+\n\
+  2. The input padded alignment read's CIGAR strings must not use P or I operators.\n\
+\n");
+        return 1;
+}
diff --git a/phase.c b/phase.c
new file mode 100644
index 0000000..ef4eff9
--- /dev/null
+++ b/phase.c
@@ -0,0 +1,687 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <math.h>
+#include <zlib.h>
+#include "bam.h"
+#include "errmod.h"
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define MAX_VARS 256
+#define FLIP_PENALTY 2
+#define FLIP_THRES 4
+#define MASK_THRES 3
+
+#define FLAG_FIX_CHIMERA 0x1
+#define FLAG_LIST_EXCL   0x4
+#define FLAG_DROP_AMBI   0x8
+
+typedef struct {
+	// configurations, initialized in the main function
+	int flag, k, min_baseQ, min_varLOD, max_depth;
+	// other global variables
+	int vpos_shift;
+	bamFile fp;
+	char *pre;
+	bamFile out[3];
+	// alignment queue
+	int n, m;
+	bam1_t **b;
+} phaseg_t;
+
+typedef struct {
+	int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation!
+	int vpos, beg, end;
+	uint32_t vlen:16, single:1, flip:1, phase:1, phased:1, ambig:1;
+	uint32_t in:16, out:16; // in-phase and out-phase
+} frag_t, *frag_p;
+
+#define rseq_lt(a,b) ((a)->vpos < (b)->vpos)
+
+#include "khash.h"
+KHASH_SET_INIT_INT64(set64)
+KHASH_MAP_INIT_INT64(64, frag_t)
+
+typedef khash_t(64) nseq_t;
+
+#include "ksort.h"
+KSORT_INIT(rseq, frag_p, rseq_lt)
+
+static char nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+static inline uint64_t X31_hash_string(const char *s)
+{
+	uint64_t h = *s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+	return h;
+}
+
+static void count1(int l, const uint8_t *seq, int *cnt)
+{
+	int i, j, n_ambi;
+	uint32_t z, x;
+	if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous
+	for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases
+		if (seq[i] == 0) ++n_ambi;
+	if (l - n_ambi <= 1) return; // only one SNP
+	for (x = 0; x < 1u<<n_ambi; ++x) { // count
+		for (i = j = 0, z = 0; i < l; ++i) {
+			int c;
+			if (seq[i]) c = seq[i] - 1;
+			else {
+				c = x>>j&1;
+				++j;
+			}
+			z = z<<1 | c;
+		}
+		++cnt[z];
+	}
+}
+
+static int **count_all(int l, int vpos, nseq_t *hash)
+{
+	khint_t k;
+	int i, j, **cnt;
+	uint8_t *seq;
+	seq = calloc(l, 1);
+	cnt = calloc(vpos, sizeof(void*));
+	for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<<l, sizeof(int));
+	for (k = 0; k < kh_end(hash); ++k) {
+		if (kh_exist(hash, k)) {
+			frag_t *f = &kh_val(hash, k);
+			if (f->vpos >= vpos || f->single) continue; // out of region; or singleton
+			if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right
+				f->single = 1;
+				continue;
+			}
+			for (j = 1; j < f->vlen; ++j) {
+				for (i = 0; i < l; ++i)
+					seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)];
+				count1(l, seq, cnt[f->vpos + j]);
+			}
+		}
+	}
+	free(seq);
+	return cnt;
+}
+
+// phasing
+static int8_t *dynaprog(int l, int vpos, int **w)
+{
+	int *f[2], *curr, *prev, max, i;
+	int8_t **b, *h = 0;
+	uint32_t x, z = 1u<<(l-1), mask = (1u<<l) - 1;
+	f[0] = calloc(z, sizeof(int));
+	f[1] = calloc(z, sizeof(int));
+	b = calloc(vpos, sizeof(void*));
+	prev = f[0]; curr = f[1];
+	// fill the backtrack matrix
+	for (i = 0; i < vpos; ++i) {
+		int *wi = w[i], *tmp;
+		int8_t *bi;
+		bi = b[i] = calloc(z, 1);
+		/* In the following, x is the current state, which is the
+		 * lexicographically smaller local haplotype. xc is the complement of
+		 * x, or the larger local haplotype; y0 and y1 are the two predecessors
+		 * of x. */
+		for (x = 0; x < z; ++x) { // x0 is the smaller 
+			uint32_t y0, y1, xc;
+			int c0, c1;
+			xc = ~x&mask; y0 = x>>1; y1 = xc>>1;
+			c0 = prev[y0] + wi[x] + wi[xc];
+			c1 = prev[y1] + wi[x] + wi[xc];
+			if (c0 > c1) bi[x] = 0, curr[x] = c0;
+			else bi[x] = 1, curr[x] = c1;
+		}
+		tmp = prev; prev = curr; curr = tmp; // swap
+	}
+	{ // backtrack
+		uint32_t max_x = 0;
+		int which = 0;
+		h = calloc(vpos, 1);
+		for (x = 0, max = 0, max_x = 0; x < z; ++x)
+			if (prev[x] > max) max = prev[x], max_x = x;
+		for (i = vpos - 1, x = max_x; i >= 0; --i) {
+			h[i] = which? (~x&1) : (x&1);
+			which = b[i][x]? !which : which;
+			x = b[i][x]? (~x&mask)>>1 : x>>1;
+		}
+	}
+	// free
+	for (i = 0; i < vpos; ++i) free(b[i]);
+	free(f[0]); free(f[1]); free(b);
+	return h;
+}
+
+// phase each fragment
+static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip)
+{
+	khint_t k;
+	uint64_t *pcnt;
+	uint32_t *left, *rght, max;
+	left = rght = 0; max = 0;
+	pcnt = calloc(vpos, 8);
+	for (k = 0; k < kh_end(hash); ++k) {
+		if (kh_exist(hash, k)) {
+			int i, c[2];
+			frag_t *f = &kh_val(hash, k);
+			if (f->vpos >= vpos) continue;
+			// get the phase
+			c[0] = c[1] = 0;
+			for (i = 0; i < f->vlen; ++i) {
+				if (f->seq[i] == 0) continue;
+				++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1];
+			}
+			f->phase = c[0] > c[1]? 0 : 1;
+			f->in = c[f->phase]; f->out = c[1 - f->phase];
+			f->phased = f->in == f->out? 0 : 1;
+			f->ambig = (f->in && f->out && f->out < 3 && f->in <= f->out + 1)? 1 : 0;
+			// fix chimera
+			f->flip = 0;
+			if (flip && c[0] >= 3 && c[1] >= 3) {
+				int sum[2], m, mi, md;
+				if (f->vlen > max) { // enlarge the array
+					max = f->vlen;
+					kroundup32(max);
+					left = realloc(left, max * 4);
+					rght = realloc(rght, max * 4);
+				}
+				for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts
+					if (f->seq[i]) {
+						int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+						++sum[c == path[f->vpos + i]? 0 : 1];
+					}
+					left[i] = sum[1]<<16 | sum[0];
+				}
+				for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts
+					if (f->seq[i]) {
+						int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+						++sum[c == path[f->vpos + i]? 0 : 1];
+					}
+					rght[i] = sum[1]<<16 | sum[0];
+				}
+				// find the best flip point
+				for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) {
+					int a[2];
+					a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY;
+					a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY;
+					if (a[0] > a[1]) {
+						if (a[0] > m) m = a[0], md = 0, mi = i;
+					} else {
+						if (a[1] > m) m = a[1], md = 1, mi = i;
+					}
+				}
+				if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip
+					f->flip = 1;
+					if (md == 0) { // flip the tail
+						for (i = mi + 1; i < f->vlen; ++i)
+							if (f->seq[i] == 1) f->seq[i] = 2;
+							else if (f->seq[i] == 2) f->seq[i] = 1;
+					} else { // flip the head
+						for (i = 0; i <= mi; ++i)
+							if (f->seq[i] == 1) f->seq[i] = 2;
+							else if (f->seq[i] == 2) f->seq[i] = 1;
+					}
+				}
+			}
+			// update pcnt[]
+			if (!f->single) {
+				for (i = 0; i < f->vlen; ++i) {
+					int c;
+					if (f->seq[i] == 0) continue;
+					c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+					if (c == path[f->vpos + i]) {
+						if (f->phase == 0) ++pcnt[f->vpos + i];
+						else pcnt[f->vpos + i] += 1ull<<32;
+					} else {
+						if (f->phase == 0) pcnt[f->vpos + i] += 1<<16;
+						else pcnt[f->vpos + i] += 1ull<<48;
+					}
+				}
+			}
+		}
+	}
+	free(left); free(rght);
+	return pcnt;
+}
+
+static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n)
+{
+	int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0;
+	uint64_t *list = 0;
+	for (i = 0; i < vpos; ++i) {
+		uint64_t x = pcnt[i];
+		int c[4], pre = score, s;
+		c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff;
+		s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1);
+		if (c[3] > c[2]) s += c[3] - c[2];
+		if (c[1] > c[0]) s += c[1] - c[0];
+		score += s;
+		if (score < 0) score = 0;
+		if (pre == 0 && score > 0) beg = i; // change from zero to non-zero
+		if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) {
+			if (n == m) {
+				m = m? m<<1 : 4;
+				list = realloc(list, m * 8);
+			}
+			list[n++] = (uint64_t)beg<<32 | max_i;
+			i = max_i; // reset i to max_i
+			score = 0;
+		} else if (score > max) max = score, max_i = i;
+		if (score == 0) max = 0;
+	}
+	*_n = n;
+	return list;
+}
+
+// trim heading and tailing ambiguous bases; mark deleted and remove sequence
+static int clean_seqs(int vpos, nseq_t *hash)
+{
+	khint_t k;
+	int ret = 0;
+	for (k = 0; k < kh_end(hash); ++k) {
+		if (kh_exist(hash, k)) {
+			frag_t *f = &kh_val(hash, k);
+			int beg, end, i;
+			if (f->vpos >= vpos) {
+				ret = 1;
+				continue;
+			}
+			for (i = 0; i < f->vlen; ++i)
+				if (f->seq[i] != 0) break;
+			beg = i;
+			for (i = f->vlen - 1; i >= 0; --i)
+				if (f->seq[i] != 0) break;
+			end = i + 1;
+			if (end - beg <= 0) kh_del(64, hash, k);
+			else {
+				if (beg != 0) memmove(f->seq, f->seq + beg, end - beg);
+				f->vpos += beg; f->vlen = end - beg;
+				f->single = f->vlen == 1? 1 : 0;
+			}
+		}
+	}
+	return ret;
+}
+
+static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash)
+{
+	int i, is_flip, drop_ambi;
+	drop_ambi = g->flag & FLAG_DROP_AMBI;
+	is_flip = (drand48() < 0.5);
+	for (i = 0; i < g->n; ++i) {
+		int end, which;
+		uint64_t key;
+		khint_t k;
+		bam1_t *b = g->b[i];
+		key = X31_hash_string(bam1_qname(b));
+		end = bam_calend(&b->core, bam1_cigar(b));
+		if (end > min_pos) break;
+		k = kh_get(64, hash, key);
+		if (k == kh_end(hash)) which = 3;
+		else {
+			frag_t *f = &kh_val(hash, k);
+			if (f->ambig) which = drop_ambi? 2 : 3;
+			else if (f->phased && f->flip) which = 2;
+			else if (f->phased == 0) which = 3;
+			else { // phased and not flipped
+				char c = 'Y';
+				which = f->phase;
+				bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c);
+			}
+			if (which < 2 && is_flip) which = 1 - which; // increase the randomness
+		}
+		if (which == 3) which = (drand48() < 0.5);
+		bam_write1(g->out[which], b);
+		bam_destroy1(b);
+		g->b[i] = 0;
+	}
+	memmove(g->b, g->b + i, (g->n - i) * sizeof(void*));
+	g->n -= i;
+}
+
+static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash)
+{
+	int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos;
+	khint_t k;
+	frag_t **seqs;
+	int8_t *path, *sitemask;
+	uint64_t *pcnt, *regmask;
+
+	if (vpos == 0) return 0;
+	i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos
+	min_pos = i? cns[vpos]>>32 : 0x7fffffff;
+	if (vpos == 1) {
+		printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1);
+		printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1,
+			"ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1);
+		for (k = 0; k < kh_end(hash); ++k) {
+			if (kh_exist(hash, k)) {
+				frag_t *f = &kh_val(hash, k);
+				if (f->vpos) continue;
+				f->flip = 0;
+				if (f->seq[0] == 0) f->phased = 0;
+				else f->phased = 1, f->phase = f->seq[0] - 1;
+			}
+		}
+		dump_aln(g, min_pos, hash);
+		++g->vpos_shift;
+		return 1;
+	}
+	{ // phase
+		int **cnt;
+		uint64_t *mask;
+		printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1);
+		sitemask = calloc(vpos, 1);
+		cnt = count_all(g->k, vpos, hash);
+		path = dynaprog(g->k, vpos, cnt);
+		for (i = 0; i < vpos; ++i) free(cnt[i]);
+		free(cnt);
+		pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking
+		mask = genmask(vpos, pcnt, &n_masked);
+		regmask = calloc(n_masked, 8);
+		for (i = 0; i < n_masked; ++i) {
+			regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32;
+			for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j)
+				sitemask[j] = 1;
+		}
+		free(mask);
+		if (g->flag & FLAG_FIX_CHIMERA) {
+			free(pcnt);
+			pcnt = fragphase(vpos, path, hash, 1);
+		}
+	}
+	for (i = 0; i < n_masked; ++i)
+		printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1);
+	for (i = 0; i < vpos; ++i) {
+		uint64_t x = pcnt[i];
+		int8_t c[2];
+		c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3);
+		c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3);
+		printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]],
+			i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff));
+	}
+	free(path); free(pcnt); free(regmask); free(sitemask);
+	seqs = calloc(n_seqs, sizeof(void*));
+	for (k = 0, i = 0; k < kh_end(hash); ++k) 
+		if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single)
+			seqs[i++] = &kh_val(hash, k);
+	n_seqs = i;
+	ks_introsort_rseq(n_seqs, seqs);
+	for (i = 0; i < n_seqs; ++i) {
+		frag_t *f = seqs[i];
+		printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen);
+		for (j = 0; j < f->vlen; ++j) {
+			uint32_t c = cns[f->vpos + j];
+			if (f->seq[j] == 0) putchar('N');
+			else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]);
+		}
+		printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1);
+	}
+	free(seqs);
+	printf("//\n");
+	fflush(stdout);
+	g->vpos_shift += vpos;
+	dump_aln(g, min_pos, hash);
+	return vpos;
+}
+
+static void update_vpos(int vpos, nseq_t *hash)
+{
+	khint_t k;
+	for (k = 0; k < kh_end(hash); ++k) {
+		if (kh_exist(hash, k)) {
+			frag_t *f = &kh_val(hash, k);
+			if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it
+			else f->vpos -= vpos;
+		}
+	}
+}
+
+static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement
+{
+	return hash;
+}
+
+static int readaln(void *data, bam1_t *b)
+{
+	phaseg_t *g = (phaseg_t*)data;
+	int ret;
+	ret = bam_read1(g->fp, b);
+	if (ret < 0) return ret;
+	if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) {
+		if (g->n == g->m) {
+			g->m = g->m? g->m<<1 : 16;
+			g->b = realloc(g->b, g->m * sizeof(void*));
+		}
+		g->b[g->n++] = bam_dup1(b);
+	}
+	return ret;
+}
+
+static khash_t(set64) *loadpos(const char *fn, bam_header_t *h)
+{
+	gzFile fp;
+	kstream_t *ks;
+	int ret, dret;
+	kstring_t *str;
+	khash_t(set64) *hash;
+
+	hash = kh_init(set64);
+	str = calloc(1, sizeof(kstring_t));
+	fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+	ks = ks_init(fp);
+	while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+		int tid = bam_get_tid(h, str->s);
+		if (tid >= 0 && dret != '\n') {
+			if (ks_getuntil(ks, 0, str, &dret) >= 0) {
+				uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1);
+				kh_put(set64, hash, x, &ret);
+			} else break;
+		}
+		if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+		if (dret < 0) break;
+	}
+	ks_destroy(ks);
+	gzclose(fp);
+	free(str->s); free(str);
+	return hash;
+}
+
+static int gl2cns(float q[16])
+{
+	int i, j, min_ij;
+	float min, min2;
+	min = min2 = 1e30; min_ij = -1;
+	for (i = 0; i < 4; ++i) {
+		for (j = i; j < 4; ++j) {
+			if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j];
+			else if (q[i<<2|j] < min2) min2 = q[i<<2|j];
+		}
+	}
+	return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2;
+}
+
+int main_phase(int argc, char *argv[])
+{
+	extern void bam_init_header_hash(bam_header_t *header);
+	int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0;
+	const bam_pileup1_t *plp;
+	bam_plp_t iter;
+	bam_header_t *h;
+	nseq_t *seqs;
+	uint64_t *cns = 0;
+	phaseg_t g;
+	char *fn_list = 0;
+	khash_t(set64) *set = 0;
+	errmod_t *em;
+	uint16_t *bases;
+
+	memset(&g, 0, sizeof(phaseg_t));
+	g.flag = FLAG_FIX_CHIMERA;
+	g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256;
+	while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) {
+		switch (c) {
+			case 'D': g.max_depth = atoi(optarg); break;
+			case 'q': g.min_varLOD = atoi(optarg); break;
+			case 'Q': g.min_baseQ = atoi(optarg); break;
+			case 'k': g.k = atoi(optarg); break;
+			case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break;
+			case 'e': g.flag |= FLAG_LIST_EXCL; break;
+			case 'A': g.flag |= FLAG_DROP_AMBI; break;
+			case 'b': g.pre = strdup(optarg); break;
+			case 'l': fn_list = strdup(optarg); break;
+		}
+	}
+	if (argc == optind) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   samtools phase [options] <in.bam>\n\n");
+		fprintf(stderr, "Options: -k INT    block length [%d]\n", g.k);
+		fprintf(stderr, "         -b STR    prefix of BAMs to output [null]\n");
+		fprintf(stderr, "         -q INT    min het phred-LOD [%d]\n", g.min_varLOD);
+		fprintf(stderr, "         -Q INT    min base quality in het calling [%d]\n", g.min_baseQ);
+		fprintf(stderr, "         -D INT    max read depth [%d]\n", g.max_depth);
+//		fprintf(stderr, "         -l FILE   list of sites to phase [null]\n");
+		fprintf(stderr, "         -F        do not attempt to fix chimeras\n");
+		fprintf(stderr, "         -A        drop reads with ambiguous phase\n");
+//		fprintf(stderr, "         -e        do not discover SNPs (effective with -l)\n");
+		fprintf(stderr, "\n");
+		return 1;
+	}
+	g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+	h = bam_header_read(g.fp);
+	if (fn_list) { // read the list of sites to phase
+		bam_init_header_hash(h);
+		set = loadpos(fn_list, h);
+		free(fn_list);
+	} else g.flag &= ~FLAG_LIST_EXCL;
+	if (g.pre) { // open BAMs to write
+		char *s = malloc(strlen(g.pre) + 20);
+		strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = bam_open(s, "w");
+		strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = bam_open(s, "w");
+		strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = bam_open(s, "w");
+		for (c = 0; c <= 2; ++c) bam_header_write(g.out[c], h);
+		free(s);
+	}
+
+	iter = bam_plp_init(readaln, &g);
+	g.vpos_shift = 0;
+	seqs = kh_init(64);
+	em = errmod_init(1. - 0.83);
+	bases = calloc(g.max_depth, 2);
+	printf("CC\n");
+	printf("CC\tDescriptions:\nCC\n");
+	printf("CC\t  CC      comments\n");
+	printf("CC\t  PS      start of a phase set\n");
+	printf("CC\t  FL      filtered region\n");
+	printf("CC\t  M[012]  markers; 0 for singletons, 1 for phased and 2 for filtered\n");
+	printf("CC\t  EV      supporting reads; SAM format\n");
+	printf("CC\t  //      end of a phase set\nCC\n");
+	printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n");
+	printf("CC\t  PS  chr  phaseSetStart  phaseSetEnd\n");
+	printf("CC\t  FL  chr  filterStart    filterEnd\n");
+	printf("CC\t  M?  chr  PS  pos  allele0  allele1  hetIndex  #supports0  #errors0  #supp1  #err1\n");
+	printf("CC\nCC\n");
+	fflush(stdout);
+	while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) {
+		int i, k, c, tmp, dophase = 1, in_set = 0;
+		float q[16];
+		if (tid < 0) break;
+		if (tid != lasttid) { // change of chromosome
+			g.vpos_shift = 0;
+			if (lasttid >= 0) {
+				seqs = shrink_hash(seqs);
+				phase(&g, h->target_name[lasttid], vpos, cns, seqs);
+				update_vpos(0x7fffffff, seqs);
+			}
+			lasttid = tid;
+			vpos = 0;
+		}
+		if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1;
+		if (n > g.max_depth) continue; // do not proceed if the depth is too high
+		// fill the bases array and check if there is a variant
+		for (i = k = 0; i < n; ++i) {
+			const bam_pileup1_t *p = plp + i;
+			uint8_t *seq;
+			int q, baseQ, b;
+			if (p->is_del || p->is_refskip) continue;
+			baseQ = bam1_qual(p->b)[p->qpos];
+			if (baseQ < g.min_baseQ) continue;
+			seq = bam1_seq(p->b);
+			b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
+			if (b > 3) continue;
+			q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
+			if (q < 4) q = 4;
+			if (q > 63) q = 63;
+			bases[k++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
+		}
+		if (k == 0) continue;
+		errmod_cal(em, k, 4, bases, q); // compute genotype likelihood
+		c = gl2cns(q); // get the consensus
+		// tell if to proceed
+		if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list
+		if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant
+		// add the variant
+		if (vpos == max_vpos) {
+			max_vpos = max_vpos? max_vpos<<1 : 128;
+			cns = realloc(cns, max_vpos * 8);
+		}
+		cns[vpos] = (uint64_t)pos<<32 | c;
+		for (i = 0; i < n; ++i) {
+			const bam_pileup1_t *p = plp + i;
+			uint64_t key;
+			khint_t k;
+			uint8_t *seq = bam1_seq(p->b);
+			frag_t *f;
+			if (p->is_del || p->is_refskip) continue;
+			if (p->b->core.qual == 0) continue;
+			// get the base code
+			c = nt16_nt4_table[(int)bam1_seqi(seq, p->qpos)];
+			if (c == (cns[vpos]&3)) c = 1;
+			else if (c == (cns[vpos]>>16&3)) c = 2;
+			else c = 0;
+			// write to seqs
+			key = X31_hash_string(bam1_qname(p->b));
+			k = kh_put(64, seqs, key, &tmp);
+			f = &kh_val(seqs, k);
+			if (tmp == 0) { // present in the hash table
+				if (vpos - f->vpos + 1 < MAX_VARS) {
+					f->vlen = vpos - f->vpos + 1;
+					f->seq[f->vlen-1] = c;
+					f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
+				}
+				dophase = 0;
+			} else { // absent
+				memset(f->seq, 0, MAX_VARS);
+				f->beg = p->b->core.pos;
+				f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
+				f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = f->ambig = 0;
+			}
+		}
+		if (dophase) {
+			seqs = shrink_hash(seqs);
+			phase(&g, h->target_name[tid], vpos, cns, seqs);
+			update_vpos(vpos, seqs);
+			cns[0] = cns[vpos];
+			vpos = 0;
+		}
+		++vpos;
+	}
+	if (tid >= 0) phase(&g, h->target_name[tid], vpos, cns, seqs);
+	bam_header_destroy(h);
+	bam_plp_destroy(iter);
+	bam_close(g.fp);
+	kh_destroy(64, seqs);
+	kh_destroy(set64, set);
+	free(cns);
+	errmod_destroy(em);
+	free(bases);
+	if (g.pre) {
+		for (c = 0; c <= 2; ++c) bam_close(g.out[c]);
+		free(g.pre); free(g.b);
+	}
+	return 0;
+}
diff --git a/randomFQ b/randomFQ
new file mode 100755
index 0000000..272cd73
--- /dev/null
+++ b/randomFQ
@@ -0,0 +1,245 @@
+#!/usr/bin/perl
+use strict;
+use Getopt::Long;
+
+# Author: Erik Aronesty (earonesty at xpressionanalysis.com) 
+# Outputs a random sampled fastq or fasta file, from an input fastq
+# Copyright (c) 2011 Expression Analysis
+
+
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+my $cnt;
+my $fasta;
+my $out;
+my $seed=1;
+my $window;
+my $always;
+my $append;
+my $pct;
+GetOptions("count=i"=>\$cnt, "pct=f"=>\$pct, "fasta"=>\$fasta, "out=s"=>\$out, "seed=i"=>\$seed, "window=i"=>\$window, "always|a"=>\$always, "append|A"=>\$append) || die usage();
+if ($window < $cnt) {
+	$window = $cnt * 5; 
+	$window = 100000 if $window < 100000;
+}
+
+if ($seed) {
+	srand($seed);
+}
+
+die usage() unless $cnt>0||$pct>0;
+
+my $in = shift;
+my $mate = shift;
+my $mate2 = shift;
+
+die "Can't see $in\n" unless -e $in;
+die "Can't see $mate\n" unless !$mate || -e $mate;
+die "Can't see $mate2\n" unless !$mate2 || -e $mate2;
+
+my $s = -s $in;
+my $sm = -s $mate;
+my $sm2 = -s $mate2;
+
+die "Need -out <prefix> for paired-end" if $mate && !$out;
+
+my $suff;
+if ($out =~ s/\%(.*)//) {
+	$suff = $1;
+}
+$out =~ s/_$//;
+
+my $gzmeth = $suff =~ /\.gz/ ? "|gzip -c " : "";
+
+open(IN, $in=~/\.gz$/?"gunzip -c $in|":$in) || die;
+
+$append = $append ? ">>" : ">";
+
+if ($mate) {
+	open(MI, $mate=~/\.gz$/?"gunzip -c $mate|":$mate) || die;
+	open(MI2, $mate2=~/\.gz$/?"gunzip -c $mate2|":$mate2) if $mate2;
+	open(O1, "$gzmeth$append${out}_1$suff") || die;
+	open(O2, "$gzmeth$append${out}_2$suff") || die;
+	if ($mate2) {
+		open(O3, "$gzmeth$append${out}_3$suff") || die;
+	}
+} else {
+	my $gzmeth = $out =~ /\.gz/ ? "|gzip -c " : "";
+	if ($out) {
+		open(O1, "$gzmeth$append$out");
+	} else {
+		open(O1, ">&STDOUT");
+	}
+}
+
+my $lc = 0+`alc -o $in`;
+my $stats = $in; $stats =~ s/\.gz$//;  $stats .= ".stats";
+if (-e $stats) {
+	my $rlc = `grep ^reads $stats | cut -f 2`+0;
+	$lc = $rlc if $rlc;
+} else {
+    if ($in =~ /gz$/) {
+        $lc *= .90;         # lower guess
+    }
+}
+
+if (!$always && ($cnt > $lc/2)) {
+	$cnt *= 4;
+	warn "Source is too small relative to count requested, just returning tail -$cnt\n";
+	if ($mate) {
+		if ($in=~/gz$/) {
+			system("gunzip -c $in | head -$cnt $gzmeth $append ${out}_1$suff");
+		} else {
+			system("tail -$cnt $in $gzmeth $append ${out}_1$suff");
+		}
+		if ($mate=~/gz$/) {
+			system("gunzip -c $mate | head -$cnt $gzmeth $append ${out}_2$suff");
+		} else {
+			system("tail -$cnt $mate $gzmeth $append ${out}_2$suff");
+		}
+		if ($mate2=~/gz$/) {
+			system("gunzip -c $mate2 | head -$cnt $gzmeth $append ${out}_3$suff");
+		} else {
+			system("tail -$cnt $mate2 $gzmeth $append ${out}_3$suff") if ($mate2);
+		}
+		exit 0;
+	} else {
+		if ($out) {
+			$gzmeth = $out =~ /\.gz/ ? "|gzip -c " : "";
+			$out = "$gzmeth $append $out";
+		}
+		if ($in=~/gz$/) {
+			exec("gunzip -c $in | head -$cnt $out");
+			die("Exec failed : $!\n");
+		} else {
+			exec("tail -$cnt $in $out");
+			die("Exec failed : $!\n");
+		}
+	}
+}
+
+# reads to sample from....whole file or a smaller part?
+$window = $lc if $window > $lc;
+
+my $fudge = 1.5;                # top weighted
+$fudge = 1.2 if $always;        # less top-weighted
+
+# larger = more chance to keep read
+my $prob;
+
+if ($pct) {
+    $prob = $pct/100;
+    $cnt = 10000000000;
+} else {
+    $prob = ($cnt*$fudge)/$window;
+}
+
+# we used to seek ... but this broke too much ... if you want to fix... fix randomFQ-broken ... 
+if (!$always && !$pct) {
+    # skip some reads at the beginning
+    my $skip = ($lc-$window) / 10;
+    $skip = 200000 if $skip > 200000;
+    for (my $i=0;$i<$skip;++$i)	{
+        scalar <IN>; scalar <IN>; scalar <IN>; scalar <IN>;
+        scalar <MI>; scalar <MI>; scalar <MI>; scalar <MI>;
+        scalar <MI2>; scalar <MI2>; scalar <MI2>; scalar <MI2>;
+    }
+}
+
+while ($cnt > 0) {
+	if (rand() > $prob) {
+		# discard
+		<IN>; <IN>; <IN>; <IN>;
+		<MI>; <MI>; <MI>; <MI>;
+		<MI2>; <MI2>; <MI2>; <MI2>;
+		next;
+	}
+
+	# id
+	my $i = <IN>;
+    if (!$i) {
+        $cnt = 0;
+        last;
+    }
+	my $i2 = <MI> if $mate;
+	my $i3 = <MI2> if $mate2;
+
+	# read
+	my $r = <IN>;
+	my $r2 = <MI> if $mate;
+	my $r3 = <MI2> if $mate2;
+
+	if ($fasta) {
+		# only need id and read
+		$i=~ s/^\@/>/;
+		$i2=~ s/^\@/>/;
+		$i3=~ s/^\@/>/;
+		print O1 "$i$r";
+		print O2 "$i2$r2" if $mate;
+		print O3 "$i3$r3" if $mate2;
+		<IN>;<IN>;
+		<MI>;<MI>;
+	} else {
+		# print id and read
+		print O1 "$i$r";
+		print O2 "$i2$r2";
+		print O3 "$i3$r3";
+		# copy comment and quality
+		print O1 scalar <IN>;
+		print O2 scalar <MI> if $mate;
+		print O3 scalar <MI2> if $mate2;
+		print O1 scalar <IN>;
+		print O2 scalar <MI> if $mate;
+		print O3 scalar <MI2> if $mate2;
+	}
+	--$cnt;
+}
+
+close IN;
+
+sub usage() {
+	return <<EOF
+usage: $0 (-c <count> | -p <pct>) [-fasta] [-out <prefix>] [-seed <int>] <input-fastq> [<input-2> [<index-3>] ]
+
+Returns <count> number of random entries from the input fastq.
+
+Output is fastq, unless you specify -fasta
+
+If the -out parameter ends in .gz, the result is gzipped in-place.
+
+-p returns a % of total reads, -c returns a fixed count.
+
+SINGLE END:
+
+Outputs to standard output, unless -out <file> is specified.   
+
+PAIRED END:
+
+Pass 2 (or 3) files as input, -out is required.
+
+If the paired-end output contains a "%" sign, it is replaced with the 1 & 2 for paired-end.
+
+IE: -o output_%.fastq.gz
+
+Otherwise it's jsut output_1 and output_2
+
+*** If one file is an indexed read, it has to be the 3rd file (for now).
+EOF
+}
+
+sub max {
+	return $_[0] > $_[1] ? $_[0] : $_[1];
+}
+
diff --git a/razf.c b/razf.c
new file mode 100644
index 0000000..e7499f9
--- /dev/null
+++ b/razf.c
@@ -0,0 +1,853 @@
+/*
+ * RAZF : Random Access compressed(Z) File
+ * Version: 1.0
+ * Release Date: 2008-10-27
+ *
+ * Copyright 2008, Jue Ruan <ruanjue at gmail.com>, Heng Li <lh3 at sanger.ac.uk>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NO_RAZF
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "razf.h"
+
+
+#if ZLIB_VERNUM < 0x1221
+struct _gz_header_s {
+    int     text;
+    uLong   time;
+    int     xflags;
+    int     os;
+    Bytef   *extra;
+    uInt    extra_len;
+    uInt    extra_max;
+    Bytef   *name;
+    uInt    name_max;
+    Bytef   *comment;
+    uInt    comm_max;
+    int     hcrc;
+    int     done;
+};
+#warning "zlib < 1.2.2.1; RAZF writing is disabled."
+#endif
+
+#define DEF_MEM_LEVEL 8
+
+static inline uint32_t byte_swap_4(uint32_t v){
+	v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+	return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+
+static inline uint64_t byte_swap_8(uint64_t v){
+	v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+	v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+	return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+
+static inline int is_big_endian(){
+	int x = 0x01;
+	char *c = (char*)&x;
+	return (c[0] != 0x01);
+}
+
+#ifndef _RZ_READONLY
+static void add_zindex(RAZF *rz, int64_t in, int64_t out){
+	if(rz->index->size == rz->index->cap){
+		rz->index->cap = rz->index->cap * 1.5 + 2;
+		rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap);
+		rz->index->bin_offsets  = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1));
+	}
+	if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out;
+	rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE];
+	rz->index->size ++;
+}
+
+static void save_zindex(RAZF *rz, int fd){
+	int32_t i, v32;
+	int is_be;
+	is_be = is_big_endian();
+	if(is_be) write(fd, &rz->index->size, sizeof(int));
+	else {
+		v32 = byte_swap_4((uint32_t)rz->index->size);
+		write(fd, &v32, sizeof(uint32_t));
+	}
+	v32 = rz->index->size / RZ_BIN_SIZE + 1;
+	if(!is_be){
+		for(i=0;i<v32;i++) rz->index->bin_offsets[i]  = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
+		for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
+	}
+	write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
+	write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size);
+}
+#endif
+
+#ifdef _USE_KNETFILE
+static void load_zindex(RAZF *rz, knetFile *fp){
+#else
+static void load_zindex(RAZF *rz, int fd){
+#endif
+	int32_t i, v32;
+	int is_be;
+	if(!rz->load_index) return;
+	if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex));
+	is_be = is_big_endian();
+#ifdef _USE_KNETFILE
+	knet_read(fp, &rz->index->size, sizeof(int));
+#else
+	read(fd, &rz->index->size, sizeof(int));
+#endif
+	if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size);
+	rz->index->cap = rz->index->size;
+	v32 = rz->index->size / RZ_BIN_SIZE + 1;
+	rz->index->bin_offsets  = malloc(sizeof(int64_t) * v32);
+#ifdef _USE_KNETFILE
+	knet_read(fp, rz->index->bin_offsets, sizeof(int64_t) * v32);
+#else
+	read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
+#endif
+	rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size);
+#ifdef _USE_KNETFILE
+	knet_read(fp, rz->index->cell_offsets, sizeof(int) * rz->index->size);
+#else
+	read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size);
+#endif
+	if(!is_be){
+		for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
+		for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
+	}
+}
+
+#ifdef _RZ_READONLY
+static RAZF* razf_open_w(int fd)
+{
+	fprintf(stderr, "[razf_open_w] Writing is not available with zlib ver < 1.2.2.1\n");
+	return 0;
+}
+#else
+static RAZF* razf_open_w(int fd){
+	RAZF *rz;
+#ifdef _WIN32
+	setmode(fd, O_BINARY);
+#endif
+	rz = calloc(1, sizeof(RAZF));
+	rz->mode = 'w';
+#ifdef _USE_KNETFILE
+    rz->x.fpw = fd;
+#else
+	rz->filedes = fd;
+#endif
+	rz->stream = calloc(sizeof(z_stream), 1);
+	rz->inbuf  = malloc(RZ_BUFFER_SIZE);
+	rz->outbuf = malloc(RZ_BUFFER_SIZE);
+	rz->index = calloc(sizeof(ZBlockIndex), 1);
+	deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+	rz->stream->avail_out = RZ_BUFFER_SIZE;
+	rz->stream->next_out  = rz->outbuf;
+	rz->header = calloc(sizeof(gz_header), 1);
+	rz->header->os    = 0x03; //Unix
+	rz->header->text  = 0;
+	rz->header->time  = 0;
+	rz->header->extra = malloc(7);
+	strncpy((char*)rz->header->extra, "RAZF", 4);
+	rz->header->extra[4] = 1; // obsolete field
+	// block size = RZ_BLOCK_SIZE, Big-Endian
+	rz->header->extra[5] = RZ_BLOCK_SIZE >> 8;
+	rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF;
+	rz->header->extra_len = 7;
+	rz->header->name = rz->header->comment  = 0;
+	rz->header->hcrc = 0;
+	deflateSetHeader(rz->stream, rz->header);
+	rz->block_pos = rz->block_off = 0;
+	return rz;
+}
+
+static void _razf_write(RAZF* rz, const void *data, int size){
+	int tout;
+	rz->stream->avail_in = size;
+	rz->stream->next_in  = (void*)data;
+	while(1){
+		tout = rz->stream->avail_out;
+		deflate(rz->stream, Z_NO_FLUSH);
+		rz->out += tout - rz->stream->avail_out;
+		if(rz->stream->avail_out) break;
+#ifdef _USE_KNETFILE
+		write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else
+		write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+		rz->stream->avail_out = RZ_BUFFER_SIZE;
+		rz->stream->next_out  = rz->outbuf;
+		if(rz->stream->avail_in == 0) break;
+	};
+	rz->in += size - rz->stream->avail_in;
+	rz->block_off += size - rz->stream->avail_in;
+}
+
+static void razf_flush(RAZF *rz){
+	uint32_t tout;
+	if(rz->buf_len){
+		_razf_write(rz, rz->inbuf, rz->buf_len);
+		rz->buf_off = rz->buf_len = 0;
+	}
+	if(rz->stream->avail_out){
+#ifdef _USE_KNETFILE    
+		write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else        
+		write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+		rz->stream->avail_out = RZ_BUFFER_SIZE;
+		rz->stream->next_out  = rz->outbuf;
+	}
+	while(1){
+		tout = rz->stream->avail_out;
+		deflate(rz->stream, Z_FULL_FLUSH);
+		rz->out += tout - rz->stream->avail_out;
+		if(rz->stream->avail_out == 0){
+#ifdef _USE_KNETFILE    
+			write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else            
+			write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+			rz->stream->avail_out = RZ_BUFFER_SIZE;
+			rz->stream->next_out  = rz->outbuf;
+		} else break;
+	}
+	rz->block_pos = rz->out;
+	rz->block_off = 0;
+}
+
+static void razf_end_flush(RAZF *rz){
+	uint32_t tout;
+	if(rz->buf_len){
+		_razf_write(rz, rz->inbuf, rz->buf_len);
+		rz->buf_off = rz->buf_len = 0;
+	}
+	while(1){
+		tout = rz->stream->avail_out;
+		deflate(rz->stream, Z_FINISH);
+		rz->out += tout - rz->stream->avail_out;
+		if(rz->stream->avail_out < RZ_BUFFER_SIZE){
+#ifdef _USE_KNETFILE        
+			write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#else            
+			write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+#endif
+			rz->stream->avail_out = RZ_BUFFER_SIZE;
+			rz->stream->next_out  = rz->outbuf;
+		} else break;
+	}
+}
+
+static void _razf_buffered_write(RAZF *rz, const void *data, int size){
+	int i, n;
+	while(1){
+		if(rz->buf_len == RZ_BUFFER_SIZE){
+			_razf_write(rz, rz->inbuf, rz->buf_len);
+			rz->buf_len = 0;
+		}
+		if(size + rz->buf_len < RZ_BUFFER_SIZE){
+			for(i=0;i<size;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
+			rz->buf_len += size;
+			return;
+		} else {
+			n = RZ_BUFFER_SIZE - rz->buf_len;
+			for(i=0;i<n;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
+			size -= n;
+			data += n;
+			rz->buf_len += n;
+		}
+	}
+}
+
+int razf_write(RAZF* rz, const void *data, int size){
+	int ori_size, n;
+	int64_t next_block;
+	ori_size = size;
+	next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
+	while(rz->in + rz->buf_len + size >= next_block){
+		n = next_block - rz->in - rz->buf_len;
+		_razf_buffered_write(rz, data, n);
+		data += n;
+		size -= n;
+		razf_flush(rz);
+		add_zindex(rz, rz->in, rz->out);
+		next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
+	}
+	_razf_buffered_write(rz, data, size);
+	return ori_size;
+}
+#endif
+
+/* gzip flag byte */
+#define ASCII_FLAG   0x01 /* bit 0 set: file probably ascii text */
+#define HEAD_CRC     0x02 /* bit 1 set: header CRC present */
+#define EXTRA_FIELD  0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME    0x08 /* bit 3 set: original file name present */
+#define COMMENT      0x10 /* bit 4 set: file comment present */
+#define RESERVED     0xE0 /* bits 5..7: reserved */
+
+static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){
+	int method, flags, n, len;
+	if(size < 2) return 0;
+	if(data[0] != 0x1f || data[1] != 0x8b) return 0;
+	if(size < 4) return 0;
+	method = data[2];
+	flags  = data[3];
+	if(method != Z_DEFLATED || (flags & RESERVED)) return 0;
+	n = 4 + 6; // Skip 6 bytes
+	*extra_off = n + 2;
+	*extra_len = 0;
+	if(flags & EXTRA_FIELD){
+		if(size < n + 2) return 0;
+		len = ((int)data[n + 1] << 8) | data[n];
+		n += 2;
+		*extra_off = n;
+		while(len){
+			if(n >= size) return 0;
+			n ++;
+			len --;
+		}
+		*extra_len = n - (*extra_off);
+	}
+	if(flags & ORIG_NAME) while(n < size && data[n++]);
+	if(flags & COMMENT) while(n < size && data[n++]);
+	if(flags & HEAD_CRC){
+		if(n + 2 > size) return 0;
+		n += 2;
+	}
+	return n;
+}
+
+#ifdef _USE_KNETFILE
+static RAZF* razf_open_r(knetFile *fp, int _load_index){
+#else
+static RAZF* razf_open_r(int fd, int _load_index){
+#endif
+	RAZF *rz;
+	int ext_off, ext_len;
+	int n, is_be, ret;
+	int64_t end;
+	unsigned char c[] = "RAZF";
+	rz = calloc(1, sizeof(RAZF));
+	rz->mode = 'r';
+#ifdef _USE_KNETFILE
+    rz->x.fpr = fp;
+#else
+#ifdef _WIN32
+	setmode(fd, O_BINARY);
+#endif
+	rz->filedes = fd;
+#endif
+	rz->stream = calloc(sizeof(z_stream), 1);
+	rz->inbuf  = malloc(RZ_BUFFER_SIZE);
+	rz->outbuf = malloc(RZ_BUFFER_SIZE);
+	rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL;
+#ifdef _USE_KNETFILE
+    n = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE);
+#else
+	n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
+#endif
+	ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len);
+	if(ret == 0){
+		PLAIN_FILE:
+		rz->in = n;
+		rz->file_type = FILE_TYPE_PLAIN;
+		memcpy(rz->outbuf, rz->inbuf, n);
+		rz->buf_len = n;
+		free(rz->stream);
+		rz->stream = NULL;
+		return rz;
+	}
+	rz->header_size = ret;
+	ret = inflateInit2(rz->stream, -WINDOW_BITS);
+	if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;}
+	rz->stream->avail_in = n - rz->header_size;
+	rz->stream->next_in  = rz->inbuf + rz->header_size;
+	rz->stream->avail_out = RZ_BUFFER_SIZE;
+	rz->stream->next_out  = rz->outbuf;
+	rz->file_type = FILE_TYPE_GZ;
+	rz->in = rz->header_size;
+	rz->block_pos = rz->header_size;
+	rz->next_block_pos = rz->header_size;
+	rz->block_off = 0;
+	if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz;
+	if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){
+		fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file.  in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__);
+		return rz;
+	}
+	rz->load_index = _load_index;
+	rz->file_type = FILE_TYPE_RZ;
+#ifdef _USE_KNETFILE
+	if(knet_seek(fp, -16, SEEK_END) == -1){
+#else
+	if(lseek(fd, -16, SEEK_END) == -1){
+#endif
+		UNSEEKABLE:
+		rz->seekable = 0;
+		rz->index = NULL;
+		rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL;
+	} else {
+		is_be = is_big_endian();
+		rz->seekable = 1;
+#ifdef _USE_KNETFILE
+        knet_read(fp, &end, sizeof(int64_t));
+#else
+		read(fd, &end, sizeof(int64_t));
+#endif        
+		if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end);
+		else rz->src_end = end;
+
+#ifdef _USE_KNETFILE
+		knet_read(fp, &end, sizeof(int64_t));
+#else
+		read(fd, &end, sizeof(int64_t));
+#endif        
+		if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end);
+		else rz->end = end;
+		if(n > rz->end){
+			rz->stream->avail_in -= n - rz->end;
+			n = rz->end;
+		}
+		if(rz->end > rz->src_end){
+#ifdef _USE_KNETFILE
+            knet_seek(fp, rz->in, SEEK_SET);
+#else
+			lseek(fd, rz->in, SEEK_SET);
+#endif
+			goto UNSEEKABLE;
+		}
+#ifdef _USE_KNETFILE
+        knet_seek(fp, rz->end, SEEK_SET);
+		if(knet_tell(fp) != rz->end){
+			knet_seek(fp, rz->in, SEEK_SET);
+#else
+		if(lseek(fd, rz->end, SEEK_SET) != rz->end){
+			lseek(fd, rz->in, SEEK_SET);
+#endif
+			goto UNSEEKABLE;
+		}
+#ifdef _USE_KNETFILE
+		load_zindex(rz, fp);
+		knet_seek(fp, n, SEEK_SET);
+#else
+		load_zindex(rz, fd);
+		lseek(fd, n, SEEK_SET);
+#endif
+	}
+	return rz;
+}
+
+#ifdef _USE_KNETFILE
+RAZF* razf_dopen(int fd, const char *mode){
+    if (strstr(mode, "r")) fprintf(stderr,"[razf_dopen] implement me\n");
+    else if(strstr(mode, "w")) return razf_open_w(fd);
+	return NULL;
+}
+
+RAZF* razf_dopen2(int fd, const char *mode)
+{
+    fprintf(stderr,"[razf_dopen2] implement me\n");
+    return NULL;
+}
+#else
+RAZF* razf_dopen(int fd, const char *mode){
+	if(strstr(mode, "r")) return razf_open_r(fd, 1);
+	else if(strstr(mode, "w")) return razf_open_w(fd);
+	else return NULL;
+}
+
+RAZF* razf_dopen2(int fd, const char *mode)
+{
+	if(strstr(mode, "r")) return razf_open_r(fd, 0);
+	else if(strstr(mode, "w")) return razf_open_w(fd);
+	else return NULL;
+}
+#endif
+
+static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){
+	int fd;
+	RAZF *rz;
+	if(strstr(mode, "r")){
+#ifdef _USE_KNETFILE
+        knetFile *fd = knet_open(filename, "r");
+        if (fd == 0) {
+            fprintf(stderr, "[_razf_open] fail to open %s\n", filename);
+            return NULL;
+        }
+#else
+#ifdef _WIN32
+		fd = open(filename, O_RDONLY | O_BINARY);
+#else
+		fd = open(filename, O_RDONLY);
+#endif
+#endif
+		if(fd < 0) return NULL;
+		rz = razf_open_r(fd, _load_index);
+	} else if(strstr(mode, "w")){
+#ifdef _WIN32
+		fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666);
+#else
+		fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666);
+#endif
+		if(fd < 0) return NULL;
+		rz = razf_open_w(fd);
+	} else return NULL;
+	return rz;
+}
+
+RAZF* razf_open(const char *filename, const char *mode){
+	return _razf_open(filename, mode, 1);
+}
+
+RAZF* razf_open2(const char *filename, const char *mode){
+	return _razf_open(filename, mode, 0);
+}
+
+int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){
+	int64_t n;
+	if(rz->mode != 'r' && rz->mode != 'R') return 0;
+	switch(rz->file_type){
+		case FILE_TYPE_PLAIN:
+			if(rz->end == 0x7fffffffffffffffLL){
+#ifdef _USE_KNETFILE
+				if(knet_seek(rz->x.fpr, 0, SEEK_CUR) == -1) return 0;
+                n = knet_tell(rz->x.fpr);
+				knet_seek(rz->x.fpr, 0, SEEK_END);
+                rz->end = knet_tell(rz->x.fpr);
+				knet_seek(rz->x.fpr, n, SEEK_SET);
+#else
+				if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0;
+				rz->end = lseek(rz->filedes, 0, SEEK_END);
+				lseek(rz->filedes, n, SEEK_SET);
+#endif                
+			}
+			*u_size = *c_size = rz->end;
+			return 1;
+		case FILE_TYPE_GZ:
+			return 0;
+		case FILE_TYPE_RZ:
+			if(rz->src_end == rz->end) return 0;
+			*u_size = rz->src_end;
+			*c_size = rz->end;
+			return 1;
+		default:
+			return 0;
+	}
+}
+
+static int _razf_read(RAZF* rz, void *data, int size){
+	int ret, tin;
+	if(rz->z_eof || rz->z_err) return 0;
+	if (rz->file_type == FILE_TYPE_PLAIN) {
+#ifdef _USE_KNETFILE
+		ret = knet_read(rz->x.fpr, data, size);
+#else
+		ret = read(rz->filedes, data, size);
+#endif        
+		if (ret == 0) rz->z_eof = 1;
+		return ret;
+	}
+	rz->stream->avail_out = size;
+	rz->stream->next_out  = data;
+	while(rz->stream->avail_out){
+		if(rz->stream->avail_in == 0){
+			if(rz->in >= rz->end){ rz->z_eof = 1; break; }
+			if(rz->end - rz->in < RZ_BUFFER_SIZE){
+#ifdef _USE_KNETFILE
+				rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, rz->end -rz->in);
+#else
+				rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in);
+#endif        
+			} else {
+#ifdef _USE_KNETFILE
+				rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE);
+#else
+				rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
+#endif        
+			}
+			if(rz->stream->avail_in == 0){
+				rz->z_eof = 1;
+				break;
+			}
+			rz->stream->next_in = rz->inbuf;
+		}
+		tin = rz->stream->avail_in;
+		ret = inflate(rz->stream, Z_BLOCK);
+		rz->in += tin - rz->stream->avail_in;
+		if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){
+			fprintf(stderr, "[_razf_read] inflate error: %d %s (at %s:%d)\n", ret, rz->stream->msg ? rz->stream->msg : "", __FILE__, __LINE__);
+			rz->z_err = 1;
+			break;
+		}
+		if(ret == Z_STREAM_END){
+			rz->z_eof = 1;
+			break;
+		}
+		if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){
+			rz->buf_flush = 1;
+			rz->next_block_pos = rz->in;
+			break;
+		}
+	}
+	return size - rz->stream->avail_out;
+}
+
+int razf_read(RAZF *rz, void *data, int size){
+	int ori_size, i;
+	ori_size = size;
+	while(size > 0){
+		if(rz->buf_len){
+			if(size < rz->buf_len){
+				for(i=0;i<size;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
+				rz->buf_off += size;
+				rz->buf_len -= size;
+				data += size;
+				rz->block_off += size;
+				size = 0;
+				break;
+			} else {
+				for(i=0;i<rz->buf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
+				data += rz->buf_len;
+				size -= rz->buf_len;
+				rz->block_off += rz->buf_len;
+				rz->buf_off = 0;
+				rz->buf_len = 0;
+				if(rz->buf_flush){
+					rz->block_pos = rz->next_block_pos;
+					rz->block_off = 0;
+					rz->buf_flush = 0;
+				}
+			}
+		} else if(rz->buf_flush){
+			rz->block_pos = rz->next_block_pos;
+			rz->block_off = 0;
+			rz->buf_flush = 0;
+		}
+		if(rz->buf_flush) continue;
+		rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
+		if(rz->z_eof && rz->buf_len == 0) break;
+	}
+	rz->out += ori_size - size;
+	return ori_size - size;
+}
+
+int razf_skip(RAZF* rz, int size){
+	int ori_size;
+	ori_size = size;
+	while(size > 0){
+		if(rz->buf_len){
+			if(size < rz->buf_len){
+				rz->buf_off += size;
+				rz->buf_len -= size;
+				rz->block_off += size;
+				size = 0;
+				break;
+			} else {
+				size -= rz->buf_len;
+				rz->buf_off = 0;
+				rz->buf_len = 0;
+				rz->block_off += rz->buf_len;
+				if(rz->buf_flush){
+					rz->block_pos = rz->next_block_pos;
+					rz->block_off = 0;
+					rz->buf_flush = 0;
+				}
+			}
+		} else if(rz->buf_flush){
+			rz->block_pos = rz->next_block_pos;
+			rz->block_off = 0;
+			rz->buf_flush = 0;
+		}
+		if(rz->buf_flush) continue;
+		rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
+		if(rz->z_eof || rz->z_err) break;
+	}
+	rz->out += ori_size - size;
+	return ori_size - size;
+}
+
+static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){
+#ifdef _USE_KNETFILE
+	knet_seek(rz->x.fpr, in, SEEK_SET);
+#else
+	lseek(rz->filedes, in, SEEK_SET);
+#endif
+	rz->in  = in;
+	rz->out = out;
+	rz->block_pos = in;
+	rz->next_block_pos = in;
+	rz->block_off = 0;
+	rz->buf_flush = 0;
+	rz->z_eof = rz->z_err = 0;
+	inflateReset(rz->stream);
+	rz->stream->avail_in = 0;
+	rz->buf_off = rz->buf_len = 0;
+}
+
+int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){
+	int64_t pos;
+	rz->z_eof = 0;
+	if(rz->file_type == FILE_TYPE_PLAIN){
+		rz->buf_off = rz->buf_len = 0;
+		pos = block_start + block_offset;
+#ifdef _USE_KNETFILE
+		knet_seek(rz->x.fpr, pos, SEEK_SET);
+        pos = knet_tell(rz->x.fpr);
+#else
+		pos = lseek(rz->filedes, pos, SEEK_SET);
+#endif
+		rz->out = rz->in = pos;
+		return pos;
+	}
+	if(block_start == rz->block_pos && block_offset >= rz->block_off) {
+		block_offset -= rz->block_off;
+		goto SKIP; // Needn't reset inflate
+	}
+	if(block_start  == 0) block_start = rz->header_size; // Automaticly revist wrong block_start
+	_razf_reset_read(rz, block_start, 0);
+	SKIP:
+	if(block_offset) razf_skip(rz, block_offset);
+	return rz->block_off;
+}
+
+int64_t razf_seek(RAZF* rz, int64_t pos, int where){
+	int64_t idx;
+	int64_t seek_pos, new_out;
+	rz->z_eof = 0;
+	if (where == SEEK_CUR) pos += rz->out;
+	else if (where == SEEK_END) pos += rz->src_end;
+	if(rz->file_type == FILE_TYPE_PLAIN){
+#ifdef _USE_KNETFILE
+		knet_seek(rz->x.fpr, pos, SEEK_SET);
+        seek_pos = knet_tell(rz->x.fpr);
+#else
+		seek_pos = lseek(rz->filedes, pos, SEEK_SET);
+#endif
+		rz->buf_off = rz->buf_len = 0;
+		rz->out = rz->in = seek_pos;
+		return seek_pos;
+	} else if(rz->file_type == FILE_TYPE_GZ){
+		if(pos >= rz->out) goto SKIP;
+		return rz->out;
+	}
+	if(pos == rz->out) return pos;
+	if(pos > rz->src_end) return rz->out;
+	if(!rz->seekable || !rz->load_index){
+		if(pos >= rz->out) goto SKIP;
+	}
+	idx = pos / RZ_BLOCK_SIZE - 1;
+	seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
+	new_out  = (idx + 1) * RZ_BLOCK_SIZE;
+	if(pos > rz->out && new_out <= rz->out) goto SKIP;
+	_razf_reset_read(rz, seek_pos, new_out);
+	SKIP:
+	razf_skip(rz, (int)(pos - rz->out));
+	return rz->out;
+}
+
+uint64_t razf_tell2(RAZF *rz)
+{
+	/*
+	if (rz->load_index) {
+		int64_t idx, seek_pos;
+		idx = rz->out / RZ_BLOCK_SIZE - 1;
+		seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
+		if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off)
+			fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n",
+					(long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off);
+	}
+	*/
+	return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff);
+}
+
+int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where)
+{
+	if (where != SEEK_SET) return -1;
+	return razf_jump(rz, voffset>>16, voffset&0xffff);
+}
+
+void razf_close(RAZF *rz){
+	if(rz->mode == 'w'){
+#ifndef _RZ_READONLY
+		razf_end_flush(rz);
+		deflateEnd(rz->stream);
+#ifdef _USE_KNETFILE
+		save_zindex(rz, rz->x.fpw);
+		if(is_big_endian()){
+			write(rz->x.fpw, &rz->in, sizeof(int64_t));
+			write(rz->x.fpw, &rz->out, sizeof(int64_t));
+		} else {
+			uint64_t v64 = byte_swap_8((uint64_t)rz->in);
+			write(rz->x.fpw, &v64, sizeof(int64_t));
+			v64 = byte_swap_8((uint64_t)rz->out);
+			write(rz->x.fpw, &v64, sizeof(int64_t));
+		}
+#else
+		save_zindex(rz, rz->filedes);
+		if(is_big_endian()){
+			write(rz->filedes, &rz->in, sizeof(int64_t));
+			write(rz->filedes, &rz->out, sizeof(int64_t));
+		} else {
+			uint64_t v64 = byte_swap_8((uint64_t)rz->in);
+			write(rz->filedes, &v64, sizeof(int64_t));
+			v64 = byte_swap_8((uint64_t)rz->out);
+			write(rz->filedes, &v64, sizeof(int64_t));
+		}
+#endif
+#endif
+	} else if(rz->mode == 'r'){
+		if(rz->stream) inflateEnd(rz->stream);
+	}
+	if(rz->inbuf) free(rz->inbuf);
+	if(rz->outbuf) free(rz->outbuf);
+	if(rz->header){
+		free(rz->header->extra);
+		free(rz->header->name);
+		free(rz->header->comment);
+		free(rz->header);
+	}
+	if(rz->index){
+		free(rz->index->bin_offsets);
+		free(rz->index->cell_offsets);
+		free(rz->index);
+	}
+	free(rz->stream);
+#ifdef _USE_KNETFILE
+    if (rz->mode == 'r')
+        knet_close(rz->x.fpr);
+    if (rz->mode == 'w')
+        close(rz->x.fpw);
+#else
+	close(rz->filedes);
+#endif
+	free(rz);
+}
+
+#endif
diff --git a/razip.c b/razip.c
new file mode 100644
index 0000000..825e732
--- /dev/null
+++ b/razip.c
@@ -0,0 +1,141 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include "razf.h"
+
+#define WINDOW_SIZE 4096
+
+static int razf_main_usage()
+{
+	printf("\n");
+	printf("Usage:   razip [options] [file] ...\n\n");
+	printf("Options: -c      write on standard output, keep original files unchanged\n");
+	printf("         -d      decompress\n");
+	printf("         -l      list compressed file contents\n");
+	printf("         -b INT  decompress at INT position in the uncompressed file\n");
+	printf("         -s INT  decompress INT bytes in the uncompressed file\n");
+	printf("         -h      give this help\n");
+	printf("\n");
+	return 0;
+}
+
+static int write_open(const char *fn, int is_forced)
+{
+	int fd = -1;
+	char c;
+	if (!is_forced) {
+		if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
+			printf("razip: %s already exists; do you wish to overwrite (y or n)? ", fn);
+			scanf("%c", &c);
+			if (c != 'Y' && c != 'y') {
+				printf("razip: not overwritten\n");
+				exit(1);
+			}
+		}
+	}
+	if (fd < 0) {
+		if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
+			fprintf(stderr, "razip: %s: Fail to write\n", fn);
+			exit(1);
+		}
+	}
+	return fd;
+}
+
+int main(int argc, char **argv)
+{
+	int c, compress, pstdout, is_forced;
+	RAZF *rz;
+	void *buffer;
+	long start, end, size;
+
+	compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
+	while((c  = getopt(argc, argv, "cdlhfb:s:")) >= 0){
+		switch(c){
+		case 'h': return razf_main_usage();
+		case 'd': compress = 0; break;
+		case 'c': pstdout = 1; break;
+		case 'l': compress = 2; break;
+		case 'b': start = atol(optarg); break;
+		case 's': size = atol(optarg); break;
+		case 'f': is_forced = 1; break;
+		}
+	}
+	if (size >= 0) end = start + size;
+	if(end >= 0 && end < start){
+		fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end);
+		return 1;
+	}
+	if(compress == 1){
+		int f_src, f_dst = -1;
+		if(argc > optind){
+			if((f_src = open(argv[optind], O_RDONLY)) < 0){
+				fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]);
+				return 1;
+			}
+			if(pstdout){
+				f_dst = fileno(stdout);
+			} else {
+				char *name = malloc(sizeof(strlen(argv[optind]) + 5));
+				strcpy(name, argv[optind]);
+				strcat(name, ".rz");
+				f_dst = write_open(name, is_forced);
+				if (f_dst < 0) return 1;
+				free(name);
+			}
+		} else if(pstdout){ 
+			f_src = fileno(stdin);
+			f_dst = fileno(stdout);
+		} else return razf_main_usage();
+		rz = razf_dopen(f_dst, "w");
+		buffer = malloc(WINDOW_SIZE);
+		while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c);
+		razf_close(rz); // f_dst will be closed here
+		if (argc > optind && !pstdout) unlink(argv[optind]);
+		free(buffer);
+		close(f_src);
+		return 0;
+	} else {
+		if(argc <= optind) return razf_main_usage();
+		if(compress == 2){
+			rz = razf_open(argv[optind], "r");
+			if(rz->file_type == FILE_TYPE_RZ) {
+							printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name");
+				printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end,
+					   argv[optind]);
+			} else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]);
+		} else {
+			int f_dst;
+			if (argc > optind && !pstdout) {
+				char *name;
+				if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) {
+					printf("razip: %s: unknown suffix -- ignored\n", argv[optind]);
+					return 1;
+				}
+				name = strdup(argv[optind]);
+				name[strlen(name) - 3] = '\0';
+				f_dst = write_open(name, is_forced);
+				free(name);
+			} else f_dst = fileno(stdout);
+			rz = razf_open(argv[optind], "r");
+			buffer = malloc(WINDOW_SIZE);
+			razf_seek(rz, start, SEEK_SET);
+			while(1){
+				if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE);
+				else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
+				if(c <= 0) break;
+				start += c;
+				write(f_dst, buffer, c);
+				if(end >= 0 && start >= end) break;
+			}
+			free(buffer);
+			if (!pstdout) unlink(argv[optind]);
+		}
+		razf_close(rz);
+		return 0;
+	}
+}
+
diff --git a/sam-stats.cpp b/sam-stats.cpp
new file mode 100644
index 0000000..9f0cd7d
--- /dev/null
+++ b/sam-stats.cpp
@@ -0,0 +1,1121 @@
+/*
+# Copyright (c) 2011 Erik Aronesty
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT.
+*/
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+
+#include <string>
+#include <google/sparse_hash_map> // or sparse_hash_set, dense_hash_map, ...
+#include <google/dense_hash_map>  // or sparse_hash_set, dense_hash_map, ...
+
+#include <samtools/sam.h>         // samtools api
+
+#include "fastq-lib.h"
+
+const char * VERSION = "1.38";
+
+#define SVNREV atoi(strchr("$LastChangedRevision: 681 $", ':')+1)
+
+using namespace std;
+
+void usage(FILE *f);
+
+#define MAX_MAPQ 300
+// this factor is based on a quick empirical look at a few bam files....
+#define VFACTOR 1.5
+
+//#define max(a,b) (a>b?a:b)
+//#define min(a,b) (a<b?a:b)
+#define meminit(l) (memset(&l,0,sizeof(l)))
+#define debugout(s,...) if (debug) fprintf(stderr,s,##__VA_ARGS__)
+#undef warn
+#define warn(s,...) ((++errs), fprintf(stderr,s,##__VA_ARGS__))
+#define stdev(cnt, sum, ssq) sqrt((((double)cnt)*ssq-pow((double)sum,2)) / ((double)cnt*((double)cnt-1)))
+
+template <class vtype> 
+    double quantile(const vtype &vec, double p);
+
+template <class itype> 
+    double quantile(const vector<itype> &vec, double p);
+
+std::string string_format(const std::string &fmt, ...);
+
+int debug=0;
+int errs=0;
+extern int optind;
+int histnum=30;
+bool isbwa=false;
+int rnamode = 0;
+bool allow_no_reads = false;
+
+// from http://programerror.com/2009/10/iterative-calculation-of-lies-er-stats/
+class cRunningStats
+{
+private:
+  double m_n;  // count
+  double m_m1; // mean
+  double m_m2; // second moment
+  double m_m3; // third moment
+  double m_m4; // fourth moment
+public:
+  cRunningStats() : m_n(0.0), m_m1(0.0), m_m2(0.0), m_m3(0.0), m_m4(0.0)
+    { ; }
+  void Push(double x)
+  {
+    m_n++;
+    double d = (x - m_m1);
+    double d_n = d / m_n;
+    double d_n2 = d_n * d_n;
+    m_m4 += d * d_n2 * d_n * ((m_n - 1) * ((m_n * m_n) - 3 * m_n + 3)) +
+            6 * d_n2 * m_m2 - 4 * d_n * m_m3;
+    m_m3 += d * d_n2 * ((m_n - 1) * (m_n - 2)) - 3 * d_n * m_m2;
+    m_m2 += d * d_n * (m_n - 1);
+    m_m1 += d_n;
+  }
+  double Mean() { return m_m1; }
+  double StdDeviation() { return sqrt(Variance()); }
+  double StdError() { return (m_n > 1.0) ? sqrt(Variance() / m_n) : 0.0; }
+  double Variance() { return (m_n > 1.0) ? (m_m2 / (m_n - 1.0)) : 0.0; }
+  double Skewness() { return sqrt(m_n) * m_m3 / pow(m_m2, 1.5); }
+  double Kurtosis() { return m_n * m_m4 / (m_m2 * m_m2); }
+};
+
+/// if we use this a lot may want to make it variable size
+class scoverage {
+public:
+	scoverage() {mapb=reflen=0; dist.resize(histnum+2); mapr=0;};
+	long long int mapb;
+	long int mapr;
+    cRunningStats spos;
+	int reflen;
+	vector <int> dist;
+};
+
+// sorted integer bucket ... good for ram with small max size, slow to access
+class ibucket {
+public:
+	int tot;
+	vector<int> dat;
+	ibucket(int max) {dat.resize(max+1);tot=0;}
+	int size() const {return tot;};
+
+	int operator[] (int n) const {
+		assert(n < size());
+		int i; 
+		for (i=0;i<dat.size();++i) {
+			if (n < dat[i]) {
+				return i;
+			}
+			n-=dat[i];
+		}
+	}
+
+	void push(int v) {
+		assert(v<dat.size());
+		++dat[v];
+		++tot;
+	}
+};
+
+class fqent {
+    public:
+    int bits; 
+    std::string r;
+    std::string q;
+};
+
+class sstats {
+public:
+	ibucket vmapq;			// all map qualities
+	sstats() : vmapq(MAX_MAPQ) {
+		memset((void*)&dat,0,sizeof(dat));
+		covr.set_empty_key("-");
+		petab.set_deleted_key("-");
+	}
+	~sstats() {
+		covr.clear();
+	}
+	struct {
+		int n, mapn, mapzero;		// # of entries, # of mapped entries, 
+		int lenmin, lenmax; double lensum, lenssq;	// read length stats
+		double mapsum, mapssq;	// map quality sum/ssq 
+		double nmnz, nmsum;	// # of mismatched reads, sum of mismatch lengths 
+		long long int nbase;
+		int qualmax, qualmin;	// num bases samples, min/max qual 
+		double qualsum, qualssq;	// sum quals, sum-squared qual
+		int nrev, nfor;		// rev reads, for reads
+		double tmapb;		// number of mapped bases
+		long long int basecnt[5];
+		int del, ins;		// length total dels/ins found
+		bool pe;		// paired-end ? 0 or 1	
+		int disc;
+		int disc_pos;
+		int dupmax;		// max dups found
+	} dat;
+	vector<int> visize;		// all insert sizes
+	google::dense_hash_map<std::string, scoverage> covr;	// # mapped per ref seq
+	google::sparse_hash_map<std::string, int> dups;		// alignments by read-id (not necessary for some pipes)
+	google::sparse_hash_map<std::string, fqent> petab;		// peread table
+
+	// file-format neutral ... called per read... warning seq/qual are not necessarily null-terminated
+	void dostats(string name, int rlen, int bits, const string &ref, int pos, int mapq, const string &materef, int nmate, const string &seq, const char *qual, int nm, int del, int ins);
+
+	// read a bam/sam file and call dostats over and over
+	bool parse_bam(const char *in);
+	bool parse_sam(FILE *f);
+};
+
+#define T_A 0
+#define T_C 1
+#define T_G 2
+#define T_T 3
+#define T_N 4
+
+void build_basemap();
+
+int dupreads = 1000000;
+int max_chr = 1000;
+bool trackdup=0;
+FILE *sefq = NULL;
+FILE *pefq1 = NULL;
+FILE *pefq2 = NULL;
+int basemap[256];
+int main(int argc, char **argv) {
+	const char *ext = NULL;
+	bool multi=0, newonly=0, inbam=0;
+    int fq_out=0;
+    const char *rnafile = NULL;
+	char c;
+	optind = 0;
+    struct option long_options[] = {
+               {"fastq", no_argument, NULL, 'o'},
+               {0,0,0,0},
+    };
+    int long_index=0;
+    const char *prefix;
+
+    while ( (c = getopt_long(argc, argv, "?BzArR:Ddx:MhS:", long_options, &long_index)) != -1) {
+                switch (c) {
+                case 'd': ++debug; break;                                       // increment debug level
+                case 'D': ++trackdup; break;
+                case 'B': inbam=1; break;
+                case 'A': max_chr=1000000; break;                               // max chrom
+                case 'R': rnafile=optarg;                                       // pass through
+                case 'r': max_chr=1000000; rnamode=1; if (histnum < 60) histnum=60; break;
+                case 'O': prefix=optarg; break;
+                case 'S': histnum=atoi(optarg); break;
+                case 'x': ext=optarg; break;
+                case 'M': newonly=1; break;
+                case 'z': allow_no_reads = true; break;
+                case 'o': fq_out=1; trackdup=1; break;                     // output suff
+                case 'h': usage(stdout); return 0;
+                case '?':
+                     if (!optopt) {
+                        usage(stdout); return 0;
+                     } else if (optopt && strchr("ox", optopt))
+                       fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+                     else if (isprint(optopt))
+                       fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+                     else
+                       fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt);
+                     usage(stderr);
+                     return 1;
+                }
+    }
+
+	// recompute argc owing to getopt (is this necessary? i don't think so)
+	const char *stdv[3] = {argv[0],"-",NULL}; 
+	if (!argv[optind]) {
+		argc=2;
+		argv = (char **) stdv;
+		optind=1;
+	}
+
+	multi = (argc-optind-1) > 0;                // more than 1 input? 
+	if (multi && !ext) 
+		ext = "stats";                          // force serial processed extension-mode
+	
+	build_basemap();                            // precompute matrices for rabit base->integer (A->0, C->1,. ...etc) lookups
+
+	debugout("argc:%d, argv[1]:%s, multi:%d, ext:%s\n", argc,argv[optind],multi,ext);
+
+    FILE *rnao = NULL;
+
+
+	const char *p;
+	// for each input file
+	for (;optind < argc;++optind) {
+		sstats s;
+		const char *in = argv[optind];
+		FILE *f;
+		FILE *o=NULL;
+		FILE *rnao=NULL;
+		bool needpclose = 0;
+
+		// decide input format
+		string out;
+
+		if (!strcmp(in,"-")) {
+			// read sam/bam from stdin
+			if (ext||fq_out) {
+				warn("Can't use file extension with stdin\n");
+				continue;
+			}
+			f = stdin;
+			o = stdout;
+		} else {
+			if ((p = strrchr(in,'.')) && !strcmp(p, ".gz")) {
+				// maybe this is a gzipped sam file...
+				string cmd = string_format("gunzip -c '%s'", in);
+				f = popen(cmd.c_str(), "r");
+				needpclose=1;
+				if (f) {
+					char c;
+					if (!inbam) {
+						// guess file format with 1 char
+						c=getc(f); ungetc(c,f);
+						if (c==-1) {
+							warn("Can't unzip %s\n", in);
+							pclose(f);
+							continue;
+						}
+						if (c==31) {
+							// bam file... reopen to reset stream... can't pass directly
+							string cmd = string_format("gunzip -c '%s'", in);
+							f = popen(cmd.c_str(), "r");
+							inbam=1;
+						}
+					} else 
+						c = 31;	// user forced bam, no need to check/reopen
+
+					if (inbam) {
+						// why did you gzip a bam... weird? 
+						if (dup2(fileno(f),0) == -1) {
+						      warn("Can't dup2 STDIN\n");
+						      continue;
+						}
+						in = "-";
+					}
+				} else {
+					warn("Can't unzip %s: %s\n", in, strerror(errno));
+					continue;
+				}
+				// extension mode... output to file minus .gz
+				if (ext||fq_out) 
+					out=string(in, p-in);
+			} else {
+	 			f = fopen(in, "r");
+				if (!f) {
+					warn("Can't open %s: %s\n", in, strerror(errno));
+					continue;
+				}
+				// extension mode... output to file
+				if (ext||fq_out) 
+					out=in;
+			}
+            if (fq_out) {
+                sefq=fopen((out+".fq").c_str(),"w");
+                pefq1=fopen((out+".fq1").c_str(),"w");
+                pefq2=fopen((out+".fq2").c_str(),"w");
+            }
+			if (ext) {
+				( out += '.') += ext;
+				o=fopen(out.c_str(), "w");
+				if (!o) {
+					warn("Can't write %s: %s\n", out.c_str(), strerror(errno));
+					continue;
+				}
+			} else
+				o=stdout;
+		}
+
+		// more guessing
+		debugout("file:%s, f: %lx\n", in, (long int) f);
+		char c;
+		if (!inbam) {
+			// guess file format
+			c=getc(f); ungetc(c,f);
+			if (c==31 && !strcmp(in,"-")) {
+				// if bamtools api allowed me to pass a stream, this wouldn't be an issue....
+				warn("Specify -B to read a bam file from standard input\n");
+				continue;
+			}
+		} else 
+			c = 31;		// 31 == bam
+
+        if (rnafile) {
+            rnao=fopen(rnafile,"w");
+            if (!rnao) {
+                warn("Can't write %s: %s\n", rnafile, strerror(errno));
+                return 1;
+            }
+        } else {
+            rnao=o;
+        }
+ 
+		// parse sam or bam as needed
+		if (c != 31) {
+			// (could be an uncompressed bam... but can't magic in 1 char)
+			if (!s.parse_sam(f)) {
+				if (needpclose) pclose(f); else fclose(f);
+				warn("Invalid or corrupt sam file %s\n", in);
+				continue;
+			}
+		} else {
+			if (!s.parse_bam(in)) {
+				if (needpclose) pclose(f); else fclose(f);
+				warn("Invalid or corrupt bam file %s\n", in);
+				continue;
+			}
+		}
+        int ret;
+		if (needpclose) ret=pclose(f); else ret=fclose(f);
+        if (ret!=0) {
+            warn("Error closing '%s': %s\n", in, strerror(errno));
+            continue;
+        } 
+
+        if (fq_out) {
+            if(sefq && s.dat.pe) {
+                fclose(sefq);
+                unlink((out+".fq").c_str());
+            }
+            if (pefq1 && !s.dat.pe) {
+                fclose(pefq1);
+                fclose(pefq2);
+                unlink((out+".fq1").c_str());
+                unlink((out+".fq2").c_str());
+            }
+        }
+
+		// sort sstats
+		sort(s.visize.begin(), s.visize.end());
+
+		int phred = s.dat.qualmin < 64 ? 33 : 64;
+		if (!s.dat.n && ! allow_no_reads) {
+			warn("No reads in %s\n", in);
+			continue;
+		}
+		fprintf(o, "reads\t%d\n", s.dat.n);
+		fprintf(o, "version\t%s.%d\n", VERSION, SVNREV);
+
+		// mapped reads is the number of reads that mapped at least once (either mated or not)
+		if (s.dat.mapn > 0) {
+			if (trackdup && s.dat.dupmax > (s.dat.pe+1)) {
+				google::sparse_hash_map<string,int>::iterator it = s.dups.begin();
+				vector<int> vtmp;
+				int amb = 0;
+				int sing = 0;
+				while(it!=s.dups.end()) {
+					// *not* making the distinction between 2 singleton mappings and 1 paired here
+					if (it->second > (s.dat.pe+1)) {
+						++amb;
+					}
+					if (it->second == 1 && s.dat.pe) {
+						++sing;	
+					}
+					++it;
+				}
+                int mapped = (int) s.dups.size()*(s.dat.pe+1)-sing;
+
+				fprintf(o,"mapped reads\t%d\n", mapped);
+				if (amb > 0) {
+                    int unmapped=s.dat.n-s.dat.mapn;
+					fprintf(o,"pct align\t%.6f\n", 100.0*((double)mapped/(double)(mapped+unmapped)));
+					fprintf(o,"ambiguous\t%d\n", amb*(s.dat.pe+1));
+					fprintf(o,"pct ambiguous\t%.6f\n", 100.0*((double)amb/(double)s.dups.size()));
+					fprintf(o,"max dup align\t%.d\n", s.dat.dupmax-s.dat.pe);
+				} else {
+                    // no ambiguous mappings... simple
+				    fprintf(o, "pct align\t%.6f\n", 100.0*(double)s.dat.mapn/(double)s.dat.n);
+                }
+				if (sing)
+					fprintf(o,"singleton mappings\t%.d\n", sing);
+				// number of total mappings
+				fprintf(o, "total mappings\t%d\n", s.dat.mapn);
+			} else {
+				// dup-id's not tracked
+				fprintf(o, "mapped reads\t%d\n", s.dat.mapn);
+				fprintf(o, "pct align\t%.6f\n", 100.0*(double)s.dat.mapn/(double)s.dat.n);
+				// todo: add support for bwa's multiple alignment tag
+				// fprintf(o, "total mappings\t%d\n", s.dat.mapn);
+			}
+		} else {
+			fprintf(o, "mapped reads\t%d\n", s.dat.mapn);
+		}
+
+        if (s.dat.mapzero > 0) {
+			fprintf(o, "skipped mappings\t%d\n", s.dat.mapzero);
+        }
+
+		fprintf(o, "mapped bases\t%.0f\n", s.dat.tmapb);
+		if (s.dat.pe) {
+			fprintf(o, "library\tpaired-end\n");
+		}
+		if (s.dat.disc > 0) {
+			fprintf(o, "discordant mates\t%d\n", s.dat.disc);
+		}
+		if (s.dat.disc_pos > 0) {
+			fprintf(o, "distant mates\t%d\n", s.dat.disc_pos);
+		}
+
+		if (s.dat.mapn > 0) {
+           if (s.dat.mapn > 100) {
+                // at least 100 mappings to call a meaningful "percentage" 
+    			fprintf(o, "pct forward\t%.3f\n", 100*(s.dat.nfor/(double)(s.dat.nfor+s.dat.nrev)));
+            }
+
+			fprintf(o, "phred\t%d\n", phred);
+			fprintf(o, "forward\t%d\n", s.dat.nfor);
+			fprintf(o, "reverse\t%d\n", s.dat.nrev);
+			if (s.dat.lenmax != s.dat.lenmin) {
+				fprintf(o, "len max\t%d\n", s.dat.lenmax);	
+				fprintf(o, "len mean\t%.4f\n", s.dat.lensum/s.dat.mapn);	
+				fprintf(o, "len stdev\t%.4f\n", stdev(s.dat.mapn, s.dat.lensum, s.dat.lenssq));	
+			} else {
+				fprintf(o, "len max\t%d\n", s.dat.lenmax);	
+			}
+			fprintf(o, "mapq mean\t%.4f\n", s.dat.mapsum/s.dat.mapn);
+			fprintf(o, "mapq stdev\t%.4f\n", stdev(s.dat.mapn, s.dat.mapsum, s.dat.mapssq));
+
+			fprintf(o, "mapq Q1\t%.2f\n", quantile(s.vmapq,.25));
+			fprintf(o, "mapq median\t%.2f\n", quantile(s.vmapq,.50));
+			fprintf(o, "mapq Q3\t%.2f\n", quantile(s.vmapq,.75));
+
+			if (s.dat.lensum > 0) {
+				fprintf(o, "snp rate\t%.6f\n", s.dat.nmsum/s.dat.lensum);
+				if (s.dat.ins >0 ) fprintf(o, "ins rate\t%.6f\n", s.dat.ins/s.dat.lensum);
+				if (s.dat.del >0 ) fprintf(o, "del rate\t%.6f\n", s.dat.del/s.dat.lensum);
+				fprintf(o, "pct mismatch\t%.4f\n", 100.0*((double)s.dat.nmnz/s.dat.mapn));
+			}
+
+			if (s.visize.size() > 0) {
+				double p10 = quantile(s.visize, .10);
+				double p90 = quantile(s.visize, .90);
+				double matsum=0, matssq=0;
+				int matc = 0;
+				int i;
+				for(i=0;i<s.visize.size();++i) {
+					int v = s.visize[i];
+					if (v >= p10 && v <= p90) {
+						++matc;
+						matsum+=v;
+						matssq+=v*v;
+					}
+				}
+				fprintf(o, "insert mean\t%.4f\n", matsum/matc);
+				if (matc > 1) {
+					fprintf(o, "insert stdev\t%.4f\n", stdev(matc, matsum, matssq));
+					fprintf(o, "insert Q1\t%.2f\n", quantile(s.visize, .25));
+					fprintf(o, "insert median\t%.2f\n", quantile(s.visize, .50));
+					fprintf(o, "insert Q3\t%.2f\n", quantile(s.visize, .75));
+				}
+			}
+
+			if (s.dat.nbase >0) {
+				fprintf(o,"base qual mean\t%.4f\n", (s.dat.qualsum/s.dat.nbase)-phred);
+				fprintf(o,"base qual stdev\t%.4f\n", stdev(s.dat.nbase, s.dat.qualsum, s.dat.qualssq));
+				fprintf(o,"%%A\t%.4f\n", 100.0*((double)s.dat.basecnt[T_A]/(double)s.dat.nbase));
+				fprintf(o,"%%C\t%.4f\n", 100.0*((double)s.dat.basecnt[T_C]/(double)s.dat.nbase));
+				fprintf(o,"%%G\t%.4f\n", 100.0*((double)s.dat.basecnt[T_G]/(double)s.dat.nbase));
+				fprintf(o,"%%T\t%.4f\n", 100.0*((double)s.dat.basecnt[T_T]/(double)s.dat.nbase));
+				if (s.dat.basecnt[T_N] > 0) {
+					fprintf(o,"%%N\t%.4f\n", 100.0*((double)s.dat.basecnt[T_N]/(double)s.dat.nbase));
+				}
+			}
+			// how many ref seqs have mapped bases?
+			int mseq=0;
+			google::dense_hash_map<string,scoverage>::iterator it = s.covr.begin();
+			vector<string> vtmp;
+			bool haverlen = 0;
+			while (it != s.covr.end()) {
+				if (it->second.mapb > 0) {
+					++mseq;								// number of mapped refseqs
+					if (mseq <= max_chr) vtmp.push_back(it->first);		// don't bother if too many chrs
+					if (it->second.reflen > 0) haverlen = 1;
+				}
+				++it;
+			}
+			// don't print per-seq percentages if size is huge, or is 1
+			if ((haverlen || mseq > 1) && mseq <= max_chr) {			// worth reporting
+				// sort the id's
+				sort(vtmp.begin(),vtmp.end());
+				vector<string>::iterator vit=vtmp.begin();
+				double logb=log(2);
+                vector<double> vcovrvar;
+                vector<double> vcovr;
+                vector<double> vskew;
+                // for each chromosome or reference sequence...
+				while (vit != vtmp.end()) {
+					scoverage &v = s.covr[*vit];                    // coverage vector
+					if (v.reflen && histnum > 0) {                  // user asked for histogram
+						string sig;
+						int d; double logd, lsum=0, lssq=0;
+
+						for (d=0;d<histnum;++d) {                   // log counts for each portion of the histogram
+                            logd = log(1+v.dist[d])/logb;
+                            lsum+=logd;
+                            lssq+=logd*logd;
+							sig += ('0' + (int) logd);
+						}
+                        if (rnamode) {
+                            // variability of coverage
+                            double cv = stdev(histnum, lsum, lssq)/(lsum/histnum);
+                            // percent coverage estimated using historgram... maybe track real coverage some day, for now this is fine
+                            double covr = 0;
+                            for (d=0;d<histnum;++d) {
+                                // VFAC = % greater than 1 that a bin must be to be considered 100%
+                                if (v.dist[d] > VFACTOR*v.reflen/histnum) {
+                                    ++covr;     // 100% covered this bin
+                                } else {
+                                    // calc bases/(factor * size of bin)
+                                    covr += ((double)v.dist[d] / ((double)VFACTOR*v.reflen/histnum));
+                                }
+                            }
+                            double origcovr = covr;
+                            covr /= (double) histnum;
+                            covr = min(100.0*((double)v.mapb/v.reflen),100.0*covr);
+                            // when dealing with "position skewness", you need to anchor things
+                            v.spos.Push(v.reflen);
+                            v.spos.Push(1);
+                            double skew = -v.spos.Skewness();
+                            // if there's some coverage
+                            if (v.mapr > 0) {
+                                if (v.mapr > 10) {
+                                    // summary stats
+                                    vcovr.push_back(covr);              // look at varition
+                                    vcovrvar.push_back(cv);             // look at varition
+                                    vskew.push_back(skew);              // and skew
+                                }
+                                if (rnao) {                         // "rna mode"  = more detailed output of coverage and skewness of coverage
+        						    fprintf(rnao,"%s\t%d\t%ld\t%.2f\t%.4f\t%.4f\t%s\n", vit->c_str(), v.reflen, v.mapr, covr, skew, cv, sig.c_str());
+                                }
+                            }
+                        } else if (max_chr < 100) {                 // normal dna mode, just print percent alignment to each
+    						fprintf(o,"%%%s\t%.2f\t%s\n", vit->c_str(), 100.0*((double)v.mapb/s.dat.lensum), sig.c_str());
+                        } else {
+    						fprintf(o,"%%%s\t%.6f\t%s\n", vit->c_str(), 100.0*((double)v.mapb/s.dat.lensum), sig.c_str());
+                        }
+					} else {
+                        if (max_chr < 100) {
+						    fprintf(o,"%%%s\t%.2f\n", vit->c_str(), 100.0*((double)v.mapb/s.dat.lensum));
+                        } else {
+						    fprintf(o,"%%%s\t%.6f\n", vit->c_str(), 100.0*((double)v.mapb/s.dat.lensum));
+                        }
+					}
+					++vit;
+				}
+                if (rnamode) {
+		            sort(vcovr.begin(), vcovr.end());
+		            sort(vcovrvar.begin(), vcovrvar.end());
+		            sort(vskew.begin(), vskew.end());
+                    double medcovrvar = quantile(vcovrvar,.5);
+                    double medcovr = quantile(vcovr,.5);
+                    double medskew = quantile(vskew,.5);
+                    fprintf(o,"median skew\t%.2f\n", medskew);
+                    fprintf(o,"median coverage cv\t%.2f\n", medcovrvar);
+                    fprintf(o,"median coverage\t%.2f\n", medcovr);
+                }
+			}
+			if (s.covr.size() > 1) {
+				fprintf(o,"num ref seqs\t%d\n", (int) s.covr.size());
+				fprintf(o,"num ref aligned\t%d\n", (int) mseq);
+			}
+		} else {
+			if (s.covr.size() > 1) {
+				fprintf(o,"num ref seqs\t%d\n", (int) s.covr.size());
+			}
+		}
+	}
+	return errs ? 1 : 0;
+}
+
+#define S_ID 0
+#define S_BITS 1
+#define S_NMO 2
+#define S_POS 3
+#define S_MAPQ 4
+#define S_CIG 5
+#define S_MATEREF 6
+#define S_MATE 8
+#define S_READ 9
+#define S_QUAL 10
+#define S_TAG 11
+
+void sstats::dostats(string name, int rlen, int bits, const string &ref, int pos, int mapq, const string &materef, int nmate, const string &seq, const char *qual, int nm, int del, int ins) {
+
+	++dat.n;
+
+	if (bits & 0x04) return;       // bits say ... query was not mapped
+
+	if (pos<=0) {
+	    ++dat.mapzero;             // quantify weird errors
+        return;				       // not mapped well enough to count
+    }
+
+	++dat.mapn;                    // mapped query
+
+    // TODO: build a histogram of read lengths using the integer bucket
+
+    // read length min/max
+	if (rlen > dat.lenmax) dat.lenmax = rlen;
+	if ((rlen < dat.lenmin) || dat.lenmin==0) dat.lenmin = rlen;
+
+    // read length sum/ssq
+	dat.lensum += rlen;
+	dat.lenssq += rlen*rlen;
+
+    // TODO: allow for alternate paired-end layouts besides Illumina's
+
+    // reverse stranded query
+	if (bits & 16) 
+	    if (bits & 0x40)            // first read in the pair
+    		++dat.nrev;             // reverse
+		else
+            ++dat.nfor;             // second read? actually was a forward alignment
+	else
+	    if (bits & 0x40)            // first read in the pair
+		    ++dat.nfor;             
+        else
+		    ++dat.nrev;
+
+    // mapping quality mean/stdev
+	dat.mapsum += mapq;
+	dat.mapssq += mapq*mapq;
+
+    // mapping quality histogram
+    vmapq.push(mapq);
+
+    // TODO: NM histogram maybe?
+
+    // number of mismateches
+	if (nm > 0) {
+        // nm is snp+ins+del... which is silly
+		dat.nmnz += 1;                          // how many read are not perfect matches?
+		dat.nmsum += nm-del-ins;                // mismatch sum
+	}
+	dat.del+=del;                               // deletion sum
+	dat.ins+=ins;                               // insert sum
+
+    // if we know about the reference sequence
+	if (ref.length()) {
+		scoverage *sc = &(covr[ref]);
+		if (sc) {                               // and we have ram for coverage
+			sc->mapb+=rlen;                     // total up mapped bases in that ref
+            if (rnamode) {                      // more detailed
+                int i;
+			    sc->mapr+=1;
+                for (i=0;i<rlen;++i) {          // walk along read
+                    sc->spos.Push(pos+i);       // per-position stats
+                }
+			    if (histnum > 0 && sc->reflen > 0) {                                // if we're making a histogram
+                    for (i=0;i<rlen;++i) {                                          // walk along read
+				        int x = histnum * ((double)(pos+i) / sc->reflen);           // find the bucket this base is in
+                        if (x < histnum) {                                          
+                            sc->dist[x]+=1;                                         // add 1 to that bucket
+                        } else {
+                            // out of bounds.... what to do?
+                            sc->dist[histnum] += 1;                                 // out of bounds bases (fall off the edge) = extra bucket
+                        }
+                    }
+                }
+            } else if (histnum > 0 && sc->reflen > 0) {                             // lightweight... don't deal with each base, ok becauss CHRs are big
+				int x = histnum * ((double)pos / sc->reflen);
+				if (debug > 1) { 
+					warn("chr: %s, hn: %d, pos: %d, rl: %d, x: %x\n", ref.c_str(), histnum, pos, sc->reflen, x);
+				}
+				if (x < histnum) {
+                    sc->dist[x]+=rlen;
+				} else {
+					// out of bounds.... what to do?
+					sc->dist[histnum] +=rlen;
+				}
+			}
+		}
+	}
+    // total mapped bases += read length
+	dat.tmapb+=rlen;
+	if (nmate>0) {
+        // insert size histogram
+		visize.push_back(nmate);
+		dat.pe=1;
+	}
+
+    // mate reference chromosome is not the same as my own?
+	if (materef.size() && (materef != "=" && materef != "*" && materef != ref)) {
+        // this is a discordant read
+		dat.disc++;
+	} else {
+    // mate reference chromosome is far (>50kb) from my own?
+		if (abs(nmate) > 50000) {
+            // this is discordant-by position
+			dat.disc_pos++;
+		}
+	}
+
+    // walk along sequence, add qualities to overall min/max/mean/stdev 
+	int i, j;
+	for (i=0;i<seq.length();++i) {
+		if (qual[i]>dat.qualmax) dat.qualmax=qual[i];
+		if (qual[i]<dat.qualmin) dat.qualmin=qual[i];
+		dat.qualsum+=qual[i];
+		dat.qualssq+=qual[i]*qual[i];
+        // also count bases
+		++dat.basecnt[basemap[seq[i]]];
+        // total number of bases counted (this should be the same as tmapb???   get rid of it???)
+		++dat.nbase;
+	}
+
+    // TODO: we should be able to use the "non primary" bit field
+    //       need to test to see if this works for all aligners
+    //       then have a mode that only report stats for primary alignments... for example, and no need for this 
+    //       expensive, giant hash table
+
+    // duplicate tracking turned on?
+	if (trackdup) {
+		size_t p;
+        // illumina mode... check for a space in the name, and ignore stuff after it
+		if ((p = name.find_first_of(' '))!=string::npos) 
+			name.resize(p);
+
+        // count dups for that id
+		int x=++dups[name];
+
+        // keep track of max dups
+		if (x>dat.dupmax) 
+			dat.dupmax=x;
+
+        // fastq-output mode... 
+        if (sefq) {
+            // if the data isn't paired end or if we're not sure yet
+            if (!dat.pe || dat.mapn < 1000) {
+                // output a single end fq
+                fprintf(sefq,"@%s\n%s\n+\n%s\n",name.c_str(), seq.c_str(), qual);
+            }
+        }
+
+        // if we're outputting paired-end fastq's and if there's not a lot of dups
+        if (pefq1 && x < 4 && (dat.pe || dat.mapn < 1000)) {
+            fqent fq;
+            google::sparse_hash_map<string,fqent>::iterator it=petab.find(name);
+            // find my mate?
+            if (it == petab.end()) {
+                // no, add me
+                fq.r=seq;
+                fq.q=qual;
+                fq.bits=bits&0x40;                  // mate flag
+                petab[name]=fq;
+            } else if (it->second.bits != bits) {
+                // yes? remove me
+                fq=it->second;
+                fprintf(pefq1,"@%s 1\n%s\n+\n%s\n",name.c_str(), fq.r.c_str(), fq.q.c_str());
+                fprintf(pefq2,"@%s 2\n%s\n+\n%s\n",name.c_str(), seq.c_str(), qual);
+                petab.erase(it); 
+            }
+        }
+	}
+}
+
+// parse a sam file... maybe let samtools do this, and then handle stats in "bam mode"... faster for sure
+bool sstats::parse_sam(FILE *f) {
+	line l; meminit(l);
+    int lineno=0;
+    int warnings=0;
+	while (read_line(f, l)>0)  {
+        ++lineno;
+		char *sp;
+		if (l.s[0]=='@') {
+			if (!strncmp(l.s,"@SQ\t",4)) {
+				char *t=strtok_r(l.s, "\t", &sp);
+				string sname; int slen=0;
+				while(t) {
+					if (!strncmp(t,"SN:",3)) {
+						sname=&(t[3]);
+						if (slen) 
+							break;
+					} else if (!strncmp(t,"LN:",3)) {
+						slen=atoi(&t[3]);
+						if (sname.length()) 
+							break;
+					}
+					t=strtok_r(NULL, "\t", &sp);
+				}
+				covr[sname].reflen=slen;
+			}
+			continue;
+		}
+		char *t=strtok_r(l.s, "\t", &sp);
+		char *d[100]; meminit(d);
+		int n =0;
+		while(t) {
+			d[n++]=t;
+			t=strtok_r(NULL, "\t", &sp);
+		}
+		int nm=0;
+		int i;
+		// get # mismatches
+		for (i=S_TAG;i<n;++i){
+			if (d[i] && !strncasecmp(d[i],"NM:i:",5)) {
+				nm=atoi(&d[i][5]);
+			}
+		}
+
+		if (!d[S_BITS] || !isdigit(d[S_BITS][0]) 
+		 || !d[S_POS]  || !isdigit(d[S_POS][0])
+		   ) {
+            if (warnings < 5) {
+                warn("Line %d, missing bits/position information\n", lineno);
+                ++warnings;
+            }
+			// invalid sam
+			return false;
+		}
+
+		int ins = 0, del = 0;	
+		char *p=d[S_CIG];
+		// sum the cig
+		while (*p) {
+			int n=strtod(p, &sp);
+			if (sp==p) {
+				break;
+			}
+			if (*sp == 'I') 
+				ins+=n;
+			else if (*sp == 'D') 
+				del+=n;
+			p=sp+1;
+		}
+
+        // force unmapped to position negative one
+		if (d[S_CIG][0] == '*') d[S_POS] = (char *) "-1";
+
+        // as-if it were a bam...
+		dostats(d[S_ID],strlen(d[S_READ]),atoi(d[S_BITS]),d[S_NMO],atoi(d[S_POS]),atoi(d[S_MAPQ]),d[S_MATEREF],atoi(d[S_MATE]),d[S_READ],d[S_QUAL],nm, ins, del);
+	}
+	return true;
+}
+
+// let samtools parse the bam
+bool sstats::parse_bam(const char *in) {
+    samfile_t *fp;
+    if (!(fp=samopen(in, "rb", NULL))) {
+            warn("Error reading '%s': %s\n", in, strerror(errno));
+            return false;
+    }
+    if (fp->header) {
+        int i;
+        for (i = 0; i < fp->header->n_targets; ++i) {
+            covr[fp->header->target_name[i]].reflen=fp->header->target_len[i];
+        }
+    }
+	bam1_t *al=bam_init1();
+    int ret=0;
+    while ( (ret=samread(fp, al)) > 0 ) {
+        uint32_t *cig = bam1_cigar(al);
+        char *name = bam1_qname(al);
+        int len = al->core.l_qseq;
+        uint8_t *tag=bam_aux_get(al, "NM");     // NM tag
+		int nm = tag ? bam_aux2i(tag) : 0;
+		int ins=0, del=0;
+		int i;
+
+        // count inserts and deletions
+		for (i=0;i<al->core.n_cigar;++i) {
+            int op = cig[i] & BAM_CIGAR_MASK;
+			if (op == BAM_CINS) {
+				ins+=(cig[i] >> BAM_CIGAR_SHIFT);
+			} else if (op == BAM_CDEL) {
+				del+=(cig[i] >> BAM_CIGAR_SHIFT);
+			}
+		}
+
+        // crappy cigar?
+		if (al->core.n_cigar == 0) 
+			al->core.pos=-1;                    // not really a match if there's no cigar string... this deals with bwa's issue
+
+        char *qual = (char *) bam1_qual(al);    // qual string
+        uint8_t * bamseq = bam1_seq(al);        // sequence string
+        string seq; seq.resize(len);            // ok... really make it a string
+        for (i=0;i<len;++i) {
+            seq[i] = bam_nt16_rev_table[bam1_seqi(bamseq, i)];
+            qual[i] += 33;
+        }
+
+        // now do stats
+		dostats(name,len,al->core.flag,al->core.tid>=0?fp->header->target_name[al->core.tid]:"",al->core.pos+1,al->core.qual, al->core.mtid>=0?fp->header->target_name[al->core.mtid]:"", al->core.isize, seq, qual, nm, ins, del);
+	}
+    if (ret < -2) {
+            // no stats .. corrupt file
+            return false;
+    }
+    if (ret < -1) {
+        ++errs;
+        // truncated file, output stats, but return error code
+        return true;
+    }
+	return true;
+}
+
+void usage(FILE *f) {
+        fprintf(f,
+"Usage: sam-stats [options] [file1] [file2...filen]\n"
+"Version: %s.%d\n"
+"\n"
+"Produces lots of easily digested statistics for the files listed\n"
+"\n"
+"Options (default in parens):\n"
+"\n"
+"-D             Keep track of multiple alignments\n"
+"-O PREFIX      Output prefix enabling extended output (see below)\n"
+"-R FIL         Coverage/RNA output (coverage, 3' bias, etc, implies -A)\n"
+"-A             Report all chr sigs, even if there are more than 1000\n"
+"-b INT         Number of reads to sample for per-base stats (1M)\n"
+"-S INT         Size of ascii-signature (30)\n"
+"-x FIL         File extension for handling multiple files (stats)\n"
+"-M             Only overwrite if newer (requires -x, or multiple files)\n"
+"-B             Input is bam, don't bother looking at magic\n"
+"-z             Don't fail when zero entries in sam\n"
+"\n"
+"OUTPUT:\n"
+"\n"
+"If one file is specified, then the output is to standard out.  If\n"
+"multiple files are specified, or if the -x option is supplied,\n"
+"the output file is <filename>.<ext>.  Default extension is 'stats'.\n"
+"\n"
+"Complete Stats:\n"
+"\n"
+"  <STATS>           : mean, max, stdev, median, Q1 (25 percentile), Q3\n"
+"  reads             : # of entries in the sam file, might not be # reads\n"
+"  phred             : phred scale used\n"
+"  bsize             : # reads used for qual stats\n"
+"  mapped reads      : number of aligned reads (unique probe id sequences)\n"
+"  mapped bases      : total of the lengths of the aligned reads\n"
+"  forward           : number of forward-aligned reads\n"
+"  reverse           : number of reverse-aligned reads\n"
+"  snp rate          : mismatched bases / total bases (snv rate)\n"
+"  ins rate          : insert bases / total bases\n"
+"  del rate          : deleted bases / total bases\n"
+"  pct mismatch      : percent of reads that have mismatches\n"
+"  pct align         : percent of reads that aligned\n"
+"  len <STATS>       : read length stats, ignored if fixed-length\n"
+"  mapq <STATS>      : stats for mapping qualities\n"
+"  insert <STATS>    : stats for insert sizes\n"
+"  %%<CHR>           : percentage of mapped bases per chr, followed by a signature\n"
+"\n"
+"Subsampled stats (1M reads max):\n"
+"  base qual <STATS> : stats for base qualities\n"
+"  %%A,%%T,%%C,%%G       : base percentages\n"
+"\n"
+"Meaning of the per-chromosome signature:\n"
+"  A ascii-histogram of mapped reads by chromosome position.\n"
+"  It is only output if the original SAM/BAM has a header. The values\n"
+"  are the log2 of the # of mapped reads at each position + ascii '0'.\n"
+"\n"
+"Extended output mode produces a set of files:\n"
+"  .stats           : primary output\n"
+"  .fastx           : fastx-toolkit compatible output\n"
+"  .rcov            : per-reference counts & coverage\n"
+"  .xdist           : mismatch distribution\n"
+"  .ldist           : length distribution (if applicable)\n"
+"  .mqdist          : mapping quality distribution\n"
+"\n"
+        ,VERSION, SVNREV);
+}
+
+std::string string_format(const std::string &fmt, ...) {
+       int n, size=100;
+       std::string str;
+       va_list ap;
+       while (1) {
+       str.resize(size);
+       va_start(ap, fmt);
+       int n = vsnprintf((char *)str.c_str(), size, fmt.c_str(), ap);
+       va_end(ap);
+       if (n > -1 && n < size)
+           return str;
+       if (n > -1)
+           size=n+1;
+       else
+           size*=2;
+       }
+}
+
+// R-compatible quantile code : TODO convert to template
+
+template <class vtype>
+double quantile(const vtype &vec, double p) {
+        int l = vec.size();
+        if (!l) return 0;
+        double t = ((double)l-1)*p;
+        int it = (int) t;
+        int v=vec[it];
+        if (t > (double)it) {
+                return (v + (t-it) * (vec[it+1] - v));
+        } else {
+                return v;
+        }
+}
+
+template <class itype>
+double quantile(const vector<itype> &vec, double p) {
+        int l = vec.size();
+        if (!l) return 0;
+        double t = ((double)l-1)*p;
+        int it = (int) t;
+        itype v=vec[it];
+        if (t > (double)it) {
+                return (v + (t-it) * (vec[it+1] - v));
+        } else {
+                return v;
+        }
+}
+
+void build_basemap() {
+	int cb,j;
+	for (cb=0;cb<256;++cb) {
+		switch(cb) {
+			case 'A': case 'a':
+				j=T_A; break;
+			case 'C': case 'c':
+				j=T_C; break;
+			case 'G': case 'g':
+				j=T_G; break;
+				case 'T': case 't':
+					j=T_T; break;
+				default:
+					j=T_N; break;
+		}
+		basemap[cb]=j;
+	}
+}	
+
+
diff --git a/sam.c b/sam.c
new file mode 100644
index 0000000..fa11df6
--- /dev/null
+++ b/sam.c
@@ -0,0 +1,186 @@
+#include <string.h>
+#include <unistd.h>
+#include "faidx.h"
+#include "sam.h"
+
+#define TYPE_BAM  1
+#define TYPE_READ 2
+
+bam_header_t *bam_header_dup(const bam_header_t *h0)
+{
+	bam_header_t *h;
+	int i;
+	h = bam_header_init();
+	*h = *h0;
+	h->hash = h->dict = h->rg2lib = 0;
+	h->text = (char*)calloc(h->l_text + 1, 1);
+	memcpy(h->text, h0->text, h->l_text);
+	h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+	h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
+	for (i = 0; i < h->n_targets; ++i) {
+		h->target_len[i] = h0->target_len[i];
+		h->target_name[i] = strdup(h0->target_name[i]);
+	}
+	return h;
+}
+static void append_header_text(bam_header_t *header, char* text, int len)
+{
+	int x = header->l_text + 1;
+	int y = header->l_text + len + 1; // 1 byte null
+	if (text == 0) return;
+	kroundup32(x); 
+	kroundup32(y);
+	if (x < y) header->text = (char*)realloc(header->text, y);
+	strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here.
+	header->l_text += len;
+	header->text[header->l_text] = 0;
+}
+
+int samthreads(samfile_t *fp, int n_threads, int n_sub_blks)
+{
+	if (!(fp->type&1) || (fp->type&2)) return -1;
+	bgzf_mt(fp->x.bam, n_threads, n_sub_blks);
+	return 0;
+}
+
+samfile_t *samopen(const char *fn, const char *mode, const void *aux)
+{
+	samfile_t *fp;
+	fp = (samfile_t*)calloc(1, sizeof(samfile_t));
+	if (strchr(mode, 'r')) { // read
+		fp->type |= TYPE_READ;
+		if (strchr(mode, 'b')) { // binary
+			fp->type |= TYPE_BAM;
+			fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
+			if (fp->x.bam == 0) goto open_err_ret;
+			fp->header = bam_header_read(fp->x.bam);
+		} else { // text
+			fp->x.tamr = sam_open(fn);
+			if (fp->x.tamr == 0) goto open_err_ret;
+			fp->header = sam_header_read(fp->x.tamr);
+			if (fp->header->n_targets == 0) { // no @SQ fields
+				if (aux) { // check if aux is present
+					bam_header_t *textheader = fp->header;
+					fp->header = sam_header_read2((const char*)aux);
+					if (fp->header == 0) goto open_err_ret;
+					append_header_text(fp->header, textheader->text, textheader->l_text);
+					bam_header_destroy(textheader);
+				}
+				if (fp->header->n_targets == 0 && bam_verbose >= 1)
+					fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
+			} else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
+		}
+	} else if (strchr(mode, 'w')) { // write
+		fp->header = bam_header_dup((const bam_header_t*)aux);
+		if (strchr(mode, 'b')) { // binary
+			char bmode[3];
+			int i, compress_level = -1;
+			for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break;
+			if (mode[i]) compress_level = mode[i] - '0';
+			if (strchr(mode, 'u')) compress_level = 0;
+			bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0;
+			fp->type |= TYPE_BAM;
+			fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode);
+			if (fp->x.bam == 0) goto open_err_ret;
+			bam_header_write(fp->x.bam, fp->header);
+		} else { // text
+			// open file
+			fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
+			if (fp->x.tamw == 0) goto open_err_ret;
+			if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2;
+			else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2;
+			else fp->type |= BAM_OFDEC<<2;
+			// write header
+			if (strchr(mode, 'h')) {
+				int i;
+				bam_header_t *alt;
+				// parse the header text 
+				alt = bam_header_init();
+				alt->l_text = fp->header->l_text; alt->text = fp->header->text;
+				sam_header_parse(alt);
+				alt->l_text = 0; alt->text = 0;
+				// check if there are @SQ lines in the header
+				fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL
+				if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
+					if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1)
+						fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n");
+				} else { // then dump ->target_{name,len}
+					for (i = 0; i < fp->header->n_targets; ++i)
+						fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
+				}
+				bam_header_destroy(alt);
+			}
+		}
+	}
+	return fp;
+
+open_err_ret:
+	free(fp);
+	return 0;
+}
+
+void samclose(samfile_t *fp)
+{
+	if (fp == 0) return;
+	if (fp->header) bam_header_destroy(fp->header);
+	if (fp->type & TYPE_BAM) bam_close(fp->x.bam);
+	else if (fp->type & TYPE_READ) sam_close(fp->x.tamr);
+	else fclose(fp->x.tamw);
+	free(fp);
+}
+
+int samread(samfile_t *fp, bam1_t *b)
+{
+	if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading
+	if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b);
+	else return sam_read1(fp->x.tamr, fp->header, b);
+}
+
+int samwrite(samfile_t *fp, const bam1_t *b)
+{
+	if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing
+	if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b);
+	else {
+		char *s = bam_format1_core(fp->header, b, fp->type>>2&3);
+		int l = strlen(s);
+		fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw);
+		free(s);
+		return l + 1;
+	}
+}
+
+int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data)
+{
+	bam_plbuf_t *buf;
+	int ret;
+	bam1_t *b;
+	b = bam_init1();
+	buf = bam_plbuf_init(func, func_data);
+	bam_plbuf_set_mask(buf, mask);
+	while ((ret = samread(fp, b)) >= 0)
+		bam_plbuf_push(b, buf);
+	bam_plbuf_push(0, buf);
+	bam_plbuf_destroy(buf);
+	bam_destroy1(b);
+	return 0;
+}
+
+char *samfaipath(const char *fn_ref)
+{
+	char *fn_list = 0;
+	if (fn_ref == 0) return 0;
+	fn_list = calloc(strlen(fn_ref) + 5, 1);
+	strcat(strcpy(fn_list, fn_ref), ".fai");
+	if (access(fn_list, R_OK) == -1) { // fn_list is unreadable
+		if (access(fn_ref, R_OK) == -1) {
+			fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref);
+		} else {
+			if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n");
+			if (fai_build(fn_ref) == -1) {
+				fprintf(stderr, "[samfaipath] fail to build FASTA index.\n");
+				free(fn_list); fn_list = 0;
+			}
+		}
+	}
+	return fn_list;
+}
diff --git a/sam_header.c b/sam_header.c
new file mode 100644
index 0000000..a1b5181
--- /dev/null
+++ b/sam_header.c
@@ -0,0 +1,772 @@
+#include "sam_header.h"
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, const char *)
+
+struct _HeaderList
+{
+    struct _HeaderList *last;   // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.
+    struct _HeaderList *next;
+    void *data;
+};
+typedef struct _HeaderList list_t;
+typedef list_t HeaderDict;
+
+typedef struct
+{
+    char key[2];
+    char *value;
+}
+HeaderTag;
+
+typedef struct
+{
+    char type[2];
+    list_t *tags;
+}
+HeaderLine;
+
+const char *o_hd_tags[] = {"SO","GO",NULL};
+const char *r_hd_tags[] = {"VN",NULL};
+
+const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
+const char *r_sq_tags[] = {"SN","LN",NULL};
+const char *u_sq_tags[] = {"SN",NULL};
+
+const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL};
+const char *r_rg_tags[] = {"ID",NULL};
+const char *u_rg_tags[] = {"ID",NULL};
+
+const char *o_pg_tags[] = {"VN","CL",NULL};
+const char *r_pg_tags[] = {"ID",NULL};
+
+const char *types[]          = {"HD","SQ","RG","PG","CO",NULL};
+const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
+const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
+const char **unique_tags[]   = {NULL,     u_sq_tags,u_rg_tags,NULL,NULL,NULL};
+
+
+static void debug(const char *format, ...)
+{
+    va_list ap;
+    va_start(ap, format);
+    vfprintf(stderr, format, ap);
+    va_end(ap);
+}
+
+#if 0
+// Replaced by list_append_to_end
+static list_t *list_prepend(list_t *root, void *data)
+{
+    list_t *l = malloc(sizeof(list_t));
+    l->next = root;
+    l->data = data;
+    return l;
+}
+#endif
+
+// Relies on the root->last being correct. Do not use with the other list_*
+//  routines unless they are fixed to modify root->last as well.
+static list_t *list_append_to_end(list_t *root, void *data)
+{
+    list_t *l = malloc(sizeof(list_t));
+    l->last = l;
+    l->next = NULL;
+    l->data = data;
+
+    if ( !root )
+        return l;
+
+    root->last->next = l;
+    root->last = l;
+    return root;
+}
+
+static list_t *list_append(list_t *root, void *data)
+{
+    list_t *l = root;
+    while (l && l->next)
+        l = l->next;
+    if ( l ) 
+    {
+        l->next = malloc(sizeof(list_t));
+        l = l->next;
+    }
+    else
+    {
+        l = malloc(sizeof(list_t));
+        root = l;
+    }
+    l->data = data;
+    l->next = NULL;
+    return root;
+}
+
+static void list_free(list_t *root)
+{
+    list_t *l = root;
+    while (root)
+    {
+        l = root;
+        root = root->next;
+        free(l);
+    }
+}
+
+
+
+// Look for a tag "XY" in a predefined const char *[] array.
+static int tag_exists(const char *tag, const char **tags)
+{
+    int itag=0;
+    if ( !tags ) return -1;
+    while ( tags[itag] )
+    {
+        if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; 
+        itag++;
+    }
+    return -1;
+}
+
+
+
+// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
+//  or NULL if everything has been read. The lineptr should be freed by the caller. The
+//  newline character is stripped.
+static const char *nextline(char **lineptr, size_t *n, const char *text)
+{
+    int len;
+    const char *to = text;
+
+    if ( !*to ) return NULL;
+
+    while ( *to && *to!='\n' && *to!='\r' ) to++;
+    len = to - text + 1;
+
+    if ( *to )
+    {
+        // Advance the pointer for the next call
+        if ( *to=='\n' ) to++;
+        else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
+    }
+    if ( !len )
+        return to;
+
+    if ( !*lineptr ) 
+    {
+        *lineptr = malloc(len);
+        *n = len;
+    }
+    else if ( *n<len ) 
+    {
+        *lineptr = realloc(*lineptr, len);
+        *n = len;
+    }
+    if ( !*lineptr ) {
+		debug("[nextline] Insufficient memory!\n");
+		return 0;
+	}
+
+    memcpy(*lineptr,text,len);
+    (*lineptr)[len-1] = 0;
+
+    return to;
+}
+
+// name points to "XY", value_from points to the first character of the value string and
+//  value_to points to the last character of the value string.
+static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
+{
+    HeaderTag *tag = malloc(sizeof(HeaderTag));
+    int len = value_to-value_from+1;
+
+    tag->key[0] = name[0];
+    tag->key[1] = name[1];
+    tag->value = malloc(len+1);
+    memcpy(tag->value,value_from,len+1);
+    tag->value[len] = 0;
+    return tag;
+}
+
+static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
+{
+    list_t *tags = hline->tags;
+    while (tags)
+    {
+        HeaderTag *tag = tags->data;
+        if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
+        tags = tags->next;
+    }
+    return NULL;
+}
+
+
+// Return codes:
+//   0 .. different types or unique tags differ or conflicting tags, cannot be merged
+//   1 .. all tags identical -> no need to merge, drop one
+//   2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
+//   3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
+static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
+{
+    HeaderTag *t1, *t2;
+
+    if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] )
+        return 0;
+
+    int itype = tag_exists(hline1->type,types);
+    if ( itype==-1 ) {
+		debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
+		return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code
+	}
+
+    if ( unique_tags[itype] )
+    {
+        t1 = header_line_has_tag(hline1,unique_tags[itype][0]);
+        t2 = header_line_has_tag(hline2,unique_tags[itype][0]);
+        if ( !t1 || !t2 ) // this should never happen, the unique tags are required
+            return 2;
+
+        if ( strcmp(t1->value,t2->value) )
+            return 0;   // the unique tags differ, cannot be merged
+    }
+    if ( !required_tags[itype] && !optional_tags[itype] )
+    {
+        t1 = hline1->tags->data;
+        t2 = hline2->tags->data;
+        if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments
+        return 0;
+    }
+
+    int missing=0, itag=0;
+    while ( required_tags[itype] && required_tags[itype][itag] )
+    {
+        t1 = header_line_has_tag(hline1,required_tags[itype][itag]);
+        t2 = header_line_has_tag(hline2,required_tags[itype][itag]);
+        if ( !t1 && !t2 )
+            return 2;       // this should never happen
+        else if ( !t1 || !t2 )
+            missing = 1;    // there is some tag missing in one of the hlines
+        else if ( strcmp(t1->value,t2->value) )
+        {
+            if ( unique_tags[itype] )
+                return 2;   // the lines have a matching unique tag but have a conflicting tag
+                    
+            return 0;    // the lines contain conflicting tags, cannot be merged
+        }
+        itag++;
+    }
+    itag = 0;
+    while ( optional_tags[itype] && optional_tags[itype][itag] )
+    {
+        t1 = header_line_has_tag(hline1,optional_tags[itype][itag]);
+        t2 = header_line_has_tag(hline2,optional_tags[itype][itag]);
+        if ( !t1 && !t2 )
+        {
+            itag++;
+            continue;
+        }
+        if ( !t1 || !t2 )
+            missing = 1;    // there is some tag missing in one of the hlines
+        else if ( strcmp(t1->value,t2->value) )
+        {
+            if ( unique_tags[itype] )
+                return 2;   // the lines have a matching unique tag but have a conflicting tag
+
+            return 0;   // the lines contain conflicting tags, cannot be merged
+        }
+        itag++;
+    }
+    if ( missing ) return 3;    // there are some missing complementary tags with no conflicts, can be merged
+    return 1;
+}
+
+
+static HeaderLine *sam_header_line_clone(const HeaderLine *hline)
+{
+    list_t *tags;
+    HeaderLine *out = malloc(sizeof(HeaderLine));
+    out->type[0] = hline->type[0];
+    out->type[1] = hline->type[1];
+    out->tags = NULL;
+
+    tags = hline->tags;
+    while (tags)
+    {
+        HeaderTag *old = tags->data;
+
+        HeaderTag *new = malloc(sizeof(HeaderTag));
+        new->key[0] = old->key[0];
+        new->key[1] = old->key[1];
+        new->value  = strdup(old->value);
+        out->tags = list_append(out->tags, new);
+
+        tags = tags->next;
+    }
+    return out;
+}
+
+static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
+{
+    list_t *tmpl_tags;
+
+    if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] )
+        return 0;
+    
+    tmpl_tags = tmpl_hline->tags;
+    while (tmpl_tags)
+    {
+        HeaderTag *tmpl_tag = tmpl_tags->data;
+        HeaderTag *out_tag  = header_line_has_tag(out_hline, tmpl_tag->key);
+        if ( !out_tag )
+        {
+            HeaderTag *tag = malloc(sizeof(HeaderTag));
+            tag->key[0] = tmpl_tag->key[0];
+            tag->key[1] = tmpl_tag->key[1];
+            tag->value  = strdup(tmpl_tag->value);
+            out_hline->tags = list_append(out_hline->tags,tag);
+        }
+        tmpl_tags = tmpl_tags->next;
+    }
+    return 1;
+}
+
+
+static HeaderLine *sam_header_line_parse(const char *headerLine)
+{
+    HeaderLine *hline;
+    HeaderTag *tag;
+    const char *from, *to;
+    from = headerLine;
+
+    if ( *from != '@' ) {
+		debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
+		return 0;
+	}
+    to = ++from;
+
+    while (*to && *to!='\t') to++;
+    if ( to-from != 2 ) {
+		debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine);
+		return 0;
+	}
+    
+    hline = malloc(sizeof(HeaderLine));
+    hline->type[0] = from[0];
+    hline->type[1] = from[1];
+    hline->tags = NULL;
+
+    int itype = tag_exists(hline->type, types);
+    
+    from = to;
+    while (*to && *to=='\t') to++;
+    if ( to-from != 1 ) {
+        debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+		return 0;
+	}
+    from = to;
+    while (*from)
+    {
+        while (*to && *to!='\t') to++;
+
+        if ( !required_tags[itype] && !optional_tags[itype] )
+        {
+            // CO is a special case, it can contain anything, including tabs
+            if ( *to ) { to++; continue; }
+            tag = new_tag("  ",from,to-1);
+        }
+        else
+            tag = new_tag(from,from+3,to-1);
+
+        if ( header_line_has_tag(hline,tag->key) ) 
+                debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
+        hline->tags = list_append(hline->tags, tag);
+
+        from = to;
+        while (*to && *to=='\t') to++;
+        if ( *to && to-from != 1 ) {
+			debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+			return 0;
+		}
+
+        from = to;
+    }
+    return hline;
+}
+
+
+// Must be of an existing type, all tags must be recognised and all required tags must be present
+static int sam_header_line_validate(HeaderLine *hline)
+{
+    list_t *tags;
+    HeaderTag *tag;
+    int itype, itag;
+    
+    // Is the type correct?
+    itype = tag_exists(hline->type, types);
+    if ( itype==-1 ) 
+    {
+        debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
+        return 0;
+    }
+
+    // Has all required tags?
+    itag = 0;
+    while ( required_tags[itype] && required_tags[itype][itag] )
+    {
+        if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
+        {
+            debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
+                hline->type[0],hline->type[1]);
+            return 0;
+        }
+        itag++;
+    }
+
+    // Are all tags recognised?
+    tags = hline->tags;
+    while ( tags )
+    {
+        tag = tags->data;
+        if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
+        {
+            // Lower case tags are user-defined values.
+            if( !(islower(tag->key[0]) || islower(tag->key[1])) )
+            {
+                // Neither is lower case, but tag was not recognized.
+                debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
+                // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes
+            }
+            // else - allow user defined tag
+        }
+        tags = tags->next;
+    }
+
+    return 1;
+}
+
+
+static void print_header_line(FILE *fp, HeaderLine *hline)
+{
+    list_t *tags = hline->tags;
+    HeaderTag *tag;
+
+    fprintf(fp, "@%c%c", hline->type[0],hline->type[1]);
+    while (tags)
+    {
+        tag = tags->data;
+
+        fprintf(fp, "\t");
+        if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+            fprintf(fp, "%c%c:", tag->key[0],tag->key[1]);
+        fprintf(fp, "%s", tag->value);
+
+        tags = tags->next;
+    }
+    fprintf(fp,"\n");
+}
+
+
+static void sam_header_line_free(HeaderLine *hline)
+{
+    list_t *tags = hline->tags;
+    while (tags)
+    {
+        HeaderTag *tag = tags->data;
+        free(tag->value);
+        free(tag);
+        tags = tags->next;
+    }
+    list_free(hline->tags);
+    free(hline);
+}
+
+void sam_header_free(void *_header)
+{
+	HeaderDict *header = (HeaderDict*)_header;
+    list_t *hlines = header;
+    while (hlines)
+    {
+        sam_header_line_free(hlines->data);
+        hlines = hlines->next;
+    }
+    list_free(header);
+}
+
+HeaderDict *sam_header_clone(const HeaderDict *dict)
+{
+    HeaderDict *out = NULL;
+    while (dict)
+    {
+        HeaderLine *hline = dict->data;
+        out = list_append(out, sam_header_line_clone(hline));
+        dict = dict->next;
+    }
+    return out;
+}
+
+// Returns a newly allocated string
+char *sam_header_write(const void *_header)
+{
+	const HeaderDict *header = (const HeaderDict*)_header;
+    char *out = NULL;
+    int len=0, nout=0;
+    const list_t *hlines;
+
+    // Calculate the length of the string to allocate
+    hlines = header;
+    while (hlines)
+    {
+        len += 4;   // @XY and \n
+
+        HeaderLine *hline = hlines->data;
+        list_t *tags = hline->tags;
+        while (tags)
+        {
+            HeaderTag *tag = tags->data;
+            len += strlen(tag->value) + 1;                  // \t
+            if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+                len += strlen(tag->value) + 3;              // XY:
+            tags = tags->next;
+        }
+        hlines = hlines->next;
+    }
+
+    nout = 0;
+    out  = malloc(len+1);
+    hlines = header;
+    while (hlines)
+    {
+        HeaderLine *hline = hlines->data;
+
+        nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
+
+        list_t *tags = hline->tags;
+        while (tags)
+        {
+            HeaderTag *tag = tags->data;
+            nout += sprintf(out+nout,"\t");
+            if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+                nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
+            nout += sprintf(out+nout,"%s", tag->value);
+            tags = tags->next;
+        }
+        hlines = hlines->next;
+        nout += sprintf(out+nout,"\n");
+    }
+    out[len] = 0;
+    return out;
+}
+
+void *sam_header_parse2(const char *headerText)
+{
+    list_t *hlines = NULL;
+    HeaderLine *hline;
+    const char *text;
+    char *buf=NULL;
+    size_t nbuf = 0;
+	int tovalidate = 0;
+
+    if ( !headerText )
+		return 0;
+
+    text = headerText;
+    while ( (text=nextline(&buf, &nbuf, text)) )
+    {
+        hline = sam_header_line_parse(buf);
+        if ( hline && (!tovalidate || sam_header_line_validate(hline)) )
+            // With too many (~250,000) reference sequences the header parsing was too slow with list_append.
+            hlines = list_append_to_end(hlines, hline);
+        else
+        {
+			if (hline) sam_header_line_free(hline);
+			sam_header_free(hlines);
+            if ( buf ) free(buf);
+            return NULL;
+        }
+    }
+    if ( buf ) free(buf);
+
+    return hlines;
+}
+
+void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
+{
+	const HeaderDict *dict = (const HeaderDict*)_dict;
+    const list_t *l   = dict;
+    khash_t(str) *tbl = kh_init(str);
+    khiter_t k;
+    int ret;
+
+	if (_dict == 0) return tbl; // return an empty (not null) hash table
+    while (l)
+    {
+        HeaderLine *hline = l->data;
+        if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) 
+        {
+            l = l->next;
+            continue;
+        }
+        
+        HeaderTag *key, *value;
+        key   = header_line_has_tag(hline,key_tag);
+        value = header_line_has_tag(hline,value_tag); 
+        if ( !key || !value )
+        {
+            l = l->next;
+            continue;
+        }
+        
+        k = kh_get(str, tbl, key->value);
+        if ( k != kh_end(tbl) )
+            debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
+        k = kh_put(str, tbl, key->value, &ret);
+        kh_value(tbl, k) = value->value;
+
+        l = l->next;
+    }
+    return tbl;
+}
+
+char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)
+{
+	const HeaderDict *dict = (const HeaderDict*)_dict;
+    const list_t *l   = dict;
+    int max, n;
+	char **ret;
+
+	ret = 0; *_n = max = n = 0;
+    while (l)
+    {
+        HeaderLine *hline = l->data;
+        if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) 
+        {
+            l = l->next;
+            continue;
+        }
+        
+        HeaderTag *key;
+        key   = header_line_has_tag(hline,key_tag);
+        if ( !key )
+        {
+            l = l->next;
+            continue;
+        }
+
+		if (n == max) {
+			max = max? max<<1 : 4;
+			ret = realloc(ret, max * sizeof(void*));
+		}
+		ret[n++] = key->value;
+
+        l = l->next;
+    }
+	*_n = n;
+    return ret;
+}
+
+void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value)
+{
+    list_t *l = iter;
+    if ( !l ) return NULL;
+
+    while (l)
+    {
+        HeaderLine *hline = l->data;
+        if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
+        {
+            l = l->next;
+            continue;
+        }
+
+        HeaderTag *key, *value;
+        key   = header_line_has_tag(hline,key_tag);
+        value = header_line_has_tag(hline,value_tag);
+        if ( !key && !value ) 
+        {
+            l = l->next;
+            continue;
+        }
+
+        *_key = key->value;
+        *_value = value->value;
+        return l->next;
+    }
+    return l;
+}
+
+const char *sam_tbl_get(void *h, const char *key)
+{
+	khash_t(str) *tbl = (khash_t(str)*)h;
+	khint_t k;
+	k = kh_get(str, tbl, key);
+	return k == kh_end(tbl)? 0 : kh_val(tbl, k);
+}
+
+int sam_tbl_size(void *h)
+{
+	khash_t(str) *tbl = (khash_t(str)*)h;
+	return h? kh_size(tbl) : 0;
+}
+
+void sam_tbl_destroy(void *h)
+{
+	khash_t(str) *tbl = (khash_t(str)*)h;
+	kh_destroy(str, tbl);
+}
+
+void *sam_header_merge(int n, const void **_dicts)
+{
+	const HeaderDict **dicts = (const HeaderDict**)_dicts;
+    HeaderDict *out_dict;
+    int idict, status;
+
+    if ( n<2 ) return NULL;
+
+    out_dict = sam_header_clone(dicts[0]);
+
+    for (idict=1; idict<n; idict++)
+    {
+        const list_t *tmpl_hlines = dicts[idict];
+
+        while ( tmpl_hlines )
+        {
+            list_t *out_hlines = out_dict;
+            int inserted = 0;
+            while ( out_hlines )
+            {
+                status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);
+                if ( status==0 )
+                {
+                    out_hlines = out_hlines->next;
+                    continue;
+                }
+                
+                if ( status==2 ) 
+                {
+                    print_header_line(stderr,tmpl_hlines->data);
+                    print_header_line(stderr,out_hlines->data);
+                    debug("Conflicting lines, cannot merge the headers.\n");
+					return 0;
+                }
+                if ( status==3 )
+                    sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);
+
+                inserted = 1;
+                break;
+            }
+            if ( !inserted )
+                out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));
+
+            tmpl_hlines = tmpl_hlines->next;
+        }
+    }
+
+    return out_dict;
+}
+
+
diff --git a/tidx/fastq-lib.cpp b/tidx/fastq-lib.cpp
new file mode 120000
index 0000000..0551761
--- /dev/null
+++ b/tidx/fastq-lib.cpp
@@ -0,0 +1 @@
+../fastq-lib.cpp
\ No newline at end of file
diff --git a/tidx/fastq-lib.h b/tidx/fastq-lib.h
new file mode 120000
index 0000000..de4dedc
--- /dev/null
+++ b/tidx/fastq-lib.h
@@ -0,0 +1 @@
+../fastq-lib.h
\ No newline at end of file
diff --git a/tidx/tidx-lib.cpp b/tidx/tidx-lib.cpp
new file mode 100644
index 0000000..3199516
--- /dev/null
+++ b/tidx/tidx-lib.cpp
@@ -0,0 +1,436 @@
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string>
+#include <vector>
+
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <sparsehash/dense_hash_map>
+
+#include "fastq-lib.h"
+#include "utils.h"
+#include "tidx.h"
+
+void usage(FILE *f);
+
+using namespace std;
+using namespace google;
+
+double xtime();
+
+bool annot_comp (const annot &a, const annot &b) { return (a.beg < b.beg); }
+
+template <typename L, typename R> void append(L& lhs, R const& rhs) { lhs.insert(lhs.end(), rhs.begin(), rhs.end()); }
+template <typename L, typename R> void prepend(L& lhs, R const& rhs) { lhs.insert(lhs.begin(), rhs.begin(), rhs.end()); }
+
+struct string_annot_serializer {
+  bool operator()(FILE* fp, const std::pair<const string&, const vector<annot> >& value) const {
+
+    {
+    assert(value.first.length() <= UCHAR_MAX);
+    const unsigned char size = value.first.length();
+    if (fwrite(&size, sizeof(size), 1, fp) != 1)
+      return false;
+    if (fwrite(value.first.data(), size, 1, fp) != 1)
+      return false;
+    }
+
+    {
+    const vector<annot>&van=value.second;
+    const unsigned long size = van.size();
+    if (fwrite(&size, sizeof(size), 1, fp) != 1)
+      return false;
+    int i;
+    for (i=0;i<size;++i) {
+        if (fwrite(&van[i].beg, sizeof(van[i].beg), 1, fp) != 1)
+          return false;
+        if (fwrite(&van[i].end, sizeof(van[i].end), 1, fp) != 1)
+          return false;
+        assert(van[i].pos.size() <= USHRT_MAX);
+        const unsigned short size = van[i].pos.size();
+        if (fwrite(&size, sizeof(size), 1, fp) != 1)
+          return false;
+        int j;
+        for (j=0;j<size;++j) {
+            if (fwrite(&van[i].pos[j], sizeof(van[i].pos[j]), 1, fp) != 1)
+              return false;
+        }
+    }
+    }
+
+    return true;
+  }
+
+  bool operator()(FILE* fp, std::pair<const string, vector<annot> >* value) const {
+
+    {
+
+    string buf;
+    unsigned char size;    // all strings are <= 255 chars long
+    if (fread(&size, sizeof(size), 1, fp) != 1)
+      return false;
+
+    if(size>buf.size()) buf.resize(size*2);
+
+    if (fread((void *)buf.data(), size, 1, fp) != 1) {
+      return false;
+    }
+    // necessarry to "new" the value which must be const, except during "unsearialization"
+    // api shouldn't foist this on the user ... should be behind the scenes
+    string * ncs = const_cast<string *>(&value->first);
+    new(ncs) string(buf.data(), (size_t)size);
+    
+    }
+
+    {
+    
+    vector<annot> &van=value->second;
+    unsigned long size;
+    if (fread(&size, sizeof(size), 1, fp) != 1)
+      return false;
+    int i;
+    van.resize(size);
+    for (i=0;i<size;++i) {
+        if (fread(&van[i].beg, sizeof(van[i].beg), 1, fp) != 1)
+          return false;
+        if (fread(&van[i].end, sizeof(van[i].beg), 1, fp) != 1)
+          return false;
+        unsigned short size;
+        if (fread(&size, sizeof(size), 1, fp) != 1)
+          return false;
+        int j;
+        van[i].pos.resize(size);
+        for(j=0;j<size;++j) {
+            if (fread(&van[i].pos[j], sizeof(van[i].pos[j]), 1, fp) != 1)
+              return false;
+        }
+    }
+
+    }
+
+    return true;
+  }
+};
+
+
+void chomp_line(struct line &l) {
+    if (l.s[l.n-1] == '\n') l.s[--l.n]='\0';       // chomp
+    if (l.s[l.n-1] == '\r') l.s[--l.n]='\0';       // chomp
+}
+
+vector <long int> empty_vector;
+const vector<long int> &tidx::lookup(const char *chr, int pos) {
+    dense_hash_map<string,vector<annot> >::iterator it=map.find(chr);
+    if (it == map.end()) return empty_vector;
+    vector<annot> &va = it->second;
+    if (debug) fprintf(stderr,"lookup: %s:%d -> %d\n", chr, pos, (int) va.size());
+    int b=0, t=va.size(), c=0;
+    while (t>b) {
+        c=(t+b)/2;
+//        printf("here1: c:%d, t:%d, b:%d, pos:%d, beg:%d, end:%d, res:%d\n", c, t, b, pos, va[c].beg, va[c].end, va[c].pos[0]);
+        if (pos == va[c].beg)
+            break;
+        else if (pos < va[c].beg)
+            t=c-1;
+        else if (pos > va[c].beg) {
+            if (pos <= va[c].end) {
+                return va[c].pos;
+            }
+            b=c+1;
+        }
+    }
+    
+    if (t == b)
+        c = t;
+//    printf("here2: c:%d, t:%d, b:%d, pos:%d, beg:%d, end:%d, res:%d\n", c, t, b, pos, va[c].beg, va[c].end, va[c].pos[0]);
+    if (pos >= va[c].beg && pos <= va[c].end) {
+        return va[c].pos;
+    }
+    return empty_vector;
+}
+
+vector<long int> tidx::lookup_r(const char *chr, int beg, int end) {
+    dense_hash_map<string,vector<annot> >::iterator it=map.find(chr);
+    if (it == map.end()) return empty_vector;
+    vector<annot> &va = it->second;
+    if (debug) fprintf(stderr,"lookup_r: %s:%d.%d -> %d\n", chr, beg, end, (int) va.size());
+    int b=0, t=va.size(), c=0;
+    while (t>b) {
+        c=(t+b)/2;
+        if (beg == va[c].beg)
+            break;
+        else if (beg < va[c].beg)
+            t=c-1;
+        else if (beg > va[c].beg) {
+            if (beg <= va[c].end) 
+                break;
+            b=c+1;
+        }
+    }
+    if (t == b)
+        c = t;
+    vector<long int> res;
+    while (c<va.size() && end >= va[c].beg && beg <= va[c].end) {
+        append(res,va[c].pos);
+        ++c;
+    }
+    return res;
+}
+
+string tidx::lookup(const char *chr, int pos, const char *msep) { 
+//    printf("here2\n");
+    const vector<long int> &v = lookup(chr, pos);
+    string res;
+    if (!fh) {
+        fh=fopen(path.c_str(),"rb");
+        if (!fh) 
+            fail("%s:%s\n",path.c_str(),strerror(errno));
+    }
+    string line;
+    int i;
+	struct line l; meminit(l);
+    for (i=0;i<v.size();++i) {
+        fseek(fh,v[i],0);
+    	read_line(fh, l);
+        chomp_line(l);
+        res += msep;
+        res += string(l.s, l.n);
+    }
+    free_line(&l);
+    return res;
+}
+
+string tidx::lookup_r(const char *chr, int beg, int end, const char *msep) {
+//    printf("here2\n");
+    const vector<long int> &v = lookup_r(chr, beg, end);
+    string res;
+    if (!fh) {
+        fh=fopen(path.c_str(),"rb");
+        if (!fh)
+            fail("%s:%s\n",path.c_str(),strerror(errno));
+    }
+    string line;
+    int i;
+    struct line l; meminit(l);
+    for (i=0;i<v.size();++i) {
+        fseek(fh,v[i],0);
+        read_line(fh, l);
+        chomp_line(l);
+        res += msep;
+        res += string(l.s, l.n);
+    }
+    free_line(&l);
+    return res;
+}
+
+string api_ret = "";
+const char *tidx::lookup_c(const char *chr, int pos, const char *msep) {
+    api_ret = lookup(chr, pos, msep);
+    return api_ret.c_str();
+}
+
+const char *tidx::lookup_cr(const char *chr, int beg, int end, const char *msep) {
+    api_ret = lookup_r(chr, beg, end, msep);
+    return api_ret.c_str();
+}
+
+bool tidx::read(const char *in) {
+    string uin = string_format("gunzip -c %s.tidx", in);
+
+    if (debug) fprintf(stderr, "read %s\n", in);
+    FILE *fun=popen(uin.c_str(),"r");
+    if (!fun) {
+        return false;
+    }
+    map.unserialize(string_annot_serializer(), fun);
+    path=in;
+    return true;
+}
+
+void tidx::init() {
+    debug=false;
+    fh=NULL;
+    map.set_empty_key("-");
+}
+
+void tidx::dump(FILE *fh) {
+    fprintf(fh,"#file\t%s\n",path.c_str());
+    dense_hash_map<string,vector<annot> >::iterator it = map.begin();;
+    while (it != map.end()) {
+        vector<annot> &van = it->second;
+        int i;
+        for (i=0;i<van.size();++i) {
+            fprintf(fh, "%s\t%d\t%d\t%ld\t%ld\n", it->first.c_str(), van[i].beg, van[i].end, van[i].pos.size(), van[i].pos[0]);
+        }
+        ++it;
+    }    
+}
+
+// fun part
+void tidx::build(const char *in, const char *sep, int nchr, int nbeg, int nend, int skip_i, char skip_c, bool sub_e) {
+	FILE *fin=fopen(in,"r");
+
+    if (!fin)
+        fail("%s:%s\n",in,strerror(errno));
+
+    if (nend == -1)
+        nend = nbeg;
+
+    string out = string_format("gzip -c > %s.tidx", in);
+    FILE *fout=popen(out.c_str(),"w");
+    if (!fout)
+        fail("%s:%s\n", out.c_str(),strerror(errno));
+
+    double xst = xtime();
+
+	struct line l; meminit(l);
+    int nlast = max(max(nbeg,nend),nchr);
+    int nl = 0;
+
+    string p_chr = "%";
+    path=in;
+    vector<annot> *pvan;
+    long tpos = ftell(fin);
+    // read in the annotation file
+    if (debug) fprintf(stderr, "reading %s (%d, %d, %d)\n", in, nchr, nbeg, nend);
+	while (read_line(fin, l)>0) {
+        ++nl;
+        if (skip_i > 0 || *l.s==skip_c) {
+            --skip_i;
+        } else {
+            chomp_line(l);
+            vector<char *> v = split(l.s, sep);
+            if (nlast >= v.size()) {
+                fail("error, file %s, line %d: missing info\n", in, nl);
+            } 
+            annot a;
+            a.beg=atoi(v[nbeg]);
+            a.end=atoi(v[nend]);
+            if (sub_e) --a.end;
+            if (a.beg > a.end) {
+                fail("error, file %s, line %d: beg > end : %d > %d\n", in, nl, a.beg, a.end);
+            }
+            a.pos.push_back(tpos);
+            if (strcmp(v[nchr], p_chr.c_str())) {       // speed up
+                pvan = &map[v[nchr]];
+                p_chr=v[nchr];
+            }
+            pvan->push_back(a);
+        }
+        tpos = ftell(fin);
+	}
+    dense_hash_map<string,vector<annot> >::iterator it;
+
+    free_line(&l);
+ 
+    // for each chromosome
+    it = map.begin();
+    while (it != map.end()) {
+        vector<annot> &van = it->second;
+        // sort the annotation file by beginning of region
+        sort(van.begin(), van.end(), annot_comp);
+        int i;
+        if (debug) fprintf(stderr, "frag %s : %ld ->", it->first.c_str(), van.size());
+        for (i=0;i<van.size()-1;++i) {
+            if (van[i].beg >= van[i+1].beg && van[i].end == van[i+1].end) {
+                // exact match
+                if (debug) fprintf(stderr, " [dup %d]", van[i].beg);
+                // merge annotations
+                prepend(van[i+1].pos,van[i].pos);
+                // skip next... (empty pos won't be serialized)
+                assert(van[i].beg == van[i+1].beg);
+                van[i].pos.clear();
+            } else if (van[i].end >= van[i+1].beg) {
+                if (debug) fprintf(stderr, " [ovr %d-%d:%ld ]", van[i].beg, van[i].end, van[i].pos[0]);
+                // overlap next
+                int new_st;
+                int new_en;
+
+                // forced to initialize here so we can use a reference (for efficiency)
+                vector<long> new_ro = van[i].pos;
+                
+                if (van[i].end < van[i+1].end) {
+                    // contained within next, so new frag starting after i stop
+                    new_st = van[i].end + 1;
+                    new_en = van[i+1].end;
+                    new_ro = van[i+1].pos;                  // that only contains the other
+                    van[i+1].end=van[i].end;                // shorten next to my end
+                    append(van[i+1].pos,van[i].pos);        // and now the other contains me
+                } else {
+                    // passes next, so next contains all of me
+                    new_st = van[i+1].end+1;                // new frag is after the end of next
+                    new_en = van[i].end;
+                    append(van[i+1].pos,van[i].pos);
+                }
+
+                van[i].end=van[i+1].beg-1;                  // shorten my end to less than the next's start
+                
+                if (debug) fprintf(stderr, " [i:%d:%d:%ld]", van[i].beg, van[i].end, van[i].pos[0]);
+                if (debug) fprintf(stderr, " [i+1:%d:%d:%ld]", van[i+1].beg, van[i+1].end, van[i+1].pos[0]);
+
+                if (new_en >= new_st) {                     // is this a real one?
+                    if (debug) fprintf(stderr, " [n:%d:%d:%ld]", new_st, new_en, new_ro[0]);
+
+                    int j = i+2;                            // figure out where it goes (shouldn't be far)
+                    while (j < van.size() & new_st > van[j].beg) {
+                        ++j;
+                    }
+
+                    annot a;
+                    a.beg=new_st;
+                    a.end=new_en;
+                    a.pos=new_ro;
+                                                            // (slow... use linked list, turn to array later for storage/bin search?)
+                    van.insert(van.begin()+j, a);           // insert into the annot array
+                }
+            }
+        }
+        long j = 0;
+        for (i=0;i<van.size();++i) {
+            // overlap next
+            if (van[i].pos.size() != 0 && van[i].beg <= van[i].end)
+                if (i != j) 
+                    van[j++]=van[i];
+                else
+                    ++j;
+        }
+        if (j != van.size()) {
+            if (debug) fprintf(stderr, "(rm %ld) ", van.size()-j);
+            van.resize(j);
+        }
+        if (debug) fprintf(stderr, " %ld\n", van.size());
+        ++it;
+    }
+
+    double xen = xtime();
+    double speed = xen-xst;
+
+    if (debug) fprintf(stderr, "compiled in %g secs\n", speed);
+
+    map.serialize(string_annot_serializer(), fout);
+    pclose(fout);
+
+    //
+    xst = xtime();
+    tidx tmap(in);
+    xen = xtime();
+    speed = xen-xst;
+    if (debug) fprintf(stderr, "read in %g secs\n", speed);
+    path=in;
+}
+
+double xtime() {
+    struct timeval tm;
+    gettimeofday(&tm, NULL);
+    return (double) tm.tv_sec + ((double)tm.tv_usec)/1000000.0;
+}
+
+// for the api
+void tidx_build(const char *file, const char *sep, int chr, int beg, int end, int skip_i, char skip_c, bool sub_e) {
+    tidx n;
+    n.build(file, sep, chr, beg, end, skip_i, skip_c, sub_e); 
+}
diff --git a/tidx/tidx.cpp b/tidx/tidx.cpp
new file mode 100644
index 0000000..2de896f
--- /dev/null
+++ b/tidx/tidx.cpp
@@ -0,0 +1,220 @@
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string>
+#include <vector>
+
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <sparsehash/dense_hash_map>
+
+#include "fastq-lib.h"
+#include "utils.h"
+#include "tidx.h"
+
+void usage(FILE *f);
+
+using namespace std;
+using namespace google;
+
+int main (int argc, char **argv) {
+    bool debug = false;
+    bool echo = false;
+    bool build = false;
+    vector<const char *> vin;
+    const char *ain= NULL;
+	const char *sep = "\t";
+    const char *msep = "^";
+    const char *trim = "chr";
+    char *point = NULL;
+    int nchr = 1, nbeg = 2, nend = 3;
+    char skip_c = '#';
+    char sub_e = 0;
+    int skip_i = 0;
+    bool dump = 0;
+
+    char c;
+    while ( (c = getopt (argc, argv, "Dlhdt:r:c:b:T:e:p:i:s:a:nB")) != -1) {
+        switch (c) {
+            case 'd':
+                debug=true; break;
+            case 'D':
+                dump=true; break;
+            case 'n':
+                echo=false; break;
+            case 'l':
+                sub_e=true; break;
+            case 'B':
+                build = true; break;
+            case 'h':
+                usage(stdout); exit(0);
+            case 't':
+                sep = optarg; break;
+            case 'p':
+                point = optarg; break;
+            case 's':
+                if (isdigit(*optarg))
+                    skip_i = atoi(optarg);
+                else
+                    skip_c = *optarg; 
+                break;
+            case 'r':
+                msep = optarg; break;
+            case 'T':
+                trim = optarg; break;
+            case 'c':
+                nchr = atoi(optarg); break;
+            case 'i':
+                vin.push_back(optarg); break;
+            case 'a':
+                ain = optarg; break;
+            case 'b':
+                nbeg = atoi(optarg); break;
+            case 'e':
+                nend = atoi(optarg); break;
+            case '?':
+                if (strchr("tncbe", optopt))
+                    fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+                else if (isprint(optopt))
+                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+                else
+                    fprintf (stderr,
+                            "Unknown option character `\\x%x'.\n",
+                            optopt);
+                    usage(stderr);
+                    return 1;
+        }
+    }
+
+    if (!vin.size())  {
+        if (argc>1) warn("Error: at least one -i index file is required\n"); usage(stderr); exit(1);
+    }
+
+    if (! build && ! ain && !point && !dump) { 
+        fail("Error: one of -D -B, -p or -a is required\n");
+    }
+
+    if ((!!build + !!ain + !!point + !!dump) > 1) {
+        warn("Error: only one of -B, -p or -a is allowed\n");
+    }
+
+    --nchr; --nbeg; --nend;
+
+    if ( build ) {
+        int f_i;
+        for (f_i=0;f_i<vin.size();++f_i) {
+            tidx x;
+            if (debug) 
+                x.debug=true;
+            x.build(vin[f_i], sep, nchr, nbeg, nend, skip_i, skip_c, sub_e);
+        }
+    } else {
+	    struct line l; meminit(l);
+        int nl = 0;
+        int f_i;
+        vector<tidx *>vmap; vmap.resize(vin.size());
+        for (f_i=0;f_i<vin.size();++f_i) { 
+            vmap[f_i]=new tidx(vin[f_i]);
+            if (debug) 
+                vmap[f_i]->debug=true;
+            if (dump) {
+                vmap[f_i]->dump(stdout);
+            }
+        }
+        if (dump) {
+            exit(0);
+        }
+
+        if ( point ) {
+            char * p = strchr(point, ':');
+            if (!p) {
+                fail("Error: -p requires chr:pos argument\n");
+            }
+            *p++ = '\0';
+            long pos = atol(p);
+            int found = 0;
+            for (f_i=0;f_i<vin.size();++f_i) {
+                string tmp = vmap[f_i]->lookup(point, pos, msep);
+                if (tmp.size()) {
+                    ++found;
+                    fputs(tmp.c_str()+(found==1),stdout);                             // echo
+                }
+            }
+            if (found) fputc('\n',stdout);
+            return !found;
+        } else {
+            FILE *fin = !strcmp(ain,"-") ? stdin : fopen(ain, "r");
+            if (!fin)
+                fail("error '%s':%s", ain,strerror(errno));
+
+            while (read_line(fin, l)>0) {
+                ++nl;
+
+                chomp_line(l);
+
+                fputs(l.s,stdout);                              // echo
+
+                vector<char *> v = split(l.s, sep);     // todo, only get the keys desired, don't destroy
+
+                string res;
+                if (v.size() > nchr && v.size() > nbeg) {
+                    for (f_i=0;f_i<vin.size();++f_i) {
+                        string tmp = vmap[f_i]->lookup(v[nchr], atol(v[nbeg]), msep);
+                        if (tmp.size()) {
+                            res = res + tmp;
+                        }
+                    }
+                }
+                fputs(res.c_str(),stdout);                             // echo
+                fputc('\n',stdout);
+            }
+            free_line(&l);
+        }
+
+        for (f_i=0;f_i<vin.size();++f_i) 
+            delete vmap[f_i];
+    }
+}
+
+void usage(FILE *f) {
+fputs(
+"Usage: tidx [options] -i IFILE [-i IFILE2...] -a AFILE\n"
+"   or: tidx [options] -B -i IFILE\n"
+"\n"
+"Fragments and merges overlapping regions in an file with start-stop values.\n"
+"Creating a simple, fast, compressed index\n"
+"\n"
+"Also can load that index, and search AFILE for intersecting lines\n"
+"\n"
+"If the 'group' column is zero, no grouping will be used\n"
+"\n"
+"If just -b is present during a search, then only that column\n"
+"is searched.\n"
+"\n"
+"If both -b and -e are present during a search, then all regions\n"
+"that overlap will be returned.\n"
+"\n"
+"Options and (defaults):\n"
+"\n"
+"-i IFILE       Text file to index (can specify more than one)\n"
+"-B             Build index, don't annotate\n"
+"-a FILE        Read text file and annotate\n"
+"-p CHR:POS     Lookup a single point (slow!)\n"
+"-r STRING      Annotation response separator (^)\n"
+"-t CHAR(s)     Field separator (TAB)\n"
+"-c INT         Group by (chromosome) column (1)\n"
+"-b INT         Begin region column (2) (or position for annot)\n"
+"-e INT         End region column (3)\n"
+"-s INT or CHAR Skip rows starting with CHAR (#), or skip INT rows\n"
+"-l             Less than end, not less than or equal-to\n"
+"-n             Don't echo input lines\n"
+//"-d             Verbose debug output\n"
+//"-D             Dump input table (debug)\n"
+//"-p CHR:POS     Single point lookup (debug, slow)\n"
+"\n"
+        ,f);
+}
+
diff --git a/tidx/tidx.h b/tidx/tidx.h
new file mode 100644
index 0000000..15a24ee
--- /dev/null
+++ b/tidx/tidx.h
@@ -0,0 +1,43 @@
+#include <string>
+#include <vector>
+#include <sparsehash/dense_hash_map>
+
+class annot {
+public:
+    int beg;
+    int end;
+    std::vector<long> pos;
+};
+
+class tidx {
+    FILE *fh;
+    void init();
+public:
+    bool debug;
+    tidx() {init();};
+    tidx(const char *path)  {init(); read(path);};
+
+    std::string path;
+    google::dense_hash_map<std::string,std::vector<annot> > map;
+
+    void dump(FILE *stream);
+    bool read(const char *path);
+    void build(const char *path, const char *sep, int nchr, int nbeg, int nend, int skip_i, char skip_c, bool sub_e);
+
+    const std::vector <long int> & lookup(const char *chr, int pos);
+    std::string lookup(const char *chr, int pos, const char *msep);
+
+// range lookup
+    std::vector <long int> lookup_r(const char *chr, int beg, int end);
+    std::string lookup_r(const char *chr, int beg, int end, const char *msep);
+
+// const char * return value
+    const char * lookup_c(const char *chr, int pos, const char *msep);
+    const char * lookup_cr(const char *chr, int beg, int end, const char *msep);
+};
+
+void chomp_line(struct line &l);
+
+// build, with no return value, for API use
+void tidx_build(const char *path, const char *sep, int nchr, int nbeg, int nend, int skip_i, char skip_c, bool sub_e);
+
diff --git a/tidx/utils.cpp b/tidx/utils.cpp
new file mode 120000
index 0000000..451eef0
--- /dev/null
+++ b/tidx/utils.cpp
@@ -0,0 +1 @@
+../utils.cpp
\ No newline at end of file
diff --git a/tidx/utils.h b/tidx/utils.h
new file mode 120000
index 0000000..6cd5d4f
--- /dev/null
+++ b/tidx/utils.h
@@ -0,0 +1 @@
+../utils.h
\ No newline at end of file
diff --git a/utils.h b/utils.h
new file mode 100644
index 0000000..ceac58e
--- /dev/null
+++ b/utils.h
@@ -0,0 +1,5 @@
+#include <string>
+#include <vector>
+
+std::string string_format(const std::string &fmt, ...);
+std::vector<char *> split(char* str, const char* delim);
diff --git a/varcall.cpp b/varcall.cpp
new file mode 100644
index 0000000..72ec202
--- /dev/null
+++ b/varcall.cpp
@@ -0,0 +1,1744 @@
+/*
+Copyright (c) 2012 Erik Aronesty
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <math.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <string.h>
+#include <errno.h>
+#include <stdarg.h>
+
+#include <gsl/gsl_randist.h>
+
+#include <sys/stat.h>
+
+#include <string>
+#include <queue>
+#include <list>
+
+#include <google/sparse_hash_map> // or sparse_hash_set, dense_hash_map, ...
+#include <google/dense_hash_map> // or sparse_hash_set, dense_hash_map, ...
+#include "tidx/tidx.h"
+
+#include "fastq-lib.h"
+
+#define SVNREV atoi(strchr("$Revision: 632 $", ':')+1)
+const char * VERSION = "0.9";
+
+#define MIN_READ_LEN 20
+#define DEFAULT_LOCII 1000000
+
+using namespace std;
+using namespace google;
+
+void usage(FILE *f);
+
+// #define DEBUG 1
+
+#define meminit(l) (memset(&l,0,sizeof(l)))
+#ifdef DEBUG
+    #define debug(s,...) fprintf(stderr,s,##__VA_ARGS__)
+#else
+    #define debug(s,...)
+#endif
+#undef warn
+#define warn(s,...) ++errs; fprintf(stderr,s,##__VA_ARGS__)
+#define die(s,...) (fprintf(stderr,s,##__VA_ARGS__), exit(1))
+#define stat_out(s,...) fprintf(stat_fout,s,##__VA_ARGS__)
+#define stdev(cnt, sum, ssq) sqrt((((double)cnt)*ssq-pow((double)sum,2)) / ((double)cnt*((double)cnt-1)))
+#define log10(x) (log(x)/log(10))
+
+double quantile(const std::vector<int> &vec, double p);
+double quantile(const std::vector<double> &vec, double p);
+double pnorm(double x);
+double qnorm(double x);
+int rand_round(double x);
+
+// basic utils
+std::vector<char *> split(char* str, const char* delim);
+std::string string_format(const std::string &fmt, ...);
+void to_upper(const std::string str);
+void rename_tmp(std::string f);
+
+int errs=0;
+extern int optind;
+
+class Noise {
+public:
+	Noise() {noise=0;depth=0;};
+	Noise(int d, double n, double q, double mq) {depth=d; noise=n;qnoise=q;mnqual=mq;};
+	double noise;
+	double qnoise;
+	int depth;
+	double mnqual;
+};
+
+double quantile_depth(const vector<Noise> &vec, double p);
+
+bool noisebydepth (const Noise &a, const Noise &b) { return (a.depth>b.depth);}
+
+class PileupEnt {
+public:
+	bool is_rev;
+	bool is_start;
+	bool f;
+	const char *b;
+	int q;
+	int m;
+	int p;
+};
+
+class vcall {
+public:
+    vcall() {base='\0'; mn_qual=mq0=fwd=rev=qual=is_ref=qual_ssq=mq_sum=mq_ssq=tail_rev=tail_fwd=0;}
+    char base;
+	bool is_ref;
+    int qual, fwd, rev, mq0, mn_qual, qual_ssq, mq_sum, mq_ssq, tail_rev, tail_fwd;
+	vector <string> seqs;
+    int depth() const {return fwd+rev;}
+    int mq_rms() const {return sqrt(mq_ssq/depth());}
+    int qual_rms() const {return sqrt(qual_ssq/depth());}
+};
+
+class vfinal {
+public:
+    vfinal(vcall &c) {max_idl_cnt=0; padj=1; pcall = &c;};
+    vfinal & operator=(vfinal const&x) {max_idl_seq=x.max_idl_seq; max_idl_cnt=x.max_idl_cnt; padj=x.padj; pcall=x.pcall;}
+    vcall *pcall;
+    string max_idl_seq;
+    int max_idl_cnt;
+    double padj;
+    bool is_indel() {return max_idl_cnt > 0;};
+};
+
+bool hitolocall (const vcall &i,const vcall &j) {return ((i.depth())>(j.depth()));}
+bool sortreffirst (const vfinal &i,const vfinal &j) {return (i.pcall->is_ref&&!j.pcall->is_ref)||((i.pcall->is_ref==j.pcall->is_ref) && ((i.pcall->depth())>(j.pcall->depth())));}
+
+class Read {
+public:
+    int MapQ;
+    string Seq;
+    Read() {MapQ=0;};
+};
+
+class PileupReads {
+public:
+    double MeanReadLen() {return ReadBin.size() ? TotReadLen/ReadBin.size() : MIN_READ_LEN;}
+    int TotReadLen;
+    deque<Read> ReadBin;
+    list<Read> ReadList;
+    PileupReads() {TotReadLen=0;}
+};
+
+class PileupSummary {
+public:
+    string Chr;
+    int Pos;
+    char Base;
+    int Depth;
+    int TotQual;
+    int NumReads;
+    vector<vcall> Calls;
+
+    int SkipN;
+    int SkipDupReads;
+    int SkipMinMapq;
+    int SkipMinQual;
+    int MaxDepthByPos;
+    int RepeatCount;
+    char RepeatBase;
+
+	PileupSummary(char *line, PileupReads &reads);
+    PileupSummary() { Base = '\0'; Pos=-1; };
+};
+
+class PileupVisitor {
+    public:
+        char InputType;
+
+        string AnnotFile;       // path to file
+        tidx AnnotDex;          // start/stop index file
+        char AnnotType;         // b (bed) or g (gtf - preferred)
+        
+        PileupReads Reads;
+        PileupVisitor() {InputType ='\0';}
+        PileupVisitor(const char *a) {InputType ='\0'; LoadIndex(a);}
+		void Parse(char *dat) {PileupSummary p(dat, Reads); Visit(p);};
+        void LoadIndex(const char *a);
+		virtual void Visit(PileupSummary &dat)=0;
+		virtual void Finish()=0;
+};
+
+class VarStatVisitor : public PileupVisitor {
+    public:
+    VarStatVisitor() : PileupVisitor() {tot_locii=0; tot_depth=0; num_reads=0;};
+
+    void Visit(PileupSummary &dat);
+    void Finish() {};
+
+    public:
+	double tot_depth;
+	int tot_locii;
+	int num_reads;
+	vector<Noise> stats;
+	vector<Noise> ins_stats;
+	vector<Noise> del_stats;
+};
+
+class VarCallVisitor : public PileupVisitor {
+
+    deque<PileupSummary> Win;
+    void VisitX(PileupSummary &dat);
+
+    public:
+    int WinMax;
+    VarCallVisitor() : PileupVisitor() {SkippedDepth=0;WinMax=0;Hets=0;Homs=0;Locii=0;};
+
+    void Visit(PileupSummary &dat);
+    void Finish();
+
+	int SkippedDepth;
+	int Locii;
+	int Hets;
+	int Homs;
+};
+
+bool hasdata(const string &file) {
+	struct stat st;
+	if (stat(file.c_str(), &st)) {
+		return false;
+	}
+	return st.st_size > 0;
+}
+
+
+int minsampdepth=20;
+double pct_depth=0;
+double pct_qdepth=0;
+double global_error_rate=0;
+double max_phred;
+int total_locii=-1;
+double pct_balance=0;
+char *debug_xchr=NULL;
+int debug_xpos=0;
+int min_depth=1;
+int min_mapq=0;
+int min_qual=3;
+int repeat_filter=7;
+double artifact_filter=1;
+int min_adepth=2;
+int read_tail_pct=.6;
+int read_tail_len=4;
+int min_idepth=3;
+int no_baq=0;
+double zygosity=.5;		        // set to .1 for 1 10% admixture, or even .05 for het/admix
+
+void parse_bams(PileupVisitor &v, int in_n, char **in, const char *ref);
+
+FILE *noise_f=NULL, *var_f = NULL, *varsum_f = NULL, *tgt_f = NULL, *tgtsum_f = NULL, *vcf_f = NULL, *eav_f=NULL;
+
+double alpha=.05;
+int phred=33;
+double phi(double x);
+
+FILE *openordie(const char *path, const char *mode) {
+    FILE *f=fopen(path, mode);
+    if (!f) {
+        warn("Can't open-%s %s: %s\n", mode, path, strerror(errno));
+        exit(1);
+    }
+    return f;
+}
+
+int main(int argc, char **argv) {
+	char c;
+	const char *noiseout=NULL;
+	const char *ref=NULL;
+	optind = 0;
+	int umindepth=0;
+	int uminadepth=0;
+	int uminidepth=0;
+	double upctqdepth=0;
+	int do_stats=0;
+	int do_varcall=0;
+
+    char *out_prefix = NULL;
+    char *target_annot = NULL;
+    char *read_stats = NULL;
+
+	while ( (c = getopt_long(argc, argv, "?svVBhe:m:N:x:f:p:a:g:q:Q:i:o:D:R:b:L:S:",NULL,NULL)) != -1) {
+		switch (c) {
+			case 'h': usage(stdout); return 0;
+			case 'm': umindepth=atoi(optarg); break;
+			case 'q': min_qual=atoi(optarg); break;
+			case 'o': out_prefix=optarg; break;
+			case 'Q': min_mapq=atoi(optarg); break;
+			case 'V': printf("Version: %s.%d\n", VERSION, SVNREV); exit(0); break;
+			case 'R': repeat_filter=atoi(optarg); break;
+			case 'A': target_annot=optarg; break;
+			case 'a': uminadepth=atoi(optarg);break;
+			case 'D': artifact_filter=atof(optarg);break;
+			case 'i': uminidepth=atoi(optarg);break;
+			case 'x': {
+					debug_xchr=optarg;
+					char *p=strrchr(debug_xchr, ':');
+					if (!p) die("Invalid param for -x");
+					*p='\0';
+					debug_xpos=atoi(++p);
+					if (!p) die("Invalid param for -x, need pos");
+					break;
+				}
+			case 'b': pct_balance=atof(optarg)/100.0; break;
+			case 'B': no_baq=1; break;
+			case 'p': upctqdepth=atof(optarg); break;
+			case 'e': alpha=atof(optarg); break;
+			case 'g': global_error_rate=atof(optarg); break;
+			case 'L': total_locii=atoi(optarg); break;
+			case 'f': ref=optarg; break;
+			case 'N': noiseout=optarg; break;
+			case 's': do_stats=1; break;
+			case 'S': read_stats=optarg; break;
+			case 'v': do_varcall=1; break;
+			case '?':
+					  if (!optopt) {
+						  usage(stdout); return 0;
+					  } else if (optopt && strchr("ox", optopt))
+						  fprintf (stderr, "Option -%c requires an argument.\n", optopt);
+					  else if (isprint(optopt))
+						  fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+					  else
+						  fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt);
+					  usage(stderr);
+					  return 1;
+		}
+	}
+
+
+	if (!do_stats && !do_varcall || do_stats && do_varcall) {
+		warn("Specify -s for stats only, or -v to do variant calling\n\n");
+		usage(stderr);
+		return 1;
+	}
+
+    if (out_prefix) {
+        if (!do_varcall) {
+            warn("Specify -o with -v only\n\n");
+            usage(stderr);
+            return 1;
+        }
+
+        var_f = openordie(string_format("%s.var.tmp", out_prefix).c_str(), "w");
+        vcf_f = openordie(string_format("%s.vcf.tmp", out_prefix).c_str(), "w");
+        eav_f = openordie(string_format("%s.eav.tmp", out_prefix).c_str(), "w");
+        noise_f = openordie(string_format("%s.noise.tmp", out_prefix).c_str(), "w");
+        varsum_f = openordie(string_format("%s.varsum.tmp", out_prefix).c_str(), "w");
+        if (target_annot) {
+            tgt_f = openordie(string_format("%s.tgt.tmp", out_prefix).c_str(), "w");
+            tgtsum_f = openordie(string_format("%s.tgtsum.tmp", out_prefix).c_str(), "w");
+        }
+    } else {
+        var_f = stdout;
+        varsum_f = stderr;
+    } 
+
+	if (umindepth > minsampdepth) {
+		minsampdepth=umindepth;
+	}
+
+	if (noiseout) {
+		noise_f = fopen(noiseout, "w");
+		if (!noise_f) {
+			warn("Can't write %s: %s\n", noiseout, strerror(errno));
+			exit(1);
+		}
+	}
+
+	// set argv to '-' if stdin
+	const char *stdv[3] = {argv[0],"-",NULL};
+	if (!argv[optind]) {
+		argc=2;
+		argv = (char **) stdv;
+		optind=1;
+	}
+
+	char **in=&argv[optind];
+	int in_n = argc-optind;
+
+    // not really random
+    srand(1);
+
+    max_phred = -log10(global_error_rate)*10;
+
+	if (do_stats) {
+		FILE *stat_fout=stdout;			// stats to stdout
+
+		if (do_varcall) 				// unless varcalling at the same time
+			stat_fout=stderr;
+
+		VarStatVisitor vstat;
+
+		parse_bams(vstat, in_n, in, ref);
+
+		stat_out("version\tvarcall-%s.%d\n", VERSION, SVNREV);
+        stat_out("min depth\t%d\n", minsampdepth);
+        stat_out("alpha\t%f\n", alpha);
+
+        if (vstat.stats.size()) {
+            // sort by depth descending
+            sort(vstat.stats.begin(), vstat.stats.end(), noisebydepth);
+
+            // flip 3 and 1 because sorted in descending order for sampling (above)
+            double depth_q3=quantile_depth(vstat.stats, .25);
+            double depth_q2=quantile_depth(vstat.stats, .50);
+            double depth_q1=quantile_depth(vstat.stats, .75);
+            double depth_qx=quantile_depth(vstat.stats, .95);
+
+            // number of locii to compute error rate
+            int ncnt=min(100000,vstat.stats.size());
+
+            int i;
+            double nsum=0, nssq=0, dsum=0, dmin=vstat.stats[0].depth, qnsum=0, qnssq=0, qualsum=0;
+
+            double ins_nsum=0, ins_nssq=0, del_nsum=0, del_nssq=0;
+            for (i=0;i<ncnt;++i) {
+                if (vstat.stats[i].depth < depth_q1) {
+                    continue;
+                }
+                nsum+=vstat.stats[i].noise;
+                nssq+=vstat.stats[i].noise*vstat.stats[i].noise;
+                dsum+=vstat.stats[i].depth;
+                qnsum+=vstat.stats[i].qnoise;
+                qnssq+=vstat.stats[i].qnoise*vstat.stats[i].qnoise;
+                qualsum+=vstat.stats[i].mnqual;
+                if (vstat.stats[i].depth < dmin) dmin = vstat.stats[i].depth;
+                ins_nsum+=vstat.ins_stats[i].noise;
+                ins_nssq+=vstat.ins_stats[i].noise*vstat.ins_stats[i].noise;
+                del_nsum+=vstat.del_stats[i].noise;
+                del_nssq+=vstat.del_stats[i].noise*vstat.del_stats[i].noise;
+            }
+
+            double noise_mean =nsum/ncnt;
+            double noise_dev = stdev(ncnt, nsum, nssq);
+            double qnoise_mean =qnsum/ncnt;
+            double qnoise_dev = stdev(ncnt, qnsum, qnssq);
+            double qual_mean = qualsum/ncnt;
+            double ins_noise_mean =ins_nsum/ncnt;
+            double ins_noise_dev = stdev(ncnt, ins_nsum, ins_nssq);
+            double del_noise_mean =del_nsum/ncnt;
+            double del_noise_dev = stdev(ncnt, del_nsum, del_nssq);
+
+            stat_out("qual mean\t%.4f\n", qual_mean);
+            stat_out("noise mean\t%.6f\n", noise_mean);
+            stat_out("noise dev\t%.6f\n", noise_dev);
+            stat_out("qnoise mean\t%.6f\n", qnoise_mean);
+            stat_out("qnoise dev\t%.6f\n", qnoise_dev);
+            stat_out("ins freq\t%.6f\n", ins_noise_mean);
+            stat_out("ins freq dev\t%.6f\n", ins_noise_dev);
+            stat_out("del freq\t%.6f\n", del_noise_mean);
+            stat_out("del freq dev\t%.6f\n", del_noise_dev);
+
+            if (qnoise_mean >= noise_mean ) {
+                stat_out("error\tpoor quality estimates\n");
+            }
+
+            stat_out("noise depth mean\t%.4f\n", dsum/ncnt);
+            stat_out("noise depth min\t%.4f\n", dmin);
+            stat_out("noise cnt\t%d\n", ncnt);
+
+            stat_out("depth q1\t%.4f\n", depth_q1);
+            stat_out("depth median\t%.4f\n", depth_q2);
+            stat_out("depth q3\t%.4f\n", depth_q3);
+
+            dsum=0;
+            for (i=0;i<vstat.stats.size();++i) {
+                dsum+=vstat.stats[i].depth;
+            }
+
+            int locii_gtmin=0;
+            for (i=0;i<vstat.stats.size();++i) {
+                if (vstat.stats[i].depth >= min_depth) {
+                    ++locii_gtmin;
+                }
+            }
+            stat_out("locii >= min depth\t%d\n", locii_gtmin);
+            stat_out("locii\t%d\n", vstat.tot_locii);
+
+            double stdevfrommean=-qnorm((alpha/locii_gtmin)/2);
+            stat_out("qnorm adj\t%f\n", stdevfrommean);
+
+            pct_qdepth=qnoise_mean+qnoise_dev*stdevfrommean;
+            stat_out("min pct qual\t%.4f\n", 100*pct_qdepth);
+        }
+	}
+
+    if (read_stats){
+        FILE * f = fopen(read_stats, "r");
+        if (!f) {
+            warn("File %s does not exist, quitting\n", read_stats);
+            exit(1);
+        }
+        line l; meminit(l);
+        char *val;
+        while(read_line(f, l)>0) {
+            if (val=strchr(l.s, '\t')) {
+                *val='\0'; ++val;
+                if (!strcasecmp(l.s, "min depth")) {
+                    if (umindepth && umindepth > atoi(val)) {
+			            fprintf(varsum_f,"warning\tsampling depth was less than variation depth\n");
+                    }
+                    if (!umindepth) umindepth=atoi(val); 
+                } else if (!strcasecmp(l.s, "min pct qual")) {
+                    if (upctqdepth<=0) upctqdepth=atof(val); 
+                } else if (!strcasecmp(l.s, "noise mean")) {
+                    if (global_error_rate<=0) global_error_rate=atof(val); 
+                } else if (!strcasecmp(l.s, "locii >= min depth")) {
+                    if (total_locii<0) total_locii=atoi(val); 
+                } else if (!strcasecmp(l.s, "alpha")) {
+                    if (alpha<=0) alpha=atof(val); 
+                }
+            }
+        }
+    }
+
+    if (total_locii<0) total_locii=DEFAULT_LOCII;
+    if (total_locii==0) total_locii=1;          // no adjustment
+
+    if (eav_f) {
+        fprintf(eav_f,"chr\tpos\tref\tdepth\tnum_states\ttop_consensus\ttop_freq\tvar_base\tvar_depth\tvar_qual\tvar_strands\tforward_strands\treverse_strands\t%cval\n",total_locii>1?'e':'p');
+    }
+
+	if (do_varcall) {
+		if (umindepth) min_depth=umindepth;
+		if (upctqdepth > 0) pct_qdepth=(double)upctqdepth/100;
+		if (uminadepth) min_adepth=uminadepth;
+		if (uminidepth) min_idepth=uminidepth;
+
+		if (!min_depth || (!pct_depth  && !pct_qdepth)) {
+			fprintf(varsum_f,"warning\toutputting all variations, no minimum depths specified\n");
+		}
+
+		fprintf(varsum_f,"version\tvarcall-%s.%d\n", VERSION, SVNREV);
+		fprintf(varsum_f,"min depth\t%d\n", min_depth);
+		fprintf(varsum_f,"min call depth\t%d\n", min_adepth);
+		fprintf(varsum_f,"alpha\t%f\n", alpha);
+		fprintf(varsum_f,"min pct qual\t%d\n", (int)(100*pct_qdepth));
+
+		fprintf(varsum_f,"min balance\t%d\n", (int)(100*pct_balance));
+		fprintf(varsum_f,"artifact filter\t%f\n", artifact_filter);
+		fprintf(varsum_f,"min qual\t%d\n", min_qual);
+		fprintf(varsum_f,"min map qual\t%d\n", min_mapq);
+		fprintf(varsum_f,"error rate\t%f\n", global_error_rate);
+		fprintf(varsum_f,"locii used for adjustment\t%d\n", total_locii);
+
+		VarCallVisitor vcall;
+
+        if (repeat_filter > 0) {
+		    fprintf(varsum_f,"homopolymer filter\t%d\n", repeat_filter);
+            vcall.WinMax=repeat_filter+repeat_filter+3;
+        } else {
+            vcall.WinMax=5;
+        }
+
+        if (vcf_f) {
+            // print VCF header
+            fprintf(vcf_f, "%s\n", "##fileformat=VCFv4.1");
+        }
+
+		parse_bams(vcall, in_n, in, ref);
+
+        if (vcall.InputType == 'B') {
+        	fprintf(varsum_f,"baq correct\t%s\n", (no_baq?"no":"yes"));
+        }
+        fprintf(varsum_f,"locii\t%d\n", vcall.Locii);
+        fprintf(varsum_f,"hom calls\t%d\n", vcall.Homs);
+        fprintf(varsum_f,"het calls\t%d\n", vcall.Hets);
+        fprintf(varsum_f,"locii below depth\t%d\n", vcall.SkippedDepth);
+
+        if (out_prefix) {
+            fclose(var_f);
+            fclose(vcf_f);
+            fclose(eav_f);
+            fclose(noise_f);
+            fclose(varsum_f);
+            if (target_annot) {
+                fclose(tgt_f);
+                fclose(tgtsum_f);
+            }
+            rename_tmp(string_format("%s.var.tmp", out_prefix));
+            rename_tmp(string_format("%s.vcf.tmp", out_prefix));
+            rename_tmp(string_format("%s.eav.tmp", out_prefix));
+            rename_tmp(string_format("%s.noise.tmp", out_prefix));
+            rename_tmp(string_format("%s.varsum.tmp", out_prefix));
+            if (target_annot) {
+                rename_tmp(string_format("%s.tgt.tmp", out_prefix));
+                rename_tmp(string_format("%s.tgtsum.tmp", out_prefix));
+            }
+        }
+	}
+}
+
+void rename_tmp(std::string f) {
+    std::string notmp = f;
+    size_t pos = notmp.find(".tmp");
+    if (pos >= 0) {
+        notmp.replace(notmp.find(".tmp"),4,""); 
+        rename(f.c_str(),notmp.c_str());
+    }
+}
+
+// normal distribution
+double qnorm(double q) {
+     if(q == .5)
+          return 0;
+
+     q = 1.0 - q;
+
+     double p = (q > 0.0 && q < 0.5) ? q : (1.0 - q);
+     double t = sqrt(log(1.0 / pow(p, 2.0)));
+
+     double c0 = 2.515517;
+     double c1 = 0.802853;
+     double c2 = 0.010328;
+
+     double d1 = 1.432788;
+     double d2 = 0.189269;
+     double d3 = 0.001308;
+
+     double x = t - (c0 + c1 * t + c2 * pow(t, 2.0)) /
+                    (1.0 + d1 * t + d2 * pow(t, 2.0) + d3 * pow(t, 3.0));
+    
+     if(q > .5)
+          x *= -1.0;
+
+     return x;
+}
+
+double pnorm(double x)
+{
+    // constants
+    double a1 =  0.254829592;
+    double a2 = -0.284496736;
+    double a3 =  1.421413741;
+    double a4 = -1.453152027;
+    double a5 =  1.061405429;
+    double p  =  0.3275911;
+
+    // Save the sign of x
+    int sign = 1;
+    if (x < 0)
+        sign = -1;
+    x = fabs(x)/sqrt(2.0);
+
+    // A&S formula 7.1.26
+    double t = 1.0/(1.0 + p*x);
+    double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-x*x);
+
+    return 0.5*(1.0 + sign*y);
+}
+
+void parse_bams(PileupVisitor &v, int in_n, char **in, const char *ref) {
+
+	if (!in_n) {
+		warn("No input files, quitting\n");
+		exit(1);
+	}
+
+	int i, bam_n=0;
+	for (i=0;i<in_n;++i) {
+		if (!strcmp(fext(in[i]), ".bam")) {
+			++bam_n;
+		}
+	}
+
+	if (bam_n != in_n) {
+		if (bam_n > 0) {
+			warn("Can't mix bams and other input files\n");
+			exit(1);
+		} else {
+			if (in_n > 1) {
+				warn("Can't handle multiple pileups... TODO\n");
+				exit(1);
+			} else {
+				warn("input\t%d pileup\n", in_n);
+                v.InputType='P';
+			}
+		}
+	} else {
+		warn("input\t%d bam\n", bam_n);
+        v.InputType='B';
+	}
+
+	int is_popen = 0;
+	FILE *fin;
+
+	if (bam_n) {
+        if (!ref) {
+            warn("Need a reference file (-f) parameter, try -h for help\n");
+            exit(1);
+        }
+
+        if (!hasdata(string(ref)+".fai")) {
+            int ret=system(string_format("samtools faidx '%s'", ref).c_str());
+            if (ret) {
+                warn("Need a %s.fai file, run samtools faidx\n", ref);
+                exit(1);
+            }
+        }
+
+
+		const char *nobaq = no_baq ? "-B" : "";
+
+		string mpil_cmd = string_format("samtools mpileup -Q 0 -d 100000 %s -f '%s'", nobaq, ref);
+
+		int i;
+		for (i=0;i<in_n;++i) {
+			mpil_cmd += " '";
+			mpil_cmd += in[i];
+			mpil_cmd += "' ";
+		}
+
+		warn("command\t%s\n", mpil_cmd.c_str());
+
+		fin = popen(mpil_cmd.c_str(), "r");
+		if (!fin) 
+			exit(1);
+
+		is_popen = 1;
+	} else {
+        if (!strcmp(in[0], "-")) {
+            fin=stdin;
+        } else {
+            if (!strcmp(fext(in[0]), ".gz")) {
+                string gunz = string_format("gunzip -c '%s'", in[0]);
+                fin = popen(gunz.c_str(), "r");
+                is_popen = 1;
+            } else {
+                fin = fopen(in[0], "r");
+            }
+            if (!fin) {
+                warn("%s: %s", in[0], strerror(errno));
+                exit(1);
+            }
+        }
+	}
+
+    line l; meminit(l);
+	int cnt=0;
+    if (fin) {
+        while(read_line(fin, l)>0) {
+        //	chr      2       G       6       ^9,^+.^*,^2,^&.^&,      &.'&*-  9+*2&&  166,552,643,201,299,321
+            v.Parse(l.s);
+            ++cnt;
+        }
+        v.Finish();
+
+        if (is_popen) pclose(fin); else fclose(fin);
+    }
+
+	if (cnt == 0) {
+		warn("No data in pileup, quitting\n");
+		exit(1);
+	}
+}
+
+#define T_A 0
+#define T_C 1
+#define T_G 2
+#define T_T 3
+#define T_SDEL 4
+#define T_NDEL 5
+#define T_INS 6
+#define T_N 7
+#define b2i(c) ((c)=='A'?0:(c)=='a'?0:(c)=='C'?1:(c)=='c'?1:(c)=='G'?2:(c)=='g'?2:(c)=='T'?3:(c)=='t'?3:(c)=='*'?4:(c)=='-'?5:(c)=='+'?6:7)
+#define i2b(i) (i==0?'A':i==1?'C':i==2?'G':i==3?'T':i==4?'*':i==5?'-':i==6?'+':'?')
+
+bool hitoloint (int i,int j) { return (i>j);}
+
+int track_readlen[10000];
+
+
+PileupSummary::PileupSummary(char *line, PileupReads &rds) {
+
+	vector<char *> d=split(line, "\t");
+
+	if (d.size() < 6) {
+		warn("Can't read pileup : %d fields, need 6 columns\n", (int) d.size());
+		exit(1);
+	}
+
+	const char * p_qual=d[5];
+
+	Chr=d[0];
+	Pos=atoi(d[1]);
+	Base=*(d[2]);
+	Depth = atoi(d[3]);
+	SkipDupReads = 0;
+	SkipN = 0;
+	SkipMinQual = 0;
+	SkipMinMapq = 0;
+	MaxDepthByPos = 0;
+	RepeatCount = 0;
+	RepeatBase = '\0';
+	NumReads = 0;
+
+	int i;
+	vector<int> depthbypos;
+
+	const char *cur_p = d[4];
+
+    list<Read>::iterator read_i = rds.ReadList.begin();
+
+    int eor=0;
+	for (i=0;i<Depth;++i,++read_i) {
+		bool sor=0;
+		
+		if (*cur_p == '^') {
+			sor=1;
+			++cur_p;
+            Read x;
+            x.MapQ = *cur_p-phred;
+            ++cur_p;
+            if (read_i != rds.ReadList.end()) {
+                ++read_i;
+            }
+            read_i=rds.ReadList.insert(read_i,x);
+		}
+
+        if (read_i == rds.ReadList.end()) {
+            warn("warning\tread start without '^', partial pileup: '%s'\n", cur_p);
+            Read x;
+            x.MapQ = -1;
+            read_i=rds.ReadList.insert(read_i,x);
+        }
+
+        int pia = read_i->Seq.length()+1;
+		if (pia >= depthbypos.size()) {
+			depthbypos.resize(pia+1);
+		}
+		depthbypos[pia]++;
+
+
+		if (sor) 
+			++NumReads;
+
+		char q = p_qual[i]-phred;				// qual char
+		char mq = read_i->MapQ;
+		char o = *cur_p;				// orig call
+		char c = toupper(o);			// uppercase/ref 
+		bool is_ref = 0;
+
+		if (o == '.' || o == ',') {	
+			c = Base;					// ref instead
+			is_ref = 1;
+		}
+
+		if (o == '>' || o == '<') {	
+			c = 'N';					// no call
+			is_ref = 1;
+		}
+
+		bool skip = 0;
+
+        // probably should not be adding anything here... but the old code added 1 and floored... new code adds .5 and rounds... which is comparable
+        // really.. should just be adding zero, the reason the old code had it was because of a lack of max()
+		if (c == 'N') {
+			++SkipN;
+			skip=1;
+		} else if (artifact_filter > 0 && (depthbypos[pia] > max(1,rand_round(0.5+artifact_filter * (Depth/rds.MeanReadLen()))))) {
+			++SkipDupReads;
+			skip=1;
+		} else if (mq < min_mapq) {
+			++SkipMinMapq;
+			skip=1;
+		} else if (q < min_qual) {
+			++SkipMinQual;
+			skip=1;
+		} else {
+			int j = b2i(c);
+			if (j >= Calls.size()) {
+				int was = Calls.size();
+				Calls.resize(j+1);
+				int t; for (t=was;t<=j;++t) {
+					Calls[t].base=i2b(t);
+				}
+			}
+			if (is_ref) 
+				Calls[j].is_ref = 1;
+
+			if ( o == ',' || o == 'a' || o == 'c' || o == 't' || o == 'g' ) {
+				++Calls[j].rev;
+			} else if ( c != 'N' ) {
+				++Calls[j].fwd;
+			}
+
+			Calls[j].qual+=q;
+            Calls[j].mn_qual+=min(mq,q);
+            Calls[j].mq_ssq+=mq*mq;
+            Calls[j].mq_sum+=mq;
+            Calls[j].qual_ssq+=q*q;
+/*
+            if (pia <= read_tail_len || (rds.MeanReadLen()-pia) <= read_tail_len) {
+                if ( o == ',' || o == 'a' || o == 'c' || o == 't' || o == 'g' ) {
+                    ++Calls[j].tail_rev;
+                } else {
+                    ++Calls[j].tail_fwd;
+                }
+            }
+*/
+
+            if (vcf_f) {
+                if (mq == 0) 
+                    Calls[j].mq0++; 
+            }
+		}
+
+		if (c == '-' || c == '+') {
+            warn("invalid pileup, at '%s', indel not attached to read?\n", cur_p);
+		} else {
+		    if (c != '*') 
+                read_i->Seq += c;
+			++cur_p;
+		}
+
+        if (*cur_p == '+' || *cur_p == '-') {
+            c = *cur_p;
+            char *end_p;
+            int len = strtol(++cur_p, &end_p, 10);
+            string ins_seq(end_p, len);
+            to_upper(ins_seq);
+            read_i->Seq += ins_seq;
+            if (!skip) {
+                int j = b2i(c);
+                if (j >= Calls.size()) {
+                    int was = Calls.size();
+                    Calls.resize(j+1);
+                    int t; for (t=was;t<=j;++t) {
+                        Calls[t].base=i2b(t);
+                    }
+                }
+                if ( o == ',' || o == 'a' || o == 'c' || o == 't' || o == 'g' ) {
+                    ++Calls[j].rev;
+                } else {
+                    ++Calls[j].fwd;
+                }
+                Calls[j].qual+=q;
+                Calls[j].mn_qual+=min(q, mq);
+                Calls[j].qual_ssq+=q*q;
+                Calls[j].mq_ssq+=mq*mq;
+                Calls[j].mq_sum+=mq;
+                Calls[j].seqs.push_back(ins_seq);
+            }
+            cur_p=end_p+len;
+        }
+
+        if (*cur_p == '$') {
+            if (read_i->MapQ > -1) {
+                rds.TotReadLen+=read_i->Seq.size();
+                rds.ReadBin.push_back(*read_i);
+                if (rds.ReadBin.size() > min(1000,Depth*2)) {
+                    rds.ReadBin.pop_front();
+                    rds.TotReadLen-=rds.ReadBin.front().Seq.size();
+                }
+            }
+//            printf("%d\t%s\n", read_i->MapQ, read_i->Seq.c_str());
+            read_i=rds.ReadList.erase(read_i);
+            --read_i;
+            ++cur_p;
+            ++eor;
+        }
+	}
+
+    if ((Depth-eor) != rds.ReadList.size()) {
+        warn("warning\tdepth is %d, but read list is: %d\n", Depth, (int) rds.ReadList.size());
+    }
+
+	if (*cur_p == '-' || *cur_p == '+') {
+		char *end_p;
+		int len = strtol(++cur_p, &end_p, 10);
+		// keep this
+		string idl(end_p, len);
+		cur_p=end_p+len;
+	}
+
+	if (*cur_p) {
+		warn("Failed to parse pileup %s\n", d[4]);
+		exit(1);
+	}
+
+	for (i=0;i<depthbypos.size();++i) {
+		if (depthbypos[i] > MaxDepthByPos) {
+			MaxDepthByPos = depthbypos[i];
+		}
+	}
+
+	Depth=0;
+	for (i=0;i<5 && i < Calls.size();++i) {		// total depth (exclude inserts for tot depth, otherwise they are double-counted)
+		Depth+=Calls[i].depth();
+	}
+
+
+	TotQual=0;
+	for (i=0;i<5 && i < Calls.size();++i) {		// total depth (exclude inserts for tot depth, otherwise they are double-counted)
+		TotQual+=Calls[i].qual;
+	}
+}
+
+PileupSummary JunkSummary;
+
+void VarCallVisitor::Visit(PileupSummary &p) {
+    if (WinMax < 3) {
+        // no real window ... just go straight
+        VisitX(p);
+        return;
+    }
+
+    if (p.Base != '-' && p.Base != '@') {
+        if (Win.size() && (Win.back().Pos != (p.Pos - 1) )) {
+            if (Win.back().Pos < p.Pos && ((p.Pos - Win.back().Pos) <= (WinMax/2))) {
+                while (Win.back().Pos < (p.Pos - 1)) {
+                    // visit/pop, add a placeholder
+                    JunkSummary.Base = '-';
+                    JunkSummary.Pos = Win.back().Pos + 1;
+                    Visit(JunkSummary);
+                }
+            } else {
+                while (Win.size() && Win[WinMax/2].Base != '@') {
+                    // visit/pop, but don't add anything, until it's empty
+                    JunkSummary.Base = '@';
+                    JunkSummary.Pos = 0;
+                    Visit(JunkSummary);
+                }
+            }
+        }
+    }
+
+    // initialize the window with nothing, if it's not full
+    while (Win.size() < WinMax) {
+        JunkSummary.Base = '@';
+        JunkSummary.Pos = 0;
+        Win.push_back(JunkSummary);
+    }
+
+    Win.push_back(p);
+
+    //debug("Visit: %d\n", p.Pos);
+
+    if (Win.size() > WinMax)        // queue too big?  pop
+        Win.pop_front();
+    
+    int i;
+    int lrc=0,rrc=0;                // left repeat count, right repeat count
+    char lrb, rrb;                  // left repeat base...
+    int vx;
+
+    if (Win.size() < WinMax) {    // small window?  look at leading edge only
+        return;
+    } else {
+        vx = WinMax/2;              // larger window? look at midpoint
+    }
+
+    if (Win[vx].Base == '-' || Win[vx].Base == '@') 
+        return;
+
+    if (vx > 1) {                   // look left
+        lrb = Win[vx-1].Base;
+        for (i=vx-2; i >= 0; --i) { // increment repeat count
+            if (Win[i].Base == lrb) 
+                ++lrc;
+            else 
+                break;
+        }
+    }
+    if (vx < (Win.size()-2)) {
+        rrb = Win[vx+1].Base;
+        for (i=vx+2; i < Win.size(); ++i) {
+            if (Win[i].Base == rrb)
+                ++rrc;
+            else
+                break;
+        }
+    }
+
+    // repeat counts are now 1-based, not 0-based
+    ++lrc;
+    ++rrc;
+
+    // maximum repeat count and associated base
+    if (lrb == rrb ) {
+        Win[vx].RepeatCount = lrc+rrc;
+        Win[vx].RepeatBase = lrb;
+    } else if (lrc > rrc) {
+        Win[vx].RepeatCount = lrc;
+        Win[vx].RepeatBase = lrb;
+    } else {
+        Win[vx].RepeatCount = rrc;
+        Win[vx].RepeatBase = rrb;
+    }
+
+	if (debug_xpos) {
+        if (Win[vx].Pos == debug_xpos && !strcmp(debug_xchr,Win[vx].Chr.data())) {
+            fprintf(stderr,"xpos-window\t");
+            for (i=0;i<Win.size();++i) {
+                fprintf(stderr,"%c", Win[i].Base);
+            }
+            fprintf(stderr,"\n");
+        }
+    }
+
+    double drms = 0; 
+    if (vx < Win.size()-1) {
+        int i;
+		int dminus = b2i('-');
+		int dstar = b2i('*');
+
+        if (Win[vx].Calls.size() > dminus && Win[vx].Calls[dminus].depth() > 0) {
+            if (Win[vx+1].Calls.size() > dstar && Win[vx+1].Calls[dstar].depth() > 0) {
+                // baq adjustment works at the 'star' not at the 'indel', so adjust qual using the next locus
+               double adj=Win[vx+1].Calls[dstar].qual_rms()/(double)Win[vx].Calls[dminus].qual_rms();
+               if (debug_xpos) {
+                    if (Win[vx].Pos == debug_xpos && !strcmp(debug_xchr,Win[vx].Chr.data())) {
+                        fprintf(stderr,"xpos-adj-qual\t%d to %d (%f)\n", Win[vx].Calls[dminus].qual_rms(),Win[vx+1].Calls[dstar].qual_rms(), adj);
+                    }
+               }
+               Win[vx].Calls[dminus].qual *= adj; 
+               Win[vx].Calls[dminus].qual_ssq *= adj;
+            } else {    
+                vcall none;
+                if (debug_xpos) {
+                    if (Win[vx].Pos == debug_xpos && !strcmp(debug_xchr,Win[vx].Chr.data())) {
+                        fprintf(stderr,"xpos-skip-del-qual\t%d\n", Win[vx].Calls[dminus].depth());
+                    }
+                }
+                Win[vx].Calls[dminus] = none;
+            }
+        }
+    }
+
+    VisitX(Win[vx]);
+}
+
+void VarCallVisitor::Finish() {
+    // finish out the rest of the pileup, with the existing window
+    int vx = WinMax/2+1;
+    while (vx < Win.size()) {
+        ///debug("Finish: %d\n", Win[vx].Pos);
+        VisitX(Win[vx++]);
+    }
+}
+
+void VarCallVisitor::VisitX(PileupSummary &p) {
+    //debug("VisitX: %d\n", p.Pos);
+
+	if (debug_xpos) {
+		if (p.Pos != debug_xpos)
+			return;
+		if (strcmp(debug_xchr,p.Chr.data())) 
+			return;
+	}
+
+	if (p.Depth < min_depth) {
+        if (debug_xpos) {
+            fprintf(stderr,"xpos-skip-depth\t%d < %d\n",p.Depth, min_depth);
+		    fprintf(stderr,"xpos-skip-dup\t%d\n",p.SkipDupReads);
+		    fprintf(stderr,"xpos-skip-n\t%d\n",p.SkipN);
+		    fprintf(stderr,"xpos-skip-mapq\t%d\n",p.SkipMinMapq);
+		    fprintf(stderr,"xpos-skip-qual\t%d\n",p.SkipMinQual);
+        }
+		++SkippedDepth;
+		return;
+	}
+
+	int ins_fwd = p.Calls.size() > 6 ? p.Calls[6].fwd : 0;
+	int ins_rev = p.Calls.size() > 6 ? p.Calls[6].rev : 0;
+
+	int i;
+	if (p.Calls.size() > 6) 
+		p.Calls.resize(7);	// toss N's before sort
+
+	sort(p.Calls.begin(), p.Calls.end(), hitolocall);
+
+	int need_out = -1;
+	int skipped_balance=0;
+	int skipped_alpha=0;
+	int skipped_indel=0;
+	int skipped_tail_hom=0;
+	int skipped_depth=0;
+	int skipped_repeat=0;
+
+    vector<vfinal> final_calls;
+	for (i=0;i<p.Calls.size();++i) {		// all calls
+//        printf("CALL TOP: depth:%d base: %c, pd: %d, calls: %d\n", (int) p.Calls[i].depth(), p.Calls[i].base, p.Depth, (int) p.Calls.size());
+	
+		double pct = (double) p.Calls[i].depth()/p.Depth;
+		double qpct = (double) p.Calls[i].qual/p.TotQual;
+
+		if (!p.Calls[i].base)
+			continue;
+
+		if (!p.Calls[i].depth())
+			continue;
+
+		double bpct = (double) min(p.Calls[i].fwd,p.Calls[i].rev)/p.Calls[i].depth();
+
+		if (pct > pct_depth && qpct >= pct_qdepth && (p.Calls[i].depth() >= min_adepth)) {
+            if (bpct < pct_balance) {
+                int fwd_adj=0, rev_adj=0;
+                // f=b*(f+r); r=f/b-f; adj=r-(f/b-f)
+                if (p.Calls[i].fwd < p.Calls[i].rev) {
+                    rev_adj = (int) p.Calls[i].rev - ( p.Calls[i].fwd/pct_balance  - p.Calls[i].fwd );
+                } else {
+                    fwd_adj = (int) p.Calls[i].fwd - ( p.Calls[i].rev/pct_balance  - p.Calls[i].rev );
+                }
+                if (fwd_adj + rev_adj > 1 && bpct > 0) {
+                    // adjust call down
+                    p.Calls[i].qual -= (rev_adj+fwd_adj)*(p.Calls[i].qual/p.Calls[i].depth()); 
+                    p.Calls[i].mq_sum -= (rev_adj+fwd_adj)*(p.Calls[i].mq_sum/p.Calls[i].depth());
+                    p.Calls[i].qual_ssq -= (rev_adj+fwd_adj)*(p.Calls[i].qual_ssq/p.Calls[i].depth());
+                    p.Calls[i].mq_ssq -= (rev_adj+fwd_adj)*(p.Calls[i].mq_ssq/p.Calls[i].depth());
+                    p.Calls[i].rev -= rev_adj;
+                    p.Calls[i].fwd -= fwd_adj;
+                    skipped_balance+=rev_adj+fwd_adj;
+
+                    // fixed bpct
+                    bpct = (double) min(p.Calls[i].fwd,p.Calls[i].rev)/p.Calls[i].depth();
+                } else {
+                    // it's junk anyway
+                }
+
+                // fix depths after adjustment!
+                pct = (double) p.Calls[i].depth()/p.Depth;
+                qpct = (double) p.Calls[i].qual/p.TotQual;
+            }
+        }
+
+		if (pct > pct_depth && qpct >= pct_qdepth && (p.Calls[i].depth() >= min_adepth)) {
+			// balance is meaningless at low depths
+			if ((bpct >= pct_balance) || (p.Calls[i].depth()<4)) {
+				if (p.Calls[i].base == '+' || p.Calls[i].base == '-') {
+                    // yuk ... time to think about a possible indel call
+					if (p.Calls[i].depth() >= min_idepth) {
+						// should really pick more than 1
+						// but need to allow "similar" indels to pile up
+                        // should group into distinct bins, using some homology thing
+						sort(p.Calls[i].seqs.begin(), p.Calls[i].seqs.end());
+						string prev, maxs;
+						int pcnt=0, maxc=0, j;
+						for (j=0;j<p.Calls[i].seqs.size();++j) {
+							if (prev == p.Calls[i].seqs[j]) {
+								++pcnt;
+							} else {
+								if (pcnt > maxc) {
+									maxs=prev;
+									maxc=pcnt;
+								}
+								prev=p.Calls[i].seqs[j];
+								pcnt=1;
+							}
+						}
+						if (pcnt > maxc) {
+							maxs=prev;
+							maxc=pcnt;
+						}
+						if (maxc >= min_idepth && maxc >= min_adepth) {
+                            // only calls 1 indel at a given position
+                            if ((repeat_filter == 0) || (p.RepeatCount < repeat_filter)) {
+                                // maybe use rms here... see if it helps
+                                double mean_qual = p.Calls[i].qual/(double)p.Calls[i].depth();
+                                double err_rate = mean_qual < max_phred ? pow(10,-mean_qual/10.0) : global_error_rate;
+                                // expected number of non-reference = error_rate*depth
+                                double pval=(p.Depth*err_rate==0)?0:gsl_ran_poisson_pdf(p.Calls[i].depth(), p.Depth*err_rate);
+                                double padj=total_locii ? pval*total_locii : pval;           // multiple-testing adjustment
+
+                                if (padj <= alpha) {
+                                    vfinal final(p.Calls[i]);
+                             
+                                    double mq_padj=max(total_locii*pow(10,-p.Calls[i].mq_sum/10.0),padj);      // never report pval as better than the total mapping quality
+		                            if (debug_xpos) fprintf(stderr,"xpos-debug-pval\tbase:%c, err:%g, pval:%g, padj:%g, mq_padj:%g, mq_sum:%d\n", p.Calls[i].base, err_rate, pval, padj, mq_padj, p.Calls[i].mq_sum);
+
+                                    if (mq_padj > 1) mq_padj=1;
+
+                                    if (need_out == -1) 
+                                        need_out = i;
+
+//                                    printf("FINAL: depth:%d base: %s\n", (int) maxc, maxs.c_str());
+                                    final.padj=mq_padj;
+                                    final.max_idl_cnt=maxc;
+                                    final.max_idl_seq=maxs;
+                                    final_calls.push_back(final);
+                                } else {
+                                    skipped_alpha+=p.Calls[i].depth();
+                                }
+                                // implicitly skip all the ohter indel calls at the same locus
+                                skipped_indel+=p.Calls[i].depth()-maxc;
+                            } else {
+                                skipped_repeat+=p.Calls[i].depth();
+                            }
+						} else {
+							skipped_indel+=p.Calls[i].depth();
+						}
+					} else {
+						skipped_indel+=p.Calls[i].depth();
+					}
+				} else {
+                    if (p.Calls[i].base == '*' && (
+                            ((repeat_filter > 0) && (p.RepeatCount >= repeat_filter)) || 
+                            (p.Calls[i].depth() < min_idepth)
+                       )) {
+					   skipped_indel+=p.Calls[i].depth();
+                    } else {
+                        // subtract inserts from reference .. perhaps > 0 is correct here....
+                        if (p.Calls[i].is_ref && (ins_rev+ins_fwd) > max(min_idepth,min_adepth)) {
+                            p.Calls[i].fwd-=ins_fwd;
+                            p.Calls[i].rev-=ins_rev;
+                        }
+
+                        double mean_qual = p.Calls[i].qual/(double)p.Calls[i].depth();
+
+/*
+                        if ( (repeat_filter > 0) && (p.RepeatCount >= repeat_filter) ) {
+                           p.Calls[i].fwd-=p.Calls[i].tail_fwd; 
+                           p.Calls[i].rev-=p.Calls[i].tail_rev;
+                           skipped_tail_hom+=p.Calls[i].tail_fwd+p.Calls[i].tail_rev;
+                        }
+*/
+                        if (p.Calls[i].depth() >= min_adepth && p.Calls[i].depth() > 0) {
+                            double err_rate = mean_qual < max_phred ? pow(10,-mean_qual/10.0) : global_error_rate;
+                            // expected number of non-reference bases at this position is error_rate*depth
+                            double pval=(p.Depth*err_rate==0)?0:gsl_ran_poisson_pdf(p.Calls[i].depth(), p.Depth*err_rate);
+                            double padj=total_locii ? pval*total_locii : pval;           // multiple-testing adjustment
+
+                            if (padj <= alpha) {
+                                double mq_padj=max(total_locii*pow(10,-p.Calls[i].mq_sum/10.0),padj);      // never report as better than the mapping quality
+
+                                if (mq_padj > 1) mq_padj=1;
+
+		                        if (debug_xpos) fprintf(stderr,"xpos-debug-pval\tbase:%c, err:%g, pval:%g, padj:%g, mq_padj:%g, mq_sum:%d\n", p.Calls[i].base, err_rate, pval, padj, mq_padj, p.Calls[i].mq_sum);
+
+                                if (!p.Calls[i].is_ref || debug_xpos) {
+                                    if (need_out == -1)
+                                        need_out = i;
+                                }
+                                vfinal final(p.Calls[i]);
+                                final.padj=mq_padj;
+                                final_calls.push_back(final);
+                            } else {
+                                skipped_alpha+=p.Calls[i].depth();
+                            }
+                        }
+                    }
+				}
+			} else {
+				skipped_balance+=p.Calls[i].depth();
+			}
+		} else {
+            // depth is too low now.... technically you can just add all the rest of the calls to skipped_depth without checking
+			skipped_depth+=p.Calls[i].depth();
+		}
+	}
+
+    ++Locii;
+
+	if (need_out>=0||debug_xpos) {
+
+        if (final_calls.size() > 1){
+//            printf("HERE1 %c/%c\n", final_calls[0].pcall->base, final_calls[1].pcall->base);
+            if(final_calls[1].pcall->is_ref) {
+                vfinal tmp=final_calls[1];
+                final_calls[1]=final_calls[0];
+                final_calls[0]=tmp;
+//                printf("HERE2 %c/%c\n", final_calls[0].pcall->base, final_calls[1].pcall->base);
+            }
+        }
+
+//        printf("allele_count: %d\n", (int) final_calls.size());
+
+        
+        int total_call_depth=0;
+        int i;
+        for (i=0;i<final_calls.size();++i) {
+            total_call_depth+=final_calls[i].pcall->depth();
+        }
+        double pct_allele = 0;
+        if (need_out >=0) {
+            // more than 1 call at this position = Het
+            if (final_calls.size() > 1) {
+                if (final_calls[0].pcall->is_ref) {
+                    pct_allele = 100.0 * final_calls[1].pcall->depth() / (double) total_call_depth;
+                } else {
+                    // no reference seen... but still het?
+                    pct_allele = 100.0 * final_calls[0].pcall->depth() / (double) total_call_depth;
+                }
+                ++Hets;
+            } else {
+                pct_allele = 100.0 * final_calls[0].pcall->depth() / (double) total_call_depth;
+                ++Homs;
+            }
+        }
+
+        if (var_f) {
+            int i;
+            string pil;
+            for (i=0;i<final_calls.size();++i) {
+               vfinal &f=final_calls[i];
+               if (f.is_indel()) {
+                    pil += string_format("\t%c%s:%d,%d,%.1e",f.pcall->base,f.max_idl_seq.c_str(),f.max_idl_cnt,f.pcall->qual/f.pcall->depth(),f.padj);
+               } else {
+                    pil += string_format("\t%c:%d,%d,%.1e",f.pcall->base,f.pcall->depth(),f.pcall->qual/f.pcall->depth(),f.padj);
+               }
+            }
+            fprintf(var_f,"%s\t%d\t%c\t%d\t%d\t%2.2f%s\n",p.Chr.c_str(), p.Pos, p.Base, p.Depth, skipped_alpha+skipped_depth+skipped_balance+p.SkipN+p.SkipDupReads+p.SkipMinMapq+p.SkipMinQual, pct_allele, pil.c_str());
+        }
+
+        if (vcf_f) {
+
+            for (i=0;i<final_calls.size();++i) {
+               vfinal &f=final_calls[i];
+               int qual = f.padj>0?min(40,10*(-log10(f.padj))):40;
+
+               if (f.is_indel()) {
+                    string base;
+                    string alt;
+                    if (f.pcall->base =='-') {
+                        base = p.Base + f.max_idl_seq;
+                        alt = p.Base;
+                    } else {
+                        base = p.Base;
+                        alt = p.Base + f.max_idl_seq;
+                    }
+                    double freq_allele = f.max_idl_cnt / (double) p.Depth;
+                    fprintf(vcf_f,"%s\t%d\t.\t%s\t%s\t%2d\tPASS\tMQ=%d;BQ=%d;DP=%d;AF=%2.2f\n", 
+                        p.Chr.c_str(), p.Pos, base.c_str(), alt.c_str(), qual, 
+                        (int) f.pcall->mq_rms(),
+                        (int) f.pcall->qual_rms(),
+                        total_call_depth,
+                        freq_allele);
+                } else {
+                    char alt = f.pcall->base;
+                    if (f.pcall->is_ref) 
+                        alt = '.';
+                    double freq_allele = f.pcall->depth() / (double) p.Depth;
+                    fprintf(vcf_f,"%s\t%d\t.\t%c\t%c\t%d\tPASS\tMQ=%d;BQ=%d;DP=%d;AF=%2.2f\n",
+                        p.Chr.c_str(), p.Pos, p.Base, alt, qual,
+                        (int) f.pcall->mq_rms(),
+                        (int) f.pcall->qual_rms(),
+                        total_call_depth,
+                        freq_allele);
+                }
+           }
+        }
+
+        if (eav_f) {
+//            printf(eav_f,"chr\tpos\tref\tdepth\tnum_states\ttop_consensus\ttop_freq\tvar_base\tvar_depth\tvar_qual\tvar_strands\tforward_strands\treverse_strands\n");
+            string top_cons, var_base, var_depth, var_qual, var_strands, forward, reverse;
+           
+            float padj=final_calls[0].padj;
+            if (final_calls[0].pcall->is_ref && final_calls.size() > 1) {
+                padj=final_calls[1].padj;
+            }
+            for (i=0;i<final_calls.size();++i) {
+                vfinal &f=final_calls[i];
+                if (i < 2) {
+                    if (i > 0) top_cons += "/";
+                    top_cons += f.pcall->base;
+                }
+                if (i > 0) var_base += "/";
+                var_base += f.pcall->base;
+                if (f.is_indel()) {
+                    if (i < 2) {
+                        top_cons += f.max_idl_seq;
+                    }
+                    var_base += f.max_idl_seq;
+                }
+                if (i > 0) var_depth+= ";";
+                var_depth+= string_format("%d",f.pcall->depth());
+                if (i > 0) var_qual+= ";";
+                var_qual+= string_format("%d",f.pcall->qual_rms());
+                if (i > 0) var_strands+= ";";
+                var_strands+= string_format("%d",(f.pcall->fwd>0)+(f.pcall->rev>0));
+                if (i > 0) forward += ";";
+                forward+= string_format("%d",f.pcall->fwd);
+                if (i > 0) reverse += ";";
+                reverse+= string_format("%d",f.pcall->rev);
+            }
+            fprintf(eav_f,"%s\t%d\t%c\t%d\t%d\t%s\t%2.2f\t%s\t%s\t%s\t%s\t%s\t%s\t%.1e\n",p.Chr.c_str(), p.Pos, p.Base, p.Depth, (int) final_calls.size(),top_cons.c_str(), pct_allele, var_base.c_str(), var_depth.c_str(), var_qual.c_str(), var_strands.c_str(), forward.c_str(), reverse.c_str(), padj);
+        }
+
+		if (debug_xpos) {
+		    fprintf(stderr,"xpos-skip-dup\t%d\n",p.SkipDupReads);
+		    fprintf(stderr,"xpos-skip-mapq\t%d\n",p.SkipMinMapq);
+		    fprintf(stderr,"xpos-skip-qual\t%d\n",p.SkipMinQual);
+		    fprintf(stderr,"xpos-skip-bal\t%d\n",skipped_balance);
+		    fprintf(stderr,"xpos-skip-depth\t%d\n",skipped_depth);
+		    fprintf(stderr,"xpos-skip-indel\t%d\n",skipped_indel);
+//		    fprintf(stderr,"xpos-skip-tail-imbalance\t%d\n",skipped_tail_hom);
+		    fprintf(stderr,"xpos-skip-repeat\t%d\n",skipped_repeat);
+		    fprintf(stderr,"xpos-skip-alpha\t%d\n",skipped_alpha);
+            if (repeat_filter > 0) {
+                fprintf(stderr,"repeat-count\t%d\n",p.RepeatCount);
+                fprintf(stderr,"repeat-filter\t%d\n",repeat_filter);
+                fprintf(stderr,"repeat-base\t%c\n",p.RepeatBase);
+            }
+			exit(0);
+		}
+	}
+}
+
+
+void PileupVisitor::LoadIndex(const char *path) {
+    FILE *f = fopen(path,"r");
+    if (!f) {
+        warn("Can't open %s : %s\n", path, strerror(errno));
+        exit(1);
+    }
+
+    AnnotType = '\0';
+    line l; meminit(l);
+    int cnt=0;
+    while(read_line(f, l)>0) {
+        vector<char *> d=split(l.s, "\t");
+        if (d.size() < 9) {
+            warn("File must be a GTF or a BED: '%s'\n", path);
+            exit(1);
+        }
+        AnnotType = (*d[5]=='+' || *d[5] == '-') ? 'b' : '\0';  
+        AnnotType = (*d[6]=='+' || *d[5] == '-') ? 'g' : AnnotType;
+        break;
+    }
+
+    if (!AnnotType) {
+        warn("File must be a GTF or a BED: '%s'\n", path);
+        exit(1);
+    }
+
+    if (!AnnotDex.read(path)) {
+        //    void build(const char *path, const char *sep, int nchr, int nbeg, int nend, int skip_i, char skip_c);
+        AnnotDex.build(path, "\t",  0, 1, 2, 0, '#', 1);
+    }
+
+    fclose(f);
+}
+
+void VarStatVisitor::Visit(PileupSummary &p) {
+	tot_locii += 1;
+
+	if (p.Depth < minsampdepth)
+		return;
+
+    // insert and deletions have their own, separate noise levels
+
+	int ins_depth = p.Calls.size() > 6 ? p.Calls[6].depth() : 0;
+	int ins_qual = p.Calls.size() > 6 ? p.Calls[6].qual : 0;
+	double ins_noise = 0;
+	double ins_qnoise = 0;
+	if (p.Calls.size() > 1 && p.Calls[1].depth() > ins_depth && ins_depth > 0) {
+		ins_noise = (double) ins_depth/p.Depth;
+		ins_qnoise = (double) ins_qual/p.TotQual;
+	}
+
+	int del_depth = p.Calls.size() > 5 ? p.Calls[5].depth() : 0;
+	int del_qual = p.Calls.size() > 5 ? p.Calls[5].qual : 0;
+	double del_noise = 0;
+	double del_qnoise = 0;
+	if (p.Calls.size() > 1 && p.Calls[1].depth() > del_depth && del_depth > 0) {
+		del_noise = (double) del_depth/p.Depth;
+		del_qnoise = (double) del_qual/p.TotQual;
+	}
+
+    // snp's are "noise" if there are 3 alleles at a given position
+	int i;
+	if (p.Calls.size() > 5) 
+		p.Calls.resize(5);		// toss N's and inserts before sort
+
+	sort(p.Calls.begin(), p.Calls.end(), hitolocall);
+
+	double noise = p.Calls.size() > 2 ? (double) p.Calls[2].depth()/p.Depth : 0;
+	double qnoise = p.Calls.size() > 2 ? (double) p.Calls[2].qual/p.TotQual : 0;
+
+	double mnqual = (double)p.TotQual/p.Depth;
+
+	char pbase = p.Calls.size() > 2 ? p.Calls[2].base : '.';
+
+	if (noise_f) {
+		fprintf(noise_f,"%d\t%c\t%f\t%f\n", p.Depth, pbase, noise, qnoise, mnqual);
+/*
+        if (ins_noise > 0) {
+		    fprintf(noise_f,"%d\t%c\t%f\t%f\n", p.Depth, '+', ins_noise, ins_qnoise, mnqual);
+        }
+        if (del_noise > 0) {
+		    fprintf(noise_f,"%d\t%c\t%f\t%f\n", p.Depth, '-', del_noise, del_qnoise, mnqual);
+        }
+*/
+	}
+
+	tot_depth += p.Depth;
+	num_reads += p.NumReads;
+	stats.push_back(Noise(p.Depth, noise, qnoise, mnqual));
+	ins_stats.push_back(Noise(p.Depth, ins_noise, ins_qnoise, mnqual));
+	del_stats.push_back(Noise(p.Depth, del_noise, del_qnoise, mnqual));
+}
+
+
+void usage(FILE *f) {
+        fprintf(f,
+"Usage: varcall <-s|-v> <-f REF> [options] bam1 [bam2...]\n"
+"Version: %s.%d (BETA)\n"
+"\n"
+"Either outputs summry stats for the list of files, or performs variant calling\n"
+"\n"
+"Options (later options override earlier):\n"
+"\n"
+"-s          Calculate statistics\n"
+"-v          Calculate variants bases on supplied parameters (see -S)\n"
+"-f          Reference fasta (required if using bams, ignored otherwise)\n"
+"-m          Min locii depth (0)\n"
+"-a          Min allele depth (0)\n"
+"-p          Min allele pct by quality (0)\n"
+"-q          Min qual (3)\n"
+"-Q          Min mapping quality (0)\n"
+"-b          Min pct balance (strand/total) (0)\n"
+"-D FLOAT    Max duplicate read fraction (depth/length per position) (1)\n"
+"-B          Turn off BAQ correction (false)\n"
+"-R          Homopolymer repeat indel filtering (8)\n"
+"-e FLOAT    Alpha filter to use, requires -l or -S (.05)\n"
+"-g FLOAT    Global minimum error rate (default: assume phred is ok)\n"
+"-l INT      Number of locii in total pileup used for bonferroni (1 mil)\n"
+"-x CHR:POS  Output this pos only, then quit\n"
+"-N FIL      Output noise stats to FIL\n"
+"-S FIL      Read in statistics and params from a previous run with -s (do this!)\n"
+"-A ANNOT    Calculate in-target stats using the annotation file (requires -o)\n"
+"-o PREFIX   Output prefix (note: overlaps with -N)\n"
+"\n"
+"Input files\n"
+"\n"
+"Files must be sorted bam files with bai index files available.  Alternatively,\n"
+"a single pileup file can be supplied.\n"
+"\n"
+"Output files\n"
+"\n"
+"Varcalls go to stdout.  Stats go to stdout, or stderr if varcalling too\n"
+"\n"
+"If an output prefix is used, files are created as follows:\n"
+"   PREFIX.var       Variant calls in tab delimited 'varcall' format\n"
+"   PREFIX.eav       Variant calls in tab delimited 'ea-var' format\n"
+"   PREFIX.vcf       Variant calls, in vcf format\n"
+"   PREFIX.varsum    Summary of variant calls\n"
+"   PREFIX.tgt       On-target stats detail\n"
+"   PREFIX.tgtsum    Summary of on-target stats\n"
+"   PREFIX.noise     Noise stats detail\n"
+"\n"
+"Stats Output:\n"
+"\n"
+"Contains mean, median, quartile information for depth, base quality, read len,\n"
+"mapping quality, indel levels. Also estimates parameters suitable for\n"
+"variant calls, and can be passed directly to this program for variant calls\n"
+"\n"
+"Filtering Details:\n"
+"\n"
+        ,VERSION, SVNREV);
+}
+
+std::string string_format(const std::string &fmt, ...) {
+       int n, size=100;
+       std::string str;
+       va_list ap;
+       while (1) {
+		   str.resize(size);
+		   va_start(ap, fmt);
+		   int n = vsnprintf((char *)str.c_str(), size, fmt.c_str(), ap);
+		   va_end(ap);
+		   if (n > -1 && n < size) {
+			   str.resize(n);
+			   return str;
+		   }
+		   if (n > -1)
+			   size=n+1;
+		   else
+			   size*=2;
+       }
+}
+
+void to_upper(const std::string str) {
+	std::string::iterator it;
+	int i;
+	for ( i=0;i<str.size();++i ) {
+		((char *)(void *)str.data())[i]=toupper(((char *)str.data())[i]);
+	}
+}
+
+// returns quantile depth 
+double quantile_depth(const std::vector<Noise> &vec, double p) {
+        int l = vec.size();
+        assert(l > 0);
+        double t = ((double)l-1)*p;
+        int it = (int) t;
+        int v=vec[it].depth;
+        if (t > (double)it) {
+                return (v + (t-it) * (vec[it+1].depth - v));
+        } else {
+                return v;
+        }
+}
+
+double quantile(const std::vector<int> &vec, double p) {
+        int l = vec.size();
+        double t = ((double)l-1)*p;
+        int it = (int) t;
+        int v=vec[it];
+        if (t > (double)it) {
+                return (v + (t-it) * (vec[it+1] - v));
+        } else {
+                return v;
+        }
+}
+
+double quantile(const std::vector<double> &vec, double p) {
+        int l = vec.size();
+        double t = ((double)l-1)*p;
+        int it = (int) t;
+        double v=vec[it];
+        if (t > (double)it) {
+                return (v + p * (vec[it+1] - v));
+        } else {
+                return v;
+        }
+}
+
+std::vector<char *> split(char* str,const char* delim)
+{
+    char* token = strtok(str,delim);
+    std::vector<char *> result;
+    while(token != NULL)
+    {
+        result.push_back(token);
+        token = strtok(NULL,delim);
+    }
+    return result;
+}
+
+int rand_round(double x) {
+    return floor(x)+((rand()>(x-int(x))) ? 1 : 0);
+//warn("rr:%f=%d\n",x);
+}
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ea-utils.git