[med-svn] [libvcflib] 01/02: Imported Upstream version 0.0.20141212

Sun Feb 1 10:55:23 UTC 2015

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository libvcflib.

commit 47d88078ab2af69a961d344ac607a4e05a790980
Author: Andreas Tille <tille at debian.org>
Date:   Sun Feb 1 11:54:34 2015 +0100

    Imported Upstream version 0.0.20141212
---
 .gitignore                         |  118 ++
 .gitmodules                        |   21 +
 LICENSE                            |   19 +
 Makefile                           |  173 +++
 README.md                          |  650 ++++++++++
 bin/bed2region                     |    9 +
 bin/plot_roc.r                     |  153 +++
 bin/vcf2bed.py                     |   16 +
 bin/vcf2sqlite.py                  |  130 ++
 bin/vcf_strip_extra_headers        |   18 +
 bin/vcfbiallelic                   |   20 +
 bin/vcfclearid                     |   12 +
 bin/vcfclearinfo                   |   12 +
 bin/vcfcomplex                     |   26 +
 bin/vcffirstheader                 |   16 +
 bin/vcfgtcompare.sh                |   16 +
 bin/vcfindelproximity              |   82 ++
 bin/vcfindels                      |   26 +
 bin/vcfmultiallelic                |   20 +
 bin/vcfmultiway                    |   20 +
 bin/vcfmultiwayscripts             |   30 +
 bin/vcfnobiallelicsnps             |   29 +
 bin/vcfnoindels                    |   25 +
 bin/vcfnosnps                      |   25 +
 bin/vcfnulldotslashdot             |   22 +
 bin/vcfplotaltdiscrepancy.r        |  511 ++++++++
 bin/vcfplotaltdiscrepancy.sh       |   12 +
 bin/vcfplotsitediscrepancy.r       |   99 ++
 bin/vcfplottstv.sh                 |   13 +
 bin/vcfprintaltdiscrepancy.r       |   37 +
 bin/vcfprintaltdiscrepancy.sh      |   18 +
 bin/vcfqualfilter                  |   32 +
 bin/vcfregionreduce                |   29 +
 bin/vcfregionreduce_and_cut        |   32 +
 bin/vcfregionreduce_pipe           |   29 +
 bin/vcfregionreduce_uncompressed   |   29 +
 bin/vcfremovenonATGC               |   29 +
 bin/vcfsnps                        |   26 +
 bin/vcfsort                        |    3 +
 bin/vcfvarstats                    |  225 ++++
 samples/sample.vcf                 |   31 +
 src/BedReader.h                    |  176 +++
 src/Variant.cpp                    | 2405 ++++++++++++++++++++++++++++++++++++
 src/Variant.h                      |  586 +++++++++
 src/convert.h                      |   22 +
 src/join.h                         |   36 +
 src/mt19937ar.h                    |  192 +++
 src/split.cpp                      |   23 +
 src/split.h                        |   53 +
 src/ssw.c                          |  834 +++++++++++++
 src/ssw.h                          |  129 ++
 src/ssw_cpp.cpp                    |  399 ++++++
 src/ssw_cpp.h                      |  216 ++++
 src/vcf2dag.cpp                    |  168 +++
 src/vcf2fasta.cpp                  |  264 ++++
 src/vcf2tsv.cpp                    |  241 ++++
 src/vcfaddinfo.cpp                 |  111 ++
 src/vcfafpath.cpp                  |   52 +
 src/vcfallelicprimitives.cpp       |  414 +++++++
 src/vcfaltcount.cpp                |   50 +
 src/vcfannotate.cpp                |  126 ++
 src/vcfannotategenotypes.cpp       |  220 ++++
 src/vcfbreakmulti.cpp              |  114 ++
 src/vcfcat.cpp                     |   34 +
 src/vcfcheck.cpp                   |  139 +++
 src/vcfclassify.cpp                |  162 +++
 src/vcfcleancomplex.cpp            |   71 ++
 src/vcfcombine.cpp                 |  207 ++++
 src/vcfcommonsamples.cpp           |   85 ++
 src/vcfcountalleles.cpp            |   33 +
 src/vcfcreatemulti.cpp             |  197 +++
 src/vcfdistance.cpp                |   92 ++
 src/vcfecho.cpp                    |   31 +
 src/vcfentropy.cpp                 |  159 +++
 src/vcfevenregions.cpp             |  202 +++
 src/vcffilter.cpp                  |  402 ++++++
 src/vcffixup.cpp                   |  117 ++
 src/vcfflatten.cpp                 |  178 +++
 src/vcfgeno2alleles.cpp            |   54 +
 src/vcfgeno2haplo.cpp              |  391 ++++++
 src/vcfgenosamplenames.cpp         |   39 +
 src/vcfgenosummarize.cpp           |  107 ++
 src/vcfgenotypecompare.cpp         |  327 +++++
 src/vcfgenotypes.cpp               |   66 +
 src/vcfglbound.cpp                 |  178 +++
 src/vcfglxgt.cpp                   |  171 +++
 src/vcfhetcount.cpp                |   72 ++
 src/vcfhethomratio.cpp             |   66 +
 src/vcfindex.cpp                   |   42 +
 src/vcfinfo2qual.cpp               |   50 +
 src/vcfinfosummarize.cpp           |  212 ++++
 src/vcfintersect.cpp               |  577 +++++++++
 src/vcfkeepgeno.cpp                |   62 +
 src/vcfkeepinfo.cpp                |   68 +
 src/vcfkeepsamples.cpp             |   54 +
 src/vcfleftalign.cpp               |  781 ++++++++++++
 src/vcflength.cpp                  |   49 +
 src/vcfnumalt.cpp                  |   55 +
 src/vcfoverlay.cpp                 |  109 ++
 src/vcfparsealts.cpp               |   42 +
 src/vcfprimers.cpp                 |  140 +++
 src/vcfqual2info.cpp               |   44 +
 src/vcfrandom.cpp                  |   70 ++
 src/vcfrandomsample.cpp            |  174 +++
 src/vcfremap.cpp                   |  350 ++++++
 src/vcfremoveaberrantgenotypes.cpp |   75 ++
 src/vcfremovesamples.cpp           |   76 ++
 src/vcfroc.cpp                     |  469 +++++++
 src/vcfsample2info.cpp             |  218 ++++
 src/vcfsamplediff.cpp              |  200 +++
 src/vcfsamplenames.cpp             |   29 +
 src/vcfsamplestats.cpp             |  193 +++
 src/vcfsitesummarize.cpp           |   94 ++
 src/vcfsom.cpp                     |  626 ++++++++++
 src/vcfstats.cpp                   |  570 +++++++++
 src/vcfstreamsort.cpp              |  143 +++
 src/vcfuniq.cpp                    |   49 +
 src/vcfuniqalleles.cpp             |   54 +
 tests/lib/Local/vcflib/Test.pm     |   32 +
 tests/vcfdistance.t                |   98 ++
 120 files changed, 19059 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..99ce1a1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,118 @@
+*~
+.Rhistory
+.*swp
+.nfs*
+*.o
+BedReader.cpp
+Fasta.cpp
+Fasta.h
+Makefile.bad
+Multinomial.cpp
+Multinomial.h
+Pilot1
+Pilot2
+VCF.h
+VariantFilter.h
+b.vcf
+bugs/
+callgrind.out.7143
+f
+freebayes.chr20.integrated.nogeno.20101123.vcf
+glorder
+glorder.cpp
+glorder.py
+glorder.pyc
+gmon.out
+multimaptest.cpp
+pooled.sqlite
+pooled.sqlite3
+shunt
+shunt.c
+t.bed
+test.db
+test.vcf
+test.vcf.gz
+test.vcf.gz.tbi
+test/
+vcf2tsv
+vcfaddinfo
+vcfaddtag.cpp
+vcfafpath
+vcfallelicprimitives
+vcfaltcount
+vcfannotate
+vcfannotategenotypes
+vcfbreakmulti
+vcfcheck
+vcfclassify
+vcfcleancomplex
+vcfcommonsamples
+vcfcountalleles
+vcfcreatemulti
+vcfdistance
+vcfecho
+vcfentropy
+vcffilter
+vcffixup
+vcffixup.cpp.bak
+vcfflatten
+vcfgeno2haplo
+vcfgenotypecompare
+vcfgenotypes
+vcfglxgt
+vcfhaplotyecompare.cpp
+vcfhetcount
+vcfhethomratio
+vcfintersect
+vcfkeepfields
+vcfkeepgeno
+vcfkeepinfo
+vcfkeepsamples
+vcflength
+vcfmultiwaywwwindexfilter
+vcfnogeno
+vcfnogeno.cpp
+vcfnumalt
+vcfoverlay
+vcfparallel
+vcfparsealts
+vcfphylo.cpp
+vcfplotaltdiscrepancy.r.loess
+vcfplottstv.r
+vcfprimers
+vcfrandom
+vcfrandomsample
+vcfremap
+vcfremoveaberrantgenotypes
+vcfremovesamples
+vcfroc
+vcfsamplediff
+vcfsamplenames
+vcfsitesummarize
+vcfsom
+vcfsplit.cpp
+vcfstats
+vcfstreamsort
+vcfuniqalleles
+#vcfcountalleles.cpp#
+.vcfplotaltdiscrepancy.r.swo
+.vcfstats.cpp.swn
+.vcfstats.cpp.swo
+a.out
+vcfuniq
+vcfcat
+vcfevenregions
+vcfgenosummarize
+vcfgenosamplenames
+vcf2fasta
+bin/vcf2dag
+bin/vcfcombine
+bin/vcfgeno2alleles
+bin/vcfglbound
+bin/vcfindex
+bin/vcfinfo2qual
+bin/vcfinfosummarize
+bin/vcfleftalign
+bin/vcfqual2info
+bin/vcfsample2info
+libvcflib.a
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..9f675ec
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,21 @@
+[submodule "tabixpp"]
+	path = tabixpp
+	url = https://github.com/ekg/tabixpp.git
+[submodule "smithwaterman"]
+	path = smithwaterman
+	url = https://github.com/ekg/smithwaterman.git
+[submodule "multichoose"]
+	path = multichoose
+	url = https://github.com/ekg/multichoose.git
+[submodule "fastahack"]
+	path = fastahack
+	url = https://github.com/ekg/fastahack.git
+[submodule "intervaltree"]
+	path = intervaltree
+	url = https://github.com/ekg/intervaltree.git
+[submodule "fsom"]
+	path = fsom
+	url = https://github.com/ekg/fsom.git
+[submodule "filevercmp"]
+	path = filevercmp
+	url = https://github.com/ekg/filevercmp.git
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0708937
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2012 Erik Garrison
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..5a45987
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,173 @@
+#OBJ_DIR = ./
+HEADERS = src/Variant.h \
+		  src/split.h \
+		  src/join.h
+SOURCES = src/Variant.cpp \
+		  src/split.cpp
+OBJECTS= $(SOURCES:.cpp=.o)
+
+# TODO
+#vcfstats.cpp
+
+BIN_SOURCES = src/vcfecho.cpp \
+			  src/vcfaltcount.cpp \
+			  src/vcfhetcount.cpp \
+			  src/vcfhethomratio.cpp \
+			  src/vcffilter.cpp \
+			  src/vcf2tsv.cpp \
+			  src/vcfgenotypes.cpp \
+			  src/vcfannotategenotypes.cpp \
+			  src/vcfcommonsamples.cpp \
+			  src/vcfremovesamples.cpp \
+			  src/vcfkeepsamples.cpp \
+			  src/vcfsamplenames.cpp \
+			  src/vcfgenotypecompare.cpp \
+			  src/vcffixup.cpp \
+			  src/vcfclassify.cpp \
+			  src/vcfsamplediff.cpp \
+			  src/vcfremoveaberrantgenotypes.cpp \
+			  src/vcfrandom.cpp \
+			  src/vcfparsealts.cpp \
+			  src/vcfstats.cpp \
+			  src/vcfflatten.cpp \
+			  src/vcfprimers.cpp \
+			  src/vcfnumalt.cpp \
+			  src/vcfcleancomplex.cpp \
+			  src/vcfintersect.cpp \
+			  src/vcfannotate.cpp \
+			  src/vcfallelicprimitives.cpp \
+			  src/vcfoverlay.cpp \
+			  src/vcfaddinfo.cpp \
+			  src/vcfkeepinfo.cpp \
+			  src/vcfkeepgeno.cpp \
+			  src/vcfafpath.cpp \
+			  src/vcfcountalleles.cpp \
+			  src/vcflength.cpp \
+			  src/vcfdistance.cpp \
+			  src/vcfrandomsample.cpp \
+			  src/vcfentropy.cpp \
+			  src/vcfglxgt.cpp \
+			  src/vcfroc.cpp \
+			  src/vcfcheck.cpp \
+			  src/vcfstreamsort.cpp \
+			  src/vcfuniq.cpp \
+			  src/vcfuniqalleles.cpp \
+			  src/vcfremap.cpp \
+			  src/vcf2fasta.cpp \
+			  src/vcfsitesummarize.cpp \
+			  src/vcfbreakmulti.cpp \
+			  src/vcfcreatemulti.cpp \
+			  src/vcfevenregions.cpp \
+			  src/vcfcat.cpp \
+			  src/vcfgenosummarize.cpp \
+			  src/vcfgenosamplenames.cpp \
+			  src/vcfgeno2haplo.cpp \
+			  src/vcfleftalign.cpp \
+			  src/vcfcombine.cpp \
+			  src/vcfgeno2alleles.cpp \
+			  src/vcfindex.cpp \
+			  src/vcf2dag.cpp \
+			  src/vcfsample2info.cpp \
+			  src/vcfqual2info.cpp \
+			  src/vcfinfo2qual.cpp \
+			  src/vcfglbound.cpp \
+			  src/vcfinfosummarize.cpp
+
+# when we can figure out how to build on mac
+# src/vcfsom.cpp
+
+#BINS = $(BIN_SOURCES:.cpp=)
+BINS = $(addprefix bin/,$(notdir $(BIN_SOURCES:.cpp=)))
+SHORTBINS = $(notdir $(BIN_SOURCES:.cpp=))
+
+TABIX = tabixpp/tabix.o
+
+FASTAHACK = fastahack/Fasta.o
+
+SMITHWATERMAN = smithwaterman/SmithWatermanGotoh.o 
+
+REPEATS = smithwaterman/Repeats.o
+
+INDELALLELE = smithwaterman/IndelAllele.o
+
+DISORDER = smithwaterman/disorder.o
+
+LEFTALIGN = smithwaterman/LeftAlign.o
+
+FSOM = fsom/fsom.o
+
+FILEVERCMP = filevercmp/filevercmp.o
+
+INCLUDES = -I. -L. -Ltabixpp/
+LDFLAGS = -lvcflib -ltabix -lz -lm
+
+
+all: $(OBJECTS) $(BINS)
+
+CXX = g++
+CXXFLAGS = -O3 -D_FILE_OFFSET_BITS=64
+#CXXFLAGS = -O2
+#CXXFLAGS = -pedantic -Wall -Wshadow -Wpointer-arith -Wcast-qual
+
+SSW = src/ssw.o src/ssw_cpp.o
+
+ssw.o: src/ssw.h
+ssw_cpp.o:src/ssw_cpp.h
+
+openmp:
+	$(MAKE) CXXFLAGS="$(CXXFLAGS) -fopenmp -D HAS_OPENMP"
+
+profiling:
+	$(MAKE) CXXFLAGS="$(CXXFLAGS) -g" all
+
+gprof:
+	$(MAKE) CXXFLAGS="$(CXXFLAGS) -pg" all
+
+$(OBJECTS): $(SOURCES) $(HEADERS) $(TABIX)
+	$(CXX) -c -o $@ src/$(*F).cpp $(INCLUDES) $(LDFLAGS) $(CXXFLAGS)
+
+$(TABIX):
+	cd tabixpp && $(MAKE)
+
+$(SMITHWATERMAN):
+	cd smithwaterman && $(MAKE)
+
+$(DISORDER): $(SMITHWATERMAN)
+
+$(REPEATS): $(SMITHWATERMAN)
+
+$(LEFTALIGN): $(SMITHWATERMAN)
+
+$(INDELALLELE): $(SMITHWATERMAN)
+
+$(FASTAHACK):
+	cd fastahack && $(MAKE)
+
+#$(FSOM):
+#	cd fsom && $(CXX) $(CXXFLAGS) -c fsom.c -lm
+
+$(FILEVERCMP):
+	cd filevercmp && make
+
+$(SHORTBINS):
+	$(MAKE) bin/$@
+
+$(BINS): $(BIN_SOURCES) libvcflib.a $(OBJECTS) $(SMITHWATERMAN) $(FASTAHACK) $(DISORDER) $(LEFTALIGN) $(INDELALLELE) $(SSW) $(FILEVERCMP)
+	$(CXX) src/$(notdir $@).cpp -o $@ $(INCLUDES) $(LDFLAGS) $(CXXFLAGS)
+
+libvcflib.a: $(OBJECTS) $(SMITHWATERMAN) $(REPEATS) $(FASTAHACK) $(DISORDER) $(LEFTALIGN) $(INDELALLELE) $(SSW) $(FILEVERCMP) $(TABIX)
+	ar rs libvcflib.a $(OBJECTS) smithwaterman/sw.o $(FASTAHACK) $(SSW) $(FILEVERCMP) $(TABIX) tabixpp/bgzf.o tabixpp/index.o tabixpp/knetfile.o tabixpp/kstring.o
+
+
+test: $(BINS)
+	@prove -Itests/lib -w tests/*.t
+
+clean:
+	rm -f $(BINS) $(OBJECTS)
+	rm -f ssw_cpp.o ssw.o
+	rm -f libvcflib.a
+	cd tabixpp && make clean
+	cd smithwaterman && make clean
+	cd fastahack && make clean
+
+.PHONY: clean all test
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..13603b2
--- /dev/null
+++ b/README.md
@@ -0,0 +1,650 @@
+# vcflib
+### a C++ library for parsing and manipulating VCF files.
+[![Gitter](https://badges.gitter.im/Join Chat.svg)](https://gitter.im/ekg/vcflib?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+
+#### author: Erik Garrison <erik.garrison at bc.edu>
+
+#### license: MIT
+
+---
+
+## overview
+
+The [Variant Call Format (VCF)](http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41)
+is a flat-file, tab-delimited textual format
+intended to concisely describe reference-indexed variations between individuals. 
+VCF provides a common interchange format for the description of variation in individuals and populations of samples,
+and has become the _defacto_ standard reporting format for a wide array of genomic variant detectors.
+
+vcflib provides methods to manipulate and interpret sequence variation as it can be described by VCF.
+It is both:
+
+ * an API for parsing and operating on records of genomic variation as it can be described by the VCF format,
+ * and a collection of command-line utilities for executing complex manipulations on VCF files.
+
+The API itself provides a quick and extremely permissive method to read and write VCF files.
+Extensions and applications of the library provided in the included utilities (*.cpp) comprise the vast bulk of the library's utility for most users.
+
+## usage
+
+vcflib provides a variety of functions for VCF manipulation:
+
+### comparison
+
+ * Generate **haplotype-aware intersections** ([vcfintersect](#vcfintersect) -i), **unions** (vcfintersect -u), and **complements** (vcfintersect -v -i).
+ * **Overlay-merge** multiple VCF files together, using provided order as precedence ([vcfoverlay](#vcfoverlay)).
+ * **Combine** multiple VCF files together, handling samples when alternate allele descriptions are identical ([vcfcombine](#vcfcombine)).
+ * **Validate** the integrity and identity of the VCF by verifying that the VCF record's REF matches a given reference file ([vcfcheck](#vcfcheck)).
+
+### format conversion
+
+ * Convert a VCF file into a per-allele or per-genotype **tab-separated (.tsv)** file ([vcf2tsv](#vcf2tsv)).
+ * Store a VCF file in an **SQLite3** database (vcf2sqlite.py).
+ * Make a **BED file** from the intervals in a VCF file (vcf2bed.py).
+
+### filtering and subsetting
+
+ * **Filter** variants and genotypes using arbitrary expressions based on values in the INFO and sample fields ([vcffilter](#vcffilter)).
+ * **Randomly sample** a subset of records from a VCF file, given a rate ([vcfrandomsample](#vcfrandomsample)).
+ * **Select variants** of a certain type (vcfsnps, vcfbiallelic, vcfindels, vcfcomplex, etc.)
+
+### annotation
+
+ * **Annotate** one VCF file with fields from the INFO column of another, based on position ([vcfaddinfo](#vcfaddinfo), [vcfintersect](#vcfintersect)).
+ * Incorporate annotations or targets provided by a *BED* file ([vcfannotate](#vcfannotate), [vcfintersect](#vcfintersect)).
+ * Examine **genotype correspondence** between two VCF files by annotating samples in one file with genotypes from another ([vcfannotategenotypes](#vcfannotategenotypes)).
+ * Annotate variants with the **distance** to the nearest variant ([vcfdistance](#vcfdistance)).
+ * Count the number of alternate alleles represented in samples at each variant record ([vcfaltcount](#vcfaltcount)).
+ * **Subset INFO fields** to decrease file size and processing time ([vcfkeepinfo](#vcfkeepinfo)).
+ * Lighten up VCF files by keeping only a **subset of per-sample information** ([vcfkeepgeno](#vcfkeepgeno)).
+ * **Numerically index** alleles in a VCF file ([vcfindex](#vcfindex)).
+
+### samples
+
+ * Quickly obtain the **list of samples** in a given VCF file ([vcfsamplenames](#vcfsamplenames)).
+ * **Remove samples** from a VCF file ([vcfkeepsamples](#vcfkeepsamples), [vcfremovesamples](#vcfremovesamples)).
+
+### ordering
+
+ * **Sort variants** by genome coordinate ([vcfstreamsort](#vcfstreamsort)).
+ * **Remove duplicate** variants in vcfstreamsort'ed files according to their REF and ALT fields ([vcfuniq](#vcfuniq)).
+
+### variant representation
+
+ * **Break multiallelic** records into multiple records ([vcfbreakmulti](#vcfbreakmulti)), retaining allele-specific INFO fields.
+ * **Combine overlapping biallelic** records into a single record ([vcfcreatemulti](#vcfcreatemulti)).
+ * **Decompose complex variants** into a canonical SNP and indel representation ([vcfallelicprimitives](#vcfallelicprimitives)), generating phased genotypes for available samples.
+ * **Reconstitute complex variants** provided a phased VCF with samples ([vcfgeno2haplo](#vcfgeno2haplo)).
+ * **Left-align indel and complex variants** ([vcfleftalign](#vcfleftalign)).
+
+### genotype manipulation
+
+ * **Set genotypes** in a VCF file provided genotype likelihoods in the GL field ([vcfglxgt](#vcfglxgt)).
+ * Establish putative **somatic variants** using reported differences between germline and somatic samples ([vcfsamplediff](#vcfsamplediff)).
+ * Remove samples for which the reported genotype (GT) and observation counts disagree (AO, RO) ([vcfremoveaberrantgenotypes](#vcfremoveaberrantgenotypes)).
+
+### interpretation and classification of variants
+
+ * Obtain aggregate **statistics** about VCF files ([vcfstats](#vcfstats)).
+ * Print the **receiver-operating characteristic (ROC)** of one VCF given a truth set ([vcfroc](#vcfroc)).
+ * Annotate VCF records with the **Shannon entropy** of flanking sequence ([vcfentropy](#vcfentropy)).
+ * Calculate the heterozygosity rate ([vcfhetcount](#vcfhetcount)).
+ * Generate potential **primers** from VCF records ([vcfprimers](#vcfprimers)), to check for genome uniqueness.
+ * Convert the numerical represenation of genotypes provided by the GT field to a **human-readable genotype format** ([vcfgenotypes](#vcfgenotypes)).
+ * Observe how different alignment parameters, including context and entropy-dependent ones, influence **variant classification and interpretation** ([vcfremap](#vcfremap)).
+ * **Classify variants** by annotations in the INFO field using a self-organizing map ([vcfsom](#vcfsom)); **re-estimate their quality** given known variants.
+
+
+A number of "helper" perl and python scripts (e.g. vcf2bed.py, vcfbiallelic) further extend functionality.
+
+In practice, users are encouraged to drive the utilities in the library in a streaming fashion, using pipes, to fully utilize resources on multi-core systems during interactive work.  Piping provides a convenient method to interface with other libraries (vcf-tools, BedTools, GATK, htslib, bcftools, freebayes) which interface via VCF files, allowing the composition of an immense variety of processing functions.
+
+## development
+
+See src/vcfecho.cpp for basic usage.  src/Variant.h and src/Variant.cpp describe methods available in the API.
+vcflib is incorporated into several projects, such as [freebayes](https://github.com/ekg/freebayes), which may provide a point of reference for prospective developers.
+Additionally, developers should be aware of that vcflib contains submodules (git repositories) comprising its dependencies (outside of lzib and a *nix environment).
+
+
+## installing
+
+vcflib includes submodules, so to obtain vcflib you have to use:
+
+    % git clone --recursive git://github.com/ekg/vcflib.git
+
+or
+
+    % git clone --recursive https://github.com/ekg/vcflib.git
+
+To build, use Make:
+
+    % cd vcflib
+    % make
+
+Executables are built into the ./bin directory in the repository.
+A number of shell, perl, python, and R scripts already reside there.
+This makes installation easy, as users can add vcflib/bin to their path, or copy the contained executables to a directory already in their path.
+
+
+## executables
+
+### vcf2tsv
+    
+    usage: vcf2tsv [-n null_string] [-g] [vcf file]
+    Converts stdin or given VCF file to tab-delimited format, using null string to replace empty values in the table.
+    Specifying -g will output one line per sample with genotype information.
+    
+### vcfaddinfo
+    
+    usage: vcfaddinfo <vcf file> <vcf file>
+    Adds info fields from the second file which are not present in the first vcf file.
+    
+    
+### vcfafpath
+
+Uses allele frequencies in the AF info column to estimate phylogeny at multiallelic sites.
+
+
+### vcfallelicprimitives
+    
+    usage: vcfallelicprimitives [options] [file]
+    
+    options:
+        -m, --use-mnps          Retain MNPs as separate events (default: false)
+        -t, --tag-parsed FLAG   Tag records which are split apart of a complex allele
+                                with this flag
+    
+    If multiple alleleic primitives (gaps or mismatches) are specified in
+    a single VCF record, split the record into multiple lines, but drop all
+    INFO fields.  "Pure" MNPs are split into multiple SNPs unless the -m
+    flag is provided.  Genotypes are phased where complex alleles have been
+    decomposed, provided genotypes in the input.
+
+    
+### vcfaltcount
+    
+Counts the number of alternate alleles in the record.
+    
+    
+### vcfannotate
+    
+    usage: vcfannotate [options] [<vcf file>]
+
+    options:
+        -b, --bed   use annotations provided by this BED file
+        -k, --key   use this INFO field key for the annotations
+        -d, --default  use this INFO field key for records without annotations
+
+    Intersect the records in the VCF file with targets provided in a BED file.
+    Intersections are done on the reference sequences in the VCF file.
+    If no VCF filename is specified on the command line (last argument) the VCF
+    read from stdin.
+
+    
+### vcfannotategenotypes
+    
+    usage: vcfannotategenotypes <annotation-tag> <vcf file> <vcf file>
+
+    annotates genotypes in the first file with genotypes in the second adding the
+    genotype as another flag to each sample filed in the first file.
+    annotation-tag is the name of the sample flag which is added to store the
+    annotation.  also adds a 'has\_variant' flag for sites where the second file has
+    a variant.
+    
+    
+### vcfbreakmulti
+    
+    usage: vcfbreakmulti [options] [file]
+    
+    If multiple alleles are specified in a single record, break the record into
+    multiple lines, preserving allele-specific INFO fields.
+    
+    
+### vcfcheck
+    
+    usage: vcfcheck [options] <vcf file>
+    
+    options: -f, --fasta-reference  FASTA reference file to use to obtain
+                                    primer sequences
+    
+    Verifies that the VCF REF field matches the reference as described.
+    
+    
+    
+### vcfcleancomplex
+    
+Removes reference-matching sequence from complex alleles and adjusts records to
+reflect positional change.
+    
+    
+### vcfcombine
+
+    usage: vcfcombine [vcf file] [vcf file] ...
+
+    options:
+        -h --help           This text.
+        -r --region REGION  A region specifier of the form chrN:x-y to bound the merge
+
+Combines VCF files positionally, combining samples when sites and alleles are identical.
+Any number of VCF files may be combined.  The INFO field and other columns are taken from
+one of the files which are combined when records in multiple files match.  Alleles must
+have identical ordering to be combined into one record.  If they do not, multiple records
+will be emitted.
+
+
+### vcfcommonsamples
+    
+    usage: vcfcommonsamples <vcf file> <vcf file> outputs each record in the
+    first file, removing samples not present in the second
+    
+    
+### vcfcountalleles
+    
+Counts the total number of alleles in the input.
+
+
+### vcfcreatemulti
+
+If overlapping alleles are represented across multiple records, merge them into a single record.
+    
+### vcfdistance
+    
+Adds a value to each VCF record indicating the distance to the nearest variant
+in the file.
+    
+    
+### vcfentropy
+    
+    usage: vcfentropy [options] <vcf file>
+    
+    options: -f, --fasta-reference  FASTA reference file to use to obtain
+primer sequences -w, --window-size      Size of the window over which to
+calculate entropy
+    
+    Anotates the output VCF file with, for each record, EntropyLeft,
+EntropyRight, EntropyCenter, which are the entropies of the sequence of the
+given window size to the left, right, and center  of the record.
+    
+    
+    
+### vcffilter
+    
+    usage: vcffilter [options] <vcf file>
+
+    options:
+        -f, --info-filter     specifies a filter to apply to the info fields of records,
+                              removes alleles which do not pass the filter
+        -g, --genotype-filter specifies a filter to apply to the genotype fields of records
+        -s, --filter-sites    filter entire records, not just alleles
+        -t, --tag-pass        tag vcf records as positively filtered with this tag, print all records
+        -F, --tag-fail        tag vcf records as negatively filtered with this tag, print all records
+        -A, --append-filter   append the existing filter tag, don't just replace it
+        -a, --allele-tag      apply -t on a per-allele basis.  adds or sets the corresponding INFO field tag
+        -v, --invert          inverts the filter, e.g. grep -v
+        -o, --or              use logical OR instead of AND to combine filters
+        -r, --region          specify a region on which to target the filtering, requires a BGZF
+                              compressed file which has been indexed with tabix.  any number of
+                              regions may be specified.
+
+    Filter the specified vcf file using the set of filters.
+    Filters are specified in the form "<ID> <operator> <value>:
+     -f "DP > 10"  # for info fields
+     -g "GT = 1|1" # for genotype fields
+     -f "CpG"  # for 'flag' fields
+
+    Operators can be any of: =, !, <, >, |, &
+
+    Any number of filters may be specified.  They are combined via logical AND
+    unless --or is specified on the command line.  Obtain logical negation through
+    the use of parentheses, e.g. "! ( DP = 10 )"
+
+    For convenience, you can specify "QUAL" to refer to the quality of the site, even
+    though it does not appear in the INFO fields.
+
+    
+### vcffixup
+
+Count the allele frequencies across alleles present in each record in the 
+VCF file.  (Similar to vcftools --freq.)
+
+Uses genotypes from the VCF file to correct AC (alternate allele count), AF
+(alternate allele frequency), NS (number of called), in the VCF records.  For
+example:
+
+    % vcfkeepsamples file.vcf NA12878 | vcffixup - | vcffilter -f "AC > 0"
+
+Would downsample file.vcf to only NA12878, removing sites for which the sample
+was not called as polymorphic.
+    
+    
+### vcfflatten
+    
+    usage: vcfflatten [file]
+    
+    Removes multi-allelic sites by picking the most common alternate.  Requires
+    allele frequency specification 'AF' and use of 'G' and 'A' to specify the
+    fields which vary according to the Allele or Genotype. VCF file may be
+    specified on the command line or piped as stdin.
+    
+    
+### vcfgeno2haplo
+    
+    usage: vcfgeno2haplo [options] [<vcf file>]
+    
+    options:
+        -w, --window-size N       compare records up to this many bp away (default 30)
+        -r, --reference FILE      FASTA reference file, required with -i and -u
+    
+    Convert genotype-based phased alleles within --window-size into haplotype alleles.
+    
+    
+    
+### vcfgenotypecompare
+    
+    usage: vcfgenotypecompare <other-genotype-tag> <vcf file>
+    adds statistics to the INFO field of the vcf file describing the
+    amount of discrepancy between the genotypes (GT) in the vcf file and the
+    genotypes reported in the <other-genotype-tag>.  use this after
+    vcfannotategenotypes to get correspondence statistics for two vcfs.
+    
+    
+### vcfgenotypes
+    
+Converts numerical representation of genotypes (standard in GT field) to the
+alleles provided in the call's ALT/REF fields.
+    
+    
+### vcfglxgt
+    
+    usage: vcfglxgt [options] <vcf file>
+    
+    options:
+        -n, --fix-null-genotypes   only apply to null and partly-null genotypes
+    
+    Set genotypes using the maximum genotype likelihood for each sample.
+    
+    
+    
+### vcfhetcount
+    
+Count the number of heterozygotes in the input VCF.
+    
+    
+### vcfhethomratio
+    
+Provides the ratio between heterozygotes and homozygotes.
+
+### vcfindex
+
+Adds a field (id) which contains an allele-specific numerical index.
+    
+### vcfintersect
+    
+    usage: vcfintersect [options] [<vcf file>]
+    
+    options:
+        -b, --bed FILE            use intervals provided by this BED file
+        -v, --invert              invert the selection, printing only records which would
+                                    not have been printed out
+        -i, --intersect-vcf FILE  use this VCF for set intersection generation
+        -u, --union-vcf FILE      use this VCF for set union generation
+        -w, --window-size N       compare records up to this many bp away (default 30)
+        -r, --reference FILE      FASTA reference file, required with -i and -u
+        -l, --loci                output whole loci when one alternate allele matches
+        -m, --ref-match           intersect on the basis of record REF string
+        -t, --tag TAG             attach TAG to each record's info field if it would intersect
+        -V, --tag-value VAL       use this value to indicate that the allele is passing
+                                  '.' will be used otherwise.  default: 'PASS'
+        -M, --merge-from FROM-TAG
+        -T, --merge-to   TO-TAG   merge from FROM-TAG used in the -i file, setting TO-TAG
+                                  in the current file.
+    
+    For bed-vcf intersection, alleles which fall into the targets are retained.
+    
+    For vcf-vcf intersection and union, unify on equivalent alleles within window-size bp
+    as determined by haplotype comparison alleles.
+    
+    
+### vcfkeepgeno
+    
+    usage: vcfkeepgeno <vcf file> [FIELD1] [FIELD2] ...
+    outputs each record in the vcf file, removing FORMAT fields not listed
+    on the command line from sample specifications in the output
+    
+    
+### vcfkeepinfo
+    
+    usage: vcfkeepinfo <vcf file> [FIELD1] [FIELD2] ...
+    outputs each record in the vcf file, removing INFO fields not listed on the command line
+    
+    
+### vcfkeepsamples
+    
+    usage: vcfkeepsamples <vcf file> [SAMPLE1] [SAMPLE2] ...
+    outputs each record in the vcf file, removing samples not listed on the command line
+    
+    
+### vcfleftalign
+
+Left-align indels and complex variants in the input using a pairwise ref/alt
+alignment followed by a heuristic, iterative left realignment process that
+shifts indel representations to their absolute leftmost (5') extent.  This is
+the same procedure used in the internal left alignment in freebayes, and can be
+used when preparing VCF files for input to freebayes to decrease positional
+representation differences between the input alleles and left-realigned
+alignments.
+
+    usage: vcfleftalign [options] [file]
+
+    options:
+        -r, --reference FILE  Use this reference as a basis for realignment.
+        -w, --window N        Use a window of this many bp when left aligning (150).
+
+    Left-aligns variants in the specified input file or stdin.  Window size is determined
+    dynamically according to the entropy of the regions flanking the indel.  These must have
+    entropy > 1 bit/bp, or be shorter than ~5kb.
+
+
+### vcflength
+    
+Adds the length of the variant record (in [-/+]) relative to the reference allele to each VCF record.
+    
+    
+### vcfnumalt
+    
+Annotates the VCF stream on stdin with the number of alternate alleles at the site.
+    
+    
+### vcfoverlay
+    
+    usage: vcfoverlay [options] [<vcf file> ...]
+    
+    options:
+        -h, --help       this dialog
+    
+    Overlays records in the input vcf files in the order in which they appear.
+    
+    
+### vcfparsealts
+    
+Demonstration of alternate allele parsing method.  This method uses pairwise
+alignment of REF and ALTs to determine component allelic primitives for each
+alternate allele.
+
+Use `vcfallelicprimitives` to decompose records while preserving format.
+    
+    
+### vcfprimers
+    
+    usage: vcfprimers [options] <vcf file>
+    
+    options:
+        -f, --fasta-reference  FASTA reference file to use to obtain primer sequences
+        -l, --primer-length    The length of the primer sequences on each side of the variant
+    
+    For each VCF record, extract the flanking sequences, and write them to stdout as FASTA
+    records suitable for alignment.  This tool is intended for use in designing validation
+    experiments.  Primers extracted which would flank all of the alleles at multi-allelic
+    sites.  The name of the FASTA "reads" indicates the VCF record which they apply to.
+    The form is >CHROM_POS_LEFT for the 3' primer and >CHROM_POS_RIGHT for the 5' primer,
+    for example:
+    
+    >20_233255_LEFT
+    CCATTGTATATATAGACCATAATTTCTTTATCCAATCATCTGTTGATGGA
+    >20_233255_RIGHT
+    ACTCAGTTGATTCCATACCTTTGCCATCATGAATCATGTTGTAATAAACA
+    
+    
+    
+### vcfrandomsample
+    
+    usage: vcfrandomsample [options] [<vcf file>]
+    
+    options:
+        -r, --rate RATE      base sampling probability per locus
+        -s, --scale-by KEY   scale sampling likelihood by this Float info field
+        -p, --random-seed N  use this random seed
+    
+    Randomly sample sites from an input VCF file, which may be provided as stdin.
+    Scale the sampling probability by the field specified in KEY.  This may be
+    used to provide uniform sampling across allele frequencies, for instance.
+    
+    
+### vcfremap
+    
+    usage: vcfremap [options] [<vcf file>]
+    
+    options:
+        -w, --ref-window-size N      align using this many bases flanking each side of the reference allele
+        -s, --alt-window-size N      align using this many flanking bases from the reference around each alternate allele
+        -r, --reference FILE         FASTA reference file, required with -i and -u
+        -m, --match-score N          match score for SW algorithm
+        -x, --mismatch-score N       mismatch score for SW algorithm
+        -o, --gap-open-penalty N     gap open penalty for SW algorithm
+        -e, --gap-extend-penalty N   gap extension penalty for SW algorithm
+        -z, --entropy-gap-open       use entropy scaling for the gap open penalty
+        -R, --repeat-gap-extend N    penalize non-repeat-unit gaps in repeat sequence
+        -a, --adjust-vcf TAG         supply a new cigar as TAG in the output VCF
+    
+    For each alternate allele, attempt to realign against the reference with lowered gap open penalty.
+    If realignment is possible, adjust the cigar and reference/alternate alleles.
+    
+    
+### vcfremoveaberrantgenotypes
+    
+Strips genotypes which are homozygous but have observations implying
+heterozygosity.  Requires RA (reference allele observation) and AA (alternate
+allele observation) for each genotype.
+    
+    
+### vcfremovesamples
+    
+    usage: vcfremovesamples <vcf file> [SAMPLE1] [SAMPLE2] ...
+    outputs each record in the vcf file, removing samples listed on the command line
+    
+    
+### vcfroc
+    
+    usage: vcfroc [options] [<vcf file>]
+    
+    options:
+        -t, --truth-vcf FILE      use this VCF as ground truth for ROC generation
+        -w, --window-size N       compare records up to this many bp away (default 30)
+        -r, --reference FILE      FASTA reference file
+    
+    Generates a pseudo-ROC curve using sensitivity and specificity estimated against
+    a putative truth set.  Thresholding is provided by successive QUAL cutoffs.
+    
+    
+### vcfsamplediff
+    
+    usage: vcfsamplediff <tag> <sample> <sample> [ <sample> ... ] <vcf file>
+    tags each record where the listed sample genotypes differ with <tag>
+    The first sample is assumed to be germline, the second somatic.
+    Each record is tagged with <tag>={germline,somatic,loh} to specify the type of
+    variant given the genotype difference between the two samples.
+    
+    
+### vcfsamplenames
+    
+Prints the names of the samples in the VCF file.
+
+    
+### vcfsom
+    
+    usage: vcfsom [options] [vcf file]
+    
+    training: 
+        vcfsom -s output.som -f "AF DP ABP" training.vcf
+    
+    application: 
+        vcfsom -a output.som -f "AF DP ABP" test.vcf >results.vcf
+    
+    vcfsomtrains and/or applies a self-organizing map to the input VCF data
+    on stdin, adding two columns for the x and y coordinates of the winning
+    neuron in the network and an optional euclidean distance from a given
+    node (--center).
+    
+    If a map is provided via --apply,  map will be applied to input without
+    training.  Automated filtering to an estimated FP rate is 
+    
+    options:
+    
+        -h, --help             this dialog
+    
+    training:
+    
+        -f, --fields "FIELD ..."  INFO fields to provide to the SOM
+        -a, --apply FILE       apply the saved map to input data to FILE
+        -s, --save  FILE       train on input data and save the map to FILE
+        -t, --print-training-results
+                               print results of SOM on training input
+                               (you can also just use --apply on the same input)
+        -x, --width X          width in columns of the output array
+        -y, --height Y         height in columns of the output array
+        -i, --iterations N     number of training iterations or epochs
+        -d, --debug            print timing information
+    
+    recalibration:
+    
+        -c, --center X,Y       annotate with euclidean distance from center
+        -p, --paint-true VCF   use VCF file to annotate true variants (multiple)
+        -f, --paint-false VCF  use VCF file to annotate false variants (multiple)
+        -R, --paint-tag TAG    provide estimated FDR% in TAG in variant INFO
+        -N, --false-negative   replace FDR% (false detection) with FNR% (false negative)
+    
+    
+### vcfstats
+    
+    usage: vcfstats [options] <vcf file>
+    
+        -r, --region          specify a region on which to target the stats, requires a BGZF
+                              compressed file which has been indexed with tabix.  any number of
+                              regions may be specified.
+        -a, --add-info        add the statistics intermediate information to the VCF file,
+                              writing out VCF records instead of summary statistics
+        -l, --no-length-frequency    don't out the indel and mnp length-frequency spectra
+        -m, --match-score N          match score for SW algorithm
+        -x, --mismatch-score N       mismatch score for SW algorithm
+        -o, --gap-open-penalty N     gap open penalty for SW algorithm
+        -e, --gap-extend-penalty N   gap extension penalty for SW algorithm
+    
+    Prints statistics about variants in the input VCF file.
+    
+    
+### vcfstreamsort
+    
+Reads VCF on stdin and guarantees that the positional order is correct provided out-of-order
+variants are no more than 100 positions in the VCF file apart.
+
+
+### vcfuniq
+
+Like GNU uniq, but for VCF records.  Remove records which have the same positon, ref, and alt
+as the previous record.
+
+
+### vcfuniqalleles
+
+For each record, remove any duplicate alternate alleles that may have resulted from merging
+separate VCF files.
diff --git a/bin/bed2region b/bin/bed2region
new file mode 100755
index 0000000..ffa40ef
--- /dev/null
+++ b/bin/bed2region
@@ -0,0 +1,9 @@
+#!/usr/bin/perl
+
+while (<STDIN>) {
+    $_ =~ /^(.+?)\s(.+?)\s(.+)\s*/;
+    $chrom = $1;
+    $pos = $2;
+    $end = $3;
+    print $chrom . ":" . $pos . "-" . $end . "\n";
+}
diff --git a/bin/plot_roc.r b/bin/plot_roc.r
new file mode 100755
index 0000000..5a99615
--- /dev/null
+++ b/bin/plot_roc.r
@@ -0,0 +1,153 @@
+#!/usr/bin/Rscript
+
+
+
+
+require(plyr)
+require(ggplot2)
+require(pracma)
+require(grid)
+
+argv <- commandArgs(trailingOnly = TRUE)
+
+prefix <- gsub("\\s","", argv[1])
+print(prefix)
+truthset <- argv[2]
+print(truthset)
+results <- argv[3]
+print(results)
+xmin <- as.numeric(argv[4])
+xmax <- as.numeric(argv[5])
+ymin <- as.numeric(argv[6])
+ymax <- as.numeric(argv[7])
+
+roc <- read.delim(results)
+
+bests <- ddply(roc, .(set), function(x) { data.frame(best_snps=with(x, min(false_negative_snps + false_positive_snps)), best_snp_threshold=min(subset(x, (false_negative_snps + false_positive_snps) == with(x, min(false_negative_snps + false_positive_snps)))$threshold ), best_indels=with(x, min(false_negative_indels + false_positive_indels)), best_indel_threshold=min(subset(x, (false_negative_indels + false_positive_indels) == with(x, min(false_negative_indels + false_positive_indels)))$th [...]
+
+write.table(bests, paste(prefix, ".bests.tsv", sep=""), row.names=FALSE, quote=FALSE, sep="\t")
+
+#abs(trapz(c(1, roc$complexfpr), c(1, roc$complextpr)))
+
+true_snps <- with(subset(roc, set==truthset), max(num_snps))
+true_indels <- with(subset(roc, set==truthset), max(num_indels))
+
+# get ROC AUC
+auc <- ddply(roc, .(set),
+      function(x) {
+        data.frame(
+                   snp_auc=ifelse(true_snps>0,
+                     with(x,
+                          abs(trapz(c(1,
+                                      false_positive_snps/(false_positive_snps+ max(false_negative_snps + num_snps - false_positive_snps))),
+                                    c(max(1- false_negative_snps/true_snps),
+                                      1- false_negative_snps/true_snps)))),
+                     0),
+                   indel_auc=ifelse(true_indels>0,
+                     with(x,
+                          abs(trapz(c(1,
+                                      false_positive_indels/(false_positive_indels+ max(false_negative_indels + num_indels - false_positive_indels))),
+                                    c(max(1- false_negative_indels/true_indels),
+                                      1- false_negative_indels/true_indels)))),
+                     0)
+                   )
+      }
+      )
+
+write.table(auc, paste(prefix, ".auc.tsv", sep=""), row.names=FALSE, quote=FALSE, sep="\t")
+
+
+rocsnps <- ddply(roc, .(set),
+      function(x) {
+        data.frame(
+                   FPR=
+                     with(x,
+                          c(1,
+                            false_positive_snps/(false_positive_snps+ max(false_negative_snps + num_snps - false_positive_snps)))),
+                   TPR=
+                      with(x,
+                          c(max(1- false_negative_snps/true_snps),
+                            1- false_negative_snps/true_snps)),
+                   type=as.factor("snps")
+                   )
+          }
+      )
+
+rocindels <- ddply(roc, .(set),
+      function(x) {
+        data.frame(
+                   FPR=
+                     with(x,
+                          c(1,
+                            false_positive_indels/(false_positive_indels+ max(false_negative_indels + num_indels - false_positive_indels)))),
+                   TPR=
+                      with(x,
+                          c(max(1- false_negative_indels/true_indels),
+                            1- false_negative_indels/true_indels)),
+                   type=as.factor("indels")
+                   )
+          }
+      )
+
+
+if (FALSE) {
+if (true_snps>0) {
+  ggplot(subset(roc, set != truthset),
+         aes(false_positive_snps/(false_positive_snps+with(subset(roc, set==set), max(false_negative_snps + num_snps - false_positive_snps))),
+             1- false_negative_snps/with(subset(roc, set==set), max(false_negative_snps + num_snps - false_positive_snps)),
+             group=set,
+             color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw()
+            + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
+  ggsave(paste(prefix, ".snps.png", sep=""), height=6, width=9)
+}
+
+if (true_indels>0) {
+  ggplot(subset(roc, set != truthset),
+         aes(false_positive_indels/(false_positive_indels+with(subset(roc, set==set), max(false_negative_indels + num_indels - false_positive_indels))),
+             1- false_negative_indels/with(subset(roc, set==set), max(false_negative_indels + num_indels - false_positive_indels)),
+             group=set,
+             color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw()
+            + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
+  ggsave(paste(prefix, ".indels.png", sep=""), height=6, width=9)
+}
+}
+
+
+# new versions
+if (true_snps>0) {
+  ggplot(subset(rocsnps, set != truthset),
+         aes(FPR,
+             TPR,
+             group=set,
+             color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw() + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
+  ggsave(paste(prefix, ".snps.png", sep=""), height=6, width=9)
+}
+
+if (true_indels>0) {
+  ggplot(subset(rocindels, set != truthset),
+         aes(FPR,
+             TPR,
+             group=set,
+             color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw() + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
+  ggsave(paste(prefix, ".indels.png", sep=""), height=6, width=9)
+}
+
+if (true_indels>0 && true_snps>0) {
+
+(
+  ggplot(subset(rbind(rocsnps,rocindels), set != truthset),
+         aes(FPR,
+             TPR,
+             group=set,
+             color=set))
+    + scale_x_continuous("false positive rate")
+    + scale_y_continuous("true positive rate")
+    + geom_path()
+    + theme_bw()
+    + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
+    + facet_grid(type ~ .)
+    + theme(panel.margin = unit(1, "lines")) 
+)
+  ggsave(paste(prefix, ".both.png", sep=""), height=5, width=5)
+
+}
diff --git a/bin/vcf2bed.py b/bin/vcf2bed.py
new file mode 100755
index 0000000..c04cb60
--- /dev/null
+++ b/bin/vcf2bed.py
@@ -0,0 +1,16 @@
+#!/usr/bin/python
+
+import sys
+
+for line in sys.stdin:
+    if line.startswith('#'):
+        continue
+    fields = line.strip().split()
+    # VCF is 1-based, BED is 0-based half open
+    # print out chrom, start, end, 
+    chrom = fields[0]
+    start = int(fields[1]) - 1
+    span = len(fields[3]) # handle multi-base alleles
+    end = start + span
+    name = fields[2]
+    print "\t".join(map(str, [chrom, start, end, name]))
diff --git a/bin/vcf2sqlite.py b/bin/vcf2sqlite.py
new file mode 100755
index 0000000..f0170b0
--- /dev/null
+++ b/bin/vcf2sqlite.py
@@ -0,0 +1,130 @@
+#!/usr/bin/python
+
+import sys
+import re
+import sqlite3
+
+if len(sys.argv) < 2:
+    print "usage", sys.argv[0], " [dbname]"
+    print "reads VCF on stdin, and writes output to a sqlite3 db [dbname]"
+    exit(1)
+
+dbname = sys.argv[1]
+
+# parse the header
+# into a mapping from tag -> type
+
+infotypes = {}
+infonumbers = {}
+
+for line in sys.stdin:
+    if line.startswith('##INFO'):
+        #<ID=XRS,Number=1,Type=Float,
+        i = re.search("ID=(.*?),", line)
+        n = re.search("Number=(.*?),", line)
+        t = re.search("Type=(.*?),", line)
+        if i and n and t:
+            id = i.groups()[0]
+            number = n.groups()[0]
+            if number == "A":
+                number = -1
+            elif number == "G" or int(number) > 1:
+                # unclear how to deal with these
+                continue
+            else:
+                number = int(number)
+            typestr = t.groups()[0]
+            infotypes[id] = typestr
+            infonumbers[id] = number
+        else:
+            continue
+    elif line.startswith('##'):
+        continue
+    else:
+        break # header line, sample names etc.
+
+# write the table schema
+
+infotype_to_sqltype = {}
+infotype_to_sqltype["Flag"] = "boolean"
+infotype_to_sqltype["Integer"] = "integer"
+infotype_to_sqltype["Float"] = "real"
+infotype_to_sqltype["String"] = "text"
+
+tablecmd = """create table alleles"""
+specs = ["CHROM text",
+        "POS integer",
+        "ID text",
+        "REF text",
+        "ALT text",
+        "QUAL real",
+        "FILTER text"]
+
+sorted_fields = sorted(infotypes.keys())
+for field in sorted_fields:
+    infotype = infotypes[field]
+    sqltype = infotype_to_sqltype[infotype]
+    field = field.replace(".", "_") # escape periods, which are not allowed
+    specs.append(field + " " + sqltype)
+
+tablecmd += " (" + ", ".join(specs) + ")"
+
+conn = sqlite3.connect(dbname)
+conn.execute(tablecmd)
+
+# for each record
+# parse the record
+# for each allele
+
+for line in sys.stdin:
+    fields = line.split('\t')
+    chrom, pos, id, ref, alts, qual, filter, info = fields[:8]
+    alts = alts.split(",")
+    altindex = 0
+    chrom = "\'" + chrom + "\'"
+    id = "\'" + id + "\'"
+    ref = "\'" + ref + "\'"
+    filter = "\'" + filter + "\'"
+    for alt in alts:
+        alt = "\'" + alt + "\'"
+        info_values = {}
+        for pair in info.split(";"):
+            if pair.find("=") is not -1:
+                pair = pair.split("=")
+                key = pair[0]
+                value = pair[1]
+                if not infonumbers.has_key(key):
+                    continue
+                if infonumbers[key] == -1:
+                    values = value.split(",")
+                    value = values[altindex]
+                info_values[key] = value
+            else:
+               # boolean flag
+                info_values[pair] = "1"
+        ordered_insertion = []
+        for field in sorted_fields:
+            value = "null"
+            if info_values.has_key(field):
+                value = info_values[field]
+                if infotypes[field] == "String":
+                    value = "\'" + value + "\'"
+            else:
+                # missing flag means "false" for that flag
+                if infotypes[field] == "Flag":
+                    value = "0"
+            ordered_insertion.append(value)
+        cmd = "insert into alleles values (" \
+            + ", ".join([chrom, pos, id, ref, alt, qual, filter]) \
+            + ", " \
+            + ", ".join(ordered_insertion) + ")"
+        conn.execute(cmd)
+        altindex += 1
+
+conn.commit()
+
+# TODO ignoring samples (for now)
+
+# add indexes everywhere?
+
+conn.close()
diff --git a/bin/vcf_strip_extra_headers b/bin/vcf_strip_extra_headers
new file mode 100755
index 0000000..c8b05e6
--- /dev/null
+++ b/bin/vcf_strip_extra_headers
@@ -0,0 +1,18 @@
+#!/usr/bin/perl
+
+my $seen_non_header = 0;
+
+while (<STDIN>) {
+    if (! $seen_non_header) {
+        if (/^#/) {
+        } else {
+            $seen_non_header = 1;
+        }
+        print;
+    } else {
+        if (! /^#/) {
+            print;
+        }
+    }
+
+}
diff --git a/bin/vcfbiallelic b/bin/vcfbiallelic
new file mode 100755
index 0000000..7761fec
--- /dev/null
+++ b/bin/vcfbiallelic
@@ -0,0 +1,20 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+    if ($_ =~ /^#/) {
+        print;
+    } else {
+        $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+        $chrom = $1;
+        $pos = $2;
+        $tag = $3;
+        $ref = $4;
+        $alt = $5;
+        if ($alt =~ /,/) {
+            # remove anything which isn't biallelic
+        } else {
+            print;
+        }
+    }
+}
diff --git a/bin/vcfclearid b/bin/vcfclearid
new file mode 100755
index 0000000..f428682
--- /dev/null
+++ b/bin/vcfclearid
@@ -0,0 +1,12 @@
+#!/usr/bin/python
+#
+
+import sys
+
+for line in sys.stdin:
+    if line.startswith("#"):
+        print line.strip()
+    else:
+        fields = line.strip().split("\t")
+        fields[2] = "."
+        print "\t".join(fields)
diff --git a/bin/vcfclearinfo b/bin/vcfclearinfo
new file mode 100755
index 0000000..a0512fd
--- /dev/null
+++ b/bin/vcfclearinfo
@@ -0,0 +1,12 @@
+#!/usr/bin/python
+#
+
+import sys
+
+for line in sys.stdin:
+    if line.startswith("#"):
+        print line.strip()
+    else:
+        fields = line.strip().split("\t")
+        fields[7] = "."
+        print "\t".join(fields)
diff --git a/bin/vcfcomplex b/bin/vcfcomplex
new file mode 100755
index 0000000..9c2b188
--- /dev/null
+++ b/bin/vcfcomplex
@@ -0,0 +1,26 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+    if ($_ =~ /^#/) {
+        print;
+    } else {
+        $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+        $chrom = $1;
+        $pos = $2;
+        $tag = $3;
+        $ref = $4;
+        $alts = $5;
+        $hasindel = 0;
+        @alts = split(/,/, $alts);
+        $snp = 1;
+        foreach $alt (@alts) {
+            if (length($ref) > 1 || length($alt) != length($ref)) {
+                $snp = 0;
+            }
+        }
+        if (!$snp) {
+            print;
+        }
+    }
+}
diff --git a/bin/vcffirstheader b/bin/vcffirstheader
new file mode 100755
index 0000000..2e77c2e
--- /dev/null
+++ b/bin/vcffirstheader
@@ -0,0 +1,16 @@
+#!/usr/bin/python
+
+import sys
+
+header=True
+for line in sys.stdin:
+    if line.startswith('##'):
+        if header:
+            print line.strip()
+        continue
+    elif line.startswith('#'):
+        if header:
+           print line.strip()
+           header=False
+        continue
+    print line.strip()
diff --git a/bin/vcfgtcompare.sh b/bin/vcfgtcompare.sh
new file mode 100755
index 0000000..d563313
--- /dev/null
+++ b/bin/vcfgtcompare.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+if [ $# != 3 ];
+then
+    echo "usage: $0 [annotation] [fileA] [fileB]"
+    echo "annotates records in the first file with genotypes and sites from the second"
+    exit
+fi
+
+annotation=$1
+fileA=$2
+fileB=$3
+
+vcfcommonsamples $fileA $fileB \
+ | vcfannotategenotypes $annotation - $fileB \
+ | vcfgenotypecompare $annotation -
diff --git a/bin/vcfindelproximity b/bin/vcfindelproximity
new file mode 100755
index 0000000..982ba81
--- /dev/null
+++ b/bin/vcfindelproximity
@@ -0,0 +1,82 @@
+#!/usr/bin/perl
+#
+
+
+
+# for line in the vcf
+# stuff the line into a queue
+# when you reach an indel
+# record the position
+# pop lines from the back of the queue until we are at the current position
+#
+
+my @lines;
+
+my $prox = $ARGV[0];
+
+my $lastchrom = "";
+my $indelpos = 0;
+
+while (<STDIN>) {
+
+    if ($_ =~ /^#/) {
+        print $_;
+        next;
+    }
+
+    $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+    my $chrom = $1;
+    my $pos = $2;
+    my $tag = $3;
+    my $ref = $4;
+    my $alt = $5;
+    #print "chrom: $chrom, pos: $pos, ref: $ref, alt: $alt\n";
+
+    # if new chrom, print out everything from last one
+    if ($lastchrom == "") {
+        $lastchrom = $chrom;
+    }
+
+    if ($chrom != $lastchrom) {
+        while ($lines) {
+            print pop(@lines);
+        }
+    }
+
+    unshift(@lines, $_);
+
+    my $diff = length($ref) - length($alt);
+
+    if ($diff != 0) {
+        # insertion
+        if ($indelpos == 0) {
+            $indelpos = $pos;
+        }
+        $nextindelpos = $pos;
+        #print "last $indelpos next $nextindelpos\n";
+        while (@lines) {
+            my $line = pop(@lines);
+            $line =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+            my $c = $1;
+            my $p = $2;
+            my $t = $3;
+            my $r = $4;
+            my $a = $5;
+            # print indels
+            if (length($r) - length($a) != 0) {
+                print $line;
+            } else {
+            # print other events which are more than prox away from indels
+                if (abs($indelpos - $p) >= $prox and abs($nextindelpos - $p) >= $prox) {
+                    print $line;
+                }
+            }
+        }
+        $indelpos = $pos;
+    }
+}
+
+# flush lines end of file
+while ($lines) {
+    print pop(@lines);
+}
diff --git a/bin/vcfindels b/bin/vcfindels
new file mode 100755
index 0000000..1b92a45
--- /dev/null
+++ b/bin/vcfindels
@@ -0,0 +1,26 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+    if ($_ =~ /^#/) {
+        print;
+    } else {
+        $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+        $chrom = $1;
+        $pos = $2;
+        $tag = $3;
+        $ref = $4;
+        $alts = $5;
+        $hasindel = 0;
+        @alts = split(/,/, $alts);
+        $snp = 1;
+        foreach $alt (@alts) {
+            if (length($alt) != length($ref)) {
+                $snp = 0;
+            }
+        }
+        if (!$snp) {
+            print;
+        }
+    }
+}
diff --git a/bin/vcfmultiallelic b/bin/vcfmultiallelic
new file mode 100755
index 0000000..47f3dce
--- /dev/null
+++ b/bin/vcfmultiallelic
@@ -0,0 +1,20 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+    if ($_ =~ /^#/) {
+        print;
+    } else {
+        $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+        $chrom = $1;
+        $pos = $2;
+        $tag = $3;
+        $ref = $4;
+        $alt = $5;
+        if ($alt =~ /,/) {
+            print;
+        } else {
+            # remove anything which isn't multiallelic
+        }
+    }
+}
diff --git a/bin/vcfmultiway b/bin/vcfmultiway
new file mode 100755
index 0000000..9536a65
--- /dev/null
+++ b/bin/vcfmultiway
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+reference=$1
+shift
+
+echo comparing $@
+
+for fileA in $@;
+do
+    for fileB in $@;
+    do
+	if [ "$fileA" = "$fileB" ]
+	then
+	    vcfstats $fileA >$(basename $fileA).stats.txt
+	else
+	    vcfintersect -r $reference -i $fileA $fileB | vcfstats >$(basename $fileA).common.$(basename $fileB).stats.txt
+	    vcfintersect -r $reference -v -i $fileB $fileA | vcfstats >$(basename $fileA).unique.$(basename $fileB).stats.txt
+	fi
+    done
+done
diff --git a/bin/vcfmultiwayscripts b/bin/vcfmultiwayscripts
new file mode 100755
index 0000000..8373805
--- /dev/null
+++ b/bin/vcfmultiwayscripts
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+reference=$1
+outdir=$2
+scriptsdir=$3
+shift
+shift
+shift
+
+mkdir -p $outdir
+mkdir -p $scriptsdir
+
+echo comparing $@
+
+for fileA in $@;
+do
+    fileA=$(pwd)/$fileA
+    for fileB in $@;
+    do
+	fileB=$(pwd)/$fileB
+	echo $fileA vs $fileB
+	if [ "$fileA" = "$fileB" ]
+	then
+	    echo "vcfstats $fileA >$outdir/$(basename $fileA).stats" >$scriptsdir/$(basename $fileA).sh
+	else
+	    echo "vcfintersect -r $reference -i $fileA $fileB | vcfstats >$outdir/$(basename $fileA).common.$(basename $fileB).stats" >$scriptsdir/$(basename $fileA).common.$(basename $fileB).sh
+	    echo "vcfintersect -r $reference -v -i $fileB $fileA | vcfstats >$outdir/$(basename $fileA).unique.$(basename $fileB).stats" >$scriptsdir/$(basename $fileA).unique.$(basename $fileB).sh
+	fi
+    done
+done
diff --git a/bin/vcfnobiallelicsnps b/bin/vcfnobiallelicsnps
new file mode 100755
index 0000000..2433eee
--- /dev/null
+++ b/bin/vcfnobiallelicsnps
@@ -0,0 +1,29 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+    if ($_ =~ /^#/) {
+        print;
+    } else {
+        $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+        $chrom = $1;
+        $pos = $2;
+        $tag = $3;
+        $ref = $4;
+        $alts = $5;
+        $hasnonsnp = 0;
+        $biallelic = 1;
+        if ($alts =~ /,/) {
+            $biallelic = 0;
+        }
+        @alts = split(/,/, $alts);
+        foreach $alt (@alts) {
+            if (!(length($alt)==1 && length($alt) == length($ref))) {
+                $hasnonsnp = 1;
+            }
+        }
+        if ($hasnonsnp || !$biallelic) {
+            print;
+        }
+    }
+}
diff --git a/bin/vcfnoindels b/bin/vcfnoindels
new file mode 100755
index 0000000..c2051ac
--- /dev/null
+++ b/bin/vcfnoindels
@@ -0,0 +1,25 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+    if ($_ =~ /^#/) {
+        print;
+    } else {
+        $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+        $chrom = $1;
+        $pos = $2;
+        $tag = $3;
+        $ref = $4;
+        $alts = $5;
+	$hasindel = 0;
+	@alts = split(/,/, $alts);
+	foreach $alt (@alts) {
+	    if (length($alt ) != length($ref)) {
+		$hasindel = 1;
+	    }
+        }
+	if (! $hasindel) {
+	    print;
+	}
+    }
+}
diff --git a/bin/vcfnosnps b/bin/vcfnosnps
new file mode 100755
index 0000000..19ee084
--- /dev/null
+++ b/bin/vcfnosnps
@@ -0,0 +1,25 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+    if ($_ =~ /^#/) {
+        print;
+    } else {
+        $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+        $chrom = $1;
+        $pos = $2;
+        $tag = $3;
+        $ref = $4;
+        $alts = $5;
+        $hasnonsnp = 0;
+        @alts = split(/,/, $alts);
+        foreach $alt (@alts) {
+            if (!(length($alt)==1 && length($alt) == length($ref))) {
+                $hasnonsnp = 1;
+            }
+        }
+        if ($hasnonsnp) {
+            print;
+        }
+    }
+}
diff --git a/bin/vcfnulldotslashdot b/bin/vcfnulldotslashdot
new file mode 100755
index 0000000..9951a34
--- /dev/null
+++ b/bin/vcfnulldotslashdot
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+
+import sys
+import math
+
+def bincoeff(n,k): return math.factorial(n) / (math.factorial(n-k)*math.factorial(k))
+def multcoeff(n,k): return bincoeff(n+k-1,k)
+
+for line in sys.stdin:
+    if line.startswith("#"):
+        print line.strip()
+        continue
+    fields = line.strip().split("\t")
+    alleles = len(fields[4].split(","))+1
+    # assume that we have GT:GL
+    # how many genotypes?  assume diploid
+    flatgls = ",".join(map(str,[0]*multcoeff(alleles,2)))
+    for i in range(9, len(fields)):
+        if fields[i] == ".":
+            fields[i] = "./.:" + flatgls
+    print "\t".join(fields)
+    
diff --git a/bin/vcfplotaltdiscrepancy.r b/bin/vcfplotaltdiscrepancy.r
new file mode 100755
index 0000000..8717987
--- /dev/null
+++ b/bin/vcfplotaltdiscrepancy.r
@@ -0,0 +1,511 @@
+#!/usr/bin/Rscript
+
+# helper functions
+
+nan.to.zero <- function(n) {
+    if (is.nan(n)) return(0) else return(n)
+}
+
+
+# get the input VCF tabular format, assert that sites must have AC > 0
+vcf <- subset(read.table(pipe('cat /dev/stdin'), header=T), AC > 0)
+
+filename <- commandArgs(TRUE)[1]
+tag <- commandArgs(TRUE)[2]
+
+tag.genotypes_count <- paste(tag, '.genotypes.alternate_count', sep='')
+tag.genotypes_alternate_count <- paste(tag, '.genotypes.alternate_count', sep='')
+tag.non_reference_discrepancy_count <- paste(tag, '.site.non_reference_discrepancy.count', sep='')
+tag.non_reference_discrepancy_normalizer <- paste(tag, '.site.non_reference_discrepancy.normalizer', sep='')
+tag.non_reference_sensitivity_count <- paste(tag, '.site.non_reference_sensitivity.count', sep='')
+tag.non_reference_sensitivity_normalizer <- paste(tag, '.site.non_reference_sensitivity.normalizer', sep='')
+tag.alternate_positive_discrepancy <- paste(tag, '.site.alternate_positive_discrepancy', sep='')
+tag.alternate_negative_discrepancy <- paste(tag, '.site.alternate_negative_discrepancy', sep='')
+tag.has_variant <- paste(tag, '.has_variant', sep='')
+
+vcf.numberOfSites <- length(vcf[, tag.genotypes_alternate_count])
+vcf.totalAltAlleles <- sum(vcf[, tag.genotypes_alternate_count])
+vcf.positiveDiscrepancy <- sum(vcf[, tag.alternate_positive_discrepancy]) / sum(vcf[, tag.genotypes_alternate_count])
+vcf.negativeDiscrepancy <- sum(vcf[, tag.alternate_negative_discrepancy]) / sum(vcf[, tag.genotypes_alternate_count])
+vcf.sitesTruePositive <- mean(vcf[, tag.has_variant])
+
+min_sites <- 5  # number of sites required for "simple plotting"
+
+#library(ggplot2)
+#vcf2 <- data.frame(QUAL=vcf$QUAL, AC=vcf$AC, has_variant=vcf[, tag.has_variant])
+#qplot(AC, has_variant, group=AC, geom="boxplot", data=subset(vcf2, AC <= 20))
+#ggsave(paste(filename, '.', tag, '.PD.vs.AC.boxplot.ac_lt_20.pdf', sep=''))
+
+
+cat('number of sites', vcf.numberOfSites, '\n')
+cat('total alternate alleles', vcf.totalAltAlleles, '\n')
+cat('positive discrepancy', vcf.positiveDiscrepancy, '\n')
+cat('negative discrepancy', vcf.negativeDiscrepancy, '\n')
+
+#x <- cbind(tapply(vcf, as.list(seq(0,max(vcf$AC))),
+#    function(x) {
+#        sum(x[, tag.alternate_positive_discrepancy]) / sum(x[, tag.genotypes_alternate_count])
+#    }))
+
+byac <- data.frame(ac=as.vector(seq(1,max(vcf$AC)))) #, fdr=as.vector(x))
+
+
+byac$fdr <- as.vector(cbind(by(byac$ac, byac$ac,
+    function(x) {
+        s <- subset(vcf, AC == x)
+        return(nan.to.zero(sum(s[, tag.alternate_positive_discrepancy]) / sum(s[, tag.genotypes_alternate_count])))
+    })))
+
+# false detection count
+byac$fpc <- as.vector(cbind(by(byac$ac, byac$ac,
+    function(x) {
+        s <- subset(vcf, AC == x)
+        return(sum(s[, tag.alternate_positive_discrepancy]))
+    })))
+
+byac$alleles <- as.vector(cbind(by(byac$ac, byac$ac,
+    function(x) {
+        s <- subset(vcf, AC == x)
+        return(sum(s[, tag.genotypes_alternate_count]))
+    })))
+
+byac$sites <- as.vector(cbind(by(byac$ac, byac$ac,
+    function(x) {
+        s <- subset(vcf, AC == x)
+        return(length(s$AC))
+    })))
+
+# count true positive sites
+byac$site_tpc <- as.vector(cbind(by(byac$ac, byac$ac,
+    function(x) {
+        s <- subset(vcf, AC == x)
+        return(sum(s[, tag.has_variant]))
+    })))
+
+# fpc == false detection count
+byac$site_fpc <- byac$sites - byac$site_tpc
+# site detection fpr is 1 - true positive rate
+byac$site_fpr <- 1 - ( byac$site_tpc / byac$sites )
+
+summary(byac)
+
+#print(byac$sites)
+#print(byac$site_tpc)
+#print(byac$site_fpc)
+#print(byac$site_fpr)
+
+#byac$site_fpr_gt0 <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) { 
+#    s <- subset(byac, ac == i, select=c(site_fpr, sites))
+#if (s$sites >= min_sites) {
+#    return(s$site_fpr)
+#} else {
+#    return(NA)
+#}
+#})))
+
+#byac$site_fprlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac <= i, select=site_fpr)))))
+byac$site_fprlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) { 
+    s <- subset(byac, ac <= i, select=c(site_fpc, sites))
+    return(sum(s$site_fpc) / sum(s$sites))
+})))
+
+#byac$site_fprgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac >= i, select=site_fpr)))))
+byac$site_fprgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) { 
+    s <- subset(byac, ac >= i, select=c(site_fpc, sites))
+    return(sum(s$site_fpc) / sum(s$sites))
+})))
+
+byac$cfa <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) sum(subset(byac, ac <= i, select=alleles)) / sum(byac$alleles))))
+
+byac$cfs <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) sum(subset(byac, ac <= i, select=sites)) / length(vcf$AC))))
+
+# inappropriate collapse via averaging of fdr
+#byac$alternate_pdlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac <= i, select=fdr)))))
+
+byac$alternate_pdr <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) { 
+    s <- subset(byac, ac == i, select=c(fpc, alleles))
+    return(sum(s$fpc) / sum(s$alleles))
+})))
+
+# use this one
+byac$alternate_pdlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) { 
+    s <- subset(byac, ac <= i, select=c(fpc, alleles))
+    return(sum(s$fpc) / sum(s$alleles))
+})))
+
+#byac$alternate_pdgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac >= i, select=fdr)))))
+byac$alternate_pdgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) { 
+    s <- subset(byac, ac >= i, select=c(fpc, alleles))
+    return(sum(s$fpc) / sum(s$alleles))
+})))
+
+byac$nrs <- as.vector(cbind(by(byac$ac, byac$ac,
+    function(x) {
+        s <- subset(vcf, AC == x)
+        return(nan.to.zero(sum(s[, tag.non_reference_sensitivity_count]) / sum(s[, tag.non_reference_sensitivity_normalizer])))
+    })))
+
+byac$nrd <- as.vector(cbind(by(byac$ac, byac$ac,
+    function(x) {
+        s <- subset(vcf, AC == x)
+        return(nan.to.zero(sum(s[, tag.non_reference_discrepancy_count]) / sum(s[, tag.non_reference_discrepancy_normalizer])))
+    })))
+
+byac$nrslt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) {
+    s <- subset(vcf, AC <= i, select=c(tag.non_reference_sensitivity_count, tag.non_reference_sensitivity_normalizer))
+    return(nan.to.zero(sum(s[, tag.non_reference_sensitivity_count]) / sum(s[, tag.non_reference_sensitivity_normalizer])))
+})))
+
+byac$nrdlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) { 
+    s <- subset(vcf, AC <= i, select=c(tag.non_reference_discrepancy_count, tag.non_reference_discrepancy_normalizer))
+    return(nan.to.zero(sum(s[, tag.non_reference_discrepancy_count]) / sum(s[, tag.non_reference_discrepancy_normalizer])))
+})))
+
+byac_gtsites <- subset(byac, sites >= min_sites)
+
+
+if (FALSE) {
+pdf(paste(filename, '.', tag, '.PD.vs.AC.smooth.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byac$cfa, ylim=c(0,1.0),
+    xlab='alternate allele count (AC)', xaxt='n',
+    ylab='', yaxt='n', type='l', col='red')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byac$ac),10), labels=seq(0,max(byac$ac),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(smoothed)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(new=T)
+#plot(byac$alternate_pdlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='green')
+plot(byac$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+lines(byac$ac, predict(loess(byac$alternate_pdr ~ byac$ac, span=0.5)), col="green")
+par(new=T)
+plot(byac$cfa, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='red')
+par(new=T)
+plot(byac$cfs, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='purple')
+par(new=T)
+lines(byac$ac, predict(loess(byac$site_fpr ~ byac$ac, span=0.5)), col="blue")
+par(new=T)
+lines(byac$ac, predict(loess(byac$nrs ~ byac$ac, span=0.5)), col="magenta")
+par(new=T)
+lines(byac$ac, predict(loess(byac$nrd ~ byac$ac, span=0.5)), col="brown")
+par(new=T, cex=0.65)
+mtext(paste("alternate genotype PD: ", round(vcf.positiveDiscrepancy, digits=4), ", site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative fraction of alt alleles', 'cumulative fraction of sites', 'alt genotypes PD', 'site PD', 'non-ref sensitivity', 'non-ref discrepancy', 'site PD at AC'),
+    fill=c('red', 'purple', 'green', 'blue', 'magenta', 'brown', 'black'))
+garbage <- dev.off()
+}
+
+
+
+pdf(paste(filename, '.', tag, '.PD.vs.AC.cumulative.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byac$cfa, ylim=c(0,1.0), 
+    xlab='alternate allele count (AC)', xaxt='n',
+    ylab='', yaxt='n', type='l', col='red')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byac$ac),10), labels=seq(0,max(byac$ac),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(cumulative)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(new=T)
+#plot(byac_gtsites$ac, byac_gtsites$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+plot(byac$site_fpr, ylim=c(0,1.0),  xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+plot(byac$alternate_pdlt, ylim=c(0,1.0),  xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='green')
+par(new=T)
+plot(byac$cfa, ylim=c(0,1.0),  xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='red')
+par(new=T)
+plot(byac$cfs, ylim=c(0,1.0),  xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='purple')
+par(new=T)
+plot(byac$site_fprlt, ylim=c(0,1.0),  xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue')
+par(new=T)
+plot(byac$nrslt, ylim=c(0,1.0),  xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='magenta')
+par(new=T)
+plot(byac$nrdlt, ylim=c(0,1.0),  xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='brown')
+par(new=T, cex=0.65)
+mtext(paste("alternate genotype PD: ", round(vcf.positiveDiscrepancy, digits=4), ", site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative fraction of alt alleles', 'cumulative fraction of sites', 'alt genotypes PD', 'site PD', 'non-ref sensitivity', 'non-ref discrepancy', 'site PD at AC'),
+    fill=c('red', 'purple', 'green', 'blue', 'magenta', 'brown', 'black'))
+garbage <- dev.off()
+
+
+
+pdf(paste(filename, '.', tag, '.PD.vs.AC.cumulative.simple.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byac$cfs, ylim=c(0,1.0), xlim=c(0,max(vcf$AC)),
+    xlab='alternate allele count (AC)', xaxt='n',
+    ylab='', yaxt='n', type='l', col='purple')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byac$ac),10), labels=seq(0,max(byac$ac),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(cumulative)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(new=T)
+plot(byac$site_fprlt, ylim=c(0,1.0), xlim=c(0,max(vcf$AC)), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue')
+par(new=T)
+plot(byac_gtsites$ac, byac_gtsites$site_fpr, ylim=c(0,1.0), xlim=c(0,max(vcf$AC)),   xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T, cex=0.65)
+mtext(paste("site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative fraction of sites', 'cumulative site PD', paste('site PD at AC (>=', min_sites, 'sites)')),
+    fill=c('purple', 'blue', 'black'))
+garbage <- dev.off()
+
+
+
+
+pdf(paste(filename, '.', tag, '.PD.vs.AC.instantaneous.ac_lt_20.pdf', sep=''))
+#par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byac$sites, ylim=c(0,max(byac$sites)), xlim=c(0,20),
+    xlab='alternate allele count (AC)', xaxt='n',
+    ylab='number of sites', type='l', pch=19, col='blue')
+#axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+#countTicks <- round(seq(0,1,0.1) * max(byac$sites))
+#axis(2, at=countTicks, labels=countTicks)
+par(new=T)
+axis(1, at=seq(0,max(byac$ac),1), labels=seq(0,max(byac$ac),1))
+grid(lty=5)
+par(new=T)
+plot(byac$sites, ylim=c(0,max(byac$sites)), xlim=c(0,20), type='o', pch=19, col='blue', xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(instantaneous)'))
+par(new=T)
+mtext("number of sites", side=2, line=3) #, cex=0.75)
+#par(new=T)
+#plot(byac$site_fprlt, ylim=c(0,1.0), xlim=c(0,20),  xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue')
+par(new=T)
+plot(byac_gtsites$ac, byac_gtsites$site_tpc, ylim=c(0,max(byac$sites)), xlim=c(0,20), xlab='', xaxt='n', ylab='', yaxt='n', col='red', pch=19, type='o')
+par(new=T) #, cex=0.65)
+mtext(paste("site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T) #, cex=0.65)
+#legend('topright', c('number of sites', 'site PD count'),
+#    fill=c('blue', 'red'))
+garbage <- dev.off()
+
+# stratifying by QUAL
+
+if (FALSE) {
+
+
+x <- cbind(by(vcf, vcf$QUAL,
+    function(x) {
+        sum(x[, tag.alternate_positive_discrepancy]) / sum(x[, tag.genotypes_alternate_count])
+    }))
+
+byqual <- data.frame(qual=as.numeric(rownames(x)), fdr=as.vector(x))
+
+# false detection count
+byqual$fpc <- as.vector(cbind(by(vcf, vcf$QUAL,
+    function(i) { sum(i[, tag.alternate_positive_discrepancy]) } )))
+
+byqual$alleles <- as.vector(cbind(by(vcf, vcf$QUAL,
+    function(i) {
+        sum(i[, tag.genotypes_alternate_count])
+    })))
+
+byqual$sites <- as.vector(cbind(by(vcf$QUAL, vcf$QUAL, function(i) length(i))))
+
+# count true positive sites
+byqual$site_tpc <- as.vector(cbind(by(vcf[, tag.has_variant], vcf$QUAL, function(i) sum(i))))
+# fpc == false detection count
+byqual$site_fpc <- byqual$sites - byqual$site_tpc
+# site detection fpr is 1 - true positive rate
+byqual$site_fpr <- 1 - ( byqual$site_tpc / byqual$sites )
+
+#byqual$site_fpr_gt0 <- as.vector(cbind(tapply(byqual$ac, byqual$ac, function(i) { 
+#    s <- subset(byqual, ac == i, select=c(site_fpr, sites))
+#if (s$sites >= min_sites) {
+#    return(s$site_fpr)
+#} else {
+#    return(NA)
+#}
+#})))
+
+#byqual$site_fprlt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) mean(subset(byqual, qual <= i, select=site_fpr)))))
+byqual$site_fprlt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) { 
+    s <- subset(byqual, qual <= i, select=c(site_fpc, sites))
+    return(sum(s$site_fpc) / sum(s$sites))
+})))
+
+#byqual$site_fprgt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) mean(subset(byqual, qual >= i, select=site_fpr)))))
+byqual$site_fprgt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) { 
+    s <- subset(byqual, qual >= i, select=c(site_fpc, sites))
+    return(sum(s$site_fpc) / sum(s$sites))
+})))
+
+byqual$cfa <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) sum(subset(byqual, qual <= i, select=alleles)) / sum(byqual$alleles))))
+
+byqual$cfs <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) sum(subset(byqual, qual <= i, select=sites)) / length(vcf$QUAL))))
+
+# inappropriate collapse via averaging of fdr
+#byqual$alternate_pdlt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) mean(subset(byqual, qual <= i, select=fdr)))))
+
+byqual$alternate_pdr <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) { 
+    s <- subset(byqual, qual == i, select=c(fpc, alleles))
+    return(sum(s$fpc) / sum(s$alleles))
+})))
+
+# use this one
+byqual$alternate_pdlt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) { 
+    s <- subset(byqual, qual <= i, select=c(fpc, alleles))
+    return(sum(s$fpc) / sum(s$alleles))
+})))
+
+#byqual$alternate_pdgt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) mean(subset(byqual, qual >= i, select=fdr)))))
+byqual$alternate_pdgt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) { 
+    s <- subset(byqual, qual >= i, select=c(fpc, alleles))
+    return(sum(s$fpc) / sum(s$alleles))
+})))
+
+nan.to.zero <- function(n) {
+    if (is.nan(n)) return(0) else return(n)
+}
+
+byqual$nrs <- as.vector(cbind(by(vcf, vcf$QUAL, function(i) {
+    return(nan.to.zero(sum(i[, tag.non_reference_sensitivity_count]) / sum(i[, tag.non_reference_sensitivity_normalizer])))
+})))
+
+byqual$nrd <- as.vector(cbind(by(vcf, vcf$QUAL, function(i) { 
+    return(nan.to.zero(sum(i[, tag.non_reference_discrepancy_count]) / sum(i[, tag.non_reference_discrepancy_normalizer])))
+})))
+
+byqual$nrslt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) {
+    s <- subset(vcf, QUAL <= i, select=c(tag.non_reference_sensitivity_count, tag.non_reference_sensitivity_normalizer))
+    return(nan.to.zero(sum(s[, tag.non_reference_sensitivity_count]) / sum(s[, tag.non_reference_sensitivity_normalizer])))
+})))
+
+byqual$nrdlt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) { 
+    s <- subset(vcf, QUAL <= i, select=c(tag.non_reference_discrepancy_count, tag.non_reference_discrepancy_normalizer))
+    return(nan.to.zero(sum(s[, tag.non_reference_discrepancy_count]) / sum(s[, tag.non_reference_discrepancy_normalizer])))
+})))
+
+byqual_gt10 <- subset(byqual, sites >= min_sites)
+
+
+if (FALSE) {
+pdf(paste(filename, '.', tag, '.PD.vs.QUAL.smooth.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byqual$cfa, ylim=c(0,1.0),
+    xlab='QUAL', xaxt='n',
+    ylab='', yaxt='n', type='l', col='red')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byqual$qual),10), labels=seq(0,max(byqual$qual),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(smoothed)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(new=T)
+#plot(byqual$alternate_pdlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='green')
+plot(byqual$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+lines(byqual$qual, predict(loess(byqual$alternate_pdr ~ byqual$qual, span=0.5)), col="green")
+par(new=T)
+plot(byqual$cfa, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='red')
+par(new=T)
+plot(byqual$cfs, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='purple')
+par(new=T)
+lines(byqual$qual, predict(loess(byqual$site_fpr ~ byqual$qual, span=0.5)), col="blue")
+par(new=T)
+lines(byqual$qual, predict(loess(byqual$nrs ~ byqual$qual, span=0.5)), col="magenta")
+par(new=T)
+lines(byqual$qual, predict(loess(byqual$nrd ~ byqual$qual, span=0.5)), col="brown")
+par(new=T, cex=0.65)
+mtext(paste("alternate genotype PD: ", round(vcf.positiveDiscrepancy, digits=4), ", site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative fraction of alt alleles', 'cumulative fraction of sites', 'alt genotypes PD', 'site PD', 'non-ref sensitivity', 'non-ref discrepancy', 'site PD at QUAL'),
+    fill=c('red', 'purple', 'green', 'blue', 'magenta', 'brown', 'black'))
+garbage <- dev.off()
+}
+
+
+
+pdf(paste(filename, '.', tag, '.PD.vs.QUAL.cumulative.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byqual$cfa, ylim=c(0,1.0),
+    xlab='QUAL', xaxt='n',
+    ylab='', yaxt='n', type='l', col='red')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byqual$qual),10), labels=seq(0,max(byqual$qual),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(cumulative)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(new=T)
+plot(byqual_gt10$qual, byqual_gt10$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+#plot(byqual$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+plot(byqual$alternate_pdlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='green')
+par(new=T)
+plot(byqual$cfa, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='red')
+par(new=T)
+plot(byqual$cfs, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='purple')
+par(new=T)
+plot(byqual$site_fprlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue')
+par(new=T)
+plot(byqual$nrslt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='magenta')
+par(new=T)
+plot(byqual$nrdlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='brown')
+par(new=T, cex=0.65)
+mtext(paste("alternate genotype PD: ", round(vcf.positiveDiscrepancy, digits=4), ", site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative fraction of alt alleles', 'cumulative fraction of sites', 'alt genotypes PD', 'site PD', 'non-ref sensitivity', 'non-ref discrepancy', 'site PD at QUAL (>= 10 sites)'),
+    fill=c('red', 'purple', 'green', 'blue', 'magenta', 'brown', 'black'))
+garbage <- dev.off()
+
+
+
+pdf(paste(filename, '.', tag, '.PD.vs.QUAL.cumulative.simple.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byqual$cfs, ylim=c(0,1.0),
+    xlab='QUAL', xaxt='n',
+    ylab='', yaxt='n', type='l', col='purple')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byqual$qual),10), labels=seq(0,max(byqual$qual),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(cumulative)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(new=T)
+plot(byqual$site_fprlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue')
+par(new=T)
+plot(byqual_gt10$qual, byqual_gt10$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T, cex=0.65)
+mtext(paste("site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative fraction of sites', 'site PD', 'site PD at QUAL (>= 10 sites)'),
+    fill=c('purple', 'blue', 'black'))
+garbage <- dev.off()
+
+}
diff --git a/bin/vcfplotaltdiscrepancy.sh b/bin/vcfplotaltdiscrepancy.sh
new file mode 100755
index 0000000..0817895
--- /dev/null
+++ b/bin/vcfplotaltdiscrepancy.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+filename=$1
+tag=$2
+
+vcf2tsv \
+    | tsvsplit \
+        QUAL \
+        AC \
+        $tag.has_variant \
+    | tf2binary \
+    | vcfplotsitediscrepancy.r $filename $tag
diff --git a/bin/vcfplotsitediscrepancy.r b/bin/vcfplotsitediscrepancy.r
new file mode 100755
index 0000000..d4f4ecb
--- /dev/null
+++ b/bin/vcfplotsitediscrepancy.r
@@ -0,0 +1,99 @@
+#!/usr/bin/Rscript --vanilla --slave
+
+# get the input VCF tabular format, assert that sites must have AC > 0
+vcf <- subset(read.table(pipe('cat /dev/stdin'), header=T), AC > 0)
+
+filename <- commandArgs(TRUE)[1]
+tag <- commandArgs(TRUE)[2]
+
+tag.has_variant <- paste(tag, '.has_variant', sep='')
+
+vcf.numberOfSites <- length(vcf$AC)
+vcf.sitesTruePositive <- mean(vcf[, tag.has_variant])
+
+# false detection count
+x <- cbind(by(vcf$AC, vcf$AC, function(i) length(i)))
+
+byac <- data.frame(ac=as.numeric(rownames(x)), sites=as.vector(x))
+
+# count true positive sites
+byac$site_tpc <- as.vector(cbind(by(vcf[, tag.has_variant], vcf$AC, function(i) sum(i))))
+# fpc == false detection count
+byac$site_fpc <- byac$sites - byac$site_tpc
+# site detection fpr is 1 - true positive rate
+byac$site_fpr <- 1 - ( byac$site_tpc / byac$sites )
+
+#byac$site_fprlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac <= i, select=site_fpr)))))
+byac$site_fprlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) { 
+    s <- subset(byac, ac <= i, select=c(site_fpc, sites))
+    return(sum(s$site_fpc) / sum(s$sites))
+})))
+
+#byac$site_fprgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac >= i, select=site_fpr)))))
+byac$site_fprgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) { 
+    s <- subset(byac, ac >= i, select=c(site_fpc, sites))
+    return(sum(s$site_fpc) / sum(s$sites))
+})))
+
+byac$cfs <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) sum(subset(byac, ac <= i, select=sites)) / length(vcf$AC))))
+
+
+pdf(paste(filename, '.', tag, '.site_FDR.vs.AC.smooth.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byac$cfs, ylim=c(0,1.0),
+    xlab='alternate allele count (AC)', xaxt='n',
+    ylab='false discovery rate (FDR)', yaxt='n', type='l', col='red')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byac$ac),10), labels=seq(0,max(byac$ac),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'putative site false discovery rate versus', tag, '(smoothed)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+par(col='red')
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(col='black')
+par(new=T)
+plot(byac$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+lines(byac$ac, predict(loess(byac$site_fpr ~ byac$ac, span=0.5)), col="blue")
+par(new=T, cex=0.65)
+mtext(paste("site FDR: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative sites', 'site FDR (loess smoothed)', 'FDR at AC'),
+    fill=c('red', 'blue', 'black'))
+garbage <- dev.off()
+
+
+
+pdf(paste(filename, '.', tag, '.site_FDR.vs.AC.cumulative.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byac$cfs, ylim=c(0,1.0),
+    xlab='alternate allele count (AC)', xaxt='n',
+    ylab='false discovery rate (FDR)', yaxt='n', type='l', col='red')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byac$ac),10), labels=seq(0,max(byac$ac),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'putative false discovery rate versus', tag, '(cumulative)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+par(col='red')
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(col='black')
+par(new=T)
+plot(byac$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+plot(byac$site_fprgt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='green')
+par(new=T)
+plot(byac$site_fprlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue')
+par(new=T, cex=0.65)
+mtext(paste("site FDR: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative sites', 'site FDR <= AC', 'site FDR >= AC', 'FDR at AC'),
+    fill=c('red', 'blue', 'green', 'black'))
+garbage <- dev.off()
diff --git a/bin/vcfplottstv.sh b/bin/vcfplottstv.sh
new file mode 100755
index 0000000..835b5ef
--- /dev/null
+++ b/bin/vcfplottstv.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+filename=$1
+title=$2
+
+vcf2tsv \
+    | tsvsplit \
+        QUAL \
+        AC \
+        AF \
+        TS \
+    | tf2binary \
+    | vcfplottstv.r $filename $title
diff --git a/bin/vcfprintaltdiscrepancy.r b/bin/vcfprintaltdiscrepancy.r
new file mode 100755
index 0000000..ef33502
--- /dev/null
+++ b/bin/vcfprintaltdiscrepancy.r
@@ -0,0 +1,37 @@
+#!/usr/bin/Rscript --vanilla --slave
+
+# get the input VCF tabular format, assert that sites must have AC > 0
+vcf <- subset(read.table(pipe('cat /dev/stdin'), header=T), AC > 0)
+
+tag <- commandArgs(TRUE)[1]
+
+tag.genotypes_alternate_count <- paste(tag, '.genotypes.alternate_count', sep='')
+tag.non_reference_discrepancy_count <- paste(tag, '.site.non_reference_discrepancy.count', sep='')
+tag.non_reference_discrepancy_normalizer <- paste(tag, '.site.non_reference_discrepancy.normalizer', sep='')
+tag.non_reference_sensitivity_count <- paste(tag, '.site.non_reference_sensitivity.count', sep='')
+tag.non_reference_sensitivity_normalizer <- paste(tag, '.site.non_reference_sensitivity.normalizer', sep='')
+tag.alternate_positive_discrepancy <- paste(tag, '.site.alternate_positive_discrepancy', sep='')
+tag.alternate_negative_discrepancy <- paste(tag, '.site.alternate_negative_discrepancy', sep='')
+tag.has_variant <- paste(tag, '.has_variant', sep='')
+
+vcf.numberOfSites <- length(vcf[, tag.genotypes_alternate_count])
+vcf.totalAltAlleles <- sum(vcf[, tag.genotypes_alternate_count])
+vcf.positiveDiscrepancy <- sum(vcf[, tag.alternate_positive_discrepancy]) / sum(vcf[, tag.genotypes_alternate_count])
+vcf.negativeDiscrepancy <- sum(vcf[, tag.alternate_negative_discrepancy]) / sum(vcf[, tag.genotypes_alternate_count])
+vcf.sitesTruePositive <- sum(vcf[, tag.has_variant]) / nrow(vcf)
+
+cat('number of sites', vcf.numberOfSites, '\n')
+cat('total alternate alleles', vcf.totalAltAlleles, '\n')
+cat('positive discrepancy', vcf.positiveDiscrepancy, '\n')
+cat('negative discrepancy', vcf.negativeDiscrepancy, '\n')
+
+x <- cbind(by(vcf, vcf$AC,
+    function(x) {
+        sum(x[, tag.alternate_positive_discrepancy]) / sum(x[, tag.genotypes_alternate_count])
+    }))
+
+byac <- data.frame(ac=as.numeric(rownames(x)), fdr=as.vector(x))
+
+print(byac)
+
+
diff --git a/bin/vcfprintaltdiscrepancy.sh b/bin/vcfprintaltdiscrepancy.sh
new file mode 100755
index 0000000..1df0a65
--- /dev/null
+++ b/bin/vcfprintaltdiscrepancy.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+tag=$1
+
+vcf2tsv \
+    | tsvsplit \
+        QUAL \
+        AC \
+        $tag.has_variant \
+        $tag.site.alternate_negative_discrepancy \
+        $tag.site.alternate_positive_discrepancy \
+        $tag.genotypes.alternate_count \
+        $tag.site.non_reference_sensitivity.count \
+        $tag.site.non_reference_sensitivity.normalizer \
+        $tag.site.non_reference_discrepancy.count \
+        $tag.site.non_reference_discrepancy.normalizer \
+    | tf2binary \
+    | vcfprintaltdiscrepancy.r $tag
diff --git a/bin/vcfqualfilter b/bin/vcfqualfilter
new file mode 100755
index 0000000..8fe970f
--- /dev/null
+++ b/bin/vcfqualfilter
@@ -0,0 +1,32 @@
+#!/usr/bin/perl
+#
+#
+
+use Getopt::Long;
+my $cutoff = -1;
+my $max = -1;
+my $indel = 0;
+my $snp = 0;
+$result = GetOptions ("c|cutoff=i" => \$cutoff,
+                      "m|max=i" => \$max,
+                      "i|indel=i"   => \$indel,
+                      "s|snp=i"  => \$snp);
+
+
+while (<STDIN>) {
+    if ($_ =~ /^#/) {
+        print $_;
+        next;
+    }
+
+    if ($_ =~ /^(.*?\t){6}(.*?)\t/) {
+        $qual = $1;
+    }
+    if ($cutoff ne -1 and $qual >= $cutoff and ($max eq -1 or $qual <= $max)) {
+        print $_;
+    } elsif ($snp and $_ =~ "SNP" and $qual >= $snp) {
+        print $_;
+    } elsif ($indel and $_ =~ "INS\|DEL" and $qual >= $indel) {
+        print $_;
+    }
+}
diff --git a/bin/vcfregionreduce b/bin/vcfregionreduce
new file mode 100755
index 0000000..fde0938
--- /dev/null
+++ b/bin/vcfregionreduce
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+if [ $# -ne 2 ];
+then
+    echo "Usage: $0 [region file] [directory]"
+    echo
+    echo "Generates \`basename directory\`.vcf.gz, which is the concatenation"
+    echo "of files in the directory named [directory]/[region1].vcf.gz,"
+    echo "[directory]/[region2].vcf.gz, etc. in the order in which they"
+    echo "occur in the region file."
+    echo
+    echo "A tabix index is subsequently generated."
+    exit 1
+fi
+
+regionfile=$1
+mergedir=$2
+mergename=$(basename $mergedir)
+vcfgenotypes=$mergename.vcf.gz
+#vcfsites=$mergename.sites.vcf.gz
+
+firstfile=$mergedir/$(head -1 $regionfile).vcf.gz
+files=$(for region in $(cat $regionfile); do echo $mergedir/$region.vcf.gz; done)
+
+( zcat $firstfile | head -1000 | grep ^#
+for file in $files
+do
+    zcat $file | grep -v "^#"
+done ) | ( bgzip >$vcfgenotypes && tabix -p vcf $vcfgenotypes )
diff --git a/bin/vcfregionreduce_and_cut b/bin/vcfregionreduce_and_cut
new file mode 100755
index 0000000..f15dc2c
--- /dev/null
+++ b/bin/vcfregionreduce_and_cut
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+if [ $# -ne 2 ];
+then
+    echo "Usage: $0 [region file] [directory]"
+    echo
+    echo "Generates \`basename directory\`.vcf.gz and \`basename directory\`.sites.vcf.gz"
+    echo "which are the concatenation of files in the directory named [directory]/[region1].vcf.gz,"
+    echo "[directory]/[region2].vcf.gz, etc. in the order in which they occur in the region file."
+    echo
+    echo "Tabix indexes are simultaneously generated."
+    exit 1
+fi
+
+regionfile=$1
+mergedir=$2
+mergename=$(basename $mergedir)
+vcfgenotypes=$mergename.vcf.gz
+vcfsites=$mergename.sites.vcf.gz
+
+regions=$(cat $regionfile)
+
+firstfile=$mergedir/$(echo $regions | cut -f 1 -d\  ).vcf.gz
+files=$(for region in $regions; do echo $mergedir/$region.vcf.gz; done)
+
+( zcat $firstfile | head -1000 | grep ^#
+for file in $files
+do
+    zcat $file | grep -v "^#"
+done ) | uniq | pee \
+        "bgzip >$vcfgenotypes && tabix -p vcf $vcfgenotypes" \
+        "cut -f -8 | bgzip >$vcfsites && tabix -p vcf $vcfsites"
diff --git a/bin/vcfregionreduce_pipe b/bin/vcfregionreduce_pipe
new file mode 100755
index 0000000..8a21782
--- /dev/null
+++ b/bin/vcfregionreduce_pipe
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+if [ $# -ne 2 ];
+then
+    echo "Usage: $0 [region file] [directory]"
+    echo
+    echo "Generates \`basename directory\`.vcf.gz, which is the concatenation"
+    echo "of files in the directory named [directory]/[region1].vcf.gz,"
+    echo "[directory]/[region2].vcf.gz, etc. in the order in which they"
+    echo "occur in the region file."
+    echo
+    echo "A tabix index is subsequently generated."
+    exit 1
+fi
+
+regionfile=$1
+mergedir=$2
+mergename=$(basename $mergedir)
+vcfgenotypes=$mergename.vcf.gz
+#vcfsites=$mergename.sites.vcf.gz
+
+firstfile=$mergedir/$(head -1 $regionfile).vcf.gz
+files=$(for region in $(cat $regionfile); do echo $mergedir/$region.vcf.gz; done)
+
+zcat $firstfile | head -1000 | grep ^#
+for file in $files
+do
+    zcat $file | grep -v "^#"
+done
diff --git a/bin/vcfregionreduce_uncompressed b/bin/vcfregionreduce_uncompressed
new file mode 100755
index 0000000..41c7528
--- /dev/null
+++ b/bin/vcfregionreduce_uncompressed
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+if [ $# -ne 2 ];
+then
+    echo "Usage: $0 [region file] [directory]"
+    echo
+    echo "Generates \`basename directory\`.vcf.gz, which is the concatenation"
+    echo "of files in the directory named [directory]/[region1].vcf.gz,"
+    echo "[directory]/[region2].vcf.gz, etc. in the order in which they"
+    echo "occur in the region file."
+    echo
+    echo "A tabix index is subsequently generated."
+    exit 1
+fi
+
+regionfile=$1
+mergedir=$2
+mergename=$(basename $mergedir)
+vcfgenotypes=$mergename.vcf.gz
+#vcfsites=$mergename.sites.vcf.gz
+
+firstfile=$mergedir/$(head -1 $regionfile).vcf
+files=$(for region in $(cat $regionfile); do echo $mergedir/$region.vcf; done)
+
+( cat $firstfile | head -1000 | grep ^#
+for file in $files
+do
+    cat $file | grep -v "^#"
+done ) | ( bgzip >$vcfgenotypes && tabix -p vcf $vcfgenotypes )
diff --git a/bin/vcfremovenonATGC b/bin/vcfremovenonATGC
new file mode 100755
index 0000000..7418843
--- /dev/null
+++ b/bin/vcfremovenonATGC
@@ -0,0 +1,29 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+    if ($_ =~ /^#/) {
+        print;
+    } else {
+        $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+        $chrom = $1;
+        $pos = $2;
+        $tag = $3;
+        $ref = $4;
+        $alts = $5;
+        $hasJunk = 0;
+        @alts = split(/,/, $alts);
+
+        if (!($ref =~ /A|T|G|C/)) {
+            $hasJunk = 1;
+        }
+        foreach $alt (@alts) {
+            if (!($alt =~ /A|T|G|C/)) {
+                $hasJunk = 1;
+            }
+        }
+        if (!$hasJunk) {
+            print;
+        }
+    }
+}
diff --git a/bin/vcfsnps b/bin/vcfsnps
new file mode 100755
index 0000000..b2b8b79
--- /dev/null
+++ b/bin/vcfsnps
@@ -0,0 +1,26 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+    if ($_ =~ /^#/) {
+        print;
+    } else {
+        $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+        $chrom = $1;
+        $pos = $2;
+        $tag = $3;
+        $ref = $4;
+        $alts = $5;
+        $hasindel = 0;
+        @alts = split(/,/, $alts);
+        $snp = 1;
+        foreach $alt (@alts) {
+            if (length($ref) > 1 || length($alt) != length($ref)) {
+                $snp = 0;
+            }
+        }
+        if ($snp) {
+            print;
+        }
+    }
+}
diff --git a/bin/vcfsort b/bin/vcfsort
new file mode 100755
index 0000000..def75ee
--- /dev/null
+++ b/bin/vcfsort
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+head -1000 $1 | grep "^#"; cat $@ | grep -v "^#" | sort -k1,1d -k2,2n
diff --git a/bin/vcfvarstats b/bin/vcfvarstats
new file mode 100755
index 0000000..1f645f7
--- /dev/null
+++ b/bin/vcfvarstats
@@ -0,0 +1,225 @@
+#!/usr/bin/perl
+#
+
+use IPC::Open2;
+
+sub revcomplement {
+    $revcom = reverse shift;
+    $revcom =~ tr/ACGTacgt/TGCAtgca/;
+    return $revcom;
+}
+
+$reference = $ARGV[0];
+
+if ($reference) {
+    $pid = open2(\*FASTAHACK_OUT, \*FASTAHACK_IN, "fastahack -c $reference");
+}
+
+#print FASTAHACK_IN "1:10000\n";
+#$result = <FASTAHACK_OUT>;
+#print $result;
+
+
+#open(VCF, $file);
+
+$ts = 0;
+$tv = 0;
+$cpg = 0;
+$total = 0;
+$snp = 0;
+$mnp = 0;
+$mnplen = 0;
+%mnp = ();
+$ins = 0;
+$inslen = 0;
+%ins = ();
+$del = 0;
+$dellen = 0;
+%del = ();
+
+%dint = (); # di-nucleotide distribution
+
+while (<STDIN>) {
+    if ($_ =~ /^#/) {
+        next;
+    } else {
+        $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+        $chrom = $1;
+        $pos = $2;
+        $tag = $3;
+        $ref = $4;
+        $alt = $5;
+        #print "chrom: $chrom, pos: $pos, ref: $ref, alt: $alt\n";
+    }
+
+    $diff = length($ref) - length($alt);
+
+    $is_snp = 0;
+    if ($_ =~ /SNP/) {
+        $is_snp = 1;
+        $snp += 1;
+        # get di-nt's
+        if ($reference) {
+            if ($_ =~ /^(\d+)\t(\d+)/) {
+                $seq = $1;
+                $start = $2;
+                $end = $2 + 1;
+                print FASTAHACK_IN "$seq:$start..$end\n";
+                $dibp = <FASTAHACK_OUT>;
+                chomp $dibp;
+                $dint{$dibp} += 1;
+            }
+        }
+    } elsif ($diff eq 0 and length($ref) eq 1) {
+        $snp += 1;
+        $is_snp = 1;
+    } elsif ($diff eq 0 and length($ref) gt 1) {
+        $mnp += 1;
+        $mnplen += length($ref);
+        $mnp{length($ref)} += 1;
+    }
+    if ($is_snp) {
+        if ((($ref eq "A" and $alt eq "G") or ($ref eq "G" and $alt eq "A"))
+                or
+            (($ref eq "C" and $alt eq "T") or ($ref eq "T" and $alt eq "C"))) {
+            $ts += 1;
+        } else {
+            $tv += 1;
+        }
+        if ($_ =~ /CpG/) { $cpg += 1; }
+    }
+
+    if ($diff lt 0) {
+        $len = abs($diff);
+        $ins += 1;
+        $inslen += $len;
+        $ins{$len} += 1;
+    }
+
+    if ($diff gt 0) {
+        $len = abs($diff);
+        $del += 1;
+        $dellen += $len;
+        $del{$len} += 1;
+    }
+    #elsif (length($ref) > 1 and $diff eq 0) {
+    #    print $_ . "\n";
+    #    $mnp += 1;
+    #    $mnplen += length($ref);
+    #    $mnp{length($ref)} += 1;
+    #}
+
+    $total += 1;
+}
+
+if ($total == 0) {
+    die "no VCF records read on stdin\n";
+}
+
+print "total variants:\t$total" . "\n";
+print "\n";
+if ($snp > 0) {
+    print "total snps:\t$snp\n";
+    print "transitions:\t$ts\n";
+    print "transversions:\t$tv\n";
+    if ($tv > 0) {
+        print "ts/tv ratio:\t" . ($ts / $tv) . "\n";
+    }
+    print "CpG sites:\t$cpg\n";
+    if ($cpg > 0) {
+        print "CpG/total snps:\t" . ($cpg / $snp) . "\n";
+    }
+}
+
+if (($ins + $del) > 0) {
+    print "\n";
+    print "total indels:\t" . ($ins + $del) . "\n";
+    print "insertions:\t$ins\t$inslen bp\n";
+    print "deletions:\t$del\t$dellen bp\n";
+
+    $max = 0;
+    while ( my ($size, $count) = each(%ins) ) {
+        if ($size > $max) { $max = $size; }
+    }
+    while ( my ($size, $count) = each(%del) ) {
+        if ($size > $max) { $max = $size; }
+    }
+
+    print "\n";
+
+    if ($inslen > 0 and $dellen > 0) {
+        $indel_length_ratio = $inslen / $dellen;
+        print "ins/del length ratio:\t$indel_length_ratio\n";
+        print "\n";
+        print "indel size frequency distribution\n";
+        print "size\tins\tdel\tins/del\tcurr/prev\n";
+
+        $last_delcount = 0;
+        $last_inscount = 0;
+        $last_ratio_del = 0;
+        $last_ratio_ins = 0;
+        for (1 .. $max) {
+            $inscount = $ins{$_};
+            $delcount = $del{$_};
+            if ($last_delcount != 0) {
+                $last_ratio_del = $delcount / $last_delcount;
+            }
+            if ($last_inscount != 0) {
+                $last_ratio_ins = $inscount / $last_inscount;
+            }
+            $last_delcount = $delcount;
+            $last_inscount = $inscount;
+            if ($inscount > 0 and $delcount > 0) {
+                $ratio = $inscount / $delcount;
+            } else {
+                $ratio = "";
+            }
+            print "$_\t$inscount\t$delcount\t"
+            . sprintf("%.3f", $ratio);
+            if ($last_ratio_ins != 0 or $last_ratio_del != 0) {
+                print "\t";
+                if ($last_ratio_ins != 0) {
+                    print sprintf("%.3f", $last_ratio_ins);
+                }
+                print "\t";
+                if ($last_ratio_del != 0) {
+                    print sprintf("%.3f", $last_ratio_del);
+                }
+                print "\n";
+            } else {
+                print "\n";
+            }
+        }
+        # FIXME
+        #print "\t\t\t\t" . sprintf("%.3f", $even_odd_ratio_sum_ins / $ins)
+        # . "\t" . sprintf("%.3f", $even_odd_ratio_sum_del / $del);
+    }
+}
+
+if ($mnplen > 0) {
+    print "\n";
+    print "total mnps:\t$mnp\n";
+    print "mnps length:\t$mnplen\n";
+    print "mnp size distribution\n";
+    $max = 0;
+    while ( my ($size, $count) = each(%mnp) ) {
+        if ($size > $max) { $max = $size; }
+    }
+    print "size\tcount\n";
+    for (2 .. $max) {
+        print $_ . "\t" . $mnp{$_} . "\n";
+    }
+}
+
+if ($reference) {
+
+    print "\n";
+
+    print "di-nucleotide distribution for SNPs\n";
+    print "di-nt\tcount\tcount/(total snps / 16)\n";
+    while ( my ($dibp, $count) = each(%dint) ) {
+        print "$dibp\t$count\t" . ($count / ($snp / 16)) . "\n";
+    }
+
+}
+
diff --git a/samples/sample.vcf b/samples/sample.vcf
new file mode 100644
index 0000000..e8dd794
--- /dev/null
+++ b/samples/sample.vcf
@@ -0,0 +1,31 @@
+##fileformat=VCFv4.0
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=1000GenomesPilot-NCBI36
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
+##ALT=<ID=CNV,Description="Copy number variable region">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
+19	111	.	A	C	9.6	.	.	GT:HQ	0|0:10,10	0|0:10,10	0/1:3,3
+19	112	.	A	G	10	.	.	GT:HQ	0|0:10,10	0|0:10,10	0/1:3,3
+20	14370	rs6054257	G	A	29	PASS	NS=3;DP=14;AF=0.5;DB;H2	GT:GQ:DP:HQ	0|0:48:1:51,51	1|0:48:8:51,51	1/1:43:5:.,.
+20	17330	.	T	A	3	q10	NS=3;DP=11;AF=0.017	GT:GQ:DP:HQ	0|0:49:3:58,50	0|1:3:5:65,3	0/0:41:3:.,.
+20	1110696	rs6040355	A	G,T	67	PASS	NS=2;DP=10;AF=0.333,0.667;AA=T;DB	GT:GQ:DP:HQ	1|2:21:6:23,27	2|1:2:0:18,2	2/2:35:4:.,.
+20	1230237	.	T	.	47	PASS	NS=3;DP=13;AA=T	GT:GQ:DP:HQ	0|0:54:.:56,60	0|0:48:4:51,51	0/0:61:2:.,.
+20	1234567	microsat1	G	GA,GAC	50	PASS	NS=3;DP=9;AA=G;AN=6;AC=3,1	GT:GQ:DP	0/1:.:4	0/2:17:2	1/1:40:3
+20	1235237	.	T	.	.	.	.	GT	0/0	0|0	./.
+X	10	rsTest	AC	A,ATG	10	PASS	.	GT	0	0/1	0|2
diff --git a/src/BedReader.h b/src/BedReader.h
new file mode 100644
index 0000000..9deee16
--- /dev/null
+++ b/src/BedReader.h
@@ -0,0 +1,176 @@
+#ifndef BEDREADER_H
+#define BEDREADER_H
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <deque>
+#include <map>
+#include <iterator>
+#include <algorithm>
+#include "intervaltree/IntervalTree.h"
+#include "split.h"
+
+using namespace std;
+
+string strip(string const& str, char const* separators = " \t") {
+    string::size_type const first = str.find_first_not_of(separators);
+    return (first == string::npos) ? string()
+        : str.substr(first, str.find_last_not_of(separators) - first + 1);
+}
+
+void parseRegion(
+    string& region,
+    string& startSeq,
+    int& startPos,
+    int& stopPos) {
+
+    size_t foundFirstColon = region.find(":");
+
+    // we only have a single string, use the whole sequence as the target
+    if (foundFirstColon == string::npos) {
+        startSeq = region;
+        startPos = 0;
+        stopPos = -1;
+    } else {
+        startSeq = region.substr(0, foundFirstColon);
+        string sep = "..";
+        size_t foundRangeSep = region.find(sep, foundFirstColon);
+        if (foundRangeSep == string::npos) {
+            sep = "-";
+            foundRangeSep = region.find("-", foundFirstColon);
+        }
+        if (foundRangeSep == string::npos) {
+            startPos = atoi(region.substr(foundFirstColon + 1).c_str());
+            // differ from bamtools in this regard, in that we process only
+            // the specified position if a range isn't given
+            stopPos = startPos + 1;
+        } else {
+            startPos = atoi(region.substr(foundFirstColon + 1, foundRangeSep - foundFirstColon).c_str());
+            // if we have range sep specified, but no second number, read to the end of sequence
+            if (foundRangeSep + sep.size() != region.size()) {
+                stopPos = atoi(region.substr(foundRangeSep + sep.size()).c_str()); // end-exclusive, bed-format
+            } else {
+                //stopPos = reference.sequenceLength(startSeq);
+                stopPos = -1;
+            }
+        }
+    }
+}
+
+// stores the posiitional information of a bed target entry
+class BedTarget {
+
+public:
+
+    string seq;  // sequence name
+    int left;    // left position
+    int right;   // right position, adjusted to 0-base
+    string desc; // descriptive information, target name typically
+
+    BedTarget(string s) {
+        parseRegion(s, seq, left, right); 
+    }
+
+    BedTarget(string s, int l, int r, string d = "")
+        : seq(s)
+        , left(l)
+        , right(r)
+        , desc(d)
+    { }
+
+};
+
+
+class BedReader {
+
+    bool _isOpen;
+    ifstream file;
+
+public:
+
+    bool isOpen(void) { return _isOpen; }
+
+    vector<BedTarget> targets;
+    map<string, IntervalTree<BedTarget*> > intervals; // intervals by reference sequence
+
+    vector<BedTarget> entries(void) {
+
+        vector<BedTarget> entries;
+
+        if (!isOpen()) {
+            cerr << "bed targets file is not open" << endl;
+            exit(1);
+        }
+
+        string line;
+        while (std::getline(file, line)) {
+            vector<string> fields = split(line, " \t");
+            BedTarget entry(strip(fields[0]),
+                            atoi(strip(fields[1]).c_str()),
+                            atoi(strip(fields[2]).c_str()),
+                            (fields.size() >= 4) ? strip(fields[3]) : "");
+            entries.push_back(entry);
+        }
+
+        return entries;
+
+    }
+
+    vector<BedTarget*> targetsContained(BedTarget& target) {
+        vector<Interval<BedTarget*> > results;
+        intervals[target.seq].findContained(target.left, target.right, results);
+        vector<BedTarget*> contained;
+        for (vector<Interval<BedTarget*> >::iterator r = results.begin(); r != results.end(); ++r) {
+            contained.push_back(r->value);
+        }
+        return contained;
+    }
+
+    vector<BedTarget*> targetsOverlapping(BedTarget& target) {
+        vector<Interval<BedTarget*> > results;
+        intervals[target.seq].findOverlapping(target.left, target.right, results);
+        vector<BedTarget*> overlapping;
+        for (vector<Interval<BedTarget*> >::iterator r = results.begin(); r != results.end(); ++r) {
+            overlapping.push_back(r->value);
+        }
+        return overlapping;
+    }
+
+BedReader(void)
+	: _isOpen(false)
+    { }
+
+BedReader(string& fname)
+	: _isOpen(false) {
+        open(fname);
+    }
+
+    void addTargets(vector<BedTarget>& targets) {
+        map<string, vector<Interval<BedTarget*> > > intervalsBySeq;
+        for (vector<BedTarget>::iterator t = targets.begin(); t != targets.end(); ++t) {
+            intervalsBySeq[t->seq].push_back(Interval<BedTarget*>(1 + t->left, t->right, &*t));
+        }
+        for (map<string, vector<Interval<BedTarget*> > >::iterator s = intervalsBySeq.begin(); s != intervalsBySeq.end(); ++s) {
+            intervals[s->first] = IntervalTree<BedTarget*>(s->second);
+        }
+    }
+
+    void open(const string& fname) {
+        file.open(fname.c_str());
+        _isOpen = true;
+        targets = entries();
+        map<string, vector<Interval<BedTarget*> > > intervalsBySeq;
+        for (vector<BedTarget>::iterator t = targets.begin(); t != targets.end(); ++t) {
+            intervalsBySeq[t->seq].push_back(Interval<BedTarget*>(1 + t->left, t->right, &*t));
+        }
+        for (map<string, vector<Interval<BedTarget*> > >::iterator s = intervalsBySeq.begin(); s != intervalsBySeq.end(); ++s) {
+            intervals[s->first] = IntervalTree<BedTarget*>(s->second);
+        }
+    }
+
+};
+
+#endif
+
diff --git a/src/Variant.cpp b/src/Variant.cpp
new file mode 100644
index 0000000..d6f4a92
--- /dev/null
+++ b/src/Variant.cpp
@@ -0,0 +1,2405 @@
+#include "Variant.h"
+#include <utility>
+
+namespace vcf {
+
+void Variant::parse(string& line, bool parseSamples) {
+
+    // clean up potentially variable data structures
+    info.clear();
+    infoFlags.clear();
+    format.clear();
+    alt.clear();
+    alleles.clear();
+
+    // #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT [SAMPLE1 .. SAMPLEN]
+    vector<string> fields = split(line, '\t');
+    if (fields.size() < 7) {
+        cerr << "broken VCF record (less than 7 fields)" << endl
+             << line << endl;
+        exit(1);
+    }
+
+    sequenceName = fields.at(0);
+    char* end; // dummy variable for strtoll
+    position = strtoll(fields.at(1).c_str(), &end, 10);
+    id = fields.at(2);
+    ref = fields.at(3);
+    alt = split(fields.at(4), ","); // a comma-separated list of alternate alleles
+
+    // make a list of all (ref + alts) alleles, allele[0] = ref, alleles[1:] = alts
+    // add the ref allele ([0]), resize for the alt alleles, and then add the alt alleles
+    alleles.push_back(ref);
+    alleles.resize(alt.size()+1);
+    std::copy(alt.begin(), alt.end(), alleles.begin()+1);
+
+    // set up reverse lookup of allele index
+    altAlleleIndexes.clear();
+    int n = 0;
+    for (vector<string>::iterator a = alt.begin();
+            a != alt.end(); ++a, ++n) {
+        altAlleleIndexes[*a] = n;
+    }
+
+    convert(fields.at(5), quality);
+    filter = fields.at(6);
+    if (fields.size() > 7) {
+        vector<string> infofields = split(fields.at(7), ';');
+        for (vector<string>::iterator f = infofields.begin(); f != infofields.end(); ++f) {
+            if (*f == ".") {
+                continue;
+            }
+            vector<string> kv = split(*f, '=');
+            if (kv.size() == 2) {
+                split(kv.at(1), ',', info[kv.at(0)]);
+            } else if (kv.size() == 1) {
+                infoFlags[kv.at(0)] = true;
+            }
+        }
+    }
+    // check if we have samples specified
+    // and that we are supposed to parse them
+    if (parseSamples && fields.size() > 8) {
+        format = split(fields.at(8), ':');
+        // if the format changed, we have to rebuild the samples
+        if (fields.at(8) != lastFormat) {
+            samples.clear();
+            lastFormat = fields.at(8);
+        }
+        vector<string>::iterator sampleName = sampleNames.begin();
+        vector<string>::iterator sample = fields.begin() + 9;
+        for (; sample != fields.end() && sampleName != sampleNames.end(); ++sample, ++sampleName) {
+            string& name = *sampleName;
+            if (*sample == "." || *sample == "./.") {
+                samples.erase(name);
+                continue;
+            }
+            vector<string> samplefields = split(*sample, ':');
+            vector<string>::iterator i = samplefields.begin();
+            if (samplefields.size() != format.size()) {
+                // ignore this case... malformed (or 'null') sample specs are caught above
+                // /*
+                // cerr << "inconsistent number of fields for sample " << name << endl
+                //      << "format is " << join(format, ":") << endl
+                //      << "sample is " << *sample << endl;
+                // exit(1);
+                // *
+            }
+            else {
+                for (vector<string>::iterator f = format.begin(); f != format.end(); ++f) {
+                    samples[name][*f] = split(*i, ','); ++i;
+                }
+            }
+        }
+        if (sampleName != sampleNames.end()) {
+            cerr << "error: more sample names in header than sample fields" << endl;
+            cerr << "samples: " << join(sampleNames, " ") << endl;
+            cerr << "line: " << line << endl;
+            exit(1);
+        }
+        if (sample != fields.end()) {
+            cerr << "error: more sample fields than samples listed in header" << endl;
+            cerr << "samples: " << join(sampleNames, " ") << endl;
+            cerr << "line: " << line << endl;
+            cerr << *sample << endl;
+            exit(1);
+        }
+    } else if (!parseSamples) {
+        originalLine = line;
+    }
+
+    //return true; // we should be catching exceptions...
+}
+
+void Variant::setVariantCallFile(VariantCallFile& v) {
+    sampleNames = v.sampleNames;
+    outputSampleNames = v.sampleNames;
+    vcf = &v;
+}
+
+void Variant::setVariantCallFile(VariantCallFile* v) {
+    sampleNames = v->sampleNames;
+    outputSampleNames = v->sampleNames;
+    vcf = v;
+}
+
+ostream& operator<<(ostream& out, VariantFieldType type) {
+    switch (type) {
+        case FIELD_INTEGER:
+            out << "integer";
+            break;
+        case FIELD_FLOAT:
+            out << "float";
+            break;
+        case FIELD_BOOL:
+            out << "bool";
+            break;
+        case FIELD_STRING:
+            out << "string";
+            break;
+        default:
+            out << "unknown";
+            break;
+    }
+    return out;
+}
+
+VariantFieldType typeStrToVariantFieldType(string& typeStr) {
+    if (typeStr == "Integer") {
+        return FIELD_INTEGER;
+    } else if (typeStr == "Float") {
+        return FIELD_FLOAT;
+    } else if (typeStr == "Flag") {
+        return FIELD_BOOL;
+    } else if (typeStr == "String") {
+        return FIELD_STRING;
+    } else {
+        return FIELD_UNKNOWN;
+    }
+}
+
+VariantFieldType Variant::infoType(string& key) {
+    map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
+    if (s == vcf->infoTypes.end()) {
+        if (key == "QUAL") { // hack to use QUAL as an "info" field
+            return FIELD_INTEGER;
+        }
+        cerr << "no info field " << key << endl;
+        exit(1);
+    } else {
+        return s->second;
+    }
+}
+
+    VariantFieldType Variant::formatType(string& key) {
+        map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
+        if (s == vcf->formatTypes.end()) {
+            cerr << "no format field " << key << endl;
+            exit(1);
+        } else {
+            return s->second;
+        }
+    }
+
+    bool Variant::getInfoValueBool(string& key, int index) {
+        map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
+        if (s == vcf->infoTypes.end()) {
+            cerr << "no info field " << key << endl;
+            exit(1);
+        } else {
+            int count = vcf->infoCounts[key];
+            // XXX TODO, fix for Genotype variants...
+            if (count != ALLELE_NUMBER) {
+                index = 0;
+            }
+            if (index == INDEX_NONE) {
+                if (count != 1) {
+                    cerr << "no field index supplied and field count != 1" << endl;
+                    exit(1);
+                } else {
+                    index = 0;
+                }
+            }
+            VariantFieldType type = s->second;
+            if (type == FIELD_BOOL) {
+                map<string, bool>::iterator b = infoFlags.find(key);
+                if (b == infoFlags.end())
+                    return false;
+                else
+                    return true;
+            } else {
+                cerr << "not flag type " << key << endl;
+            }
+        }
+    }
+
+    string Variant::getInfoValueString(string& key, int index) {
+        map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
+        if (s == vcf->infoTypes.end()) {
+            cerr << "no info field " << key << endl;
+            exit(1);
+        } else {
+            int count = vcf->infoCounts[key];
+            // XXX TODO, fix for Genotype variants...
+            if (count != ALLELE_NUMBER) {
+                index = 0;
+            }
+            if (index == INDEX_NONE) {
+                if (count != 1) {
+                    cerr << "no field index supplied and field count != 1" << endl;
+                    exit(1);
+                } else {
+                    index = 0;
+                }
+            }
+            VariantFieldType type = s->second;
+            if (type == FIELD_STRING) {
+                map<string, vector<string> >::iterator b = info.find(key);
+                if (b == info.end())
+                    return "";
+                return b->second.at(index);
+            } else {
+                cerr << "not string type " << key << endl;
+                return "";
+            }
+        }
+    }
+
+    double Variant::getInfoValueFloat(string& key, int index) {
+        map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
+        if (s == vcf->infoTypes.end()) {
+            if (key == "QUAL") {
+                return quality;
+            }
+            cerr << "no info field " << key << endl;
+            exit(1);
+        } else {
+            int count = vcf->infoCounts[key];
+            // XXX TODO, fix for Genotype variants...
+            if (count != ALLELE_NUMBER) {
+                index = 0;
+            }
+            if (index == INDEX_NONE) {
+                if (count != 1) {
+                    cerr << "no field index supplied and field count != 1" << endl;
+                    exit(1);
+                } else {
+                    index = 0;
+                }
+            }
+            VariantFieldType type = s->second;
+            if (type == FIELD_FLOAT || type == FIELD_INTEGER) {
+                map<string, vector<string> >::iterator b = info.find(key);
+                if (b == info.end())
+                    return false;
+                double r;
+                if (!convert(b->second.at(index), r)) {
+                    cerr << "could not convert field " << key << "=" << b->second.at(index) << " to " << type << endl;
+                    exit(1);
+                }
+                return r;
+            } else {
+                cerr << "unsupported type for variant record " << type << endl;
+                exit(1);
+            }
+        }
+    }
+
+    int Variant::getNumSamples(void) {
+        return sampleNames.size();
+    }
+
+    int Variant::getNumValidGenotypes(void) {
+        int valid_genotypes = 0;
+        map<string, map<string, vector<string> > >::const_iterator s     = samples.begin();
+        map<string, map<string, vector<string> > >::const_iterator sEnd  = samples.end();
+        for (; s != sEnd; ++s) {
+            map<string, vector<string> > sample_info = s->second;
+            if (sample_info["GT"].front() != "./.") {
+                valid_genotypes++;
+            }
+        }
+        return valid_genotypes;
+    }
+
+    bool Variant::getSampleValueBool(string& key, string& sample, int index) {
+        map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
+        if (s == vcf->infoTypes.end()) {
+            cerr << "no info field " << key << endl;
+            exit(1);
+        } else {
+            int count = vcf->formatCounts[key];
+            // XXX TODO, fix for Genotype variants...
+            if (count != ALLELE_NUMBER) {
+                index = 0;
+            }
+            if (index == INDEX_NONE) {
+                if (count != 1) {
+                    cerr << "no field index supplied and field count != 1" << endl;
+                    exit(1);
+                } else {
+                    index = 0;
+                }
+            }
+            VariantFieldType type = s->second;
+            map<string, vector<string> >& sampleData = samples[sample];
+            if (type == FIELD_BOOL) {
+                map<string, vector<string> >::iterator b = sampleData.find(key);
+                if (b == sampleData.end())
+                    return false;
+                else
+                    return true;
+            } else {
+                cerr << "not bool type " << key << endl;
+            }
+        }
+    }
+
+    string Variant::getSampleValueString(string& key, string& sample, int index) {
+        map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
+        if (s == vcf->infoTypes.end()) {
+            cerr << "no info field " << key << endl;
+            exit(1);
+        } else {
+            int count = vcf->formatCounts[key];
+            // XXX TODO, fix for Genotype variants...
+            if (count != ALLELE_NUMBER) {
+                index = 0;
+            }
+            if (index == INDEX_NONE) {
+                if (count != 1) {
+                    cerr << "no field index supplied and field count != 1" << endl;
+                    exit(1);
+                } else {
+                    index = 0;
+                }
+            }
+            VariantFieldType type = s->second;
+            map<string, vector<string> >& sampleData = samples[sample];
+            if (type == FIELD_STRING) {
+                map<string, vector<string> >::iterator b = sampleData.find(key);
+                if (b == sampleData.end()) {
+                    return "";
+                } else {
+                    return b->second.at(index);
+                }
+            } else {
+                cerr << "not string type " << key << endl;
+            }
+        }
+    }
+
+    double Variant::getSampleValueFloat(string& key, string& sample, int index) {
+        map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
+        if (s == vcf->infoTypes.end()) {
+            cerr << "no info field " << key << endl;
+            exit(1);
+        } else {
+            // XXX TODO wrap this with a function call
+            int count = vcf->formatCounts[key];
+            // XXX TODO, fix for Genotype variants...
+            if (count != ALLELE_NUMBER) {
+                index = 0;
+            }
+            if (index == INDEX_NONE) {
+                if (count != 1) {
+                    cerr << "no field index supplied and field count != 1" << endl;
+                    exit(1);
+                } else {
+                    index = 0;
+                }
+            }
+            VariantFieldType type = s->second;
+            map<string, vector<string> >& sampleData = samples[sample];
+            if (type == FIELD_FLOAT || type == FIELD_INTEGER) {
+                map<string, vector<string> >::iterator b = sampleData.find(key);
+                if (b == sampleData.end())
+                    return false;
+                double r;
+                if (!convert(b->second.at(index), r)) {
+                    cerr << "could not convert field " << key << "=" << b->second.at(index) << " to " << type << endl;
+                    exit(1);
+                }
+                return r;
+            } else {
+                cerr << "unsupported type for sample " << type << endl;
+            }
+        }
+    }
+
+    bool Variant::getValueBool(string& key, string& sample, int index) {
+        if (sample.empty()) { // an empty sample name means
+            return getInfoValueBool(key, index);
+        } else {
+            return getSampleValueBool(key, sample, index);
+        }
+    }
+
+    double Variant::getValueFloat(string& key, string& sample, int index) {
+        if (sample.empty()) { // an empty sample name means
+            return getInfoValueFloat(key, index);
+        } else {
+            return getSampleValueFloat(key, sample, index);
+        }
+    }
+
+    string Variant::getValueString(string& key, string& sample, int index) {
+        if (sample.empty()) { // an empty sample name means
+            return getInfoValueString(key, index);
+        } else {
+            return getSampleValueString(key, sample, index);
+        }
+    }
+
+    int Variant::getAltAlleleIndex(string& allele) {
+        map<string, int>::iterator f = altAlleleIndexes.find(allele);
+        if (f == altAlleleIndexes.end()) {
+            cerr << "no such allele \'" << allele << "\' in record " << sequenceName << ":" << position << endl;
+            exit(1);
+        } else {
+            return f->second;
+        }
+    }
+
+    void Variant::addFilter(string& tag) {
+        if (filter == "" || filter == ".")
+            filter = tag;
+        else
+            filter += "," + tag;
+    }
+
+    void Variant::addFormatField(string& key) {
+        bool hasTag = false;
+        for (vector<string>::iterator t = format.begin(); t != format.end(); ++t) {
+            if (*t == key) {
+                hasTag = true;
+                break;
+            }
+        }
+        if (!hasTag) {
+            format.push_back(key);
+        }
+    }
+
+    void Variant::printAlt(ostream& out) {
+        for (vector<string>::iterator i = alt.begin(); i != alt.end(); ++i) {
+            out << *i;
+            // add a comma for all but the last alternate allele
+            if (i != (alt.end() - 1)) out << ",";
+        }
+    }
+
+    void Variant::printAlleles(ostream& out) {
+        for (vector<string>::iterator i = alleles.begin(); i != alleles.end(); ++i) {
+            out << *i;
+            // add a comma for all but the last alternate allele
+            if (i != (alleles.end() - 1)) out << ",";
+        }
+    }
+
+    ostream& operator<<(ostream& out, Variant& var) {
+        // ensure there are no empty fields
+        if (var.sequenceName.empty()) var.sequenceName = ".";
+        if (var.id.empty()) var.id = ".";
+        if (var.ref.empty()) var.ref = ".";
+        if (var.alt.empty()) var.alt.push_back(".");
+        if (var.filter.empty()) var.filter = ".";
+
+        out << var.sequenceName << "\t"
+            << var.position << "\t"
+            << var.id << "\t"
+            << var.ref << "\t";
+        // report the list of alternate alleles.
+        var.printAlt(out);
+        out << "\t"
+            << var.quality << "\t"
+            << var.filter << "\t";
+        if (var.info.empty() && var.infoFlags.empty()) {
+            out << ".";
+        } else {
+            for (map<string, vector<string> >::iterator i = var.info.begin(); i != var.info.end(); ++i) {
+                if (!i->second.empty()) {
+                    out << ((i == var.info.begin()) ? "" : ";") << i->first << "=" << join(i->second, ",");
+                }
+            }
+            for (map<string, bool>::iterator i = var.infoFlags.begin(); i != var.infoFlags.end(); ++i) {
+                if (i == var.infoFlags.end()) {
+                    out << "";
+                } else if (i == var.infoFlags.begin() && var.info.empty()) {
+                    out << "";
+                } else {
+                    out << ";";
+                }
+                out << i->first;
+            }
+        }
+        if (!var.format.empty()) {
+            out << "\t";
+            for (vector<string>::iterator f = var.format.begin(); f != var.format.end(); ++f) {
+                out << ((f == var.format.begin()) ? "" : ":") << *f;
+            }
+            for (vector<string>::iterator s = var.outputSampleNames.begin(); s != var.outputSampleNames.end(); ++s) {
+                out << "\t";
+                map<string, map<string, vector<string> > >::iterator sampleItr = var.samples.find(*s);
+                if (sampleItr == var.samples.end()) {
+                    out << ".";
+                } else {
+                    map<string, vector<string> >& sample = sampleItr->second;
+                    if (sample.size() == 0) {
+                        out << ".";
+                    } else {
+                        for (vector<string>::iterator f = var.format.begin(); f != var.format.end(); ++f) {
+                            map<string, vector<string> >::iterator g = sample.find(*f);
+                            out << ((f == var.format.begin()) ? "" : ":");
+                            if (g != sample.end() && !g->second.empty()) {
+                                out << join(g->second, ",");
+                            } else {
+                                out << ".";
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        return out;
+    }
+
+    void Variant::setOutputSampleNames(vector<string>& samplesToOutput) {
+        outputSampleNames = samplesToOutput;
+    }
+
+
+// shunting yard algorithm
+    void infixToPrefix(queue<RuleToken> tokens, queue<RuleToken>& prefixtokens) {
+        stack<RuleToken> ops;
+        while (!tokens.empty()) {
+            RuleToken& token = tokens.front();
+            if (isOperator(token)) {
+                //cerr << "found operator " << token.value << endl;
+                while (ops.size() > 0 && isOperator(ops.top())
+                       && (   (isLeftAssociative(token)  && priority(token) <= priority(ops.top()))
+                              || (isRightAssociative(token) && priority(token) <  priority(ops.top())))) {
+                    prefixtokens.push(ops.top());
+                    ops.pop();
+                }
+                ops.push(token);
+            } else if (isLeftParenthesis(token)) {
+                //cerr << "found paran " << token.value << endl;
+                ops.push(token);
+            } else if (isRightParenthesis(token)) {
+                //cerr << "found paran " << token.value << endl;
+                while (ops.size() > 0 && !isLeftParenthesis(ops.top())) {
+                    prefixtokens.push(ops.top());
+                    ops.pop();
+                }
+                if (ops.size() == 0) {
+                    cerr << "error: mismatched parentheses" << endl;
+                    exit(1);
+                }
+                if (isLeftParenthesis(ops.top())) {
+                    ops.pop();
+                }
+            } else {
+                //cerr << "found operand " << token.value << endl;
+                prefixtokens.push(token);
+            }
+            tokens.pop();
+        }
+        while (ops.size() > 0) {
+            if (isRightParenthesis(ops.top()) || isLeftParenthesis(ops.top())) {
+                cerr << "error: mismatched parentheses" << endl;
+                exit(1);
+            }
+            prefixtokens.push(ops.top());
+            ops.pop();
+        }
+    }
+
+    RuleToken::RuleToken(string tokenstr, map<string, VariantFieldType>& variables) {
+        isVariable = false;
+        if (tokenstr == "!") {
+            type = RuleToken::NOT_OPERATOR;
+        } else if (tokenstr == "&") {
+            type = RuleToken::AND_OPERATOR;
+        } else if (tokenstr == "|") {
+            type = RuleToken::OR_OPERATOR;
+        } else if (tokenstr == "+") {
+            type = RuleToken::ADD_OPERATOR;
+        } else if (tokenstr == "-") {
+            type = RuleToken::SUBTRACT_OPERATOR;
+        } else if (tokenstr == "*") {
+            type = RuleToken::MULTIPLY_OPERATOR;
+        } else if (tokenstr == "/") {
+            type = RuleToken::DIVIDE_OPERATOR;
+        } else if (tokenstr == "=") {
+            type = RuleToken::EQUAL_OPERATOR;
+        } else if (tokenstr == ">") {
+            type = RuleToken::GREATER_THAN_OPERATOR;
+        } else if (tokenstr == "<") {
+            type = RuleToken::LESS_THAN_OPERATOR;
+        } else if (tokenstr == "(") {
+            type = RuleToken::LEFT_PARENTHESIS;
+        } else if (tokenstr == ")") {
+            type = RuleToken::RIGHT_PARENTHESIS;
+        } else { // operand
+            type = RuleToken::OPERAND;
+            if (variables.find(tokenstr) == variables.end()) {
+                if (convert(tokenstr, number)) {
+                    type = RuleToken::NUMBER;
+                } else if (tokenstr == "QUAL") {
+                    isVariable = true;
+                } else {
+                    type = RuleToken::STRING_VARIABLE;
+                }
+            } else {
+                isVariable = true;
+            }
+        }
+        value = tokenstr;
+    }
+
+
+    void tokenizeFilterSpec(string& filterspec, queue<RuleToken>& tokens, map<string, VariantFieldType>& variables) {
+        string lastToken = "";
+        bool inToken = false;
+        for (unsigned int i = 0; i <  filterspec.size(); ++i) {
+            char c = filterspec.at(i);
+            if (c == ' ' || c == '\n') {
+                inToken = false;
+                if (!inToken && lastToken.size() > 0) {
+                    tokens.push(RuleToken(lastToken, variables));
+                    lastToken = "";
+                }
+            } else if (!inToken && (isOperatorChar(c) || isParanChar(c))) {
+                inToken = false;
+                if (lastToken.size() > 0) {
+                    tokens.push(RuleToken(lastToken, variables));
+                    lastToken = "";
+                }
+                tokens.push(RuleToken(filterspec.substr(i,1), variables));
+            } else {
+                inToken = true;
+                lastToken += c;
+            }
+        }
+        // get the last token
+        if (inToken) {
+            tokens.push(RuleToken(lastToken, variables));
+        }
+    }
+
+// class which evaluates filter expressions
+// allow filters to be defined using boolean infix expressions e.g.:
+//
+// "GQ > 10 & (DP < 3 | DP > 5) & SAMPLE = NA12878"
+// or
+// "GT = 1/1 | GT = 0/0"
+//
+// on initialization, tokenizes the input sequence, and converts it from infix to postfix
+// on call to 
+//
+
+
+    VariantFilter::VariantFilter(string filterspec, VariantFilterType filtertype, map<string, VariantFieldType>& variables) {
+        type = filtertype;
+        spec = filterspec;
+        tokenizeFilterSpec(filterspec, tokens, variables);
+        infixToPrefix(tokens, rules);
+        /*while (!rules.empty()) {
+          cerr << " " << rules.front().value << ((isNumeric(rules.front())) ? "f" : "");
+          rules.pop();
+          }
+        */
+        //cerr << endl;
+        //cerr << join(" ", tokens) << endl;
+    }
+
+// all alts pass
+    bool VariantFilter::passes(Variant& var, string& sample) {
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+            string& allele = *a;
+            if (!passes(var, sample, allele)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    bool VariantFilter::passes(Variant& var, string& sample, string& allele) {
+        // to evaluate a rpn boolean queue with embedded numbers and variables
+        // make a result stack, use float to allow comparison of floating point
+        // numbers, booleans, and integers
+        stack<RuleToken> results;
+        queue<RuleToken> rulesCopy = rules; // copy
+
+        int index;
+        if (allele.empty()) {
+            index = 0; // apply to the whole record
+        } else {
+            // apply to a specific allele
+            index = var.getAltAlleleIndex(allele);
+        }
+
+        while (!rulesCopy.empty()) {
+            RuleToken token = rulesCopy.front();
+            rulesCopy.pop();
+        // pop operands from the front of the queue and push them onto the stack
+        if (isOperand(token)) {
+            //cout << "is operand: " << token.value << endl;
+            // if the token is variable, i.e. not evaluated in this context, we
+            // must evaluate it before pushing it onto the stack
+            if (token.isVariable) {
+                //cout << "is variable" << endl;
+                // look up the variable using the Variant, depending on our filter type
+                //cout << "token.value " << token.value << endl;
+                VariantFieldType vtype;
+                if (sample.empty()) { // means we are record-specific
+                    vtype = var.infoType(token.value);
+                } else {
+                    vtype = var.formatType(token.value);
+                    //cout << "type = " << type << endl;
+                }
+                //cout << "type: " << type << endl;
+
+                if (vtype == FIELD_INTEGER || vtype == FIELD_FLOAT) {
+                    token.type = RuleToken::NUMERIC_VARIABLE;
+                    token.number = var.getValueFloat(token.value, sample, index);
+                    //cerr << "number: " << token.number << endl;
+                } else if (vtype == FIELD_BOOL) {
+                    token.type = RuleToken::BOOLEAN_VARIABLE;
+                    token.state = var.getValueBool(token.value, sample, index);
+                    //cerr << "state: " << token.state << endl;
+                } else if (vtype == FIELD_STRING) {
+                    //cout << "token.value = " << token.value << endl;
+                    token.type = RuleToken::STRING_VARIABLE;
+                    token.str = var.getValueString(token.value, sample, index);
+                } else if (isString(token)) {
+                    token.type = RuleToken::STRING_VARIABLE;
+                    token.str = var.getValueString(token.value, sample, index);
+                    //cerr << "string: " << token.str << endl;
+                }
+            } else {
+                double f;
+                string s;
+                //cerr << "parsing operand" << endl;
+                if (convert(token.value, f)) {
+                    token.type = RuleToken::NUMERIC_VARIABLE;
+                    token.number = f;
+                    //cerr << "number: " << token.number << endl;
+                } else if (convert(token.value, s)) {
+                    token.type = RuleToken::STRING_VARIABLE;
+                    token.str = s;
+                    //cerr << "string: " << token.str << endl;
+                } else {
+                    cerr << "could not parse non-variable operand " << token.value << endl;
+                    exit(1);
+                }
+            }
+            results.push(token);
+        } 
+        // apply operators to the first n elements on the stack and push the result back onto the stack
+        else if (isOperator(token)) {
+            //cerr << "is operator: " << token.value << endl;
+            RuleToken a, b, r;
+            // is it a not-operator?
+            switch (token.type) {
+                case ( RuleToken::NOT_OPERATOR ):
+                    a = results.top();
+                    results.pop();
+                    if (!isBoolean(a)) {
+                        cerr << "cannot negate a non-boolean" << endl;
+                    } else {
+                        a.state = !a.state;
+                        results.push(a);
+                    }
+                    break;
+
+                case ( RuleToken::EQUAL_OPERATOR ):
+                    a = results.top(); results.pop();
+                    b = results.top(); results.pop();
+                    if (a.type == b.type) {
+                        switch (a.type) {
+                            case (RuleToken::STRING_VARIABLE):
+                                r.state = (a.str == b.str);
+                                break;
+                            case (RuleToken::NUMERIC_VARIABLE):
+                                r.state = (a.number == b.number);
+                                break;
+                            case (RuleToken::BOOLEAN_VARIABLE):
+                                r.state = (a.state == b.state);
+                                break;
+                            default:
+                                cerr << "should not get here" << endl; exit(1);
+                                break;
+                        }
+                    } else if (a.type == RuleToken::STRING_VARIABLE && b.type == RuleToken::NUMERIC_VARIABLE) {
+                        r.state = (convert(b.number) == a.str);
+                    } else if (b.type == RuleToken::STRING_VARIABLE && a.type == RuleToken::NUMERIC_VARIABLE) {
+                        r.state = (convert(a.number) == b.str);
+                    }
+                    results.push(r);
+                    break;
+
+                case ( RuleToken::GREATER_THAN_OPERATOR ):
+                    a = results.top(); results.pop();
+                    b = results.top(); results.pop();
+                    if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
+                        r.state = (b.number > a.number);
+                    } else {
+                        cerr << "cannot compare (>) objects of dissimilar types" << endl;
+                        cerr << a.type << " " << b.type << endl;
+                        exit(1);
+                    }
+                    results.push(r);
+                    break;
+
+                case ( RuleToken::LESS_THAN_OPERATOR ):
+                    a = results.top(); results.pop();
+                    b = results.top(); results.pop();
+                    if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
+                        r.state = (b.number < a.number);
+                    } else {
+                        cerr << "cannot compare (<) objects of dissimilar types" << endl;
+                        cerr << a.type << " " << b.type << endl;
+                        exit(1);
+                    }
+                    results.push(r);
+                    break;
+
+                case ( RuleToken::ADD_OPERATOR ):
+                    a = results.top(); results.pop();
+                    b = results.top(); results.pop();
+                    if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
+                        r.number = (b.number + a.number);
+                        r.type = RuleToken::NUMERIC_VARIABLE;
+                    } else {
+                        cerr << "cannot add objects of dissimilar types" << endl;
+                        cerr << a.type << " " << b.type << endl;
+                        exit(1);
+                    }
+                    results.push(r);
+                    break;
+
+                case ( RuleToken::SUBTRACT_OPERATOR ):
+                    a = results.top(); results.pop();
+                    b = results.top(); results.pop();
+                    if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
+                        r.number = (b.number - a.number);
+                        r.type = RuleToken::NUMERIC_VARIABLE;
+                    } else {
+                        cerr << "cannot subtract objects of dissimilar types" << endl;
+                        cerr << a.type << " " << b.type << endl;
+                        exit(1);
+                    }
+                    results.push(r);
+                    break;
+
+                case ( RuleToken::MULTIPLY_OPERATOR ):
+                    a = results.top(); results.pop();
+                    b = results.top(); results.pop();
+                    if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
+                        r.number = (b.number * a.number);
+                        r.type = RuleToken::NUMERIC_VARIABLE;
+                    } else {
+                        cerr << "cannot multiply objects of dissimilar types" << endl;
+                        cerr << a.type << " " << b.type << endl;
+                        exit(1);
+                    }
+                    results.push(r);
+                    break;
+
+                case ( RuleToken::DIVIDE_OPERATOR):
+                    a = results.top(); results.pop();
+                    b = results.top(); results.pop();
+                    if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
+                        r.number = (b.number / a.number);
+                        r.type = RuleToken::NUMERIC_VARIABLE;
+                    } else {
+                        cerr << "cannot divide objects of dissimilar types" << endl;
+                        cerr << a.type << " " << b.type << endl;
+                        exit(1);
+                    }
+                    results.push(r);
+                    break;
+
+                case ( RuleToken::AND_OPERATOR ):
+                case ( RuleToken::OR_OPERATOR ):
+                    a = results.top(); results.pop();
+                    b = results.top(); results.pop();
+                    if (a.type == b.type && a.type == RuleToken::BOOLEAN_VARIABLE) {
+                        if (token.type == RuleToken::AND_OPERATOR) {
+                            r.state = (a.state && b.state);
+                        } else {
+                            r.state = (a.state || b.state);
+                        }
+                    } else {
+                        cerr << "cannot compare (& or |) objects of dissimilar types" << endl;
+                        exit(1);
+                    }
+                    results.push(r);
+                    break;
+                default:
+                    cerr << "should not get here!" << endl; exit(1);
+                    break;
+            }
+        }
+    }
+    // at the end you should have only one value on the stack, return it as a boolean
+    if (results.size() == 1) {
+        if (isBoolean(results.top())) {
+            return results.top().state;
+        } else {
+            cerr << "error, non-boolean value left on stack" << endl;
+            //cerr << results.top().value << endl;
+            exit(1);
+        }
+    } else if (results.size() > 1) {
+        cerr << "more than one value left on results stack!" << endl;
+        while (!results.empty()) {
+            cerr << results.top().value << endl;
+            results.pop();
+        }
+        exit(1);
+    } else {
+        cerr << "results stack empty" << endl;
+        exit(1);
+    }
+}
+
+void VariantFilter::removeFilteredGenotypes(Variant& var, bool keepInfo) {
+
+    for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+        string& name = *s;
+        if (!passes(var, name)) {
+        	if (keepInfo) {
+				var.samples[name]["GT"].clear();
+				var.samples[name]["GT"].push_back("./.");
+        	}
+        	else {
+			    var.samples.erase(name);
+        	}
+        }
+    }
+}
+
+/*
+bool VariantCallFile::openVCF(string& filename) {
+    file.open(filename.c_str(), ifstream::in);
+    if (!file.is_open()) {
+        cerr << "could not open " << filename << endl;
+        return false;
+    } else {
+        return parseHeader();
+    }
+}
+
+bool VariantCallFile::openVCF(ifstream& stream) {
+    file = stream;
+    if (!file.is_open()) {
+        cerr << "provided file is not open" << endl;
+        return false;
+    } else {
+        return parseHeader();
+    }
+}
+*/
+
+void VariantCallFile::updateSamples(vector<string>& newSamples) {
+    sampleNames = newSamples;
+    // regenerate the last line of the header
+    vector<string> headerLines = split(header, '\n');
+    vector<string> colnames = split(headerLines.at(headerLines.size() - 1), '\t'); // get the last, update the samples
+    vector<string> newcolnames;
+    newcolnames.resize(9 + sampleNames.size());
+    copy(colnames.begin(), colnames.begin() + 9, newcolnames.begin());
+    copy(sampleNames.begin(), sampleNames.end(), newcolnames.begin() + 9);
+    headerLines.at(headerLines.size() - 1) = join(newcolnames, "\t");
+    header = join(headerLines, "\n");
+}
+
+// non-destructive version of above
+string VariantCallFile::headerWithSampleNames(vector<string>& newSamples) {
+    // regenerate the last line of the header
+    if (newSamples.empty()) return header;
+    vector<string> headerLines = split(header, '\n');
+    vector<string> colnames = split(headerLines.at(headerLines.size() - 1), '\t'); // get the last, update the samples
+    vector<string> newcolnames;
+    unsigned int colCount = colnames.size(); // used to be hard-coded 9, hopefully the dynamic colCount isn't an issue
+    if (colCount < 8)
+    {
+        cout << "VCF file is not suitable for use because it does not have a format field." << endl;
+        exit(0);
+    }
+    newcolnames.resize(colCount + newSamples.size());
+    copy(colnames.begin(), colnames.begin() + colCount, newcolnames.begin());
+    copy(newSamples.begin(), newSamples.end(), newcolnames.begin() + colCount);
+    headerLines.at(headerLines.size() - 1) = join(newcolnames, "\t");
+    return join(headerLines, "\n");
+}
+
+// TODO cleanup, store header lines instead of bulk header
+void VariantCallFile::addHeaderLine(string line) {
+    vector<string> headerLines = split(header, '\n');
+    headerLines.insert(headerLines.end() - 1, line);
+    header = join(unique(headerLines), "\n");
+}
+
+// helper to addHeaderLine
+vector<string>& unique(vector<string>& strings) {
+    set<string> uniq;
+    vector<string> res;
+    for (vector<string>::const_iterator s = strings.begin(); s != strings.end(); ++s) {
+        if (uniq.find(*s) == uniq.end()) {
+            res.push_back(*s);
+            uniq.insert(*s);
+        }
+    }
+    strings = res;
+    return strings;
+}
+
+vector<string> VariantCallFile::infoIds(void) {
+    vector<string> tags;
+    vector<string> headerLines = split(header, '\n');
+    for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
+        string& line = *s;
+        if (line.find("##INFO") == 0) {
+            size_t pos = line.find("ID=");
+            if (pos != string::npos) {
+                pos += 3;
+                size_t tagend = line.find(",", pos);
+                if (tagend != string::npos) {
+                    tags.push_back(line.substr(pos, tagend - pos));
+                }
+            }
+        }
+    }
+    return tags;
+}
+
+vector<string> VariantCallFile::formatIds(void) {
+    vector<string> tags;
+    vector<string> headerLines = split(header, '\n');
+    for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
+        string& line = *s;
+        if (line.find("##FORMAT") == 0) {
+            size_t pos = line.find("ID=");
+            if (pos != string::npos) {
+                pos += 3;
+                size_t tagend = line.find(",", pos);
+                if (tagend != string::npos) {
+                    tags.push_back(line.substr(pos, tagend - pos));
+                }
+            }
+        }
+    }
+    return tags;
+}
+
+void VariantCallFile::removeInfoHeaderLine(string tag) {
+    vector<string> headerLines = split(header, '\n');
+    vector<string> newHeader;
+    string id = "ID=" + tag + ",";
+    for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
+        string& line = *s;
+        if (line.find("##INFO") == 0) {
+            if (line.find(id) == string::npos) {
+                newHeader.push_back(line);
+            }
+        } else {
+            newHeader.push_back(line);
+        }
+    }
+    header = join(newHeader, "\n");
+}
+
+void VariantCallFile::removeGenoHeaderLine(string tag) {
+    vector<string> headerLines = split(header, '\n');
+    vector<string> newHeader;
+    string id = "ID=" + tag + ",";
+    for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
+        string& headerLine = *s;
+        if (headerLine.find("##FORMAT") == 0) {
+            if (headerLine.find(id) == string::npos) {
+                newHeader.push_back(headerLine);
+            }
+        } else {
+            newHeader.push_back(headerLine);
+        }
+    }
+    header = join(newHeader, "\n");
+}
+
+vector<string> VariantCallFile::getHeaderLinesFromFile()
+{
+    string headerStr = "";
+
+    if (usingTabix) {
+        tabixFile->getHeader(headerStr);
+        if (headerStr.empty()) {
+            cerr << "error: no VCF header" << endl;
+            exit(1);
+        }
+        tabixFile->getNextLine(line);
+        firstRecord = true;
+    } else {
+        while (std::getline(*file, line)) {
+            if (line.substr(0,1) == "#") {
+                headerStr += line + '\n';
+            } else {
+                // done with header
+                if (headerStr.empty()) {
+                    cerr << "error: no VCF header" << endl;
+                    return vector<string>();
+                }
+                firstRecord = true;
+                break;
+            }
+        }
+    }
+    return split(headerStr, "\n");
+}
+
+bool VariantCallFile::parseHeader(void) {
+
+    string headerStr = "";
+
+    if (usingTabix) {
+        tabixFile->getHeader(headerStr);
+        if (headerStr.empty()) {
+            cerr << "error: no VCF header" << endl;
+            exit(1);
+        }
+        tabixFile->getNextLine(line);
+        firstRecord = true;
+    } else {
+        while (std::getline(*file, line)) {
+            if (line.substr(0,1) == "#") {
+                headerStr += line + '\n';
+            } else {
+                // done with header
+                if (headerStr.empty()) {
+                    cerr << "error: no VCF header" << endl;
+                    return false;
+                }
+                firstRecord = true;
+                break;
+            }
+        }
+    }
+    this->vcf_header = headerStr;
+
+    return parseHeader(headerStr);
+
+}
+
+bool VariantCallFile::parseHeader(string& hs) {
+
+    if (hs.substr(hs.size() - 1, 1) == "\n") {
+	hs.erase(hs.size() - 1, 1); // remove trailing newline
+    }
+    header = hs; // stores the header in the object instance
+
+    vector<string> headerLines = split(header, "\n");
+    for (vector<string>::iterator h = headerLines.begin(); h != headerLines.end(); ++h) {
+        string headerLine = *h;
+        if (headerLine.substr(0,2) == "##") {
+            // meta-information headerLines
+            // TODO parse into map from info/format key to type
+            // ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+            // ##FORMAT=<ID=CB,Number=1,Type=String,Description="Called by S(Sanger), M(UMich), B(BI)">
+            size_t found = headerLine.find_first_of("=");
+            string entryType = headerLine.substr(2, found - 2);
+            // handle reference here, no "<" and ">" given
+                //} else if (entryType == "reference") {
+            size_t dataStart = headerLine.find_first_of("<");
+            size_t dataEnd = headerLine.find_first_of(">");
+            if (dataStart != string::npos && dataEnd != string::npos) {
+                string entryData = headerLine.substr(dataStart + 1, dataEnd - dataStart - 1);
+                // XXX bad; this will break if anyone ever moves the order
+                // of the fields around to include a "long form" string
+                // including either a = or , in the first or second field
+                if (entryType == "INFO" || entryType == "FORMAT") {
+                    vector<string> fields = split(entryData, "=,");
+                    if (fields[0] != "ID") {
+                        cerr << "header parse error at:" << endl
+                             << "fields[0] != \"ID\"" << endl
+                             << headerLine << endl;
+                        exit(1);
+                    }
+                    string id = fields[1];
+                    if (fields[2] != "Number") {
+                        cerr << "header parse error at:" << endl
+                             << "fields[2] != \"Number\"" << endl
+                             << headerLine << endl;
+                        exit(1);
+                    }
+                    int number;
+                    string numberstr = fields[3].c_str();
+                    // XXX TODO VCF has variable numbers of fields...
+                    if (numberstr == "A") {
+                        number = ALLELE_NUMBER;
+                    } else if (numberstr == "G") {
+                        number = GENOTYPE_NUMBER;
+                    } else if (numberstr == ".") {
+                        number = 1;
+                    } else {
+                        convert(numberstr, number);
+                    }
+                    if (fields[4] != "Type") {
+                        cerr << "header parse error at:" << endl
+                             << "fields[4] != \"Type\"" << endl
+                             << headerLine << endl;
+                        exit(1);
+                    }
+                    VariantFieldType type = typeStrToVariantFieldType(fields[5]);
+                    if (entryType == "INFO") {
+                        infoCounts[id] = number;
+                        infoTypes[id] = type;
+                        //cerr << id << " == " << type << endl;
+                    } else if (entryType == "FORMAT") {
+                        //cout << "found format field " << id << " with type " << type << endl;
+                        formatCounts[id] = number;
+                        formatTypes[id] = type;
+                    }
+                }
+            }
+        } else if (headerLine.substr(0,1) == "#") {
+            // field name headerLine
+            vector<string> fields = split(headerLine, '\t');
+            if (fields.size() > 8) {
+                sampleNames.resize(fields.size() - 9);
+                copy(fields.begin() + 9, fields.end(), sampleNames.begin());
+            }
+        }
+    }
+
+    return true;
+}
+
+bool VariantCallFile::getNextVariant(Variant& var) {
+        if (firstRecord && !justSetRegion) {
+            if (!line.empty() && line.substr(0,1) != "#") {
+                var.parse(line, parseSamples);
+                firstRecord = false;
+                _done = false;
+                return true;
+            } else {
+                return false;
+            }
+        }
+        if (usingTabix) {
+            if (justSetRegion && !line.empty() && line.substr(0,1) != "#") {
+                if (firstRecord) {
+                    firstRecord = false;
+                }
+                var.parse(line, parseSamples);
+                line.clear();
+                justSetRegion = false;
+                _done = false;
+                return true;
+            } else if (tabixFile->getNextLine(line)) {
+                var.parse(line, parseSamples);
+                _done = false;
+                return true;
+            } else {
+                _done = true;
+                return false;
+            }
+        } else {
+            if (std::getline(*file, line)) {
+                var.parse(line, parseSamples);
+                _done = false;
+                return true;
+            } else {
+                _done = true;
+                return false;
+            }
+        }
+}
+
+bool VariantCallFile::setRegion(string seq, long int start, long int end) {
+    stringstream regionstr;
+    if (end) {
+        regionstr << seq << ":" << start << "-" << end;
+    } else {
+        regionstr << seq << ":" << start;
+    }
+    return setRegion(regionstr.str());
+}
+
+bool VariantCallFile::setRegion(string region) {
+    if (!usingTabix) {
+        cerr << "cannot setRegion on a non-tabix indexed file" << endl;
+        exit(1);
+    }
+    size_t dots = region.find("..");
+    // convert between bamtools/freebayes style region string and tabix/samtools style
+    if (dots != string::npos) {
+        region.replace(dots, 2, "-");
+    }
+    if (tabixFile->setRegion(region)) {
+        if (tabixFile->getNextLine(line)) {
+	    justSetRegion = true;
+            return true;
+        } else {
+            return false;
+        }
+    } else {
+        return false;
+    }
+}
+
+
+// genotype manipulation
+/*
+map<string, int> decomposeGenotype(string& genotype) {
+    string splitter = "/";
+    if (genotype.find("|") != string::npos) {
+        splitter = "|";
+    }
+    vector<string> haps = split(genotype, splitter);
+    map<string, int> decomposed;
+    for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
+        ++decomposed[*h];
+    }
+    return decomposed;
+}
+*/
+
+map<int, int> decomposeGenotype(const string& genotype) {
+    string splitter = "/";
+    if (genotype.find("|") != string::npos) {
+        splitter = "|";
+    }
+    vector<string> haps = split(genotype, splitter);
+    map<int, int> decomposed;
+    for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
+        int alt;
+        if (*h == ".") {
+            ++decomposed[NULL_ALLELE];
+        } else {
+            convert(*h, alt);
+            ++decomposed[alt];
+        }
+    }
+    return decomposed;
+}
+
+vector<int> decomposePhasedGenotype(const string& genotype) {
+    string splitter = "/";
+    if (genotype.find("|") != string::npos) {
+        splitter = "|";
+    }
+    vector<string> haps = split(genotype, splitter);
+    if (haps.size() > 1 && splitter == "/") {
+        cerr << "could not find '|' in genotype, cannot decomposePhasedGenotype on unphased genotypes" << endl;
+        exit(1);
+    }
+    vector<int> decomposed;
+    for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
+        int alt;
+        if (*h == ".") {
+            decomposed.push_back(NULL_ALLELE);
+        } else {
+            convert(*h, alt);
+            decomposed.push_back(alt);
+        }
+    }
+    return decomposed;
+}
+
+string genotypeToString(const map<int, int>& genotype) {
+    vector<int> s;
+    for (map<int, int>::const_iterator g = genotype.begin(); g != genotype.end(); ++g) {
+        int a = g->first;
+        int c = g->second;
+        for (int i = 0; i < c; ++i) s.push_back(a);
+    }
+    sort(s.begin(), s.end());
+    vector<string> r;
+    for (vector<int>::iterator i = s.begin(); i != s.end(); ++i) {
+        if (*i == NULL_ALLELE) r.push_back(".");
+        else r.push_back(convert(*i));
+    }
+    return join(r, "/"); // TODO adjust for phased/unphased
+}
+
+string phasedGenotypeToString(const vector<int>& genotype) {
+    vector<string> r;
+    for (vector<int>::const_iterator i = genotype.begin(); i != genotype.end(); ++i) {
+        if (*i == NULL_ALLELE) r.push_back(".");
+        else r.push_back(convert(*i));
+    }
+    return join(r, "|");
+}
+
+bool isHet(const map<int, int>& genotype) {
+    return genotype.size() > 1;
+}
+
+bool isHom(const map<int, int>& genotype) {
+    return genotype.size() == 1;
+}
+
+bool hasNonRef(const map<int, int>& genotype) {
+    for (map<int, int>::const_iterator g = genotype.begin(); g != genotype.end(); ++g) {
+        if (g->first != 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool isHomRef(const map<int, int>& genotype) {
+    return isHom(genotype) && !hasNonRef(genotype);
+}
+
+bool isHomNonRef(const map<int, int>& genotype) {
+    return isHom(genotype) && hasNonRef(genotype);
+}
+
+bool isNull(const map<int, int>& genotype) {
+    return genotype.find(NULL_ALLELE) != genotype.end();
+}
+
+int ploidy(const map<int, int>& genotype) {
+    int i = 0;
+    for (map<int, int>::const_iterator g = genotype.begin(); g != genotype.end(); ++g) {
+        i += g->second;
+    }
+    return i;
+}
+
+// generates cigar from allele parsed by parsedAlternates
+string varCigar(vector<VariantAllele>& vav, bool xForMismatch) {
+    string cigar;
+    pair<int, string> element;
+    for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
+        VariantAllele& va = *v;
+        if (va.ref != va.alt) {
+            if (element.second == "M") {
+                cigar += convert(element.first) + element.second;
+                element.second = ""; element.first = 0;
+            }
+            if (va.ref.size() == va.alt.size()) {
+                cigar += convert(va.ref.size()) + (xForMismatch ? "X" : "M");
+            } else if (va.ref.size() > va.alt.size()) {
+                cigar += convert(va.ref.size() - va.alt.size()) + "D";
+            } else {
+                cigar += convert(va.alt.size() - va.ref.size()) + "I";
+            }
+        } else {
+            if (element.second == "M") {
+                element.first += va.ref.size();
+            } else {
+                element = make_pair(va.ref.size(), "M");
+            }
+        }
+    }
+    if (element.second == "M") {
+        cigar += convert(element.first) + element.second;
+    }
+    element.second = ""; element.first = 0;
+    return cigar;
+}
+
+map<string, vector<VariantAllele> > Variant::parsedAlternates(bool includePreviousBaseForIndels,
+                                                              bool useMNPs,
+                                                              bool useEntropy,
+                                                              float matchScore,
+                                                              float mismatchScore,
+                                                              float gapOpenPenalty,
+                                                              float gapExtendPenalty,
+                                                              float repeatGapExtendPenalty,
+                                                              string flankingRefLeft,
+                                                              string flankingRefRight) {
+
+    map<string, vector<VariantAllele> > variantAlleles;
+
+    // add the reference allele
+    variantAlleles[ref].push_back(VariantAllele(ref, ref, position));
+
+    // single SNP case, no ambiguity possible, no need to spend a lot of
+    // compute aligning ref and alt fields
+    if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
+        variantAlleles[alt.front()].push_back(VariantAllele(ref, alt.front(), position));
+        return variantAlleles;
+    }
+
+    // padding is used to ensure a stable alignment of the alternates to the reference
+    // without having to go back and look at the full reference sequence
+    int paddingLen = max(10, (int) (ref.size()));  // dynamically determine optimum padding length
+    for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
+        string& alternate = *a;
+        paddingLen = max(paddingLen, (int) (alternate.size()));
+    }
+    char padChar = 'Z';
+    char anchorChar = 'Q';
+    string padding(paddingLen, padChar);
+
+    // this 'anchored' string is done for stability
+    // the assumption is that there should be a positional match in the first base
+    // this is true for VCF 4.1, and standard best practices
+    // using the anchor char ensures this without other kinds of realignment
+    string reference_M;
+    if (flankingRefLeft.empty() && flankingRefRight.empty()) {
+        reference_M = padding + ref + padding;
+        reference_M[paddingLen] = anchorChar;
+    } else {
+        reference_M = flankingRefLeft + ref + flankingRefRight;
+        paddingLen = flankingRefLeft.size();
+    }
+
+    // passed to sw.Align
+    unsigned int referencePos;
+
+    string cigar;
+
+    for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
+
+        string& alternate = *a;
+        vector<VariantAllele>& variants = variantAlleles[alternate];
+        string alternateQuery_M;
+        if (flankingRefLeft.empty() && flankingRefRight.empty()) {
+            alternateQuery_M = padding + alternate + padding;
+            alternateQuery_M[paddingLen] = anchorChar;
+        } else {
+            alternateQuery_M = flankingRefLeft + alternate + flankingRefRight;
+        }
+        //const unsigned int alternateLen = alternate.size();
+
+        if (true) {
+            CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
+            if (useEntropy) sw.EnableEntropyGapPenalty(1);
+            if (repeatGapExtendPenalty != 0) sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
+            sw.Align(referencePos, cigar, reference_M, alternateQuery_M);
+        } else {  // disabled for now
+            StripedSmithWaterman::Aligner aligner;
+            StripedSmithWaterman::Filter sswFilter;
+            StripedSmithWaterman::Alignment alignment;
+            aligner.Align(alternateQuery_M.c_str(), reference_M.c_str(), reference_M.size(), sswFilter, &alignment);
+            cigar = alignment.cigar_string;
+        }
+
+        // left-realign the alignment...
+
+        vector<pair<int, string> > cigarData = splitCigar(cigar);
+
+        if (cigarData.front().second != "M" || cigarData.back().second != "M"
+            || cigarData.front().first < paddingLen || cigarData.back().first < paddingLen) {
+            cerr << "parsedAlternates: alignment does not start with match over padded sequence" << endl;
+            cerr << cigar << endl;
+            cerr << reference_M << endl;
+            cerr << alternateQuery_M << endl;
+            exit(1);
+        } else {
+            cigarData.front().first -= paddingLen;
+            cigarData.back().first -= paddingLen;;
+        }
+        //cigarData = cleanCigar(cigarData);
+        cigar = joinCigar(cigarData);
+
+        int altpos = 0;
+        int refpos = 0;
+
+        for (vector<pair<int, string> >::iterator e = cigarData.begin(); e != cigarData.end(); ++e) {
+
+            int len = e->first;
+            string type = e->second;
+
+            switch (type.at(0)) {
+            case 'I':
+                if (includePreviousBaseForIndels) {
+                    if (!variants.empty() && 
+                        variants.back().ref != variants.back().alt) {
+                        VariantAllele a = VariantAllele("", alternate.substr(altpos, len), refpos + position);
+                        variants.back() = variants.back() + a;
+                    } else {
+                        VariantAllele a = VariantAllele(ref.substr(refpos - 1, 1),
+                                                        alternate.substr(altpos - 1, len + 1),
+                                                        refpos + position - 1);
+                        variants.push_back(a);
+                    }
+                } else {
+                    variants.push_back(VariantAllele("", alternate.substr(altpos, len), refpos + position));
+                }
+                altpos += len;
+                break;
+            case 'D':
+                if (includePreviousBaseForIndels) {
+                    if (!variants.empty() &&
+                        variants.back().ref != variants.back().alt) {
+                        VariantAllele a = VariantAllele(ref.substr(refpos, len), "", refpos + position);
+                        variants.back() = variants.back() + a;
+                    } else {
+                        VariantAllele a = VariantAllele(ref.substr(refpos - 1, len + 1),
+                                                        alternate.substr(altpos - 1, 1),
+                                                        refpos + position - 1);
+                        variants.push_back(a);
+                    }
+                } else {
+                    variants.push_back(VariantAllele(ref.substr(refpos, len), "", refpos + position));
+                }
+                refpos += len;
+                break;
+            case 'M':
+                {
+                    for (int i = 0; i < len; ++i) {
+                        VariantAllele a = VariantAllele(ref.substr(refpos + i, 1),
+                                                        alternate.substr(altpos + i, 1),
+                                                        refpos + i + position);
+                        if (useMNPs &&
+                            variants.back().ref.size() == variants.back().alt.size()
+                            && variants.back().ref != variants.back().alt) {
+                            variants.back() = variants.back() + a;
+                        } else {
+                            variants.push_back(a);
+                        }
+                    }
+                }
+                refpos += len;
+                altpos += len;
+                break;
+            case 'S':
+                refpos += len;
+                altpos += len;
+                break;
+            default:
+                break;
+            }
+
+        }
+    }
+
+    return variantAlleles;
+}
+
+map<string, vector<VariantAllele> > Variant::flatAlternates(void) {
+    map<string, vector<VariantAllele> > variantAlleles;
+    for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
+        string& alternate = *a;
+        vector<VariantAllele>& variants = variantAlleles[alternate];
+        variants.push_back(VariantAllele(ref, alternate, position));
+    }
+    return variantAlleles;
+}
+
+set<string> Variant::altSet(void) {
+    set<string> altset(alt.begin(), alt.end());
+    return altset;
+}
+
+ostream& operator<<(ostream& out, VariantAllele& var) {
+    out << var.position << " " << var.ref << " -> " << var.alt;
+    return out;
+}
+
+VariantAllele operator+(const VariantAllele& a, const VariantAllele& b) {
+    return VariantAllele(a.ref + b.ref, a.alt + b.alt, a.position);
+}
+
+bool operator<(const VariantAllele& a, const VariantAllele& b) {
+    return a.repr < b.repr;
+}
+
+map<pair<int, int>, int> Variant::getGenotypeIndexesDiploid(void) {
+
+    map<pair<int, int>, int> genotypeIndexes;
+    //map<int, map<Genotype*, int> > vcfGenotypeOrder;
+    vector<int> indexes;
+    for (int i = 0; i < alleles.size(); ++i) {
+        indexes.push_back(i);
+    }
+    int ploidy = 2; // ONLY diploid
+    vector<vector<int> > genotypes = multichoose(ploidy, indexes);
+    for (vector<vector<int> >::iterator g = genotypes.begin(); g != genotypes.end(); ++g) {
+        sort(g->begin(), g->end());  // enforce e.g. 0/1, 0/2, 1/2 ordering over reverse
+        // XXX this does not handle non-diploid!!!!
+        int j = g->front();
+        int k = g->back();
+        genotypeIndexes[make_pair(j, k)] = (k * (k + 1) / 2) + j;
+    }
+    return genotypeIndexes;
+
+}
+
+void Variant::updateAlleleIndexes(void) {
+    // adjust the allele index
+    altAlleleIndexes.clear();
+    int m = 0;
+    for (vector<string>::iterator a = alt.begin();
+            a != alt.end(); ++a, ++m) {
+        altAlleleIndexes[*a] = m;
+    }
+}
+
+// TODO only works on "A"llele variant fields
+void Variant::removeAlt(string& altAllele) {
+
+    int altIndex = getAltAlleleIndex(altAllele);  // this is the alt-relative index, 0-based
+
+    for (map<string, int>::iterator c = vcf->infoCounts.begin(); c != vcf->infoCounts.end(); ++c) {
+        int count = c->second;
+        if (count == ALLELE_NUMBER) {
+            string key = c->first;
+            map<string, vector<string> >::iterator v = info.find(key);
+            if (v != info.end()) {
+                vector<string>& vals = v->second;
+                vector<string> tokeep;
+                int i = 0;
+                for (vector<string>::iterator a = vals.begin(); a != vals.end(); ++a, ++i) {
+                    if (i != altIndex) {
+                        tokeep.push_back(*a);
+                    }
+                }
+                vals = tokeep;
+            }
+        }
+    }
+
+    for (map<string, int>::iterator c = vcf->formatCounts.begin(); c != vcf->formatCounts.end(); ++c) {
+        int count = c->second;
+        if (count == ALLELE_NUMBER) {
+            string key = c->first;
+            for (map<string, map<string, vector<string> > >::iterator s = samples.begin();
+                 s != samples.end(); ++s) {
+                map<string, vector<string> >& sample = s->second;
+                map<string, vector<string> >::iterator v = sample.find(key);
+                if (v != sample.end()) {
+                    vector<string>& vals = v->second;
+                    vector<string> tokeep;
+                    int i = 0;
+                    for (vector<string>::iterator a = vals.begin(); a != vals.end(); ++a, ++i) {
+                        if (i != altIndex) {
+                            tokeep.push_back(*a);
+                        }
+                    }
+                    vals = tokeep;
+                }
+            }
+        }
+    }
+
+    int altSpecIndex = altIndex + 1; // this is the genotype-spec index, ref=0, 1-based for alts
+
+    vector<string> newalt;
+    map<int, int> alleleIndexMapping;
+    // setup the new alt string
+    alleleIndexMapping[0] = 0; // reference allele remains the same
+    alleleIndexMapping[NULL_ALLELE] = NULL_ALLELE; // null allele remains the same
+    int i = 1; // current index
+    int j = 1; // new index
+    for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a, ++i) {
+        if (i != altSpecIndex) {
+            newalt.push_back(*a);
+            // get the mapping between new and old allele indexes
+            alleleIndexMapping[i] = j;
+            ++j;
+        } else {
+            alleleIndexMapping[i] = NULL_ALLELE;
+        }
+    }
+
+    // fix the sample genotypes, removing reference to the old allele
+    map<string, int> samplePloidy;
+    for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
+        map<string, vector<string> >& sample = s->second;
+        if (sample.find("GT") != sample.end()) {
+            string& gt = sample["GT"].front();
+            string splitter = "/";
+            if (gt.find("|") != string::npos) {
+                splitter = "|";
+            }
+
+            if (splitter == "/") {
+                samplePloidy[s->first] = split(gt, splitter).size();
+                map<int, int> genotype = decomposeGenotype(sample["GT"].front());
+                map<int, int> newGenotype;
+                for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
+                    newGenotype[alleleIndexMapping[g->first]] += g->second;
+                }
+                sample["GT"].clear();
+                sample["GT"].push_back(genotypeToString(newGenotype));
+            } else {
+                samplePloidy[s->first] = split(gt, splitter).size();
+                vector<int> genotype = decomposePhasedGenotype(sample["GT"].front());
+                vector<int> newGenotype;
+                for (vector<int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
+                    newGenotype.push_back(alleleIndexMapping[*g]);
+                }
+                sample["GT"].clear();
+                sample["GT"].push_back(phasedGenotypeToString(newGenotype));
+            }
+        }
+    }
+
+    set<int> ploidies;
+    for (map<string, int>::iterator p = samplePloidy.begin(); p != samplePloidy.end(); ++p) {
+        ploidies.insert(p->second);
+    }
+
+    // fix the sample genotype likelihoods, removing reference to the old allele
+    // which GL fields should we remove?
+    vector<int> toRemove;
+    toRemove.push_back(altSpecIndex);
+    map<int, map<int, int> > glMappingByPloidy;
+    for (set<int>::iterator p = ploidies.begin(); p != ploidies.end(); ++p) {
+        glMappingByPloidy[*p] = glReorder(*p, alt.size() + 1, alleleIndexMapping, toRemove);
+    }
+
+    for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
+        map<string, vector<string> >& sample = s->second;
+        map<string, vector<string> >::iterator glsit = sample.find("GL");
+        if (glsit != sample.end()) {
+            vector<string>& gls = glsit->second; // should be split already
+            map<int, string> newgls;
+            map<int, int>& newOrder = glMappingByPloidy[samplePloidy[s->first]];
+            int i = 0;
+            for (vector<string>::iterator g = gls.begin(); g != gls.end(); ++g, ++i) {
+                int j = newOrder[i];
+                if (j != -1) {
+                    newgls[i] = *g;
+                }
+            }
+            // update the gls
+            gls.clear();
+            for (map<int, string>::iterator g = newgls.begin(); g != newgls.end(); ++g) {
+                gls.push_back(g->second);
+            }
+        }
+    }
+
+    // reset the alt
+    alt = newalt;
+
+    // and the alleles
+    alleles.clear();
+    alleles.push_back(ref);
+    alleles.insert(alleles.end(), alt.begin(), alt.end());
+
+    updateAlleleIndexes();
+
+}
+
+// union of lines in headers of input files
+string unionInfoHeaderLines(string& s1, string& s2) {
+    vector<string> lines1 = split(s1, "\n");
+    vector<string> lines2 = split(s2, "\n");
+    vector<string> result;
+    set<string> l2;
+    string lastHeaderLine; // this one needs to be at the end
+    for (vector<string>::iterator s = lines2.begin(); s != lines2.end(); ++s) {
+        if (s->substr(0,6) == "##INFO") {
+            l2.insert(*s);
+        }
+    }
+    for (vector<string>::iterator s = lines1.begin(); s != lines1.end(); ++s) {
+        if (l2.count(*s)) {
+            l2.erase(*s);
+        }
+        if (s->substr(0,6) == "#CHROM") {
+            lastHeaderLine = *s;
+        } else {
+            result.push_back(*s);
+        }
+    }
+    for (set<string>::iterator s = l2.begin(); s != l2.end(); ++s) {
+        result.push_back(*s);
+    }
+    if (lastHeaderLine.empty()) {
+        cerr << "could not find CHROM POS ... header line" << endl;
+        exit(1);
+    }
+    result.push_back(lastHeaderLine);
+    return join(result, "\n");
+}
+
+string mergeCigar(const string& c1, const string& c2) {
+    vector<pair<int, string> > cigar1 = splitCigar(c1);
+    vector<pair<int, string> > cigar2 = splitCigar(c2);
+    // check if the middle elements are the same
+    if (cigar1.back().second == cigar2.front().second) {
+        cigar1.back().first += cigar2.front().first;
+        cigar2.erase(cigar2.begin());
+    }
+    for (vector<pair<int, string> >::iterator c = cigar2.begin(); c != cigar2.end(); ++c) {
+        cigar1.push_back(*c);
+    }
+    return joinCigar(cigar1);
+}
+
+vector<pair<int, string> > splitCigar(const string& cigarStr) {
+    vector<pair<int, string> > cigar;
+    string number;
+    string type;
+    // strings go [Number][Type] ...
+    for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
+        char c = *s;
+        if (isdigit(c)) {
+            if (type.empty()) {
+                number += c;
+            } else {
+                // signal for next token, push back the last pair, clean up
+                cigar.push_back(make_pair(atoi(number.c_str()), type));
+                number.clear();
+                type.clear();
+                number += c;
+            }
+        } else {
+            type += c;
+        }
+    }
+    if (!number.empty() && !type.empty()) {
+        cigar.push_back(make_pair(atoi(number.c_str()), type));
+    }
+    return cigar;
+}
+
+list<pair<int, string> > splitCigarList(const string& cigarStr) {
+    list<pair<int, string> > cigar;
+    string number;
+    string type;
+    // strings go [Number][Type] ...
+    for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
+        char c = *s;
+        if (isdigit(c)) {
+            if (type.empty()) {
+                number += c;
+            } else {
+                // signal for next token, push back the last pair, clean up
+                cigar.push_back(make_pair(atoi(number.c_str()), type));
+                number.clear();
+                type.clear();
+                number += c;
+            }
+        } else {
+            type += c;
+        }
+    }
+    if (!number.empty() && !type.empty()) {
+        cigar.push_back(make_pair(atoi(number.c_str()), type));
+    }
+    return cigar;
+}
+
+vector<pair<int, string> > cleanCigar(const vector<pair<int, string> >& cigar) {
+    vector<pair<int, string> > cigarClean;
+    for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+        if (c->first > 0) {
+            cigarClean.push_back(*c);
+        }
+    }
+    return cigarClean;
+}
+
+string joinCigar(const vector<pair<int, string> >& cigar) {
+    string cigarStr;
+    for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+        if (c->first) {
+            cigarStr += convert(c->first) + c->second;
+        }
+    }
+    return cigarStr;
+}
+
+string joinCigar(const vector<pair<int, char> >& cigar) {
+    string cigarStr;
+    for (vector<pair<int, char> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+        if (c->first) {
+            cigarStr += convert(c->first) + string(1, c->second);
+        }
+    }
+    return cigarStr;
+}
+
+string joinCigarList(const list<pair<int, string> >& cigar) {
+    string cigarStr;
+    for (list<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+        cigarStr += convert(c->first) + c->second;
+    }
+    return cigarStr;
+}
+
+int cigarRefLen(const vector<pair<int, char> >& cigar) {
+    int len = 0;
+    for (vector<pair<int, char> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+        if (c->second == 'M' || c->second == 'D' || c->second == 'X') {
+            len += c->first;
+        }
+    }
+    return len;
+}
+
+int cigarRefLen(const vector<pair<int, string> >& cigar) {
+    int len = 0;
+    for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+        if (c->second == "M" || c->second == "D" || c->second == "X") {
+            len += c->first;
+        }
+    }
+    return len;
+}
+
+bool isEmptyCigarElement(const pair<int, string>& elem) {
+    return elem.first == 0;
+}
+
+list<list<int> > _glorder(int ploidy, int alts) {
+    if (ploidy == 1) {
+        list<list<int> > results;
+        for (int n = 0; n < alts; ++n) {
+            list<int> v;
+            v.push_back(n);
+            results.push_back(v);
+        }
+        return results;
+    } else {
+        list<list<int> > results;
+        for (int n = 0; n < alts; ++n) {
+            list<list<int> > x = _glorder(ploidy - 1, alts);
+            for (list<list<int> >::iterator v = x.begin(); v != x.end(); ++v) {
+                if (v->front() <= n) {
+                    v->push_front(n);
+                    results.push_back(*v);
+                }
+            }
+        }
+        return results;
+    }
+}
+
+// genotype likelihood-ordering of genotypes, where each genotype is a
+// list of integers (as written in the GT field)
+list<list<int> > glorder(int ploidy, int alts) {
+    list<list<int> > results = _glorder(ploidy, alts);
+    for (list<list<int> >::iterator v = results.begin(); v != results.end(); ++v) {
+        v->reverse();
+    }
+    return results;
+}
+
+// which genotype likelihoods would include this alternate allele
+list<int> glsWithAlt(int alt, int ploidy, int numalts) {
+    list<int> gls;
+    list<list<int> > orderedGenotypes = glorder(ploidy, numalts);
+    int i = 0;
+    for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v, ++i) {
+        for (list<int>::iterator q = v->begin(); q != v->end(); ++q) {
+            if (*q == alt) {
+                gls.push_back(i);
+                break;
+            }
+        }
+    }
+    return gls;
+}
+
+// describes the mapping between the old gl ordering and and a new
+// one in which the GLs including the old alt have been removed
+// a map to -1 means "remove"
+map<int, int> glReorder(int ploidy, int numalts, map<int, int>& alleleIndexMapping, vector<int>& altsToRemove) {
+    map<int, int> mapping;
+    list<list<int> > orderedGenotypes = glorder(ploidy, numalts);
+    for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v) {
+        for (list<int>::iterator n = v->begin(); n != v->end(); ++n) {
+            *n = alleleIndexMapping[*n];
+        }
+    }
+    list<list<int> > newOrderedGenotypes = glorder(ploidy, numalts - altsToRemove.size());
+    map<list<int>, int> newOrderedGenotypesMapping;
+    int i = 0;
+    // mapping is wrong...
+    for (list<list<int> >::iterator v = newOrderedGenotypes.begin(); v != newOrderedGenotypes.end(); ++v, ++i) {
+        newOrderedGenotypesMapping[*v] = i;
+    }
+    i = 0;
+    for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v, ++i) {
+        map<list<int>, int>::iterator m = newOrderedGenotypesMapping.find(*v);
+        if (m != newOrderedGenotypesMapping.end()) {
+            //cout << "new gl order of " << i << " is " << m->second << endl;
+            mapping[i] = m->second;
+        } else {
+            //cout << i << " will be removed" << endl;
+            mapping[i] = -1;
+        }
+    }
+    return mapping;
+}
+
+string Variant::getGenotype(string& sample) {
+    map<string, map<string, vector<string> > >::iterator s = samples.find(sample);
+    if (s != samples.end()) {
+        map<string, vector<string> >::iterator f = s->second.find("GT");
+        if (f != s->second.end()) {
+            return f->second.front();
+        }
+    }
+    return "";
+}
+
+bool Variant::isPhased(void) {
+    for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
+        map<string, vector<string> >& sample = s->second;
+        map<string, vector<string> >::iterator g = sample.find("GT");
+        if (g != sample.end()) {
+            string gt = g->second.front();
+            if (gt.size() > 1 && gt.find("|") == string::npos) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+long Variant::zeroBasedPosition(void) {
+    return position - 1;
+}
+
+string Variant::vrepr(void) {
+    return sequenceName + "\t" + convert(position) + "\t" + join(alleles, ",");
+}
+
+// TODO
+/*
+vector<Variant*> Variant::matchingHaplotypes() {
+
+    int haplotypeStart = var.position;
+    int haplotypeEnd = var.position + var.ref.size();
+
+    for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
+        haplotypeStart = min((*v)->position, (long int) haplotypeStart);
+        haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd);
+    }
+
+    // for everything overlapping and the current variant, construct the local haplotype within the bounds
+    // if there is an exact match, the allele in the current VCF does intersect
+
+    string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart);
+    map<string, vector<pair<Variant*, int> > > haplotypes; // map to variant and alt index
+
+    for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
+        Variant& variant = **v;
+        int altindex = 0;
+        for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a, ++altindex) {
+            string haplotype = referenceHaplotype;
+            // get the relative start and end coordinates for the variant alternate allele
+            int relativeStart = variant.position - haplotypeStart;
+            haplotype.replace(relativeStart, variant.ref.size(), *a);
+            haplotypes[haplotype].push_back(make_pair(*v, altindex));
+        }
+    }
+
+    Variant originalVar = var;
+
+    // determine the non-intersecting alts
+    vector<string> altsToRemove;
+    vector<int> altIndexesToRemove;
+    for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+        string haplotype = referenceHaplotype;
+        int relativeStart = var.position - haplotypeStart;
+        haplotype.replace(relativeStart, var.ref.size(), *a);
+        map<string, vector<pair<Variant*, int> > >::iterator h = haplotypes.find(haplotype);
+        if ((intersecting && !invert && h == haplotypes.end())
+            || (intersecting && invert && h != haplotypes.end())
+            || (unioning && h != haplotypes.end())) {
+            if (tag.empty() && mergeToTag.empty()) {
+                altsToRemove.push_back(*a);
+            } else {
+                if (!tag.empty()) {
+                    var.info[tag].push_back(".");
+                }
+                if (!mergeToTag.empty()) {
+                    var.info[mergeToTag].push_back(".");
+                }
+            }
+        } else {
+            if (!tag.empty()) {
+                var.info[tag].push_back(tagValue);
+            }
+            // NB: just take the first value for the mergeFromTag
+            if (!mergeToTag.empty()) {
+                Variant* v = h->second.front().first;
+                int index = h->second.front().second;
+                if (v->info.find(mergeFromTag) != v->info.end()) {
+                    // now you have to find the exact allele...
+                    string& otherValue = v->info[mergeFromTag].at(index);
+                    var.info[mergeToTag].push_back(otherValue);
+                } else if (mergeFromTag == "QUAL") {
+                    var.info[mergeToTag].push_back(convert(v->quality));
+                } else {
+                    var.info[mergeToTag].push_back(".");
+                }
+            }
+        }
+    }
+
+    // remove the non-overlapping (intersecting) or overlapping (unioning) alts
+    if (intersecting && loci && altsToRemove.size() != var.alt.size()) {
+        // we have a match in loci mode, so we should output the whole loci, not just the matching sequence
+    } else {
+        for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) {
+            var.removeAlt(*a);
+        }
+    }
+
+    if (unioning) {
+
+        // somehow sort the records and combine them?
+        map<long int, vector<Variant*> > variants;
+        for (vector<Variant*>::iterator o = overlapping.begin(); o != overlapping.end(); ++o) {
+            if ((*o)->position <= var.position && // check ensures proper ordering of variants on output
+                outputVariants.find(*o) == outputVariants.end()) {
+                outputVariants.insert(*o);
+                variants[(*o)->position].push_back(*o);
+            }
+        }
+        // add in the current variant, if it has alts left
+        if (!var.alt.empty()) {
+            vector<Variant*>& vars = variants[var.position];
+            int numalts = 0;
+            for (vector<Variant*>::iterator v = vars.begin(); v != vars.end(); ++v) {
+                numalts += (*v)->alt.size();
+            }
+            if (numalts + var.alt.size() == originalVar.alt.size()) {
+                variants[var.position].clear();
+                variants[var.position].push_back(&originalVar);
+            } else {
+                variants[var.position].push_back(&var);
+            }
+        }
+
+        for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) {
+            for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) {
+                cout << **o << endl;
+                lastOutputPosition = max(lastOutputPosition, (*o)->position);
+            }
+        }
+    } else {
+        // if any alts remain, output the variant record
+        if (!var.alt.empty()) {
+            cout << var << endl;
+            lastOutputPosition = max(lastOutputPosition, var.position);
+        }
+    }
+
+}
+*/
+
+
+    VCFHeader::VCFHeader()
+    {
+
+        // add manditory fields
+        this->header_columns.push_back("#CHROM");
+        this->header_columns.push_back("POS");
+        this->header_columns.push_back("ID");
+        this->header_columns.push_back("REF");
+        this->header_columns.push_back("ALT");
+        this->header_columns.push_back("QUAL");
+        this->header_columns.push_back("FILTER");
+        this->header_columns.push_back("INFO");
+
+        // add the line names in order
+        // the order is used when outputting as a string
+        this->header_line_names_ordered.push_back("##fileFormat");
+        this->header_line_names_ordered.push_back("##fileDate");
+        this->header_line_names_ordered.push_back("##source");
+        this->header_line_names_ordered.push_back("##reference");
+        this->header_line_names_ordered.push_back( "##contig");
+        this->header_line_names_ordered.push_back("##phasing");
+        this->header_line_names_ordered.push_back( "##assembly");
+
+        // add the list names in order
+        // the order is used when outputting as a string (getHeaderString)
+        this->header_list_names_ordered.push_back("##info");
+        this->header_list_names_ordered.push_back("##filter");
+        this->header_list_names_ordered.push_back("##format");
+        this->header_list_names_ordered.push_back("##alt");
+        this->header_list_names_ordered.push_back("##sample");
+        this->header_list_names_ordered.push_back("##pedigree");
+        this->header_list_names_ordered.push_back("##pedigreedb");
+
+        // initialize the header_lines with the above vector.
+        // Set the key as the ##_type_ and the value as an empty string
+        // Empty strings are ignored when outputting as string (getHeaderString)
+        for (vector<string>::const_iterator header_lines_iter = this->header_line_names_ordered.begin(); header_lines_iter != this->header_line_names_ordered.end(); ++header_lines_iter)
+        {
+            this->header_lines[(*header_lines_iter)] = "";
+        }
+
+        // initialize the header_lines with the above vector.
+        // Set the key as the ##_type_ and the value as an empty vector<string>
+        // Empty vectors are ignored when outputting as string (getHeaderString)
+        for (vector<string>::const_iterator header_lists_iter = this->header_list_names_ordered.begin(); header_lists_iter != this->header_list_names_ordered.end(); ++header_lists_iter)
+        {
+            this->header_lists[(*header_lists_iter)] = vector<string>(0);
+        }
+
+    }
+
+    void VCFHeader::addMetaInformationLine(const string& meta_line)
+    {
+        // get the meta_line unique key (first chars before the =)
+        unsigned int meta_line_index = meta_line.find("=", 0);
+        string meta_line_prefix = meta_line.substr(0, meta_line_index);
+
+        // check if the meta_line_prefix is in the header_lines, if so add it to the appropirate list
+        if (this->header_lines.find(meta_line_prefix) != header_lines.end()) // the meta_line is a header line so replace what was there
+        {
+            this->header_lines[meta_line_prefix] = meta_line;
+        }
+        else if (header_lists.find(meta_line_prefix) != header_lists.end() &&
+            !metaInfoIdExistsInVector(meta_line, this->header_lists[meta_line_prefix])) // check if the metalineprefix is in the headerLists, if so add it to the appropirate list
+        {
+            this->header_lists[meta_line_prefix].push_back(meta_line);
+        }
+    }
+
+    string VCFHeader::getHeaderString()
+    {
+        // getHeaderString generates the string each time it is called
+        string header_string;
+
+        // start by adding the header_lines
+        for (vector<string>::const_iterator header_lines_iter = this->header_line_names_ordered.begin(); header_lines_iter != this->header_line_names_ordered.end(); ++header_lines_iter)
+        {
+            if (this->header_lines[(*header_lines_iter)] != "")
+            {
+                header_string += this->header_lines[(*header_lines_iter)] + "\n";
+            }
+        }
+
+        // next add header_lists
+        for (vector<string>::const_iterator header_lists_iter = this->header_list_names_ordered.begin(); header_lists_iter != this->header_list_names_ordered.end(); ++header_lists_iter)
+        {
+            vector<string> tmp_header_lists = this->header_lists[(*header_lists_iter)];
+            for (vector<string>::const_iterator header_list = tmp_header_lists.begin(); header_list != tmp_header_lists.end(); ++header_list)
+            {
+                header_string += (*header_list) + "\n";
+            }
+        }
+
+        // last add header columns
+        vector<string>::const_iterator last_element = this->header_columns.end() - 1;
+        for (vector<string>::const_iterator header_column_iter = this->header_columns.begin(); header_column_iter != this->header_columns.end(); ++header_column_iter)
+        {
+            string delimiter = (header_column_iter == last_element) ? "\n" : "\t";
+            header_string += (*header_column_iter) + delimiter;
+        }
+        return header_string;
+    }
+
+    bool VCFHeader::metaInfoIdExistsInVector(const string& meta_line, vector<string>& meta_lines)
+    {
+        // extract the id from meta_line
+        size_t meta_line_id_start_idx = meta_line.find("ID=", 0); // used for the start of the substring index
+        size_t meta_line_id_end_idx = meta_line.find(",", meta_line_id_start_idx); // used for end of the substring index
+        string meta_line_id = (meta_line_id_start_idx < meta_line_id_end_idx) ? meta_line.substr(meta_line_id_start_idx, meta_line_id_end_idx - meta_line_id_start_idx) : "";
+
+        for (vector<string>::const_iterator iter = meta_lines.begin(); iter != meta_lines.end(); ++iter)
+        {
+            // extract the id from iter's meta_line string
+            size_t iter_meta_line_id_start_idx = (*iter).find("ID=", 0);
+            size_t iter_meta_line_id_end_idx = (*iter).find(",", iter_meta_line_id_start_idx);
+            string iter_meta_line_id = (iter_meta_line_id_start_idx < iter_meta_line_id_end_idx) ? (*iter).substr(iter_meta_line_id_start_idx, iter_meta_line_id_end_idx - iter_meta_line_id_start_idx) : "";
+            // compare the meta_line_id with the iter_meta_line_id
+            if (strcasecmp(meta_line_id.c_str(), iter_meta_line_id.c_str()) == 0)
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void VCFHeader::addHeaderColumn(const string& header_column)
+    {
+        // don't add duplicates
+        //  vector<string>::iterator test = find(this->header_columns.begin(), this->header_columns.end(), header_column);
+        if (find(this->header_columns.begin(), this->header_columns.end(), header_column) == this->header_columns.end())
+        {
+            this->header_columns.push_back(header_column);
+        }
+    }
+
+} // end namespace vcf
diff --git a/src/Variant.h b/src/Variant.h
new file mode 100644
index 0000000..307ca84
--- /dev/null
+++ b/src/Variant.h
@@ -0,0 +1,586 @@
+#ifndef __VARIANT_H
+#define __VARIANT_H
+
+#include <vector>
+#include <list>
+#include <map>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <utility>
+#include <stdlib.h>
+#include <assert.h>
+#include <stack>
+#include <queue>
+#include <set>
+#include "split.h"
+#include "join.h"
+#include "tabixpp/tabix.hpp"
+#include "smithwaterman/SmithWatermanGotoh.h"
+#include "smithwaterman/disorder.h"
+#include "ssw_cpp.h"
+#include "convert.h"
+#include "multichoose/multichoose.h"
+extern "C" {
+    #include "filevercmp/filevercmp.h"
+}
+
+using namespace std;
+
+namespace vcf {
+
+class Variant;
+
+enum VariantFieldType { FIELD_FLOAT = 0
+                      , FIELD_INTEGER
+                      , FIELD_BOOL
+                      , FIELD_STRING
+                      , FIELD_UNKNOWN
+                      };
+
+enum VariantFieldNumber { ALLELE_NUMBER = -2
+                        , GENOTYPE_NUMBER = -1
+                        };
+
+const int INDEX_NONE = -1;
+const int NULL_ALLELE = -1;
+
+VariantFieldType typeStrToFieldType(string& typeStr);
+ostream& operator<<(ostream& out, VariantFieldType type);
+
+typedef map<string, map<string, vector<string> > > Samples;
+typedef vector<pair<int, string> > Cigar;
+
+class VariantCallFile {
+
+public:
+
+    istream* file;
+    Tabix* tabixFile;
+
+    bool usingTabix;
+    string vcf_header;
+
+
+    string header;
+    string line; // the current line
+    string fileformat;
+    string fileDate;
+    string source;
+    string reference;
+    string phasing;
+    map<string, VariantFieldType> infoTypes;
+    map<string, int> infoCounts;
+    map<string, VariantFieldType> formatTypes;
+    map<string, int> formatCounts;
+    vector<string> sampleNames;
+    bool parseSamples;
+    bool _done;
+
+    void updateSamples(vector<string>& newSampleNames);
+    string headerWithSampleNames(vector<string>& newSamples); // non-destructive, for output
+    void addHeaderLine(string line);
+    void removeInfoHeaderLine(string line);
+    void removeGenoHeaderLine(string line);
+    vector<string> infoIds(void);
+    vector<string> formatIds(void);
+
+    bool open(string& filename) {
+        vector<string> filenameParts = split(filename, ".");
+        if (filenameParts.back() == "gz" || filenameParts.back() == "bgz") {
+            return openTabix(filename);
+        } else {
+            return openFile(filename);
+        }
+    }
+
+    bool openFile(string& filename) {
+        file = &_file;
+        _file.open(filename.c_str(), ifstream::in);
+        parsedHeader = parseHeader();
+        return parsedHeader;
+    }
+
+    bool openTabix(string& filename) {
+        usingTabix = true;
+        tabixFile = new Tabix(filename);
+        parsedHeader = parseHeader();
+        return parsedHeader;
+    }
+
+    bool open(istream& stream) {
+        file = &stream;
+        parsedHeader = parseHeader();
+        return parsedHeader;
+    }
+
+    bool open(ifstream& stream) {
+        file = &stream;
+        parsedHeader = parseHeader();
+        return parsedHeader;
+    }
+
+    bool openForOutput(string& headerStr) {
+        parsedHeader = parseHeader(headerStr);
+        return parsedHeader;
+    }
+
+VariantCallFile(void) : usingTabix(false), parseSamples(true), justSetRegion(false), parsedHeader(false) { }
+    ~VariantCallFile(void) {
+        if (usingTabix) {
+            delete tabixFile;
+        }
+    }
+
+    bool is_open(void) { return parsedHeader; }
+
+    bool eof(void) { return _file.eof(); }
+
+    bool done(void) { return _done; }
+
+    bool parseHeader(string& headerStr);
+
+    bool parseHeader(void);
+
+    bool getNextVariant(Variant& var);
+
+    bool setRegion(string region);
+    bool setRegion(string seq, long int start, long int end = 0);
+    vector<string> getHeaderLinesFromFile();
+
+private:
+    bool firstRecord;
+    bool justSetRegion;
+    bool usingFile;
+    ifstream _file;
+    bool parsedHeader;
+
+};
+
+class VariantAllele {
+    friend ostream& operator<<(ostream& out, VariantAllele& var);
+    friend bool operator<(const VariantAllele& a, const VariantAllele& b);
+    friend VariantAllele operator+(const VariantAllele& a, const VariantAllele& b);
+public:
+    string ref;
+    string alt;
+    string repr;
+    long position;
+    /* // TODO
+    bool isSNP(void);
+    bool isMNP(void);
+    bool isInsertion(void);
+    bool isDeletion(void);
+    bool isIndel(void);
+    */
+    VariantAllele(string r, string a, long p)
+        : ref(r), alt(a), position(p)
+    {
+        stringstream s;
+        s << position << ":" << ref << "/" << alt;
+        repr = s.str();
+    }
+};
+
+class Variant {
+
+    friend ostream& operator<<(ostream& out, Variant& var);
+    
+public:
+
+    string sequenceName;
+    long position;
+    long zeroBasedPosition(void);
+    string id;
+    string ref;
+    vector<string> alt;      // a list of all the alternate alleles present at this locus
+    vector<string> alleles;  // a list all alleles (ref + alt) at this locus
+                             // the indicies are organized such that the genotype codes (0,1,2,.etc.)
+                             // correspond to the correct offest into the allelese vector.
+                             // that is, alleles[0] = ref, alleles[1] = first alternate allele, etc.
+    string vrepr(void);  // a comparable record of the variantion described by the record
+    set<string> altSet(void);  // set of alleles, rather than vector of them
+    map<string, int> altAlleleIndexes;  // reverse lookup for alleles
+    map<string, vector<VariantAllele> > parsedAlternates(bool includePreviousBaseForIndels = false,
+                                                         bool useMNPs = false,
+                                                         bool useEntropy = false,
+                                                         float matchScore = 10.0f,
+                                                         float mismatchScore = -9.0f,
+                                                         float gapOpenPenalty = 15.0f,
+                                                         float gapExtendPenalty = 6.66f,
+                                                         float repeatGapExtendPenalty = 0.0f,
+                                                         string flankingRefLeft = "",
+                                                         string flankingRefRight = "");
+    // the same output format as parsedAlternates, without parsing
+    map<string, vector<VariantAllele> > flatAlternates(void);
+
+    map<string, string> extendedAlternates(long int newPosition, long int length);
+
+    string originalLine; // the literal of the record, as read
+    // TODO
+    // the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j
+    // vector<pair<int, int> > genotypes;  // indexes into the alleles, ordered as per the spec
+    string filter;
+    double quality;
+    VariantFieldType infoType(string& key);
+    map<string, vector<string> > info;  // vector<string> allows for lists by Genotypes or Alternates
+    map<string, bool> infoFlags;
+    VariantFieldType formatType(string& key);
+    vector<string> format;
+    map<string, map<string, vector<string> > > samples;  // vector<string> allows for lists by Genotypes or Alternates
+    vector<string> sampleNames;
+    vector<string> outputSampleNames;
+    VariantCallFile* vcf;
+
+    //void addInfoInt(string& tag, int value);
+    //void addInfoFloat(string& tag, double value);
+    //void addInfoString(string& tag, string& value);
+
+    void removeAlt(string& altallele);
+
+public:
+
+    Variant() { }
+
+    Variant(VariantCallFile& v)
+        : sampleNames(v.sampleNames)
+        , outputSampleNames(v.sampleNames)
+        , vcf(&v)
+    { }
+
+    void setVariantCallFile(VariantCallFile& v);
+    void setVariantCallFile(VariantCallFile* v);
+
+    void parse(string& line, bool parseSamples = true);
+    void addFilter(string& tag);
+    bool getValueBool(string& key, string& sample, int index = INDEX_NONE);
+    double getValueFloat(string& key, string& sample, int index = INDEX_NONE);
+    string getValueString(string& key, string& sample, int index = INDEX_NONE);
+    bool getSampleValueBool(string& key, string& sample, int index = INDEX_NONE);
+    double getSampleValueFloat(string& key, string& sample, int index = INDEX_NONE);
+    string getSampleValueString(string& key, string& sample, int index = INDEX_NONE);
+    bool getInfoValueBool(string& key, int index = INDEX_NONE);
+    double getInfoValueFloat(string& key, int index = INDEX_NONE);
+    string getInfoValueString(string& key, int index = INDEX_NONE);
+    void printAlt(ostream& out);      // print a comma-sep list of alternate alleles to an ostream
+    void printAlleles(ostream& out);  // print a comma-sep list of *all* alleles to an ostream
+    int getAltAlleleIndex(string& allele);
+    void updateAlleleIndexes(void);
+    void addFormatField(string& key);
+    void setOutputSampleNames(vector<string>& outputSamples);
+    map<pair<int, int>, int> getGenotypeIndexesDiploid(void);
+    int getNumSamples(void);
+    int getNumValidGenotypes(void);
+    string getGenotype(string& sample);
+    bool isPhased(void);
+    // TODO
+    //void setInfoField(string& key, string& val);
+
+private:
+
+    string lastFormat;
+
+};
+
+
+// from BamTools
+// RuleToken implementation
+
+class RuleToken {
+
+public:
+
+    // enums
+    enum RuleTokenType { OPERAND = 0
+                       , NUMBER
+                       , BOOLEAN_VARIABLE
+                       , NUMERIC_VARIABLE
+                       , STRING_VARIABLE
+                       , AND_OPERATOR
+                       , OR_OPERATOR
+                       , ADD_OPERATOR
+                       , SUBTRACT_OPERATOR
+                       , MULTIPLY_OPERATOR
+                       , DIVIDE_OPERATOR
+                       , NOT_OPERATOR
+                       , EQUAL_OPERATOR
+                       , GREATER_THAN_OPERATOR
+                       , LESS_THAN_OPERATOR
+                       , LEFT_PARENTHESIS
+                       , RIGHT_PARENTHESIS
+                       };
+
+    // constructor
+    RuleToken(string token, map<string, VariantFieldType>& variables);
+    RuleToken(void) 
+        : type(BOOLEAN_VARIABLE)
+        , state(false)
+    { }
+
+    // data members
+    RuleTokenType type;
+    string value;
+
+    double number;
+    string str;
+    bool state;
+
+    bool isVariable; // if this is a variable
+    //bool isEvaluated; // when we evaluate variables
+
+    RuleToken apply(RuleToken& other);
+
+};
+
+inline int priority(const RuleToken& token) {
+    switch ( token.type ) {
+        case ( RuleToken::MULTIPLY_OPERATOR )     : return 8;
+        case ( RuleToken::DIVIDE_OPERATOR )       : return 8;
+        case ( RuleToken::ADD_OPERATOR )          : return 7;
+        case ( RuleToken::SUBTRACT_OPERATOR )     : return 7;
+        case ( RuleToken::NOT_OPERATOR )          : return 6;
+        case ( RuleToken::EQUAL_OPERATOR )        : return 5;
+        case ( RuleToken::GREATER_THAN_OPERATOR ) : return 5;
+        case ( RuleToken::LESS_THAN_OPERATOR )    : return 5;
+        case ( RuleToken::AND_OPERATOR )          : return 4;
+        case ( RuleToken::OR_OPERATOR )           : return 3;
+        case ( RuleToken::LEFT_PARENTHESIS )      : return 0;
+        case ( RuleToken::RIGHT_PARENTHESIS )     : return 0;
+        default: cerr << "invalid token type" << endl; exit(1);
+    }
+}
+
+inline bool isRightAssociative(const RuleToken& token) {
+    return (token.type == RuleToken::NOT_OPERATOR ||
+            token.type == RuleToken::LEFT_PARENTHESIS);
+}
+
+inline bool isLeftAssociative(const RuleToken& token) {
+    return !isRightAssociative(token);
+}
+
+inline bool isLeftParenthesis(const RuleToken& token) {
+    return ( token.type == RuleToken::LEFT_PARENTHESIS );
+}
+
+inline bool isRightParenthesis(const RuleToken& token) {
+    return ( token.type == RuleToken::RIGHT_PARENTHESIS );
+}
+
+inline bool isOperand(const RuleToken& token) {
+    return ( token.type == RuleToken::OPERAND || 
+             token.type == RuleToken::NUMBER ||
+             token.type == RuleToken::NUMERIC_VARIABLE ||
+             token.type == RuleToken::STRING_VARIABLE ||
+             token.type == RuleToken::BOOLEAN_VARIABLE
+           );
+}
+
+inline bool isOperator(const RuleToken& token) {
+    return ( token.type == RuleToken::AND_OPERATOR ||
+             token.type == RuleToken::OR_OPERATOR  ||
+             token.type == RuleToken::NOT_OPERATOR ||
+             token.type == RuleToken::EQUAL_OPERATOR ||
+             token.type == RuleToken::GREATER_THAN_OPERATOR ||
+             token.type == RuleToken::LESS_THAN_OPERATOR ||
+             token.type == RuleToken::MULTIPLY_OPERATOR ||
+             token.type == RuleToken::DIVIDE_OPERATOR ||
+             token.type == RuleToken::ADD_OPERATOR ||
+             token.type == RuleToken::SUBTRACT_OPERATOR
+             );
+}
+
+inline bool isOperatorChar(const char& c) {
+    return (c == '!' ||
+            c == '&' ||
+            c == '|' ||
+            c == '=' ||
+            c == '>' ||
+            c == '<' ||
+            c == '*' ||
+            c == '/' ||
+            c == '+' ||
+            c == '-');
+}
+
+inline bool isParanChar(const char& c) {
+    return (c == '(' || c == ')');
+}
+
+inline bool isNumeric(const RuleToken& token) {
+    return token.type == RuleToken::NUMERIC_VARIABLE;
+}
+
+inline bool isString(const RuleToken& token) {
+    return token.type == RuleToken::STRING_VARIABLE;
+}
+
+inline bool isBoolean(const RuleToken& token) {
+    return token.type == RuleToken::BOOLEAN_VARIABLE;
+}
+
+inline bool isVariable(const RuleToken& token) {
+    return isNumeric(token) || isString(token) || isBoolean(token);
+}
+
+void tokenizeFilterSpec(string& filterspec, stack<RuleToken>& tokens, map<string, VariantFieldType>& variables);
+
+
+class VariantFilter {
+
+public:
+
+    enum VariantFilterType { SAMPLE = 0,
+                             RECORD };
+
+    string spec;
+    queue<RuleToken> tokens; // tokens, infix notation
+    queue<RuleToken> rules;  // tokens, prefix notation
+    VariantFilterType type;
+    VariantFilter(string filterspec, VariantFilterType filtertype, map<string, VariantFieldType>& variables);
+    bool passes(Variant& var, string& sample); // all alts pass
+    bool passes(Variant& var, string& sample, string& allele);
+    void removeFilteredGenotypes(Variant& var, bool keepInfo);
+
+};
+
+
+// genotype manipulation
+
+// TODO
+//map<string, int> decomposeGenotype(string& genotype);
+
+vector<int> decomposePhasedGenotype(const string& genotype);
+map<int, int> decomposeGenotype(const string& genotype);
+
+string genotypeToString(const map<int, int>& genotype);
+
+string phasedGenotypeToString(const vector<int>& genotype);
+
+bool isHet(const map<int, int>& genotype);
+
+bool isHom(const map<int, int>& genotype);
+
+bool hasNonRef(const map<int, int>& genotype);
+
+bool isHomRef(const map<int, int>& genotype);
+
+bool isHomNonRef(const map<int, int>& genotype);
+
+bool isNull(const map<int, int>& genotype);
+
+int ploidy(const map<int, int>& genotype);
+
+string unionInfoHeaderLines(string& s1, string& s2);
+
+// genotype likelihood ordering
+
+list<list<int> > glorder(int ploidy, int alleles);
+list<list<int> > _glorder(int ploidy, int alleles);
+list<int> glsWithAlt(int alt, int ploidy, int numalts);
+map<int, int> glReorder(int ploidy, int numalts, map<int, int>& alleleIndexMapping, vector<int>& altsToRemove);
+
+vector<string>& unique(vector<string>& strings);
+
+string varCigar(vector<VariantAllele>& vav, bool xForMismatch = false);
+string mergeCigar(const string& c1, const string& c2);
+vector<pair<int, string> > splitCigar(const string& cigarStr);
+list<pair<int, string> > splitCigarList(const string& cigarStr);
+int cigarRefLen(const vector<pair<int, char> >& cigar);
+int cigarRefLen(const vector<pair<int, string> >& cigar);
+vector<pair<int, string> > cleanCigar(const vector<pair<int, string> >& cigar);
+string joinCigar(const vector<pair<int, string> >& cigar);
+string joinCigar(const vector<pair<int, char> >& cigar);
+string joinCigarList(const list<pair<int, string> >& cigar);
+bool isEmptyCigarElement(const pair<int, string>& elem);
+
+// for sorting, generating maps ordered by chromosome name
+class ChromNameCompare {
+public:
+    bool operator()(const string& a, const string& b) const {
+        return (filevercmp(a.c_str(), b.c_str()) < 0);
+    }
+};
+
+class VCFHeader
+{
+public:
+    VCFHeader();
+    ~VCFHeader() {}
+
+    /*
+     * Adds header_column to this->header_columns if
+     * it doesn't already exits.
+     */
+    void addHeaderColumn(const string& header_column);
+
+    /*
+     * Adds meta_line to either header_lines or header_lists.
+     *
+     * We parse out the ##_type_ from meta_line
+     * - If the meta_line ##_type_ is a key in header_lines then meta_line is added to header_lines
+     * - If the meta_line ##_type_ is a key in header_lists then meta_line is added to header_lists[##_type_] vector<string>
+     *    Unless that header_lists[##_type_] vector already contains the ID that is in meta_line, in that case it is not added
+     */
+    void addMetaInformationLine(const string& meta_line);
+
+    /*
+     * Converts header_lines, header_lists and header_columns to a proper VCF header
+     */
+    string getHeaderString();
+
+private:
+    VCFHeader(const VCFHeader& vcfHeader); // Do not implement the copy constructor, there is no reason to add this functionality
+    VCFHeader& operator=(const VCFHeader& vcfHeader); // Do not implement operator=, there is no reason to add this functionality
+
+    /*
+     * This is a helper function that determines if the ID substring contained in meta_line
+     * exists as a ID substring within the vector<string> meta_lines. Returns true if
+     * the ID exists within the vector and false otherwise.
+     */
+    bool metaInfoIdExistsInVector(const string& meta_line, vector<string>& meta_lines);
+
+    /*
+     * header_line_names_ordered contains all the header lines that
+     * are available and in the expected order for a valid VCF file
+     */
+    vector<string> header_line_names_ordered;
+    /*
+     * header_list_names_ordered contains all the header lists that
+     * are available and in the expected order for a valid VCF file
+     */
+    vector<string> header_list_names_ordered;
+
+    /*
+     * header_columns is set by the constructor to contain the 8 manditory VCF fields.
+     * Also, unique header_columns for each of the vcf files are added as well.
+     * Duplicates are not allowed, to prevent duplicates use addHeaderColumn when adding header columns
+     */
+    vector<string> header_columns;
+
+    /* 
+     * the maps we're going to be using will be case-insensitive
+     * so that "fileFormat" and "fileformat" hash to the same item.
+     */
+    struct stringcasecmp : binary_function<string, string, bool> {
+        struct charcasecmp : public std::binary_function<unsigned char, unsigned char, bool> {
+            bool operator() (const unsigned char& c1, const unsigned char& c2) const {
+                return tolower (c1) < tolower (c2); 
+            }
+        };
+        bool operator() (const std::string & s1, const std::string & s2) const {
+            return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(), charcasecmp());
+        }
+    };
+
+    // contains all the ##_types_ as keys, the value is either empty or a VCF file has set it
+    map<string, string, stringcasecmp> header_lines; 
+
+    // contains all the ##_types_ as keys, the value is a vector of ##_type_ (since there can be duplicate #INFO for example, duplicate ids are not allowed)
+    map<string, vector<string>, stringcasecmp> header_lists; 
+
+};
+
+} // end namespace VCF
+
+#endif
diff --git a/src/convert.h b/src/convert.h
new file mode 100644
index 0000000..d73d518
--- /dev/null
+++ b/src/convert.h
@@ -0,0 +1,22 @@
+#ifndef __CONVERT_H
+#define __CONVERT_H
+
+#include <sstream>
+
+// converts the string into the specified type, setting r to the converted
+// value and returning true/false on success or failure
+template<typename T>
+bool convert(const std::string& s, T& r) {
+    std::istringstream iss(s);
+    iss >> r;
+    return iss.eof() ? true : false;
+}
+
+template<typename T>
+std::string convert(const T& r) {
+    std::ostringstream oss;
+    oss << r;
+    return oss.str();
+}
+
+#endif
diff --git a/src/join.h b/src/join.h
new file mode 100644
index 0000000..c46a75f
--- /dev/null
+++ b/src/join.h
@@ -0,0 +1,36 @@
+#ifndef __JOIN_H
+#define __JOIN_H
+
+// functions to split a string by a specific delimiter
+#include <string>
+#include <vector>
+#include <sstream>
+#include <string.h>
+
+// join a vector of elements by a delimiter object.  ostream<< must be defined
+// for both class S and T and an ostream, as it is e.g. in the case of strings
+// and character arrays
+template<class S, class T>
+std::string join(std::vector<T>& elems, S& delim) {
+    std::stringstream ss;
+    typename std::vector<T>::iterator e = elems.begin();
+    ss << *e++;
+    for (; e != elems.end(); ++e) {
+        ss << delim << *e;
+    }
+    return ss.str();
+}
+
+// same for lists
+template<class S, class T>
+std::string join(std::list<T>& elems, S& delim) {
+    std::stringstream ss;
+    typename std::list<T>::iterator e = elems.begin();
+    ss << *e++;
+    for (; e != elems.end(); ++e) {
+        ss << delim << *e;
+    }
+    return ss.str();
+}
+
+#endif
diff --git a/src/mt19937ar.h b/src/mt19937ar.h
new file mode 100644
index 0000000..3f239e1
--- /dev/null
+++ b/src/mt19937ar.h
@@ -0,0 +1,192 @@
+/* 
+   A C-program for MT19937, with initialization improved 2002/1/26.
+   Coded by Takuji Nishimura and Makoto Matsumoto.
+
+   Before using, initialize the state by using init_genrand(seed)  
+   or init_by_array(init_key, key_length).
+
+   Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+   All rights reserved.                          
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+     1. Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+
+     2. Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+
+     3. The names of its contributors may not be used to endorse or promote 
+        products derived from this software without specific prior written 
+        permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+   Any feedback is very welcome.
+   http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+   email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+*/
+
+#include <stdio.h>
+
+/* Period parameters */  
+#define N 624
+#define M 397
+#define MATRIX_A 0x9908b0dfUL   /* constant vector a */
+#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
+#define LOWER_MASK 0x7fffffffUL /* least significant r bits */
+
+static unsigned long mt[N]; /* the array for the state vector  */
+static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */
+
+/* initializes mt[N] with a seed */
+void init_genrand(unsigned long s)
+{
+    mt[0]= s & 0xffffffffUL;
+    for (mti=1; mti<N; mti++) {
+        mt[mti] = 
+	    (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti); 
+        /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
+        /* In the previous versions, MSBs of the seed affect   */
+        /* only MSBs of the array mt[].                        */
+        /* 2002/01/09 modified by Makoto Matsumoto             */
+        mt[mti] &= 0xffffffffUL;
+        /* for >32 bit machines */
+    }
+}
+
+/* initialize by an array with array-length */
+/* init_key is the array for initializing keys */
+/* key_length is its length */
+/* slight change for C++, 2004/2/26 */
+void init_by_array(unsigned long init_key[], int key_length)
+{
+    int i, j, k;
+    init_genrand(19650218UL);
+    i=1; j=0;
+    k = (N>key_length ? N : key_length);
+    for (; k; k--) {
+        mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
+          + init_key[j] + j; /* non linear */
+        mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
+        i++; j++;
+        if (i>=N) { mt[0] = mt[N-1]; i=1; }
+        if (j>=key_length) j=0;
+    }
+    for (k=N-1; k; k--) {
+        mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
+          - i; /* non linear */
+        mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
+        i++;
+        if (i>=N) { mt[0] = mt[N-1]; i=1; }
+    }
+
+    mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */ 
+}
+
+/* generates a random number on [0,0xffffffff]-interval */
+unsigned long genrand_int32(void)
+{
+    unsigned long y;
+    static unsigned long mag01[2]={0x0UL, MATRIX_A};
+    /* mag01[x] = x * MATRIX_A  for x=0,1 */
+
+    if (mti >= N) { /* generate N words at one time */
+        int kk;
+
+        if (mti == N+1)   /* if init_genrand() has not been called, */
+            init_genrand(5489UL); /* a default initial seed is used */
+
+        for (kk=0;kk<N-M;kk++) {
+            y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
+            mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
+        }
+        for (;kk<N-1;kk++) {
+            y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
+            mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
+        }
+        y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
+        mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
+
+        mti = 0;
+    }
+  
+    y = mt[mti++];
+
+    /* Tempering */
+    y ^= (y >> 11);
+    y ^= (y << 7) & 0x9d2c5680UL;
+    y ^= (y << 15) & 0xefc60000UL;
+    y ^= (y >> 18);
+
+    return y;
+}
+
+/* generates a random number on [0,0x7fffffff]-interval */
+long genrand_int31(void)
+{
+    return (long)(genrand_int32()>>1);
+}
+
+/* generates a random number on [0,1]-real-interval */
+double genrand_real1(void)
+{
+    return genrand_int32()*(1.0/4294967295.0); 
+    /* divided by 2^32-1 */ 
+}
+
+/* generates a random number on [0,1)-real-interval */
+double genrand_real2(void)
+{
+    return genrand_int32()*(1.0/4294967296.0); 
+    /* divided by 2^32 */
+}
+
+/* generates a random number on (0,1)-real-interval */
+double genrand_real3(void)
+{
+    return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0); 
+    /* divided by 2^32 */
+}
+
+/* generates a random number on [0,1) with 53-bit resolution*/
+double genrand_res53(void) 
+{ 
+    unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6; 
+    return(a*67108864.0+b)*(1.0/9007199254740992.0); 
+} 
+/* These real versions are due to Isaku Wada, 2002/01/09 added */
+
+/*
+int main(void)
+{
+    int i;
+    unsigned long init[4]={0x123, 0x234, 0x345, 0x456}, length=4;
+    init_by_array(init, length);
+    printf("1000 outputs of genrand_int32()\n");
+    for (i=0; i<1000; i++) {
+      printf("%10lu ", genrand_int32());
+      if (i%5==4) printf("\n");
+    }
+    printf("\n1000 outputs of genrand_real2()\n");
+    for (i=0; i<1000; i++) {
+      printf("%10.8f ", genrand_real2());
+      if (i%5==4) printf("\n");
+    }
+    return 0;
+}
+*/
diff --git a/src/split.cpp b/src/split.cpp
new file mode 100644
index 0000000..831dfcd
--- /dev/null
+++ b/src/split.cpp
@@ -0,0 +1,23 @@
+#include "split.h"
+
+
+std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
+    std::string delims = std::string(1, delim);
+    tokenize(s, elems, delims);
+    return elems;
+}
+
+std::vector<std::string> split(const std::string &s, char delim) {
+    std::vector<std::string> elems;
+    return split(s, delim, elems);
+}
+
+std::vector<std::string> &split(const std::string &s, const std::string& delims, std::vector<std::string> &elems) {
+    tokenize(s, elems, delims);
+    return elems;
+}
+
+std::vector<std::string> split(const std::string &s, const std::string& delims) {
+    std::vector<std::string> elems;
+    return split(s, delims, elems);
+}
diff --git a/src/split.h b/src/split.h
new file mode 100644
index 0000000..e10ba78
--- /dev/null
+++ b/src/split.h
@@ -0,0 +1,53 @@
+#ifndef __SPLIT_H
+#define __SPLIT_H
+
+// functions to split a string by a specific delimiter
+#include <string>
+#include <vector>
+#include <sstream>
+#include <string.h>
+
+// thanks to Evan Teran, http://stackoverflow.com/questions/236129/how-to-split-a-string/236803#236803
+
+// split a string on a single delimiter character (delim)
+std::vector<std::string>& split(const std::string &s, char delim, std::vector<std::string> &elems);
+std::vector<std::string>  split(const std::string &s, char delim);
+
+// split a string on any character found in the string of delimiters (delims)
+std::vector<std::string>& split(const std::string &s, const std::string& delims, std::vector<std::string> &elems);
+std::vector<std::string>  split(const std::string &s, const std::string& delims);
+
+// from Marius, http://stackoverflow.com/a/1493195/238609
+template < class ContainerT >
+void tokenize(const std::string& str, ContainerT& tokens,
+              const std::string& delimiters = " ", const bool trimEmpty = false)
+{
+
+    std::string::size_type pos, lastPos = 0;
+    while(true)
+    {
+	pos = str.find_first_of(delimiters, lastPos);
+	if(pos == std::string::npos)
+	{
+
+	    pos = str.length();
+
+	    if(pos != lastPos || !trimEmpty) {
+		tokens.push_back(typename ContainerT::value_type(str.data()+lastPos, (typename ContainerT::value_type::size_type)pos-lastPos));
+	    }
+
+	    break;
+	}
+	else
+	{
+	    if(pos != lastPos || !trimEmpty) {
+		tokens.push_back(typename ContainerT::value_type(str.data()+lastPos, (typename ContainerT::value_type::size_type)pos-lastPos));
+	    }
+	}
+
+	lastPos = pos + 1;
+    }
+};
+
+
+#endif
diff --git a/src/ssw.c b/src/ssw.c
new file mode 100644
index 0000000..69646f1
--- /dev/null
+++ b/src/ssw.c
@@ -0,0 +1,834 @@
+/*
+ *  ssw.c
+ *
+ *  Created by Mengyao Zhao on 6/22/10.
+ *  Copyright 2010 Boston College. All rights reserved.
+ *	Version 0.1.4
+ *	Last revision by Mengyao Zhao on 07/31/12.
+ *
+ */
+
+#include <emmintrin.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "ssw.h"
+
+#ifdef __GNUC__
+#define LIKELY(x) __builtin_expect((x),1)
+#define UNLIKELY(x) __builtin_expect((x),0)
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+/* Convert the coordinate in the scoring matrix into the coordinate in one line of the band. */
+#define set_u(u, w, i, j) { int x=(i)-(w); x=x>0?x:0; (u)=(j)-x+1; }
+
+/* Convert the coordinate in the direction matrix into the coordinate in one line of the band. */
+#define set_d(u, w, i, j, p) { int x=(i)-(w); x=x>0?x:0; x=(j)-x; (u)=x*3+p; }
+
+/*! @function
+  @abstract  Round an integer to the next closest power-2 integer.
+  @param  x  integer to be rounded (in place)
+  @discussion x will be modified.
+ */
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+
+typedef struct {
+	uint16_t score;
+	int32_t ref;	 //0-based position 
+	int32_t read;    //alignment ending position on read, 0-based 
+} alignment_end;
+
+typedef struct {
+	uint32_t* seq;
+	int32_t length;
+} cigar;
+
+struct _profile{
+	__m128i* profile_byte;	// 0: none
+	__m128i* profile_word;	// 0: none
+	const int8_t* read;
+	const int8_t* mat;
+	int32_t readLen;
+	int32_t n;
+	uint8_t bias;
+};
+
+/* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */
+__m128i* qP_byte (const int8_t* read_num,
+				  const int8_t* mat,
+				  const int32_t readLen,
+				  const int32_t n,	/* the edge length of the squre matrix mat */
+				  uint8_t bias) {
+ 
+	int32_t segLen = (readLen + 15) / 16; /* Split the 128 bit register into 16 pieces. 
+								     Each piece is 8 bit. Split the read into 16 segments. 
+								     Calculat 16 segments in parallel.
+								   */
+	__m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
+	int8_t* t = (int8_t*)vProfile;
+	int32_t nt, i, j, segNum;
+	
+	/* Generate query profile rearrange query sequence & calculate the weight of match/mismatch */
+	for (nt = 0; LIKELY(nt < n); nt ++) {
+		for (i = 0; i < segLen; i ++) {
+			j = i; 
+			for (segNum = 0; LIKELY(segNum < 16) ; segNum ++) {
+				*t++ = j>= readLen ? bias : mat[nt * n + read_num[j]] + bias;
+				j += segLen;
+			}
+		}
+	}
+	return vProfile;
+}
+
+/* Striped Smith-Waterman
+   Record the highest score of each reference position. 
+   Return the alignment score and ending position of the best alignment, 2nd best alignment, etc. 
+   Gap begin and gap extension are different. 
+   wight_match > 0, all other weights < 0.
+   The returned positions are 0-based.
+ */ 
+alignment_end* sw_sse2_byte (const int8_t* ref,
+							 int8_t ref_dir,	// 0: forward ref; 1: reverse ref
+							 int32_t refLen,
+							 int32_t readLen, 
+							 const uint8_t weight_gapO, /* will be used as - */
+							 const uint8_t weight_gapE, /* will be used as - */
+							 __m128i* vProfile,
+							 uint8_t terminate,	/* the best alignment score: used to terminate 
+												   the matrix calculation when locating the 
+												   alignment beginning point. If this score 
+												   is set to 0, it will not be used */
+	 						 uint8_t bias,  /* Shift 0 point to a positive value. */
+							 int32_t maskLen) {  
+      
+#define max16(m, vm) (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 8)); \
+					  (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 4)); \
+					  (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 2)); \
+					  (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 1)); \
+					  (m) = _mm_extract_epi16((vm), 0)
+
+	uint8_t max = 0;		                     /* the max alignment score */
+	int32_t end_read = readLen - 1;
+	int32_t end_ref = -1; /* 0_based best alignment ending point; Initialized as isn't aligned -1. */
+	int32_t segLen = (readLen + 15) / 16; /* number of segment */
+	
+	/* array to record the largest score of each reference position */
+	uint8_t* maxColumn = (uint8_t*) calloc(refLen, 1); 
+	
+	/* array to record the alignment read ending position of the largest score of each reference position */
+	int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t));
+	
+	/* Define 16 byte 0 vector. */
+	__m128i vZero = _mm_set1_epi32(0);
+
+	__m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i));
+	__m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i));
+	__m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i));
+	__m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i));
+
+	int32_t i, j;
+	/* 16 byte insertion begin vector */
+	__m128i vGapO = _mm_set1_epi8(weight_gapO);
+	
+	/* 16 byte insertion extension vector */
+	__m128i vGapE = _mm_set1_epi8(weight_gapE);	
+	
+	/* 16 byte bias vector */
+	__m128i vBias = _mm_set1_epi8(bias);	
+
+	__m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
+	__m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */	
+	__m128i vTemp;
+	int32_t edge, begin = 0, end = refLen, step = 1; 
+//	int32_t distance = readLen * 2 / 3;
+//	int32_t distance = readLen / 2;
+//	int32_t distance = readLen;
+
+	/* outer loop to process the reference sequence */
+	if (ref_dir == 1) {
+		begin = refLen - 1;
+		end = -1;
+		step = -1;
+	}
+	for (i = begin; LIKELY(i != end); i += step) {
+		int32_t cmp;
+		__m128i e = vZero, vF = vZero, vMaxColumn = vZero; /* Initialize F value to 0. 
+							   Any errors to vH values will be corrected in the Lazy_F loop. 
+							 */
+//		max16(maxColumn[i], vMaxColumn);
+//		fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]);
+
+		__m128i vH = pvHStore[segLen - 1];
+		vH = _mm_slli_si128 (vH, 1); /* Shift the 128-bit value in vH left by 1 byte. */
+		__m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
+
+		/* Swap the 2 H buffers. */
+		__m128i* pv = pvHLoad;
+		pvHLoad = pvHStore;
+		pvHStore = pv;
+		
+		/* inner loop to process the query sequence */
+		for (j = 0; LIKELY(j < segLen); ++j) {
+			vH = _mm_adds_epu8(vH, _mm_load_si128(vP + j));
+			vH = _mm_subs_epu8(vH, vBias); /* vH will be always > 0 */
+	//	max16(maxColumn[i], vH);
+	//	fprintf(stderr, "H[%d]: %d\n", i, maxColumn[i]);
+//	int8_t* t;
+//	int32_t ti;
+//for (t = (int8_t*)&vH, ti = 0; ti < 16; ++ti) fprintf(stderr, "%d\t", *t++);
+
+			/* Get max from vH, vE and vF. */
+			e = _mm_load_si128(pvE + j);
+			vH = _mm_max_epu8(vH, e);
+			vH = _mm_max_epu8(vH, vF);
+			vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
+			
+	//	max16(maxColumn[i], vMaxColumn);
+	//	fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]);
+//	for (t = (int8_t*)&vMaxColumn, ti = 0; ti < 16; ++ti) fprintf(stderr, "%d\t", *t++);
+
+			/* Save vH values. */
+			_mm_store_si128(pvHStore + j, vH);
+
+			/* Update vE value. */
+			vH = _mm_subs_epu8(vH, vGapO); /* saturation arithmetic, result >= 0 */
+			e = _mm_subs_epu8(e, vGapE);
+			e = _mm_max_epu8(e, vH);
+			_mm_store_si128(pvE + j, e);
+			
+			/* Update vF value. */
+			vF = _mm_subs_epu8(vF, vGapE);
+			vF = _mm_max_epu8(vF, vH);
+			
+			/* Load the next vH. */
+			vH = _mm_load_si128(pvHLoad + j);
+		}
+
+		/* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */
+        /* reset pointers to the start of the saved data */
+        j = 0;
+        vH = _mm_load_si128 (pvHStore + j);
+
+        /*  the computed vF value is for the given column.  since */
+        /*  we are at the end, we need to shift the vF value over */
+        /*  to the next column. */
+        vF = _mm_slli_si128 (vF, 1);
+        vTemp = _mm_subs_epu8 (vH, vGapO);
+		vTemp = _mm_subs_epu8 (vF, vTemp);
+		vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
+		cmp  = _mm_movemask_epi8 (vTemp);
+
+        while (cmp != 0xffff) 
+        {
+            vH = _mm_max_epu8 (vH, vF);
+			vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
+            _mm_store_si128 (pvHStore + j, vH);
+            vF = _mm_subs_epu8 (vF, vGapE);
+            j++;
+            if (j >= segLen)
+            {
+                j = 0;
+                vF = _mm_slli_si128 (vF, 1);
+            }
+            vH = _mm_load_si128 (pvHStore + j);
+
+            vTemp = _mm_subs_epu8 (vH, vGapO);
+            vTemp = _mm_subs_epu8 (vF, vTemp);
+            vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
+            cmp  = _mm_movemask_epi8 (vTemp);
+        }
+
+		vMaxScore = _mm_max_epu8(vMaxScore, vMaxColumn);
+		vTemp = _mm_cmpeq_epi8(vMaxMark, vMaxScore);
+		cmp = _mm_movemask_epi8(vTemp);
+		if (cmp != 0xffff) {
+			uint8_t temp; 
+			vMaxMark = vMaxScore;
+			max16(temp, vMaxScore);
+			vMaxScore = vMaxMark;
+			
+			if (LIKELY(temp > max)) {
+				max = temp;
+				if (max + bias >= 255) break;	//overflow
+				end_ref = i;
+			
+				/* Store the column with the highest alignment score in order to trace the alignment ending position on read. */
+				for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j];
+			}
+		}
+
+		/* Record the max score of current column. */	
+		max16(maxColumn[i], vMaxColumn);
+//		fprintf(stderr, "maxColumn[%d]: %d\n", i, maxColumn[i]);
+		if (maxColumn[i] == terminate) break;
+	}
+	
+	/* Trace the alignment ending position on read. */
+	uint8_t *t = (uint8_t*)pvHmax;
+	int32_t column_len = segLen * 16;
+	for (i = 0; LIKELY(i < column_len); ++i, ++t) {
+		int32_t temp;
+		if (*t == max) {
+			temp = i / 16 + i % 16 * segLen;
+			if (temp < end_read) end_read = temp;
+		}
+	}
+
+	free(pvHmax);
+	free(pvE);
+	free(pvHLoad);
+	free(pvHStore); 	
+
+	/* Find the most possible 2nd best alignment. */
+	alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end));
+	bests[0].score = max + bias >= 255 ? 255 : max;
+	bests[0].ref = end_ref;
+	bests[0].read = end_read;
+	
+	bests[1].score = 0;
+	bests[1].ref = 0;
+	bests[1].read = 0;
+
+	edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0;
+	for (i = 0; i < edge; i ++) {
+//			fprintf (stderr, "maxColumn[%d]: %d\n", i, maxColumn[i]); 
+		if (maxColumn[i] > bests[1].score) {
+			bests[1].score = maxColumn[i];
+			bests[1].ref = i;
+		}
+	}
+	edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen);
+	for (i = edge + 1; i < refLen; i ++) {
+//			fprintf (stderr, "refLen: %d\tmaxColumn[%d]: %d\n", refLen, i, maxColumn[i]); 
+		if (maxColumn[i] > bests[1].score) {
+			bests[1].score = maxColumn[i];
+			bests[1].ref = i;
+		}
+	}
+	
+	free(maxColumn);
+	free(end_read_column);
+	return bests;
+}
+
+__m128i* qP_word (const int8_t* read_num,
+				  const int8_t* mat,
+				  const int32_t readLen,
+				  const int32_t n) { 
+					
+	int32_t segLen = (readLen + 7) / 8; 
+	__m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
+	int16_t* t = (int16_t*)vProfile;
+	int32_t nt, i, j;
+	int32_t segNum;
+	
+	/* Generate query profile rearrange query sequence & calculate the weight of match/mismatch */
+	for (nt = 0; LIKELY(nt < n); nt ++) {
+		for (i = 0; i < segLen; i ++) {
+			j = i; 
+			for (segNum = 0; LIKELY(segNum < 8) ; segNum ++) {
+				*t++ = j>= readLen ? 0 : mat[nt * n + read_num[j]];
+				j += segLen;
+			}
+		}
+	}
+	return vProfile;
+}
+
+alignment_end* sw_sse2_word (const int8_t* ref, 
+							 int8_t ref_dir,	// 0: forward ref; 1: reverse ref
+							 int32_t refLen,
+							 int32_t readLen, 
+							 const uint8_t weight_gapO, /* will be used as - */
+							 const uint8_t weight_gapE, /* will be used as - */
+						     __m128i* vProfile,
+							 uint16_t terminate, 
+							 int32_t maskLen) { 
+
+#define max8(m, vm) (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 8)); \
+					(vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 4)); \
+					(vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 2)); \
+					(m) = _mm_extract_epi16((vm), 0)
+	
+	uint16_t max = 0;		                     /* the max alignment score */
+	int32_t end_read = readLen - 1;
+	int32_t end_ref = 0; /* 1_based best alignment ending point; Initialized as isn't aligned - 0. */
+	int32_t segLen = (readLen + 7) / 8; /* number of segment */
+	
+	/* array to record the largest score of each reference position */
+	uint16_t* maxColumn = (uint16_t*) calloc(refLen, 2); 
+	
+	/* array to record the alignment read ending position of the largest score of each reference position */
+	int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t));
+	
+	/* Define 16 byte 0 vector. */
+	__m128i vZero = _mm_set1_epi32(0);
+
+	__m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i));
+	__m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i));
+	__m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i));
+	__m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i));
+
+	int32_t i, j, k;
+	/* 16 byte insertion begin vector */
+	__m128i vGapO = _mm_set1_epi16(weight_gapO);
+	
+	/* 16 byte insertion extension vector */
+	__m128i vGapE = _mm_set1_epi16(weight_gapE);	
+
+	/* 16 byte bias vector */
+	__m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
+	__m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */	
+	__m128i vTemp;
+	int32_t edge, begin = 0, end = refLen, step = 1;
+
+	/* outer loop to process the reference sequence */
+	if (ref_dir == 1) {
+		begin = refLen - 1;
+		end = -1;
+		step = -1;
+	}
+	for (i = begin; LIKELY(i != end); i += step) {
+		int32_t cmp;
+		__m128i e = vZero, vF = vZero; /* Initialize F value to 0. 
+							   Any errors to vH values will be corrected in the Lazy_F loop. 
+							 */
+		__m128i vH = pvHStore[segLen - 1];
+		vH = _mm_slli_si128 (vH, 2); /* Shift the 128-bit value in vH left by 2 byte. */
+		
+		/* Swap the 2 H buffers. */
+		__m128i* pv = pvHLoad;
+		
+		__m128i vMaxColumn = vZero; /* vMaxColumn is used to record the max values of column i. */
+		
+		__m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
+		pvHLoad = pvHStore;
+		pvHStore = pv;
+		
+		/* inner loop to process the query sequence */
+		for (j = 0; LIKELY(j < segLen); j ++) {
+			vH = _mm_adds_epi16(vH, _mm_load_si128(vP + j));
+
+			/* Get max from vH, vE and vF. */
+			e = _mm_load_si128(pvE + j);
+			vH = _mm_max_epi16(vH, e);
+			vH = _mm_max_epi16(vH, vF);
+			vMaxColumn = _mm_max_epi16(vMaxColumn, vH);
+			
+			/* Save vH values. */
+			_mm_store_si128(pvHStore + j, vH);
+
+			/* Update vE value. */
+			vH = _mm_subs_epu16(vH, vGapO); /* saturation arithmetic, result >= 0 */
+			e = _mm_subs_epu16(e, vGapE);
+			e = _mm_max_epi16(e, vH);
+			_mm_store_si128(pvE + j, e);
+
+			/* Update vF value. */
+			vF = _mm_subs_epu16(vF, vGapE);
+			vF = _mm_max_epi16(vF, vH);
+			
+			/* Load the next vH. */
+			vH = _mm_load_si128(pvHLoad + j);
+		}
+
+		/* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */
+		for (k = 0; LIKELY(k < 8); ++k) {
+			vF = _mm_slli_si128 (vF, 2);
+			for (j = 0; LIKELY(j < segLen); ++j) {
+				vH = _mm_load_si128(pvHStore + j);
+				vH = _mm_max_epi16(vH, vF);
+				_mm_store_si128(pvHStore + j, vH);
+				vH = _mm_subs_epu16(vH, vGapO);
+				vF = _mm_subs_epu16(vF, vGapE);
+				if (UNLIKELY(! _mm_movemask_epi8(_mm_cmpgt_epi16(vF, vH)))) goto end;
+			}
+		}
+
+end:	
+		vMaxScore = _mm_max_epi16(vMaxScore, vMaxColumn);	
+		vTemp = _mm_cmpeq_epi16(vMaxMark, vMaxScore);
+		cmp = _mm_movemask_epi8(vTemp);
+		if (cmp != 0xffff) {
+			uint16_t temp; 
+			vMaxMark = vMaxScore;
+			max8(temp, vMaxScore);
+			vMaxScore = vMaxMark;
+			
+			if (LIKELY(temp > max)) {
+				max = temp;
+				end_ref = i;
+				for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j];
+			}
+		}
+		
+		/* Record the max score of current column. */	
+		max8(maxColumn[i], vMaxColumn);
+		if (maxColumn[i] == terminate) break;
+	} 	
+
+	/* Trace the alignment ending position on read. */
+	uint16_t *t = (uint16_t*)pvHmax;
+	int32_t column_len = segLen * 8;
+	for (i = 0; LIKELY(i < column_len); ++i, ++t) {
+		int32_t temp;
+		if (*t == max) {
+			temp = i / 8 + i % 8 * segLen;
+			if (temp < end_read) end_read = temp;
+		}
+	}
+
+	free(pvHmax);
+	free(pvE);
+	free(pvHLoad);
+	free(pvHStore); 
+	
+	/* Find the most possible 2nd best alignment. */
+	alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end));
+	bests[0].score = max;
+	bests[0].ref = end_ref;
+	bests[0].read = end_read;
+	
+	bests[1].score = 0;
+	bests[1].ref = 0;
+	bests[1].read = 0;
+
+	edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0;
+	for (i = 0; i < edge; i ++) {
+		if (maxColumn[i] > bests[1].score) { 
+			bests[1].score = maxColumn[i];
+			bests[1].ref = i;
+		}
+	}
+	edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen);
+	for (i = edge; i < refLen; i ++) {
+		if (maxColumn[i] > bests[1].score) {
+			bests[1].score = maxColumn[i];
+			bests[1].ref = i;
+		}
+	}
+	
+	free(maxColumn);
+	free(end_read_column);
+	return bests;
+}
+
+cigar* banded_sw (const int8_t* ref,
+				 const int8_t* read, 
+				 int32_t refLen, 
+				 int32_t readLen,
+				 int32_t score,
+				 const uint32_t weight_gapO,  /* will be used as - */
+				 const uint32_t weight_gapE,  /* will be used as - */
+				 int32_t band_width,
+				 const int8_t* mat,	/* pointer to the weight matrix */
+				 int32_t n) {	
+
+	uint32_t *c = (uint32_t*)malloc(16 * sizeof(uint32_t)), *c1;
+	int32_t i, j, e, f, temp1, temp2, s = 16, s1 = 8, s2 = 1024, l, max = 0;
+	int32_t width, width_d, *h_b, *e_b, *h_c;
+	int8_t *direction, *direction_line;
+	cigar* result = (cigar*)malloc(sizeof(cigar));
+	h_b = (int32_t*)malloc(s1 * sizeof(int32_t)); 
+	e_b = (int32_t*)malloc(s1 * sizeof(int32_t)); 
+	h_c = (int32_t*)malloc(s1 * sizeof(int32_t)); 
+	direction = (int8_t*)malloc(s2 * sizeof(int8_t));
+
+	do {
+		width = band_width * 2 + 3, width_d = band_width * 2 + 1;
+		while (width >= s1) {
+			++s1;
+			kroundup32(s1);
+			h_b = (int32_t*)realloc(h_b, s1 * sizeof(int32_t)); 
+			e_b = (int32_t*)realloc(e_b, s1 * sizeof(int32_t)); 
+			h_c = (int32_t*)realloc(h_c, s1 * sizeof(int32_t)); 
+		}
+		while (width_d * readLen * 3 >= s2) {
+			++s2;
+			kroundup32(s2);
+			if (s2 < 0) {
+				fprintf(stderr, "Alignment score and position are not consensus.\n");
+				exit(1);
+			}
+			direction = (int8_t*)realloc(direction, s2 * sizeof(int8_t)); 
+		}
+		direction_line = direction;
+		for (j = 1; LIKELY(j < width - 1); j ++) h_b[j] = 0;
+		for (i = 0; LIKELY(i < readLen); i ++) {
+			int32_t beg = 0, end = refLen - 1, u = 0, edge;
+			j = i - band_width;	beg = beg > j ? beg : j; // band start
+			j = i + band_width; end = end < j ? end : j; // band end
+			edge = end + 1 < width - 1 ? end + 1 : width - 1;
+			f = h_b[0] = e_b[0] = h_b[edge] = e_b[edge] = h_c[0] = 0;
+			direction_line = direction + width_d * i * 3;
+
+			for (j = beg; LIKELY(j <= end); j ++) {
+				int32_t b, e1, f1, d, de, df, dh;
+				set_u(u, band_width, i, j);	set_u(e, band_width, i - 1, j); 
+				set_u(b, band_width, i, j - 1); set_u(d, band_width, i - 1, j - 1);
+				set_d(de, band_width, i, j, 0);
+				set_d(df, band_width, i, j, 1);
+				set_d(dh, band_width, i, j, 2);
+
+				temp1 = i == 0 ? -weight_gapO : h_b[e] - weight_gapO;
+				temp2 = i == 0 ? -weight_gapE : e_b[e] - weight_gapE;
+				e_b[u] = temp1 > temp2 ? temp1 : temp2;
+				direction_line[de] = temp1 > temp2 ? 3 : 2;
+		
+				temp1 = h_c[b] - weight_gapO;
+				temp2 = f - weight_gapE;
+				f = temp1 > temp2 ? temp1 : temp2;
+				direction_line[df] = temp1 > temp2 ? 5 : 4;
+				
+				e1 = e_b[u] > 0 ? e_b[u] : 0;
+				f1 = f > 0 ? f : 0;
+				temp1 = e1 > f1 ? e1 : f1;
+				temp2 = h_b[d] + mat[ref[j] * n + read[i]];
+				h_c[u] = temp1 > temp2 ? temp1 : temp2;
+		
+				if (h_c[u] > max) max = h_c[u];
+		
+				if (temp1 <= temp2) direction_line[dh] = 1;
+				else direction_line[dh] = e1 > f1 ? direction_line[de] : direction_line[df];
+			}
+			for (j = 1; j <= u; j ++) h_b[j] = h_c[j];
+		}
+		band_width *= 2;
+	} while (LIKELY(max < score));
+	band_width /= 2;
+
+	// trace back
+	i = readLen - 1;
+	j = refLen - 1;
+	e = 0;	// Count the number of M, D or I.
+	l = 0;	// record length of current cigar
+	f = max = 0; // M
+	temp2 = 2;	// h
+	while (LIKELY(i > 0)) {
+		set_d(temp1, band_width, i, j, temp2);
+		switch (direction_line[temp1]) {
+			case 1: 
+				--i;
+				--j;
+				temp2 = 2;
+				direction_line -= width_d * 3;
+				f = 0;	// M
+				break;
+			case 2:
+			 	--i;
+				temp2 = 0;	// e
+				direction_line -= width_d * 3;
+				f = 1;	// I
+				break;		
+			case 3:
+				--i;
+				temp2 = 2;
+				direction_line -= width_d * 3;
+				f = 1;	// I
+				break;
+			case 4:
+				--j;
+				temp2 = 1;
+				f = 2;	// D
+				break;
+			case 5:
+				--j;
+				temp2 = 2;
+				f = 2;	// D
+				break;
+			default: 
+				fprintf(stderr, "Trace back error: %d.\n", direction_line[temp1 - 1]);
+				return 0;
+		}
+		if (f == max) ++e;
+		else {
+			++l;
+			while (l >= s) {
+				++s;
+				kroundup32(s);
+				c = (uint32_t*)realloc(c, s * sizeof(uint32_t));
+			}
+			c[l - 1] = e<<4|max;
+			max = f;
+			e = 1;
+		}
+	}
+	if (f == 0) {
+		++l;
+		while (l >= s) {
+			++s;
+			kroundup32(s);
+			c = (uint32_t*)realloc(c, s * sizeof(uint32_t));
+		}
+		c[l - 1] = (e+1)<<4;
+	}else {
+		l += 2;
+		while (l >= s) {
+			++s;
+			kroundup32(s);
+			c = (uint32_t*)realloc(c, s * sizeof(uint32_t));
+		}
+		c[l - 2] = e<<4|f;
+		c[l - 1] = 16;	// 1M
+	}
+
+	// reverse cigar
+	c1 = (uint32_t*)malloc(l * sizeof(uint32_t));
+	s = 0;
+	e = l - 1;
+	while (LIKELY(s <= e)) {			
+		c1[s] = c[e];		
+		c1[e] = c[s];		
+		++ s;					
+		-- e;						
+	}								
+	result->seq = c1;
+	result->length = l;
+
+	free(direction);
+	free(h_c);
+	free(e_b);
+	free(h_b);
+	free(c);
+	return result;
+}
+
+int8_t* seq_reverse(const int8_t* seq, int32_t end)	/* end is 0-based alignment ending position */	
+{									
+	int8_t* reverse = (int8_t*)calloc(end + 1, sizeof(int8_t));	
+	int32_t start = 0;
+	while (LIKELY(start <= end)) {			
+		reverse[start] = seq[end];		
+		reverse[end] = seq[start];		
+		++ start;					
+		-- end;						
+	}								
+	return reverse;					
+}
+		
+s_profile* ssw_init (const int8_t* read, const int32_t readLen, const int8_t* mat, const int32_t n, const int8_t score_size) {
+	s_profile* p = (s_profile*)calloc(1, sizeof(struct _profile));
+	p->profile_byte = 0;
+	p->profile_word = 0;
+	p->bias = 0;
+	
+	if (score_size == 0 || score_size == 2) {
+		/* Find the bias to use in the substitution matrix */
+		int32_t bias = 0, i;
+		for (i = 0; i < n*n; i++) if (mat[i] < bias) bias = mat[i];
+		bias = abs(bias);
+
+		p->bias = bias;
+		p->profile_byte = qP_byte (read, mat, readLen, n, bias);
+	}
+	if (score_size == 1 || score_size == 2) p->profile_word = qP_word (read, mat, readLen, n);
+	p->read = read;
+	p->mat = mat;
+	p->readLen = readLen;
+	p->n = n;
+	return p;
+}
+
+void init_destroy (s_profile* p) {
+	free(p->profile_byte);
+	free(p->profile_word);
+	free(p);
+}
+
+s_align* ssw_align (const s_profile* prof, 
+					const int8_t* ref, 
+				  	int32_t refLen, 
+				  	const uint8_t weight_gapO, 
+				  	const uint8_t weight_gapE, 
+					const uint8_t flag,	//  (from high to low) bit 5: return the best alignment beginning position; 6: if (ref_end1 - ref_begin1 <= filterd) && (read_end1 - read_begin1 <= filterd), return cigar; 7: if max score >= filters, return cigar; 8: always return cigar; if 6 & 7 are both setted, only return cigar when both filter fulfilled
+					const uint16_t filters,
+					const int32_t filterd,
+					const int32_t maskLen) {
+
+	alignment_end* bests = 0, *bests_reverse = 0;
+	__m128i* vP = 0;
+	int32_t word = 0, band_width = 0, readLen = prof->readLen;
+	int8_t* read_reverse = 0;
+	cigar* path;
+	s_align* r = (s_align*)calloc(1, sizeof(s_align));
+	r->ref_begin1 = -1;
+	r->read_begin1 = -1;
+	r->cigar = 0;
+	r->cigarLen = 0;
+	if (maskLen < 15) {
+		fprintf(stderr, "When maskLen < 15, the function ssw_align doesn't return 2nd best alignment information.\n");
+	}
+
+	// Find the alignment scores and ending positions
+	if (prof->profile_byte) {
+		bests = sw_sse2_byte(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_byte, -1, prof->bias, maskLen);
+		if (prof->profile_word && bests[0].score == 255) {
+			free(bests);
+			bests = sw_sse2_word(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_word, -1, maskLen);
+			word = 1;
+		} else if (bests[0].score == 255) {
+			fprintf(stderr, "Please set 2 to the score_size parameter of the function ssw_init, otherwise the alignment results will be incorrect.\n");
+			return 0;
+		}
+	}else if (prof->profile_word) {
+		bests = sw_sse2_word(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_word, -1, maskLen);
+		word = 1;
+	}else {
+		fprintf(stderr, "Please call the function ssw_init before ssw_align.\n");
+		return 0;
+	}
+	r->score1 = bests[0].score;
+	r->ref_end1 = bests[0].ref;
+	r->read_end1 = bests[0].read;
+	if (maskLen >= 15) {
+		r->score2 = bests[1].score;
+		r->ref_end2 = bests[1].ref;
+	} else {
+		r->score2 = 0;
+		r->ref_end2 = -1;
+	}
+	free(bests);
+	if (flag == 0 || (flag == 2 && r->score1 < filters)) goto end;
+
+	// Find the beginning position of the best alignment.
+	read_reverse = seq_reverse(prof->read, r->read_end1);
+	if (word == 0) {
+		vP = qP_byte(read_reverse, prof->mat, r->read_end1 + 1, prof->n, prof->bias);
+		bests_reverse = sw_sse2_byte(ref, 1, r->ref_end1 + 1, r->read_end1 + 1, weight_gapO, weight_gapE, vP, r->score1, prof->bias, maskLen);
+	} else {
+		vP = qP_word(read_reverse, prof->mat, r->read_end1 + 1, prof->n);
+		bests_reverse = sw_sse2_word(ref, 1, r->ref_end1 + 1, r->read_end1 + 1, weight_gapO, weight_gapE, vP, r->score1, maskLen);
+	}
+	free(vP);
+	free(read_reverse);
+	r->ref_begin1 = bests_reverse[0].ref;
+	r->read_begin1 = r->read_end1 - bests_reverse[0].read;
+	free(bests_reverse);
+	if ((7&flag) == 0 || ((2&flag) != 0 && r->score1 < filters) || ((4&flag) != 0 && (r->ref_end1 - r->ref_begin1 > filterd || r->read_end1 - r->read_begin1 > filterd))) goto end;
+
+	// Generate cigar.
+	refLen = r->ref_end1 - r->ref_begin1 + 1;
+	readLen = r->read_end1 - r->read_begin1 + 1;
+	band_width = abs(refLen - readLen) + 1;
+	path = banded_sw(ref + r->ref_begin1, prof->read + r->read_begin1, refLen, readLen, r->score1, weight_gapO, weight_gapE, band_width, prof->mat, prof->n);
+	if (path == 0) r = 0;
+	else {
+		r->cigar = path->seq;
+		r->cigarLen = path->length;
+		free(path);
+	}
+	
+end: 
+	return r;
+}
+
+void align_destroy (s_align* a) {
+	free(a->cigar);
+	free(a);
+}
diff --git a/src/ssw.h b/src/ssw.h
new file mode 100644
index 0000000..3cb45c8
--- /dev/null
+++ b/src/ssw.h
@@ -0,0 +1,129 @@
+/*
+ *  ssw.h
+ *
+ *  Created by Mengyao Zhao on 6/22/10.
+ *  Copyright 2010 Boston College. All rights reserved.
+ *	Version 0.1.4
+ *	Last revision by Mengyao Zhao on 07/31/12.
+ *
+ */
+
+#ifndef SSW_H
+#define SSW_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <emmintrin.h>
+
+/*!	@typedef	structure of the query profile	*/
+struct _profile;
+typedef struct _profile s_profile;
+
+/*!	@typedef	structure of the alignment result
+	@field	score1	the best alignment score
+	@field	score2	sub-optimal alignment score
+	@field	ref_begin1	0-based best alignment beginning position on reference;	ref_begin1 = -1 when the best alignment beginning 
+						position is not available
+	@field	ref_end1	0-based best alignment ending position on reference
+	@field	read_begin1	0-based best alignment beginning position on read; read_begin1 = -1 when the best alignment beginning 
+						position is not available
+	@field	read_end1	0-based best alignment ending position on read
+	@field	read_end2	0-based sub-optimal alignment ending position on read
+	@field	cigar	best alignment cigar; stored the same as that in BAM format, high 28 bits: length, low 4 bits: M/I/D (0/1/2); 
+					cigar = 0 when the best alignment path is not available
+	@field	cigarLen	length of the cigar string; cigarLen = 0 when the best alignment path is not available
+*/
+typedef struct {
+	uint16_t score1;	
+	uint16_t score2;	
+	int32_t ref_begin1;	
+	int32_t ref_end1;	
+	int32_t	read_begin1;	
+	int32_t read_end1;	
+	int32_t ref_end2;
+	uint32_t* cigar;	
+	int32_t cigarLen;	
+} s_align;
+
+#ifdef __cplusplus
+extern "C" {
+#endif	// __cplusplus
+
+/*!	@function	Create the query profile using the query sequence.
+	@param	read	pointer to the query sequence; the query sequence needs to be numbers
+	@param	readLen	length of the query sequence
+	@param	mat	pointer to the substitution matrix; mat needs to be corresponding to the read sequence
+	@param	n	the square root of the number of elements in mat (mat has n*n elements)
+	@param	score_size	estimated Smith-Waterman score; if your estimated best alignment score is surely < 255 please set 0; if 
+						your estimated best alignment score >= 255, please set 1; if you don't know, please set 2 
+	@return	pointer to the query profile structure
+	@note	example for parameter read and mat:
+			If the query sequence is: ACGTATC, the sequence that read points to can be: 1234142
+			Then if the penalty for match is 2 and for mismatch is -2, the substitution matrix of parameter mat will be:
+			//A  C  G  T  
+			  2 -2 -2 -2 //A
+			 -2  2 -2 -2 //C
+			 -2 -2  2 -2 //G
+			 -2 -2 -2  2 //T
+			mat is the pointer to the array {2, -2, -2, -2, -2, 2, -2, -2, -2, -2, 2, -2, -2, -2, -2, 2}
+*/
+s_profile* ssw_init (const int8_t* read, const int32_t readLen, const int8_t* mat, const int32_t n, const int8_t score_size);
+
+/*!	@function	Release the memory allocated by function ssw_init.
+	@param	p	pointer to the query profile structure	
+*/
+void init_destroy (s_profile* p);
+
+// @function	ssw alignment.
+/*!	@function	Do Striped Smith-Waterman alignment.
+	@param	prof	pointer to the query profile structure
+	@param	ref	pointer to the target sequence; the target sequence needs to be numbers and corresponding to the mat parameter of
+				function ssw_init
+	@param	refLen	length of the target sequence
+	@param	weight_gapO	the absolute value of gap open penalty  
+	@param	weight_gapE	the absolute value of gap extension penalty
+	@param	flag	bitwise FLAG; (from high to low) bit 5: when setted as 1, function ssw_align will return the best alignment 
+					beginning position; bit 6: when setted as 1, if (ref_end1 - ref_begin1 < filterd && read_end1 - read_begin1 
+					< filterd), (whatever bit 5 is setted) the function will return the best alignment beginning position and 
+					cigar; bit 7: when setted as 1, if the best alignment score >= filters, (whatever bit 5 is setted) the function
+  					will return the best alignment beginning position and cigar; bit 8: when setted as 1, (whatever bit 5, 6 or 7 is
+ 					setted) the function will always return the best alignment beginning position and cigar
+	@param	filters	score filter: when bit 7 of flag is setted as 1 and bit 8 is setted as 0, filters will be used (Please check the
+ 					decription of the flag parameter for detailed usage.)
+	@param	filterd	distance filter: when bit 6 of flag is setted as 1 and bit 8 is setted as 0, filterd will be used (Please check 
+					the decription of the flag parameter for detailed usage.)
+	@param	maskLen	The distance between the optimal and suboptimal alignment ending position >= maskLen. We suggest to use 
+					readLen/2, if you don't have special concerns. Note: maskLen has to be >= 15, otherwise this function will NOT 
+					return the suboptimal alignment information. Detailed description of maskLen: After locating the optimal
+					alignment ending position, the suboptimal alignment score can be heuristically found by checking the second 
+					largest score in the array that contains the maximal score of each column of the SW matrix. In order to avoid 
+					picking the scores that belong to the alignments sharing the partial best alignment, SSW C library masks the 
+					reference loci nearby (mask length = maskLen) the best alignment ending position and locates the second largest 
+					score from the unmasked elements.
+	@return	pointer to the alignment result structure 
+	@note	Whatever the parameter flag is setted, this function will at least return the optimal and sub-optimal alignment score,
+			and the optimal alignment ending positions on target and query sequences. If both bit 6 and 7 of the flag are setted
+			while bit 8 is not, the function will return cigar only when both criteria are fulfilled. All returned positions are 
+			0-based coordinate.  	
+*/
+s_align* ssw_align (const s_profile* prof, 
+					const int8_t* ref, 
+					int32_t refLen, 
+					const uint8_t weight_gapO, 
+					const uint8_t weight_gapE, 
+					const uint8_t flag,	
+					const uint16_t filters,
+					const int32_t filterd,
+					const int32_t maskLen);
+
+/*!	@function	Release the memory allocated by function ssw_align.
+	@param	a	pointer to the alignment result structure
+*/
+void align_destroy (s_align* a);
+
+#ifdef __cplusplus
+}
+#endif	// __cplusplus
+
+#endif	// SSW_H
diff --git a/src/ssw_cpp.cpp b/src/ssw_cpp.cpp
new file mode 100644
index 0000000..ea260de
--- /dev/null
+++ b/src/ssw_cpp.cpp
@@ -0,0 +1,399 @@
+#include "ssw_cpp.h"
+
+#include <sstream>
+
+extern "C" {
+#include "ssw.h"
+}
+
+namespace {
+
+static int8_t kBaseTranslation[128] = {
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+  //   A     C            G
+    4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+  //             T
+    4, 4, 4, 4,  3, 0, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+  //   a     c            g
+    4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+  //             t
+    4, 4, 4, 4,  3, 0, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4 
+};
+
+void BuildSwScoreMatrix(const uint8_t& match_score, 
+                        const uint8_t& mismatch_penalty,
+			int8_t* matrix) {
+
+  // The score matrix looks like
+  //                 // A,  C,  G,  T,  N
+  //  score_matrix_ = { 2, -2, -2, -2,  0, // A
+  //                   -2,  2, -2, -2,  0, // C
+  //                   -2, -2,  2, -2,  0, // G
+  //                   -2, -2, -2,  2,  0, // T
+  //                    0,  0,  0,  0,  0};// N
+
+  int id = 0;
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      matrix[id] = ((i == j) ? match_score : static_cast<int8_t>(-mismatch_penalty));
+      ++id;
+    }
+    matrix[id] = 0;
+    ++id;
+  }
+
+  for (int i = 0; i < 5; ++i)
+    matrix[id++] = 0;
+    
+}
+
+void ConvertAlignment(const s_align& s_al, 
+                      const int& query_len, 
+                      StripedSmithWaterman::Alignment* al) {
+  al->sw_score           = s_al.score1;
+  al->sw_score_next_best = s_al.score2;
+  al->ref_begin          = s_al.ref_begin1;
+  al->ref_end            = s_al.ref_end1;
+  al->query_begin        = s_al.read_begin1;
+  al->query_end          = s_al.read_end1;
+  al->ref_end_next_best  = s_al.ref_end2;
+
+  al->cigar.clear();
+  al->cigar_string.clear();
+  
+  if (s_al.cigarLen > 0) {
+    std::ostringstream cigar_string;
+    if (al->query_begin > 0) {
+      uint32_t cigar = (al->query_begin << 4) | 0x0004;
+      al->cigar.push_back(cigar);
+      cigar_string << al->query_begin << 'S';
+    }
+
+    for (int i = 0; i < s_al.cigarLen; ++i) {
+      al->cigar.push_back(s_al.cigar[i]);
+      cigar_string << (s_al.cigar[i] >> 4);
+      uint8_t op = s_al.cigar[i] & 0x000f;
+      switch(op) {
+        case 0: cigar_string << 'M'; break;
+        case 1: cigar_string << 'I'; break;
+        case 2: cigar_string << 'D'; break;
+      }
+    }
+
+    int end = query_len - al->query_end - 1;
+    if (end > 0) {
+      uint32_t cigar = (end << 4) | 0x0004;
+      al->cigar.push_back(cigar);
+      cigar_string << end << 'S';
+    }
+
+    al->cigar_string = cigar_string.str();
+  } // end if
+}
+
+int CalculateNumberMismatch(
+    const StripedSmithWaterman::Alignment& al,
+    const int8_t* matrix,
+    int8_t const *ref,
+    int8_t const *query) {
+  
+  ref   += al.ref_begin;
+  query += al.query_begin;
+  int mismatch_length = 0;
+  for (unsigned int i = 0; i < al.cigar.size(); ++i) {
+    int32_t op = al.cigar[i] & 0x0000000f;
+    int32_t length = (al.cigar[i] >> 4) & 0x0fffffff;
+    if (op == 0) { // M
+      for (int j = 0; j < length; ++j) {
+        if (matrix[*ref] != matrix[*query]) ++mismatch_length;
+	++ref;
+	++query;
+      }
+    } else if (op == 1) { // I
+      query += length;
+      mismatch_length += length;
+    } else if (op == 2) { // D
+      ref += length;
+      mismatch_length += length;
+    }
+  }
+
+  return mismatch_length;
+}
+
+void SetFlag(const StripedSmithWaterman::Filter& filter, uint8_t* flag) {
+  if (filter.report_begin_position) *flag |= 0x08;
+  if (filter.report_cigar) *flag |= 0x0f;
+}
+
+} // namespace
+
+
+
+namespace StripedSmithWaterman {
+
+Aligner::Aligner(void)
+    : score_matrix_(NULL)
+    , score_matrix_size_(5)
+    , translation_matrix_(NULL)
+    , default_matrix_(false)
+    , matrix_built_(false)
+    , match_score_(2)
+    , mismatch_penalty_(2)
+    , gap_opening_penalty_(3)
+    , gap_extending_penalty_(1)
+    , translated_reference_(NULL)
+    , reference_length_(0)
+{
+  BuildDefaultMatrix();
+}
+
+Aligner::Aligner(
+    const uint8_t& match_score,
+    const uint8_t& mismatch_penalty,
+    const uint8_t& gap_opening_penalty,
+    const uint8_t& gap_extending_penalty)
+
+    : score_matrix_(NULL)
+    , score_matrix_size_(5)
+    , translation_matrix_(NULL)
+    , default_matrix_(false)
+    , matrix_built_(false)
+    , match_score_(match_score)
+    , mismatch_penalty_(mismatch_penalty)
+    , gap_opening_penalty_(gap_opening_penalty)
+    , gap_extending_penalty_(gap_extending_penalty)
+    , translated_reference_(NULL)
+    , reference_length_(0)
+{
+  BuildDefaultMatrix();
+}
+
+Aligner::Aligner(const int8_t* score_matrix,
+                 const int&    score_matrix_size,
+	         const int8_t* translation_matrix,
+		 const int&    translation_matrix_size)
+    
+    : score_matrix_(NULL)
+    , score_matrix_size_(score_matrix_size)
+    , translation_matrix_(NULL)
+    , default_matrix_(true)
+    , matrix_built_(false)
+    , match_score_(2)
+    , mismatch_penalty_(2)
+    , gap_opening_penalty_(3)
+    , gap_extending_penalty_(1)
+    , translated_reference_(NULL)
+    , reference_length_(0)
+{
+  score_matrix_ = new int8_t[score_matrix_size_ * score_matrix_size_];
+  memcpy(score_matrix_, score_matrix, sizeof(int8_t) * score_matrix_size_ * score_matrix_size_);
+  translation_matrix_ = new int8_t[translation_matrix_size];
+  memcpy(translation_matrix_, translation_matrix, sizeof(int8_t) * translation_matrix_size);
+  matrix_built_ = true;
+}
+
+
+Aligner::~Aligner(void){
+  Clear();
+}
+
+int Aligner::SetReferenceSequence(const char* seq, const int& length) {
+  
+  int len = 0;
+  if (matrix_built_) {
+    // calculate the valid length
+    int calculated_ref_length = static_cast<int>(strlen(seq));
+    int valid_length = (calculated_ref_length > length) 
+                       ? length : calculated_ref_length;
+    // delete the current buffer
+    CleanReferenceSequence();
+    // allocate a new buffer
+    translated_reference_ = new int8_t[valid_length];
+  
+    len = TranslateBase(seq, valid_length, translated_reference_);
+  } else {
+    // nothing
+  }
+
+  reference_length_ = len;
+  return len;
+
+
+}
+
+int Aligner::TranslateBase(const char* bases, const int& length, 
+    int8_t* translated) const {
+
+  char* ptr = (char*)bases;
+  int len = 0;
+  for (int i = 0; i < length; ++i) {
+    translated[i] = translation_matrix_[(int) *ptr];
+    ++ptr;
+    ++len;
+  }
+
+  return len;
+}
+
+
+bool Aligner::Align(const char* query, const Filter& filter, 
+                    Alignment* alignment) const
+{
+  if (!matrix_built_) return false;
+  if (reference_length_ == 0) return false;
+
+  int query_len = strlen(query);
+  if (query_len == 0) return false;
+  int8_t* translated_query = new int8_t[query_len];
+  TranslateBase(query, query_len, translated_query);
+
+  const int8_t score_size = 2;
+  s_profile* profile = ssw_init(translated_query, query_len, score_matrix_, 
+                                score_matrix_size_, score_size);
+
+  uint8_t flag = 0;
+  SetFlag(filter, &flag);
+  s_align* s_al = ssw_align(profile, translated_reference_, reference_length_,
+                                 static_cast<int>(gap_opening_penalty_), 
+				 static_cast<int>(gap_extending_penalty_),
+				 flag, filter.score_filter, filter.distance_filter, query_len);
+  
+  alignment->Clear();
+  ConvertAlignment(*s_al, query_len, alignment);
+  alignment->mismatches = CalculateNumberMismatch(*alignment, score_matrix_, translated_reference_, translated_query);
+
+
+  // Free memory
+  if (query_len > 1) delete [] translated_query;
+  else delete translated_query;
+  align_destroy(s_al);
+  init_destroy(profile);
+
+  return true;
+}
+
+
+bool Aligner::Align(const char* query, const char* ref, const int& ref_len,
+                    const Filter& filter, Alignment* alignment) const
+{
+  if (!matrix_built_) return false;
+  
+  int query_len = strlen(query);
+  if (query_len == 0) return false;
+  int8_t* translated_query = new int8_t[query_len];
+  TranslateBase(query, query_len, translated_query);
+
+  // calculate the valid length
+  int calculated_ref_length = static_cast<int>(strlen(ref));
+  int valid_ref_len = (calculated_ref_length > ref_len) 
+                      ? ref_len : calculated_ref_length;
+  int8_t* translated_ref = new int8_t[valid_ref_len];
+  TranslateBase(ref, valid_ref_len, translated_ref);
+
+
+  const int8_t score_size = 2;
+  s_profile* profile = ssw_init(translated_query, query_len, score_matrix_, 
+                                score_matrix_size_, score_size);
+
+  uint8_t flag = 0;
+  SetFlag(filter, &flag);
+  s_align* s_al = ssw_align(profile, translated_ref, valid_ref_len,
+                                 static_cast<int>(gap_opening_penalty_), 
+				 static_cast<int>(gap_extending_penalty_),
+				 flag, filter.score_filter, filter.distance_filter, query_len);
+  
+  alignment->Clear();
+  ConvertAlignment(*s_al, query_len, alignment);
+  alignment->mismatches = CalculateNumberMismatch(*alignment, score_matrix_, translated_ref, translated_query);
+
+  // Free memory
+  if (query_len > 1) delete [] translated_query;
+  else delete translated_query;
+  if (valid_ref_len > 1) delete [] translated_ref;
+  else delete translated_ref;
+  align_destroy(s_al);
+  init_destroy(profile);
+
+  return true;
+}
+
+void Aligner::Clear(void) {
+  if (score_matrix_) delete [] score_matrix_;
+  score_matrix_ = NULL;
+
+  if (!default_matrix_ && translation_matrix_) 
+    delete [] translation_matrix_;
+  translation_matrix_ = NULL;
+
+  CleanReferenceSequence();
+
+  default_matrix_ = false;
+  matrix_built_   = false;
+}
+
+void Aligner::SetAllDefault(void) {
+  score_matrix_size_     = 5;
+  default_matrix_        = false;
+  matrix_built_          = false;
+  match_score_           = 2;
+  mismatch_penalty_      = 2;
+  gap_opening_penalty_   = 3;
+  gap_extending_penalty_ = 1;
+  reference_length_      = 0;
+}
+
+bool Aligner::ReBuild(void) {
+  if (matrix_built_) return false;
+
+  SetAllDefault();
+  BuildDefaultMatrix();
+
+  return true;
+}
+
+bool Aligner::ReBuild(
+    const uint8_t& match_score,
+    const uint8_t& mismatch_penalty,
+    const uint8_t& gap_opening_penalty,
+    const uint8_t& gap_extending_penalty) {
+  if (matrix_built_) return false;
+
+  SetAllDefault();
+
+  match_score_           = match_score;
+  mismatch_penalty_      = mismatch_penalty;
+  gap_opening_penalty_   = gap_opening_penalty;
+  gap_extending_penalty_ = gap_extending_penalty;
+
+  BuildDefaultMatrix();
+
+  return true;
+}
+
+bool Aligner::ReBuild(
+    const int8_t* score_matrix,
+    const int&    score_matrix_size,
+    const int8_t* translation_matrix,
+    const int&    translation_matrix_size) {
+
+  score_matrix_ = new int8_t[score_matrix_size_ * score_matrix_size_];
+  memcpy(score_matrix_, score_matrix, sizeof(int8_t) * score_matrix_size_ * score_matrix_size_);
+  translation_matrix_ = new int8_t[translation_matrix_size];
+  memcpy(translation_matrix_, translation_matrix, sizeof(int8_t) * translation_matrix_size);
+  matrix_built_ = true;
+
+  return true;
+}
+
+void Aligner::BuildDefaultMatrix(void) {
+  score_matrix_ = new int8_t[score_matrix_size_ * score_matrix_size_];
+  BuildSwScoreMatrix(match_score_, mismatch_penalty_, score_matrix_);
+  translation_matrix_ = kBaseTranslation;
+  matrix_built_   = true;
+  default_matrix_ = true;
+}
+} // namespace StripedSmithWaterman
diff --git a/src/ssw_cpp.h b/src/ssw_cpp.h
new file mode 100644
index 0000000..fb10f4f
--- /dev/null
+++ b/src/ssw_cpp.h
@@ -0,0 +1,216 @@
+#ifndef COMPLETE_STRIPED_SMITH_WATERMAN_CPP_H_
+#define COMPLETE_STRIPED_SMITH_WATERMAN_CPP_H_
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+namespace StripedSmithWaterman {
+
+struct Alignment {
+  uint16_t sw_score;           // The best alignment score 
+  uint16_t sw_score_next_best; // The next best alignment score
+  int32_t  ref_begin;          // Reference begin position of the best alignment
+  int32_t  ref_end;            // Reference end position of the best alignment
+  int32_t  query_begin;        // Query begin position of the best alignment
+  int32_t  query_end;          // Query end position of the best alignment
+  int32_t  ref_end_next_best;  // Reference end position of the next best alignment
+  int32_t  mismatches;         // Number of mismatches of the alignment
+  std::string cigar_string;    // Cigar string of the best alignment
+  std::vector<uint32_t> cigar; // Cigar stored in the BAM format
+                               //   high 28 bits: length
+			       //   low 4 bits: M/I/D/S/X (0/1/2/4/8);
+  void Clear() {
+    sw_score           = 0;
+    sw_score_next_best = 0;
+    ref_begin          = 0;
+    ref_end            = 0;
+    query_begin        = 0;
+    query_end          = 0;
+    ref_end_next_best  = 0;
+    mismatches         = 0;
+    cigar_string.clear();
+    cigar.clear();
+  };
+};
+
+struct Filter {
+  // NOTE: No matter the filter, those five fields will be given anyway.
+  //       sw_score; sw_score_next_best; ref_end; query_end; ref_end_next_best.
+  
+  bool report_begin_position;    // Give ref_begin and query_begin. 
+                                 //   If it is not set, ref_begin and query_begin are -1.
+  bool report_cigar;             // Give cigar_string and cigar.
+                                 //   report_begin_position is automatically TRUE.
+  
+  // When *report_cigar* is true and alignment passes these two filters,
+  //   cigar_string and cigar will be given.
+  uint16_t score_filter;         // score >= score_filter
+  uint16_t distance_filter;      // ((ref_end - ref_begin) < distance_filter) &&
+                                 // ((query_end - read_begin) < distance_filter)
+
+  Filter()
+    : report_begin_position(true)
+    , report_cigar(true)
+    , score_filter(0)
+    , distance_filter(32767)
+  {};
+};
+
+class Aligner {
+ public:
+  // =========
+  // @function Construct an Aligner on default values.
+  //             The function will build the {A.C,G,T,N} aligner.
+  //             If you target for other character aligners, then please
+  //             use the other constructor and pass the corresponding matrix in.
+  // =========
+  Aligner(void);
+  
+  // =========
+  // @function Construct an Aligner by assigning scores.
+  //             The function will build the {A.C,G,T,N} aligner.
+  //             If you target for other character aligners, then please
+  //             use the other constructor and pass the corresponding matrix in.
+  // =========
+  Aligner(const uint8_t& match_score,
+          const uint8_t& mismatch_penalty,
+	  const uint8_t& gap_opening_penalty,
+	  const uint8_t& gap_extending_penalty);
+  
+  // =========
+  // @function Construct an Aligner by the specific matrixs.
+  // =========
+  Aligner(const int8_t* score_matrix, 
+          const int&    score_matrix_size,
+          const int8_t* translation_matrix,
+	  const int&    translation_matrix_size);
+  
+  ~Aligner(void);
+
+  // =========
+  // @function Build the reference sequence and thus make 
+  //             Align(const char* query, s_align* alignment) function;
+  //             otherwise the reference should be given when aligning.
+  //           [NOTICE] If there exists a sequence, that one will be deleted 
+  //                    and replaced.
+  // @param    seq    The reference bases;
+  //                  [NOTICE] It is not necessary null terminated.
+  // @param    length The length of bases will be be built.
+  // @return   The length of the built bases.
+  // =========
+  int SetReferenceSequence(const char* seq, const int& length);
+
+  void CleanReferenceSequence(void);
+
+  // =========
+  // @function Set penalties for opening and extending gaps
+  //           [NOTICE] The defaults are 3 and 1 respectively.
+  // =========
+  void SetGapPenalty(const uint8_t& opening, const uint8_t& extending) {
+    gap_opening_penalty_ = opening;
+    gap_extending_penalty_ = extending;
+  };
+
+  void SetMismatchPenalty(const uint8_t& match, const uint8_t& mismatch) {
+    match_score_ = match;
+    mismatch_penalty_ = mismatch;
+  };
+
+  // =========
+  // @function Align the query againt the reference that is set by 
+  //             SetReferenceSequence.
+  // @param    query     The query sequence.
+  // @param    filter    The filter for the alignment.
+  // @param    alignment The container contains the result.
+  // @return   True: succeed; false: fail.
+  // =========
+  bool Align(const char* query, const Filter& filter, Alignment* alignment) const;
+
+  // =========
+  // @function Align the query againt the reference.
+  //           [NOTICE] The reference won't replace the reference 
+  //                      set by SetReferenceSequence.
+  // @param    query     The query sequence.
+  // @param    ref       The reference sequence.
+  //                     [NOTICE] It is not necessary null terminated.
+  // @param    ref_len   The length of the reference sequence.
+  // @param    filter    The filter for the alignment.
+  // @param    alignment The container contains the result.
+  // @return   True: succeed; false: fail.
+  // =========
+  bool Align(const char* query, const char* ref, const int& ref_len, 
+             const Filter& filter, Alignment* alignment) const;
+
+  // @function Clear up all containers and thus the aligner is disabled.
+  //             To rebuild the aligner please use Build functions.
+  void Clear(void);
+
+  // =========
+  // @function Rebuild the aligner's ability on default values.
+  //           [NOTICE] If the aligner is not cleaned, rebuilding will fail.
+  // @return   True: succeed; false: fail.
+  // =========
+  bool ReBuild(void);
+
+  // =========
+  // @function Rebuild the aligner's ability by the specific matrixs.
+  //           [NOTICE] If the aligner is not cleaned, rebuilding will fail.
+  // @return   True: succeed; false: fail.
+  // =========
+  bool ReBuild(
+          const uint8_t& match_score,
+          const uint8_t& mismatch_penalty,
+	  const uint8_t& gap_opening_penalty,
+	  const uint8_t& gap_extending_penalty);
+  
+  // =========
+  // @function Construct an Aligner by the specific matrixs.
+  //           [NOTICE] If the aligner is not cleaned, rebuilding will fail.
+  // @return   True: succeed; false: fail.
+  // =========
+  bool ReBuild(
+          const int8_t* score_matrix, 
+          const int&    score_matrix_size,
+          const int8_t* translation_matrix,
+	  const int&    translation_matrix_size);
+  
+ private:
+  int8_t* score_matrix_;
+  int     score_matrix_size_;
+  int8_t* translation_matrix_;
+  bool    default_matrix_;
+  bool    matrix_built_;
+
+  uint8_t match_score_;           // default: 2
+  uint8_t mismatch_penalty_;      // default: 2
+  uint8_t gap_opening_penalty_;   // default: 3
+  uint8_t gap_extending_penalty_; // default: 1
+
+  int8_t* translated_reference_;
+  int32_t reference_length_;
+
+  int TranslateBase(const char* bases, const int& length, int8_t* translated) const;
+  void SetAllDefault(void);
+  void BuildDefaultMatrix(void);
+  
+  Aligner& operator= (const Aligner&);
+  Aligner (const Aligner&); 
+}; // class Aligner
+
+
+// ================
+// inline functions
+// ================
+inline void Aligner::CleanReferenceSequence(void) {
+  if (reference_length_ == 0) return;
+  
+  // delete the current buffer
+  if (reference_length_ > 1) delete [] translated_reference_;
+  else delete translated_reference_;
+  
+  reference_length_ = 0;
+}
+} // namespace StripedSmithWaterman
+
+#endif // COMPLETE_STRIPED_SMITH_WATERMAN_CPP_H_
diff --git a/src/vcf2dag.cpp b/src/vcf2dag.cpp
new file mode 100644
index 0000000..1257cd0
--- /dev/null
+++ b/src/vcf2dag.cpp
@@ -0,0 +1,168 @@
+#include "Variant.h"
+#include "BedReader.h"
+#include "intervaltree/IntervalTree.h"
+#include <getopt.h>
+#include "fastahack/Fasta.h"
+#include <algorithm>
+#include <list>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+         << endl
+         << "options:" << endl 
+         << "    -r, --reference FILE         FASTA reference file." << endl
+         << endl
+         << "Modify the VCF file so that homozygous regions are included as REF/. calls." << endl
+         << "For each ref and alt allele, assign an index.  These steps are sufficient to" << endl
+         << "enable use of the VCF as a DAG (specifically a partially-ordered graph)." << endl;
+    exit(0);
+}
+
+int main(int argc, char** argv) {
+
+    string vcfFileName;
+    string fastaFileName;
+
+    bool adjustVcf = false;
+
+    if (argc == 1)
+        printSummary(argv);
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"reference", required_argument, 0, 'r'},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hr:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+
+	    case 'r':
+            fastaFileName = string(optarg);
+            break;
+
+        case 'h':
+            printSummary(argv);
+            break;
+
+        case '?':
+            printSummary(argv);
+            exit(1);
+            break;
+
+        default:
+            abort ();
+        }
+    }
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        cerr << "could not open VCF file" << endl;
+        exit(1);
+    }
+
+    FastaReference reference;
+    if (fastaFileName.empty()) {
+        cerr << "a reference is required" << endl;
+        exit(1);
+    } else {
+        reference.open(fastaFileName);
+    }
+    
+    string idname = "id";
+    long int uid = 0;
+
+    variantFile.addHeaderLine("##INFO=<ID="+idname+".alt,Number=A,Type=Integer,Description=\"Unique numerical identifier of alt allele.\">");
+    variantFile.addHeaderLine("##INFO=<ID="+idname+".ref,Number=1,Type=Integer,Description=\"Unique numerical identifier of ref allele.\">");
+    cout << variantFile.header << endl;
+
+    long int last_end = 1;
+    string sequenceName;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+
+        if (sequenceName.empty()) {
+            sequenceName = var.sequenceName;
+        } else if (sequenceName != var.sequenceName) {
+            // emit last record from previous chrom
+            // these should be refactored.....
+            Variant refvar(variantFile);
+            if (var.position - last_end > 0) {
+                refvar.ref = reference.getSubSequence(sequenceName, last_end - 1, var.position - last_end);
+                refvar.quality = 0;
+                refvar.position = last_end;
+                refvar.sequenceName = sequenceName;
+                refvar.info[idname+".ref"].push_back(convert(uid++));
+                cout << refvar << endl;
+            }
+            last_end = 1;
+            sequenceName = var.sequenceName;
+        }
+
+        // generate the last reference record if we have sequence between variants
+        if (var.position - last_end > 0) {
+            Variant refvar(variantFile);
+            refvar.quality = 0;
+            refvar.position = last_end;
+            refvar.sequenceName = sequenceName;
+            refvar.ref = reference.getSubSequence(sequenceName, last_end - 1, var.position - last_end);
+            refvar.info[idname+".ref"].push_back(convert(uid++));
+            cout << refvar << endl;
+        }
+
+        // now manipulate this record
+        vector<string>& refidx = var.info[idname+".ref"];
+        refidx.clear(); refidx.push_back(convert(uid++));
+
+        vector<string>& idxs = var.info[idname+".alt"];
+        idxs.clear();
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+            idxs.push_back(convert(uid++));
+        }
+        cout << var << endl;
+
+        last_end = var.position + var.ref.size();
+
+    }
+
+    if (reference.sequenceLength(sequenceName) - last_end > 0) {
+        Variant refvar(variantFile);
+        refvar.quality = 0;
+        refvar.position = last_end;
+        refvar.sequenceName = sequenceName;
+        refvar.ref = reference.getSubSequence(sequenceName, last_end,
+                                              reference.sequenceLength(sequenceName) - last_end);
+        refvar.info[idname+".ref"].push_back(convert(uid++));
+        cout << refvar << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcf2fasta.cpp b/src/vcf2fasta.cpp
new file mode 100644
index 0000000..88de80a
--- /dev/null
+++ b/src/vcf2fasta.cpp
@@ -0,0 +1,264 @@
+#include "Variant.h"
+#include "convert.h"
+#include "join.h"
+#include "split.h"
+#include <set>
+#include <getopt.h>
+#include "fastahack/Fasta.h"
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+using namespace vcf;
+
+#define ALLELE_NULL -1
+
+
+class SampleFastaFile {
+
+public:
+
+    ofstream fastafile;
+    long int pos;
+    string linebuffer;
+    string filename;
+    string seqname;
+    int linewidth;
+
+    void write(string sequence) {
+        linebuffer += sequence;
+        while (linebuffer.length() > linewidth) {
+            fastafile << linebuffer.substr(0, linewidth) << endl;
+            linebuffer = linebuffer.substr(linewidth);
+        }
+    }
+
+    SampleFastaFile(void) { }
+
+    void open(string& m_filename, string& m_seqname, int m_linewidth = 80) {
+        filename = m_filename;
+        seqname = m_seqname;
+        pos = 0;
+        linewidth = m_linewidth;
+        if (fastafile.is_open()) fastafile.close();
+        fastafile.open(filename.c_str());
+        if (!fastafile.is_open()) {
+            cerr << "could not open " << filename << " for writing, exiting" << endl;
+            exit(1);
+        }
+        fastafile << ">" << seqname << endl;
+    }
+
+    ~SampleFastaFile(void) {
+        if (fastafile.is_open()) {
+            write(""); // flush
+            fastafile << linebuffer << endl;
+            fastafile.close();
+        }
+    }
+
+};
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [file]" << endl
+         << endl
+         << "options:" << endl
+         << "    -f, --reference REF     Use this reference when decomposing samples." << endl
+         << "    -p, --prefix PREFIX     Affix this output prefix to each file, none by default" << endl
+         << "    -P, --default-ploidy N  Set a default ploidy for samples which do not have information in the first record (2)." << endl
+         << endl
+         << "Outputs sample_seq:N.fa for each sample, reference sequence, and chromosomal copy N in [0,1... ploidy]." << endl;
+        //<< "Impossible regions of haplotypes are noted with an error message.  The corresponding" << endl
+        //<< "regions of the output FASTA files will be marked as N." << endl
+    exit(0);
+}
+
+map<string, int>& getPloidies(Variant& var, map<string, int>& ploidies, int defaultPloidy=2) {
+    for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+        int p = ploidy(decomposeGenotype(var.getGenotype(*s)));
+        if (p == 0) ploidies[*s] = defaultPloidy;
+        else        ploidies[*s] = p;
+    }
+    return ploidies;
+}
+
+void closeOutputs(map<string, map<int, SampleFastaFile*> >& outputs) {
+    for (map<string, map<int, SampleFastaFile*> >::iterator f = outputs.begin(); f != outputs.end(); ++f) {
+        for (map<int, SampleFastaFile*>::iterator s = f->second.begin(); s != f->second.end(); ++s) {
+            delete s->second;
+        }
+    }
+}
+
+void initOutputs(map<string, map<int, SampleFastaFile*> >& outputs, vector<string>& sampleNames, string& seqName, map<string, int>& ploidies, string& prefix) {
+    closeOutputs(outputs);
+    for (vector<string>::iterator s = sampleNames.begin(); s != sampleNames.end(); ++s) {
+        map<int, SampleFastaFile*>& outs = outputs[*s];
+        int p = ploidies[*s];
+        for (int i = 0; i < p; ++i) {
+            string name = prefix + *s + "_" + seqName + ":" + convert(i) + ".fasta";
+            if (!outs[i]) {
+                SampleFastaFile* fp = new SampleFastaFile;
+                outs[i] = fp;
+            }
+            SampleFastaFile& f = *outs[i];
+            f.open(name, seqName);
+        }
+    }
+}
+
+void vcf2fasta(VariantCallFile& variantFile, FastaReference& reference, string& outputPrefix, int defaultPloidy) {
+    string lastSeq;
+    long int lastPos=0, lastEnd=0;
+    map<string, map<int, SampleFastaFile*> > outputs;
+    Variant var(variantFile);
+    map<string, int> lastPloidies;
+    while (variantFile.getNextVariant(var)) {
+        if (!var.isPhased()) {
+            cerr << "variant " << var.sequenceName << ":" << var.position << " is not phased, cannot convert to fasta" << endl;
+            exit(1);
+        }
+        map<string, int> ploidies;
+        getPloidies(var, ploidies, defaultPloidy);
+        if (var.sequenceName != lastSeq || lastSeq.empty()) {
+            if (!lastSeq.empty()) {
+                string ref5prime = reference.getSubSequence(lastSeq, lastEnd, reference.sequenceLength(lastSeq)-lastEnd);
+                for (map<string, map<int, SampleFastaFile*> >::iterator s = outputs.begin(); s != outputs.end(); ++s) {
+                    map<int, SampleFastaFile*>& f = s->second;
+                    for (map<int, SampleFastaFile*>::iterator o = f.begin(); o != f.end(); ++o) {
+                        o->second->write(ref5prime);
+                    }
+                }
+            }
+            initOutputs(outputs, var.sampleNames, var.sequenceName, ploidies, outputPrefix);
+            lastSeq = var.sequenceName;
+            lastPos = 0;
+        } else if (!lastPloidies.empty() && lastPloidies != ploidies) {
+            cerr << "cannot handle mid-sequence change of ploidy" << endl;
+            // in principle it should be possible...
+            // it's a matter of representation, GFASTA anyone?
+            exit(1);
+        }
+        lastPloidies = ploidies;
+        if (var.position < lastEnd) {
+            cerr << var.position << " vs " << lastEnd << endl;
+            cerr << "overlapping or out-of-order variants at " << var.sequenceName << ":" << var.position << endl;
+            exit(1);
+        }
+        // get reference sequences implied by last->current variant
+        string ref5prime;
+        if (var.position - 1 - lastEnd > 0) {
+            ref5prime = reference.getSubSequence(var.sequenceName, lastEnd, var.position - 1 - lastEnd);
+        }
+        // write alt/ref seqs for current variant based on phased genotypes
+        for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+            string& sample = *s;
+            vector<int> gt = decomposePhasedGenotype(var.getGenotype(sample));
+            // assume no-call == ref?
+            if (gt.empty()) {
+                cerr << "empty genotype for sample " << *s << " at " << var.sequenceName << ":" << var.position << endl;
+                exit(1);
+            }
+            int i = 0;
+            for (vector<int>::iterator g = gt.begin(); g != gt.end(); ++g, ++i) {
+                outputs[sample].at(i)->write(ref5prime+var.alleles.at(*g));
+            }
+        }
+        lastPos = var.position - 1;
+        lastEnd = lastPos + var.ref.size();
+    }
+    // write last sequences
+    {
+        string ref5prime = reference.getSubSequence(lastSeq, lastEnd, reference.sequenceLength(lastSeq)-lastEnd);
+        for (map<string, map<int, SampleFastaFile*> >::iterator s = outputs.begin(); s != outputs.end(); ++s) {
+            map<int, SampleFastaFile*>& f = s->second;
+            for (map<int, SampleFastaFile*>::iterator o = f.begin(); o != f.end(); ++o) {
+                o->second->write(ref5prime);
+            }
+        }
+    }
+    closeOutputs(outputs);
+    // outputs are closed by ~SampleFastaFile
+}
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+    string fastaFileName;
+    int defaultPloidy;
+    string outputPrefix;
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"reference", required_argument, 0, 'f'},
+                {"prefix", required_argument, 0, 'p'},
+                {"default-ploidy", required_argument, 0, 'P'},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hmf:p:P:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+
+	    case 'f':
+            fastaFileName = optarg;
+            break;
+
+        case 'h':
+            printSummary(argv);
+            break;
+
+	    case 'p':
+            outputPrefix = optarg;
+            break;
+
+        case 'P':
+            defaultPloidy = atoi(optarg);
+            break;
+
+        case '?':
+            printSummary(argv);
+            exit(1);
+            break;
+
+        default:
+            abort ();
+        }
+    }
+
+    FastaReference reference;
+    if (fastaFileName.empty()) {
+        cerr << "a reference is required for haplotype allele generation" << endl;
+        printSummary(argv);
+        exit(1);
+    }
+    reference.open(fastaFileName);
+
+    if (optind < argc) {
+        string filename = argv[optind];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    vcf2fasta(variantFile, reference, outputPrefix, defaultPloidy);
+
+    return 0;
+
+}
+
diff --git a/src/vcf2tsv.cpp b/src/vcf2tsv.cpp
new file mode 100644
index 0000000..9a1e2dd
--- /dev/null
+++ b/src/vcf2tsv.cpp
@@ -0,0 +1,241 @@
+#include "Variant.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [-n null_string] [-g]" << " [vcf file]" << endl
+         << "Converts stdin or given VCF file to tab-delimited format, using null string to replace empty values in the table." << endl
+         << "Specifying -g will output one line per sample with genotype information." << endl;
+    exit(1);
+}
+
+
+int main(int argc, char** argv) {
+
+    string nullval;
+    bool genotypes = false;
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+        {
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            {"help", no_argument, 0, 'h'},
+            {"null-value", required_argument, 0, 'n'},
+            {"genotypes", no_argument, 0, 'g'},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hn:g",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+
+	    case 'n':
+	        nullval = optarg;
+            break;
+
+        case 'g':
+            genotypes = true;
+            break;
+
+        case 'h':
+            printSummary(argv);
+            break;
+            
+        case '?':
+            printSummary(argv);
+            exit(1);
+            break;
+                
+        default:
+            abort ();
+        }
+    }
+
+    VariantCallFile variantFile;
+    bool usingstdin = false;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        if (!variantFile.open(std::cin)) {
+            if (argc == 1) {
+                printSummary(argv);
+            } else {
+                cerr << "could not open stdin for reading as VCF" << endl;
+                exit(1);
+            }
+        }
+        usingstdin = true;
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+    // obtain all possible field names
+    vector<string> infofields;
+    vector<string> infoflags;
+
+    for (map<string, VariantFieldType>::iterator i = variantFile.infoTypes.begin(); i != variantFile.infoTypes.end(); ++i) {
+        if (i->second == FIELD_BOOL) {
+            infoflags.push_back(i->first);
+        } else {
+            infofields.push_back(i->first);
+        }
+    }
+
+    vector<string> formatfields;
+    if (genotypes) {
+        for (map<string, VariantFieldType>::iterator f = variantFile.formatTypes.begin(); f != variantFile.formatTypes.end(); ++f) {
+            formatfields.push_back(f->first);
+        }
+    }
+
+    // write header
+
+    // defaults
+    cout << "CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER";
+    
+    // configurable info field
+    for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
+        cout << "\t" << *i;
+    }
+    for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
+        cout << "\t" << *i;
+    }
+    
+    if (genotypes) {
+        cout << "\t" << "SAMPLE";
+        for (vector<string>::iterator f = formatfields.begin(); f != formatfields.end(); ++f) {
+            cout << "\t" << *f;
+        }
+    }
+    cout << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+
+        if (!genotypes) {
+
+            int altindex = 0;
+            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++altindex) {
+
+                string& altallele = *a;
+
+                cout << var.sequenceName << "\t"
+                     << var.position << "\t"
+                     << var.id << "\t"
+                     << var.ref << "\t"
+                     << altallele << "\t"
+                     << var.quality << "\t"
+                     << var.filter;
+
+                for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
+                    vector<string> value;
+                    string& name = *i;
+                    map<string, vector<string> >::iterator f = var.info.find(name);
+                    if (f != var.info.end()) {
+                        value = f->second;
+                        if (value.size() == 1) {
+                            cout << "\t" << value.front();
+                        } else if (value.size() == var.alt.size()) {
+                            cout << "\t" << value.at(altindex);
+                        } else {
+                            cout << "\t" << nullval; // null
+                        }
+                    } else {
+                        cout << "\t" << nullval; // null
+                    }
+                }
+
+                for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
+                    string value;
+                    string& name = *i;
+                    map<string, bool>::iterator f = var.infoFlags.find(name);
+                    cout << "\t";
+                    if (f != var.infoFlags.end()) {
+                        cout << 1;
+                    } else {
+                        cout << 0;
+                    }
+                }
+
+                cout << endl;
+
+            }
+        } else {
+
+            stringstream o;
+
+            // per-genotype output
+            o << var.sequenceName << "\t"
+              << var.position << "\t"
+              << var.id << "\t"
+              << var.ref << "\t"
+              << join(var.alt, ",") << "\t"
+              << var.quality << "\t"
+              << var.filter;
+            
+            for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
+                vector<string> value;
+                string& name = *i;
+                map<string, vector<string> >::iterator f = var.info.find(name);
+                if (f != var.info.end()) {
+                    value = f->second;
+                    if (value.size() == 1) {
+                        o << "\t" << value.front();
+                    } else if (value.size() == var.alt.size()) {
+                        o << "\t" << join(value, ",");
+                    } else {
+                        o << "\t" << nullval; // null
+                    }
+                } else {
+                    o << "\t" << nullval; // null
+                }
+            }
+
+            for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
+                string value;
+                string& name = *i;
+                map<string, bool>::iterator f = var.infoFlags.find(name);
+                o << "\t";
+                if (f != var.infoFlags.end()) {
+                    o << 1;
+                } else {
+                    o << 0;
+                }
+            }
+            
+            string siteinfo = o.str();
+
+            for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+                cout << siteinfo;
+                const string& sampleName = s->first;
+                cout << "\t" << sampleName;
+                map<string, vector<string> >& sample = s->second;
+                for (vector<string>::iterator f = formatfields.begin(); f != formatfields.end(); ++f) {
+                    if (sample.find(*f) != sample.end()) {
+                        cout << "\t" << join(sample[*f], ",");
+                    } else {
+                        cout << "\t" << nullval;
+                    }
+                }
+                cout << endl;
+            }
+        }
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfaddinfo.cpp b/src/vcfaddinfo.cpp
new file mode 100644
index 0000000..cff66d6
--- /dev/null
+++ b/src/vcfaddinfo.cpp
@@ -0,0 +1,111 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+// adds non-overlapping info fields from varB to varA
+void addInfo(Variant& varA, Variant& varB) {
+    for (map<string, vector<string> >::iterator i = varB.info.begin(); i != varB.info.end(); ++i) {
+        if (varA.info.find(i->first) == varA.info.end()) {
+            varA.info[i->first] = i->second;
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+
+    if (argc != 3) {
+        cerr << "usage: " << argv[0] << " <vcf file> <vcf file>" << endl
+             << "Adds info fields from the second file which are not present in the first vcf file." << endl;
+        return 1;
+    }
+
+    string filenameA = argv[1];
+    string filenameB = argv[2];
+
+    if (filenameA == filenameB) {
+        cerr << "it won't help to add info data from the same file!" << endl;
+        return 1;
+    }
+
+    VariantCallFile variantFileA;
+    if (filenameA == "-") {
+        variantFileA.open(std::cin);
+    } else {
+        variantFileA.open(filenameA);
+    }
+
+    VariantCallFile variantFileB;
+    if (filenameB == "-") {
+        variantFileB.open(std::cin);
+    } else {
+        variantFileB.open(filenameB);
+    }
+
+    if (!variantFileA.is_open() || !variantFileB.is_open()) {
+        return 1;
+    }
+
+    Variant varA(variantFileA);
+    Variant varB(variantFileB);
+
+    // while the first file doesn't match the second positionally,
+    // step forward, annotating each genotype record with an empty genotype
+    // when the two match, iterate through the genotypes from the first file
+    // and get the genotypes reported in the second file
+    
+    variantFileA.getNextVariant(varA);
+    variantFileB.getNextVariant(varB);
+    
+    variantFileA.header = unionInfoHeaderLines(variantFileA.header, variantFileB.header);
+    
+    cout << variantFileA.header << endl;
+
+    do {
+
+        while (!variantFileB.done()
+               && (varB.sequenceName < varA.sequenceName
+                   || (varB.sequenceName == varA.sequenceName && varB.position < varA.position))
+            ) {
+            variantFileB.getNextVariant(varB);
+        }
+
+        while (!variantFileA.done()
+               && (varA.sequenceName < varB.sequenceName
+                   || (varA.sequenceName == varB.sequenceName && varA.position < varB.position))
+            ) {
+            cout << varA << endl;
+            variantFileA.getNextVariant(varA);
+        }
+
+        while (!variantFileB.done()
+               && (varB.sequenceName < varA.sequenceName
+                   || (varB.sequenceName == varA.sequenceName && varB.position < varA.position))
+            ) {
+            variantFileB.getNextVariant(varB);
+        }
+
+        while (!variantFileA.done() && varA.sequenceName == varB.sequenceName && varA.position == varB.position) {
+            addInfo(varA, varB);
+            cout << varA << endl;
+            variantFileA.getNextVariant(varA);
+            variantFileB.getNextVariant(varB);
+        }
+        
+    } while (!variantFileA.done() && !variantFileB.done());
+
+    if (!variantFileA.done()) {
+        cout << varA << endl;
+        while (variantFileA.getNextVariant(varA)) {
+            cout << varA << endl;
+        }
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfafpath.cpp b/src/vcfafpath.cpp
new file mode 100644
index 0000000..e87ab80
--- /dev/null
+++ b/src/vcfafpath.cpp
@@ -0,0 +1,52 @@
+#include "Variant.h"
+#include <algorithm>
+#include <vector>
+#include <map>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc > 1) {
+        string filename = argv[1];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    //cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        //cout << var << endl;
+        double afref = 1;
+        map<double, vector<string> > allelesByAf;
+        vector<double> afd;
+        vector<string>& afstr = var.info["AF"];
+        for (vector<string>::iterator af = afstr.begin(); af != afstr.end(); ++af) {
+            double r; convert(*af, r);
+            afd.push_back(r);
+        }
+        vector<double>::iterator af = afd.begin();
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++af) {
+            afref -= *af;
+            allelesByAf[*af].push_back(*a);
+        }
+        cout << var.ref;
+        for (map<double, vector<string> >::reverse_iterator a = allelesByAf.rbegin(); a != allelesByAf.rend(); ++a) {
+            cout << " -> " << join(a->second, ", ");
+        }
+        cout << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfallelicprimitives.cpp b/src/vcfallelicprimitives.cpp
new file mode 100644
index 0000000..95b6333
--- /dev/null
+++ b/src/vcfallelicprimitives.cpp
@@ -0,0 +1,414 @@
+#include "Variant.h"
+#include "convert.h"
+#include "join.h"
+#include "split.h"
+#include <set>
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+#define ALLELE_NULL -1
+
+double convertStrDbl(const string& s) {
+    double r;
+    convert(s, r);
+    return r;
+}
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [file]" << endl
+         << endl
+         << "options:" << endl
+         << "    -m, --use-mnps          Retain MNPs as separate events (default: false)." << endl
+         << "    -t, --tag-parsed FLAG   Tag records which are split apart of a complex allele with this flag." << endl
+         << "    -L, --max-length LEN    Do not manipulate records in which either the ALT or" << endl
+         << "                            REF is longer than LEN (default: 200)." << endl
+         << "    -k, --keep-info         Maintain site and allele-level annotations when decomposing." << endl
+         << "                            Note that in many cases, such as multisample VCFs, these won't" << endl
+         << "                            be valid post-decomposition.  For biallelic loci in single-sample" << endl
+         << "                            VCFs, they should be usable with caution." << endl
+         << "    -g, --keep-geno         Maintain genotype-level annotations when decomposing.  Similar" << endl
+         << "                            caution should be used for this as for --keep-info." << endl
+         << endl
+         << "If multiple alleleic primitives (gaps or mismatches) are specified in" << endl
+         << "a single VCF record, split the record into multiple lines, but drop all" << endl
+         << "INFO fields.  Does not handle genotypes (yet).  MNPs are split into" << endl
+         << "multiple SNPs unless the -m flag is provided.  Records generated by splits have th" << endl;
+    exit(0);
+}
+
+int main(int argc, char** argv) {
+
+    bool includePreviousBaseForIndels = true;
+    bool useMNPs = false;
+    string parseFlag;
+    int maxLength = 200;
+    bool keepInfo = false;
+    bool keepGeno = false;
+
+    VariantCallFile variantFile;
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"use-mnps", no_argument, 0, 'm'},
+                {"max-length", required_argument, 0, 'L'},
+                {"tag-parsed", required_argument, 0, 't'},
+                {"keep-info", no_argument, 0, 'k'},
+                {"keep-geno", no_argument, 0, 'g'},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hmkgt:L:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+
+	    case 'm':
+            useMNPs = true;
+            break;
+
+	    case 'k':
+            keepInfo = true;
+            break;
+
+	    case 'g':
+            keepGeno = true;
+            break;
+
+        case 'h':
+            printSummary(argv);
+            break;
+
+	    case 't':
+            parseFlag = optarg;
+            break;
+
+        case 'L':
+            maxLength = atoi(optarg);
+            break;
+
+        case '?':
+            printSummary(argv);
+            exit(1);
+            break;
+
+        default:
+            abort ();
+        }
+    }
+
+    if (optind < argc) {
+        string filename = argv[optind];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">");
+    variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">");
+    if (!parseFlag.empty()) {
+        variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">");
+    }
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+
+
+        // we can't decompose *1* bp events, these are already in simplest-form whether SNPs or indels
+        // we also don't handle anything larger than maxLength bp
+        if (var.alt.size() == 1 
+            && (   var.alt.front().size() == 1
+                || var.ref.size() == 1
+                || var.alt.front().size() > maxLength
+                || var.ref.size() > maxLength
+                )) {
+            // nothing to do
+            cout << var << endl;
+            continue;
+        }
+
+        // for each parsedalternate, get the position
+        // build a new vcf record for that position
+        // unless we are already at the position !
+        // take everything which is unique to that allele (records) and append it to the new record
+        // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes
+        // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it
+
+        map<string, vector<VariantAllele> > varAlleles = var.parsedAlternates(includePreviousBaseForIndels, useMNPs);
+        set<VariantAllele> alleles;
+
+        // collect unique alleles
+        for (map<string, vector<VariantAllele> >::iterator a = varAlleles.begin(); a != varAlleles.end(); ++a) {
+            for (vector<VariantAllele>::iterator va = a->second.begin(); va != a->second.end(); ++va) {
+                alleles.insert(*va);
+            }
+        }
+
+        int altcount = 0;
+        for (set<VariantAllele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
+            if (a->ref != a->alt) {
+                ++altcount;
+            }
+        }
+
+        if (altcount == 1 && var.alt.size() == 1 && var.alt.front().size() == 1) { // if biallelic SNP
+            cout << var << endl;
+            continue;
+        }
+
+        // collect variant allele indexed membership
+        map<string, vector<int> > variantAlleleIndexes; // from serialized VariantAllele to indexes
+        for (map<string, vector<VariantAllele> >::iterator a = varAlleles.begin(); a != varAlleles.end(); ++a) {
+            int index = var.altAlleleIndexes[a->first] + 1; // make non-relative
+            for (vector<VariantAllele>::iterator va = a->second.begin(); va != a->second.end(); ++va) {
+                variantAlleleIndexes[va->repr].push_back(index);
+            }
+        }
+
+        map<VariantAllele, double> alleleFrequencies;
+        map<VariantAllele, int> alleleCounts;
+        map<VariantAllele, map<string, string> > alleleInfos;
+        map<VariantAllele, map<string, map<string, string> > > alleleGenos;
+
+        bool hasAf = false;
+        if (var.info.find("AF") != var.info.end()) {
+            hasAf = true;
+            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+                vector<VariantAllele>& vars = varAlleles[*a];
+                for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) {
+                    double freq;
+                    try {
+                        convert(var.info["AF"].at(var.altAlleleIndexes[*a]), freq);
+                        alleleFrequencies[*va] += freq;
+                    } catch (...) {
+                        cerr << "vcfallelicprimitives WARNING: AF does not have count == alts @ "
+                             << var.sequenceName << ":" << var.position << endl;
+                    }
+                }
+            }
+        }
+
+        bool hasAc = false;
+        if (var.info.find("AC") != var.info.end()) {
+            hasAc = true;
+            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+                vector<VariantAllele>& vars = varAlleles[*a];
+                for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) {
+                    int freq;
+                    try {
+                        convert(var.info["AC"].at(var.altAlleleIndexes[*a]), freq);
+                        alleleCounts[*va] += freq;
+                    } catch (...) {
+                        cerr << "vcfallelicprimitives WARNING: AC does not have count == alts @ "
+                             << var.sequenceName << ":" << var.position << endl;
+                    }
+                }
+            }
+        }
+
+        if (keepInfo) {
+            for (map<string, vector<string> >::iterator infoit = var.info.begin();
+                 infoit != var.info.end(); ++infoit) {
+                string key = infoit->first;
+                for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+                    vector<VariantAllele>& vars = varAlleles[*a];
+                    for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) {
+                        string val;
+                        vector<string>& vals = var.info[key];
+                        if (vals.size() == var.alt.size()) { // allele count for info
+                            val = vals.at(var.altAlleleIndexes[*a]);
+                        } else if (vals.size() == 1) { // site-wise count
+                            val = vals.front();
+                        } // don't handle other multiples... how would we do this without going crazy?
+                        if (!val.empty()) {
+                            alleleInfos[*va][key] = val;
+                        }
+                    }
+                }
+            }
+        }
+
+        /*
+        if (keepGeno) {
+            for (map<string, map<string, vector<string> > >::iterator sampleit = var.samples.begin();
+                 sampleit != var.samples.end(); ++sampleit) {
+                string& sampleName = sampleit->first;
+                map<string, vector<string> >& sampleValues = var.samples[sampleName];
+                
+            }
+        }
+        */
+
+        // from old allele index to a new series across the unpacked positions
+        map<int, map<long unsigned int, int> > unpackedAlleleIndexes;
+
+        map<long unsigned int, Variant> variants;
+        //vector<Variant> variants;
+        for (set<VariantAllele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
+            if (a->ref == a->alt) {
+                // ref allele
+                continue;
+            }
+            string type;
+            int len = 0;
+            if (a->ref.at(0) == a->alt.at(0)) { // well-behaved indels
+                if (a->ref.size() > a->alt.size()) {
+                    type = "del";
+                    len = a->ref.size() - a->alt.size();
+                } else if (a->ref.size() < a->alt.size()) {
+                    len = a->alt.size() - a->ref.size();
+                    type = "ins";
+                }
+            } else {
+                if (a->ref.size() == a->alt.size()) {
+                    len = a->ref.size();
+                    if (a->ref.size() == 1) {
+                        type = "snp";
+                    } else {
+                        type = "mnp";
+                    }
+                } else {
+                    len = abs((int) a->ref.size() - (int) a->alt.size());
+                    type = "complex";
+                }
+            }
+
+            if (variants.find(a->position) == variants.end()) {
+                Variant newvar(variantFile);
+                variants[a->position] = newvar;
+            }
+
+            Variant& v = variants[a->position]; // guaranteed to exist
+
+            if (!parseFlag.empty()) {
+                v.infoFlags[parseFlag] = true;
+            }
+            v.quality = var.quality;
+            v.filter = var.filter;
+            v.id = ".";
+            //v.format = var.format;
+            vector<string> gtonlyformat;
+            gtonlyformat.push_back("GT");
+            v.format = gtonlyformat;
+            v.info["TYPE"].push_back(type);
+            v.info["LEN"].push_back(convert(len));
+            if (hasAf) {
+                v.info["AF"].push_back(convert(alleleFrequencies[*a]));
+            }
+            if (hasAc) {
+                v.info["AC"].push_back(convert(alleleCounts[*a]));
+            }
+            if (keepInfo) {
+                for (map<string, vector<string> >::iterator infoit = var.info.begin();
+                     infoit != var.info.end(); ++infoit) {
+                    string key = infoit->first;
+                    if (key != "AF" && key != "AC" && key != "TYPE" && key != "LEN") { // don't clobber previous
+                        v.info[key].push_back(alleleInfos[*a][key]);
+                    }
+                }
+            }
+
+            // now, keep all the other infos if we are asked to
+
+            v.sequenceName = var.sequenceName;
+            v.position = a->position; // ... by definition, this should be == if the variant was found
+            if (v.ref.size() < a->ref.size()) {
+                for (vector<string>::iterator va = v.alt.begin(); va != v.alt.end(); ++va) {
+                    *va += a->ref.substr(v.ref.size());
+                }
+                v.ref = a->ref;
+            }
+            v.alt.push_back(a->alt);
+
+            int alleleIndex = v.alt.size();
+            vector<int>& originalIndexes = variantAlleleIndexes[a->repr];
+            for (vector<int>::iterator i = originalIndexes.begin(); i != originalIndexes.end(); ++i) {
+                unpackedAlleleIndexes[*i][v.position] = alleleIndex;
+            }
+            // add null allele
+            unpackedAlleleIndexes[ALLELE_NULL][v.position] = ALLELE_NULL;
+
+        }
+
+        // genotypes
+        for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+            string& sampleName = *s;
+            if (var.samples.find(sampleName) == var.samples.end()) {
+                continue;
+            }
+            map<string, vector<string> >& sample = var.samples[sampleName];
+            if (sample.find("GT") == sample.end()) {
+                continue;
+            }
+            string& genotype = sample["GT"].front();
+            vector<string> genotypeStrs = split(genotype, "|/");
+            vector<int> genotypeIndexes;
+            for (vector<string>::iterator s = genotypeStrs.begin(); s != genotypeStrs.end(); ++s) {
+                int i;
+                if (!convert(*s, i)) {
+                    genotypeIndexes.push_back(ALLELE_NULL);
+                } else {
+                    genotypeIndexes.push_back(i);
+                }
+            }
+            map<long unsigned int, vector<int> > positionIndexes;
+            for (vector<int>::iterator g = genotypeIndexes.begin(); g != genotypeIndexes.end(); ++g) {
+                int oldIndex = *g;
+                for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
+                    const long unsigned int& p = v->first;
+                    if (oldIndex == 0) { // reference
+                        positionIndexes[p].push_back(0);
+                    } else {
+                        positionIndexes[p].push_back(unpackedAlleleIndexes[oldIndex][p]);
+                    }
+                }
+            }
+            for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
+                Variant& variant = v->second;
+                vector<int>& gtints = positionIndexes[v->first];
+                vector<string> gtstrs;
+                for (vector<int>::iterator i = gtints.begin(); i != gtints.end(); ++i) {
+                    if (*i != ALLELE_NULL) {
+                        gtstrs.push_back(convert(*i));
+                    } else {
+                        gtstrs.push_back(".");
+                    }
+                }
+                string genotype = join(gtstrs, "|");
+                // if we are keeping the geno info, pull it over here
+                if (keepGeno) {
+                    variant.format = var.format;
+                    variant.samples[sampleName] = var.samples[sampleName];
+                }
+                // note that this will replace the old geno, but otherwise it is the same
+                variant.samples[sampleName]["GT"].clear();
+                variant.samples[sampleName]["GT"].push_back(genotype);
+            }
+        }
+
+        //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
+        for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
+            cout << v->second << endl;
+        }
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfaltcount.cpp b/src/vcfaltcount.cpp
new file mode 100644
index 0000000..756b5fc
--- /dev/null
+++ b/src/vcfaltcount.cpp
@@ -0,0 +1,50 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+    if (argc != 2) {
+        cerr << "usage: " << argv[0] << " <vcf file>" << endl
+             << "count the number of alternate alleles in all records in the vcf file" << endl;
+        return 1;
+    }
+
+    string filename = argv[1];
+
+    VariantCallFile variantFile;
+    variantFile.open(filename);
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    unsigned int alternateAlleleCount = 0;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        //cout << var << endl;
+        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+            //string& name = s->first;
+            map<string, vector<string> >& sample = s->second;
+            string& genotype = sample["GT"].front();
+            vector<string> gt = split(genotype, "|/");
+            int alt = 0;
+            for (vector<string>::iterator g = gt.begin(); g != gt.end(); ++g) {
+                if (*g != "0")
+                    ++alt;
+            }
+            alternateAlleleCount += alt;
+        }
+    }
+
+    cout << alternateAlleleCount << endl;
+
+    return 0;
+
+}
+
diff --git a/src/vcfannotate.cpp b/src/vcfannotate.cpp
new file mode 100644
index 0000000..3499083
--- /dev/null
+++ b/src/vcfannotate.cpp
@@ -0,0 +1,126 @@
+#include "Variant.h"
+#include "BedReader.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+         << endl
+         << "options:" << endl 
+         << "    -b, --bed   use annotations provided by this BED file" << endl
+         << "    -k, --key   use this INFO field key for the annotations" << endl
+         << "    -d, --default  use this INFO field key for records without annotations" << endl
+         << endl
+         << "Intersect the records in the VCF file with targets provided in a BED file." << endl
+         << "Intersections are done on the reference sequences in the VCF file." << endl
+         << "If no VCF filename is specified on the command line (last argument) the VCF" << endl
+         << "read from stdin." << endl;
+    exit(0);
+}
+
+int main(int argc, char** argv) {
+
+    string bedFileName;
+    string annotationInfoKey;
+    string defaultAnnotationValue;
+
+    if (argc == 1)
+        printSummary(argv);
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+        {
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            {"help", no_argument, 0, 'h'},
+            {"bed",  required_argument, 0, 'b'},
+            {"key",  required_argument, 0, 'k'},
+            {"default",  required_argument, 0, 'd'},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hb:k:d:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+            case 'b':
+                bedFileName = string(optarg);
+                break;
+
+            case 'k':
+                annotationInfoKey = string(optarg);
+                break;
+
+            case 'd':
+                defaultAnnotationValue = string(optarg);
+                break;
+
+            case 'h':
+                printSummary(argv);
+                break;
+
+            case '?':
+                printSummary(argv);
+                exit(1);
+                break;
+
+            default:
+                abort ();
+        }
+    }
+
+    if (bedFileName.empty()) {
+        cerr << "a BED file is required when intersecting" << endl;
+        exit(1);
+    }
+
+    BedReader bed(bedFileName);
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        cout << "could not open VCF file" << endl;
+        return 1;
+    }
+
+    string line = "##INFO=<ID=" + annotationInfoKey + ",Number=1,Type=String,Description=\"Annotation from "
+        + bedFileName + " delimited by ':'\">";
+    variantFile.addHeaderLine(line);
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        BedTarget record(var.sequenceName, var.position, var.position + var.ref.size() - 1, "");
+        vector<BedTarget*> overlaps = bed.targetsOverlapping(record);
+        vector<string> annotations;
+        if (!overlaps.empty()) {
+            for (vector<BedTarget*>::iterator t = overlaps.begin(); t != overlaps.end(); ++t) {
+                annotations.push_back((*t)->desc);
+            }
+            var.info[annotationInfoKey].push_back(join(annotations, ":"));
+        } else if (!defaultAnnotationValue.empty()) {
+            var.info[annotationInfoKey].push_back(defaultAnnotationValue);
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
diff --git a/src/vcfannotategenotypes.cpp b/src/vcfannotategenotypes.cpp
new file mode 100644
index 0000000..1bb65d3
--- /dev/null
+++ b/src/vcfannotategenotypes.cpp
@@ -0,0 +1,220 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <list>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+void annotateWithBlankGenotypes(Variant& var, string& annotationTag) {
+
+    var.addFormatField(annotationTag);
+
+    map<string, map<string, vector<string> > >::iterator s     = var.samples.begin(); 
+    map<string, map<string, vector<string> > >::iterator sEnd  = var.samples.end();
+
+    for (; s != sEnd; ++s) {
+        map<string, vector<string> >& sample = s->second;
+        sample[annotationTag].clear(); // means "no genotype" genotype
+        sample[annotationTag].push_back("./."); // means "no genotype" genotype
+    }
+}
+
+void annotateWithGenotypes(Variant& varA, Variant& varB, string& annotationTag) {
+
+    varA.addFormatField(annotationTag);
+
+    map<string, map<string, vector<string> > >::iterator s     = varA.samples.begin(); 
+    map<string, map<string, vector<string> > >::iterator sEnd  = varA.samples.end();
+
+    map<string, int> varAAlleleInts;
+    int i = 1;
+    for (vector<string>::iterator a = varA.alt.begin(); a != varA.alt.end(); ++a, ++i) {
+        varAAlleleInts[*a] = i;
+    }
+
+    map<int, int> varBconvertToVarA; // maps alleles in the second file to allele numbers for the first
+    varBconvertToVarA[0] = 0; // reference == reference!
+    i = 1;
+    for (vector<string>::iterator a = varB.alt.begin(); a != varB.alt.end(); ++a, ++i) {
+        map<string, int>::iterator ita = varAAlleleInts.find(*a);
+        if (ita != varAAlleleInts.end()) {
+            varBconvertToVarA[i] = ita->second;
+        }
+    }
+
+    for (; s != sEnd; ++s) {
+        map<string, vector<string> >& sample = s->second;
+        const string& name = s->first;
+        map<string, map<string, vector<string> > >::iterator o = varB.samples.find(name);
+        sample[annotationTag].clear();
+        if (o == varB.samples.end()) {
+            sample[annotationTag].push_back("./."); // means "no genotype"
+        } else {
+            map<string, vector<string> >& other = o->second;
+            string& otherGenotype = other["GT"].front();
+            // XXX this must compare the genotypes in the two files
+            map<int, int> gtB = decomposeGenotype(otherGenotype);
+            map<int, int> gtnew;
+            for (map<int, int>::iterator g = gtB.begin(); g != gtB.end(); ++g) {
+                map<int, int>::iterator f = varBconvertToVarA.find(g->first);
+                if (f != varBconvertToVarA.end()) {
+                    gtnew[f->second] += g->second;
+                } else {
+                    gtnew[-1] += g->second;
+                }
+            }
+            sample[annotationTag].push_back(genotypeToString(gtnew));
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+
+    if (argc != 4) {
+        cerr << "usage: " << argv[0] << " <annotation-tag> <vcf file> <vcf file>" << endl
+             << "annotates genotypes in the first file with genotypes in the second" << endl
+             << "adding the genotype as another flag to each sample filed in the first file." << endl
+             << "annotation-tag is the name of the sample flag which is added to store the annotation." << endl
+             << "also adds a 'has_variant' flag for sites where the second file has a variant." << endl;
+        return 1;
+    }
+
+    string annotag = argv[1];
+    string filenameA = argv[2];
+    string filenameB = argv[3];
+
+    if (filenameA == filenameB) {
+        cerr << "it won't help to annotate samples with their own genotypes!" << endl;
+        return 1;
+    }
+
+    VariantCallFile variantFileA;
+    if (filenameA == "-") {
+        variantFileA.open(std::cin);
+    } else {
+        variantFileA.open(filenameA);
+    }
+
+    VariantCallFile variantFileB;
+    if (filenameB == "-") {
+        variantFileB.open(std::cin);
+    } else {
+        variantFileB.open(filenameB);
+    }
+
+    if (!variantFileA.is_open() || !variantFileB.is_open()) {
+        return 1;
+    }
+
+    Variant varA(variantFileA);
+    Variant varB(variantFileB);
+
+    // while the first file doesn't match the second positionally,
+    // step forward, annotating each genotype record with an empty genotype
+    // when the two match, iterate through the genotypes from the first file
+    // and get the genotypes reported in the second file
+    
+    variantFileA.getNextVariant(varA);
+    variantFileB.getNextVariant(varB);
+
+    string line = "##INFO=<ID=" + annotag + ".has_variant,Number=0,Type=Flag,Description=\"True if "
+        + annotag + " has a called alternate among samples under comparison.\">";
+    variantFileA.addHeaderLine(line);
+    line = "##FORMAT=<ID=" + annotag + ",Number=1,Type=String,Description=\"Genotype from "
+        + annotag + ".\">";
+    variantFileA.addHeaderLine(line);
+
+    cout << variantFileA.header << endl;
+
+    do {
+
+        // this is broken.  to do it right, it'll be necessary to get reference ids from the fasta reference used to make the alignments...
+		// if B is NOT done, and is less than A, read new B.
+        if (!variantFileB.done()
+            && (varB.sequenceName != varA.sequenceName
+                || (varB.sequenceName == varA.sequenceName && varB.position < varA.position)
+				|| variantFileA.done())
+            ) {
+            variantFileB.getNextVariant(varB);
+        }
+
+		// if A is not done- and A is less than B, read A.  
+		// should also read if variant B is done. 
+        if (!variantFileA.done()
+            && (varA.sequenceName != varB.sequenceName
+                || (varA.sequenceName == varB.sequenceName && varA.position < varB.position)
+				|| variantFileB.done())
+            ) {
+            cout << varA << endl;
+            variantFileA.getNextVariant(varA);
+        }
+
+        vector<Variant> varsA;
+        vector<Variant> varsB;
+
+        bool hasMultipleAlts = false;
+
+        long int thisPosition = 0;
+        string thisSequenceName;
+        if (varA.position == varB.position
+            && varA.sequenceName == varB.sequenceName) {
+            thisPosition = varA.position;
+            thisSequenceName = varA.sequenceName;
+        }
+        while (!variantFileA.done()
+               && !variantFileB.done()
+               && thisPosition == varA.position
+               && thisSequenceName == varA.sequenceName
+               && varA.sequenceName == varB.sequenceName
+               && varA.position == varB.position) {
+            // accumulate all the alts at the current position
+            varsA.push_back(varA);
+            varsB.push_back(varB);
+            if (varA.alt.size() > 1 || varB.alt.size() > 1)
+                hasMultipleAlts = true;
+            variantFileA.getNextVariant(varA);
+            variantFileB.getNextVariant(varB);
+        }
+
+        // multiple lines per position
+        if (!hasMultipleAlts && (varsA.size() > 1 || varsB.size() > 1)) {
+
+            map<pair<string, string>, Variant> varsAParsed;
+            map<pair<string, string>, Variant> varsBParsed;	
+            for (vector<Variant>::iterator v = varsA.begin(); v != varsA.end(); ++v) {
+                varsAParsed[make_pair(v->ref, v->alt.front())] = *v;
+            }
+            for (vector<Variant>::iterator v = varsB.begin(); v != varsB.end(); ++v) {
+                varsBParsed[make_pair(v->ref, v->alt.front())] = *v;
+            }
+	    
+            for (map<pair<string, string>, Variant>::iterator vs = varsAParsed.begin(); vs != varsAParsed.end(); ++vs) {
+                Variant& varA = vs->second;
+                if (varsBParsed.find(make_pair(varA.ref, varA.alt.front())) != varsBParsed.end()) {
+                    Variant& varB = varsBParsed[make_pair(varA.ref, varA.alt.front())]; // TODO cleanup
+                    annotateWithGenotypes(varA, varB, annotag);
+                    varA.infoFlags[annotag + ".has_variant"] = true;
+                } else {
+                    annotateWithBlankGenotypes(varA, annotag);
+                }
+                cout << varA << endl;
+            }
+
+        } else if (!varsA.empty() && !varsB.empty()) { // one line per multi-allelic
+            Variant& varA = varsA.front();
+            Variant& varB = varsB.front();
+            annotateWithGenotypes(varA, varB, annotag);
+            // XXX TODO, and also allow for records with multiple alts
+            // XXX assume that if the other file has a corresponding record, some kind of variation was detected at the same site
+            varA.infoFlags[annotag + ".has_variant"] = true;
+            cout << varA << endl;
+        }
+
+    } while (!variantFileA.done() || !variantFileB.done());
+
+    return 0;
+
+}
+
diff --git a/src/vcfbreakmulti.cpp b/src/vcfbreakmulti.cpp
new file mode 100644
index 0000000..f462111
--- /dev/null
+++ b/src/vcfbreakmulti.cpp
@@ -0,0 +1,114 @@
+#include "Variant.h"
+#include "convert.h"
+#include <set>
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+
+double convertStrDbl(const string& s) {
+    double r;
+    convert(s, r);
+    return r;
+}
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [file]" << endl
+	 << endl
+	 << "If multiple alleles are specified in a single record, break the record into" << endl
+	 << "multiple lines, preserving allele-specific INFO fields." << endl;
+    exit(0);
+}
+
+int main(int argc, char** argv) {
+
+    bool includePreviousBaseForIndels = true;
+    bool useMNPs = false;
+
+    VariantCallFile variantFile;
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+        {
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            {"help", no_argument, 0, 'h'},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "h",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+
+            case 'h':
+                printSummary(argv);
+                break;
+
+            case '?':
+                printSummary(argv);
+                exit(1);
+                break;
+
+            default:
+                abort ();
+        }
+    }
+
+    if (optind < argc) {
+        string filename = argv[optind];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+
+        int numalt = var.alt.size();
+
+        if (numalt == 1) {
+            cout << var << endl;
+            continue;
+        }
+
+        vector<Variant> variants;
+        for (int i = 0; i < numalt; ++i) {
+            variants.push_back(var);
+        }
+
+        for (int i = 0; i < numalt; ++i) {
+            Variant& v = variants.at(i);
+            vector<string> altsToRemove;
+            for (int j = 0; j < numalt; ++j) {
+                if (j != i) {
+                    altsToRemove.push_back(var.alt.at(j));
+                }
+            }
+            for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) {
+                v.removeAlt(*a);
+            }
+        }
+
+        for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
+            cout << *v << endl;
+        }
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfcat.cpp b/src/vcfcat.cpp
new file mode 100644
index 0000000..cf40921
--- /dev/null
+++ b/src/vcfcat.cpp
@@ -0,0 +1,34 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    if (argc == 1) {
+        cout << "usage: " << argv[0] << " [file1] [file2] ... [fileN]" << endl
+             << "Concatenates VCF files." << endl;
+        return 0;
+    } else {
+        for (int i = 1; i < argc; ++i) {
+            VariantCallFile variantFile;
+            string filename = argv[i];
+            variantFile.open(filename);
+            if (!variantFile.is_open()) {
+                cerr << "could not open " << argv[i] << endl;
+                return 1;
+            }
+            if (i == 1) {
+                cout << variantFile.header << endl;
+            }
+            Variant var(variantFile);
+            while (variantFile.getNextVariant(var)) {
+                cout << var << endl;
+            }
+        }
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfcheck.cpp b/src/vcfcheck.cpp
new file mode 100644
index 0000000..d370ae1
--- /dev/null
+++ b/src/vcfcheck.cpp
@@ -0,0 +1,139 @@
+#include "Variant.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+         << endl
+         << "options:" << endl 
+         << "    -f, --fasta-reference  FASTA reference file to use to obtain primer sequences" << endl
+         << "    -x, --exclude-failures If a record fails, don't print it.  Otherwise do." << endl
+         << "    -k, --keep-failures    Print if the record fails, otherwise not." << endl
+         << endl
+         << "Verifies that the VCF REF field matches the reference as described." << endl
+         << endl;
+    exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+    int c;
+    string fastaRef;
+    bool keepFailures = false;
+    bool excludeFailures = false;
+
+    if (argc == 1)
+        printSummary(argv);
+
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"fasta-reference",  required_argument, 0, 'f'},
+                {"exclude-failures",  no_argument, 0, 'x'},
+                {"keep-failures",  no_argument, 0, 'k'},
+                //{"length",  no_argument, &printLength, true},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hxkf:",
+                         long_options, &option_index);
+
+        /* Detect the end of the options. */
+        if (c == -1)
+            break;
+ 
+        switch (c)
+        {
+        case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+                break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+                printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+        case 'f':
+            fastaRef = optarg;
+            break;
+
+        case 'x':
+            excludeFailures = true;
+            break;
+
+        case 'k':
+            keepFailures = true;
+            break;
+ 
+        case 'h':
+            printSummary(argv);
+            exit(0);
+            break;
+
+        case '?':
+            /* getopt_long already printed an error message. */
+            printSummary(argv);
+            exit(1);
+            break;
+ 
+        default:
+            abort ();
+        }
+    }
+
+    if (fastaRef.empty()) {
+        cerr << "a FASTA reference sequence must be specified" << endl;
+        exit(1);
+    }
+
+    FastaReference ref;
+    ref.open(fastaRef);
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    if (keepFailures || excludeFailures) {
+        cout << variantFile.header << endl;
+    }
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        int refstart = var.position - 1; // convert to 0-based
+        string matchedRef = ref.getSubSequence(var.sequenceName, refstart, var.ref.size());
+        if (var.ref != matchedRef) {
+            if (keepFailures) {
+                cout << var << endl;
+            } else if (!excludeFailures) {
+                cout << "mismatched reference " << var.ref << " should be " << matchedRef << " at "
+                     << var.sequenceName << ":" << var.position << endl;
+            }
+        } else if (excludeFailures) {
+            cout << var << endl;
+        }
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfclassify.cpp b/src/vcfclassify.cpp
new file mode 100644
index 0000000..42624ee
--- /dev/null
+++ b/src/vcfclassify.cpp
@@ -0,0 +1,162 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+bool isTransition(string& ref, string& alt) {
+    if (((ref == "A" && alt == "G") || (ref == "G" && alt == "A")) ||
+        ((ref == "C" && alt == "T") || (ref == "T" && alt == "C"))) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+bool hasTransition(Variant& var) {
+    string& ref = var.ref;
+    for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+        string& alt = *a;
+        if (isTransition(ref, alt)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool hasTransversion(Variant& var) {
+    string& ref = var.ref;
+    for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+        string& alt = *a;
+        if (!isTransition(ref, alt)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool hasInsertion(Variant& var) {
+    string& ref = var.ref;
+    for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+        string& alt = *a;
+        if (ref.size() < alt.size()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool hasDeletion(Variant& var) {
+    string& ref = var.ref;
+    for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+        string& alt = *a;
+        if (ref.size() > alt.size()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool hasMNP(Variant& var) {
+    string& ref = var.ref;
+    for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+        string& alt = *a;
+        if (ref.size() > 1 && alt.size() == ref.size()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool hasSNP(Variant& var) {
+    string& ref = var.ref;
+    for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+        string& alt = *a;
+        if (ref.size() == 1 && alt.size() == 1) {
+            return true;
+        }
+    }
+    return false;
+}
+
+int main(int argc, char** argv) {
+
+    if (argc != 2) {
+        cerr << "usage: " << argv[0] << " <vcf file>" << endl
+             << "outputs a VCF stream each variant is tagged by allele class: snp, ts/tv, indel, mnp" << endl;
+        return 1;
+    }
+
+    string filename = argv[1];
+
+    VariantCallFile variantFile;
+    if (filename == "-") {
+        variantFile.open(std::cin);
+    } else {
+        variantFile.open(filename);
+    }
+
+    if (!variantFile.is_open()) {
+        cerr << "could not open " << filename << endl;
+        return 1;
+    }
+
+    Variant var(variantFile);
+
+    string line;
+    line = "##INFO=<ID=SNP,Number=0,Type=Flag,Description=\"SNP allele\">";
+    variantFile.addHeaderLine(line);
+    line = "##INFO=<ID=TS,Number=0,Type=Flag,Description=\"transition SNP\">";
+    variantFile.addHeaderLine(line);
+    line = "##INFO=<ID=TV,Number=0,Type=Flag,Description=\"transversion SNP\">";
+    variantFile.addHeaderLine(line);
+    line = "##INFO=<ID=INS,Number=0,Type=Flag,Description=\"insertion allele\">";
+    variantFile.addHeaderLine(line);
+    line = "##INFO=<ID=DEL,Number=0,Type=Flag,Description=\"deletion allele\">";
+    variantFile.addHeaderLine(line);
+    line = "##INFO=<ID=MNP,Number=0,Type=Flag,Description=\"MNP allele\">";
+    variantFile.addHeaderLine(line);
+    // TODO handle lengths at poly-allelic sites
+    //line = "##INFO=<ID=LEN,Number=1,Type=Integer,Description=\"allele length\">";
+    //variantFile.addHeaderLine(line);
+
+    // write the new header
+    cout << variantFile.header << endl;
+ 
+
+    while (variantFile.getNextVariant(var)) {
+
+        if (hasSNP(var)) {
+            var.infoFlags["SNP"] = true;
+        }
+
+        if (hasTransition(var)) {
+            var.infoFlags["TS"] = true;
+        }
+
+        if (hasTransversion(var)) {
+            var.infoFlags["TV"] = true;
+        }
+
+        if (hasInsertion(var)) {
+            var.infoFlags["INS"] = true;
+        }
+
+        if (hasDeletion(var)) {
+            var.infoFlags["DEL"] = true;
+        }
+
+        if (hasMNP(var)) {
+            var.infoFlags["MNP"] = true;
+        }
+
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfcleancomplex.cpp b/src/vcfcleancomplex.cpp
new file mode 100644
index 0000000..c0b3c71
--- /dev/null
+++ b/src/vcfcleancomplex.cpp
@@ -0,0 +1,71 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+    if (argc != 2) {
+        cerr << "usage: " << argv[0] << " <vcf file>" << endl
+             << "outputs a VCF stream in which 'long' non-complex"
+             << "alleles have their position corrected." << endl
+             << "assumes that VCF records can't overlap 5'->3'" << endl;
+        return 1;
+    }
+
+    string filename = argv[1];
+
+    VariantCallFile variantFile;
+    if (filename == "-") {
+        variantFile.open(std::cin);
+    } else {
+        variantFile.open(filename);
+    }
+
+    if (!variantFile.is_open()) {
+        cerr << "could not open " << filename << endl;
+        return 1;
+    }
+
+    Variant var(variantFile);
+
+    // write the new header
+    cout << variantFile.header << endl;
+ 
+    // print the records, filtering is done via the setting of varA's output sample names
+    while (variantFile.getNextVariant(var)) {
+        // if we just have one parsed alternate (non-complex case)
+        map<string, vector<VariantAllele> > parsedAlts = var.parsedAlternates(true, true); // use mnps, and previous for indels
+        // but the alt string is long
+        //cerr << var.alt.size() << " " << parsedAlts.size() << endl;
+        if (var.alt.size() == 1 && parsedAlts.size() > 1) {
+            string& alternate = var.alt.front();
+            vector<VariantAllele>& vs = parsedAlts[alternate];
+            vector<VariantAllele> valleles;
+            for (vector<VariantAllele>::iterator a = vs.begin(); a != vs.end(); ++a) {
+                if (a->ref != a->alt) {
+                    valleles.push_back(*a);
+                }
+            }
+            if (valleles.size() == 1) {
+                // do we have extra sequence hanging around?
+                VariantAllele& varallele = valleles.front();
+                if (vs.front().ref == vs.front().alt) {
+                    var.position = varallele.position;
+                    var.ref = var.ref.substr(vs.front().ref.size(), varallele.ref.size());
+                    var.alt.front() = varallele.alt;
+                }
+            }
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfcombine.cpp b/src/vcfcombine.cpp
new file mode 100644
index 0000000..09201c4
--- /dev/null
+++ b/src/vcfcombine.cpp
@@ -0,0 +1,207 @@
+#include "Variant.h"
+#include <getopt.h>
+#include <utility>
+
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [vcf file] [vcf file] ..." << endl
+         << endl
+         << "Combines VCF files positionally, combining samples when sites and alleles are identical." << endl
+         << "Any number of VCF files may be combined.  The INFO field and other columns are taken from" << endl
+         << "one of the files which are combined when records in multiple files match.  Alleles must" << endl
+         << "have identical ordering to be combined into one record.  If they do not, multiple records" << endl
+         << "will be emitted." << endl
+         << endl
+         << "options:" << endl
+         << "    -h --help           This text." << endl
+         << "    -r --region REGION  A region specifier of the form chrN:x-y to bound the merge" << endl;
+    exit(1);
+}
+
+int main(int argc, char** argv) {
+
+    if (argc < 2) {
+        printSummary(argv);
+    }
+
+    string region;
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+        {
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            {"help", no_argument, 0, 'h'},
+            {"region", required_argument, 0, 'r'},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hr:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+
+        case 'h':
+            printSummary(argv);
+            break;
+
+        case 'r':
+            region = optarg;
+            break;
+
+        case '?':
+            printSummary(argv);
+            exit(1);
+            break;
+
+        default:
+            abort ();
+        }
+    }
+
+    vector<string> sampleNames;
+    string randomHeader;
+    VariantCallFile* vcf;
+
+    // structure to track ordered variants
+    //ChromNameCompare chromCompare;
+
+    typedef
+        map<vector<string>, // alts
+            map<VariantCallFile*, Variant*> >
+        Position;
+
+    typedef
+        map<long int, Position>
+        ChromVariants;
+
+    typedef
+        map<string, // chrom
+            ChromVariants,
+            ChromNameCompare>
+        VariantsByChromPosAltFile;
+
+    VariantsByChromPosAltFile  variantsByChromPosAltFile;
+
+    VariantCallFile* firstVCF = NULL;
+    VCFHeader vcf_header;
+    for (int i = optind; i != argc; ++i) {
+        string inputFilename = argv[i];
+        vcf = new VariantCallFile;
+        vcf->open(inputFilename);
+        if (!region.empty()) {
+            if (!vcf->setRegion(region)) {
+                cerr << "could not set region on " << inputFilename << endl;
+                delete vcf;
+                continue;
+            }
+        }
+        if (vcf->is_open()) {
+            Variant* var = new Variant(*vcf);
+            if (vcf->getNextVariant(*var)) {
+                variantsByChromPosAltFile[var->sequenceName][var->position][var->alt][vcf] = var;
+                sampleNames.insert(sampleNames.end(), vcf->sampleNames.begin(), vcf->sampleNames.end());
+                // the first file is tracked for header generation
+            }
+            // populate the vcf_header with header_lines from this vcf file
+            vector<string> header_lines = split(vcf->vcf_header, "\n");
+            if (header_lines.size() > 0)
+            {
+                // populate the meta information lines
+                string column_headers_line;
+                for (vector<string>::const_iterator meta_iter = header_lines.begin(); meta_iter != header_lines.end(); ++meta_iter)
+                {
+                    vcf_header.addMetaInformationLine(*meta_iter);
+                    if ((*meta_iter).find("#CHROM") != string::npos) // store the header column position
+                    {
+                        column_headers_line = (*meta_iter);
+                    }
+                }
+                if (column_headers_line.size() > 0) // if there are header columns then add them
+                {
+                    vector<string> header_columns = split(column_headers_line, "\t");
+                    for (vector<string>::const_iterator column_iter = header_columns.begin(); column_iter != header_columns.end(); ++column_iter)
+                    {
+                        vcf_header.addHeaderColumn(*column_iter);
+                    }
+                }
+            }
+
+            if (firstVCF == NULL) firstVCF = vcf;
+        }
+    }
+
+    // get sorted, unique samples in all files
+    sort(sampleNames.begin(), sampleNames.end());
+    sampleNames.erase(unique(sampleNames.begin(), sampleNames.end()), sampleNames.end());
+
+    // now that we've accumulated the sample information we can generate the combined header
+    VariantCallFile outputCallFile;
+//    string header = firstVCF->headerWithSampleNames(sampleNames);
+    string header = vcf_header.getHeaderString();
+
+    outputCallFile.openForOutput(header);
+
+    cout << outputCallFile.header << endl;
+
+    while (!variantsByChromPosAltFile.empty()) {
+        // get lowest variant(s)
+        // if they have identical alts and position, combine
+        // otherwise just output, but with the same sample names
+
+        ChromVariants& chrom = variantsByChromPosAltFile.begin()->second;
+        if (chrom.empty()) {
+            variantsByChromPosAltFile.erase(variantsByChromPosAltFile.begin());
+            continue;
+        }
+
+        Position& pos = chrom.begin()->second;
+        Position::iterator s = pos.begin();
+        for ( ; s != pos.end(); ++s) {
+            Variant variant(outputCallFile);
+            map<VariantCallFile*, Variant*>& vars = s->second;
+            map<VariantCallFile*, Variant*>::iterator v = vars.begin();
+            for ( ; v != vars.end(); ++v) {
+                VariantCallFile* vcf = v->first;
+                Variant* var = v->second;
+                //if (variant.info.empty()) {
+                if (v == vars.begin()) { // set these using the first matching variant
+                    variant.sequenceName = var->sequenceName;
+                    variant.position = var->position;
+                    variant.id = var->id;
+                    variant.ref = var->ref;
+                    variant.alt = var->alt;
+                    variant.filter = var->filter;
+                    variant.quality = var->quality;
+                    variant.info = var->info;
+                    variant.format = var->format;
+                }
+                // add samples to output variant
+                for (Samples::iterator sample = var->samples.begin(); sample != var->samples.end(); ++sample) {
+                    variant.samples[sample->first] = sample->second;
+                }
+                if (vcf->getNextVariant(*var)) {
+                    variantsByChromPosAltFile[var->sequenceName][var->position][var->alt][vcf] = var;
+                }
+            }
+            // what was this chck for?
+            //if (!variant.info.empty())
+            cout << variant << endl;
+        }
+        // pop the last position
+        chrom.erase(chrom.begin());
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfcommonsamples.cpp b/src/vcfcommonsamples.cpp
new file mode 100644
index 0000000..ee594cb
--- /dev/null
+++ b/src/vcfcommonsamples.cpp
@@ -0,0 +1,85 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+template<class T>
+vector<T> intersection(vector<T>& a, vector<T>& b) {
+    map<T, bool> inA;
+    map<T, bool> inAB;
+    for (typename vector<T>::iterator i = a.begin(); i != a.end(); ++i) {
+        inA[*i] = true;
+    }
+    for (typename vector<T>::iterator i = b.begin(); i != b.end(); ++i) {
+        if (inA.find(*i) != inA.end()) {
+            inAB[*i] = true;
+        }
+    }
+    vector<T> aIb;
+    for (typename map<T, bool>::iterator i = inAB.begin(); i != inAB.end(); ++i) {
+        aIb.push_back(i->first);
+    }
+    return aIb;
+}
+
+int main(int argc, char** argv) {
+
+    if (argc != 3) {
+        cerr << "usage: " << argv[0] << " <vcf file> <vcf file>" << endl
+             << "outputs each record in the first file, removing samples not present in the second" << endl;
+        return 1;
+    }
+
+    string filenameA = argv[1];
+    string filenameB = argv[2];
+
+    if (filenameA == filenameB) {
+        cerr << "you're just spinning your wheels matching the samples in "
+            << filenameA << " to the samples in " << filenameB << endl;
+        return 1;
+    }
+
+    VariantCallFile variantFileA;
+    if (filenameA == "-") {
+        variantFileA.open(std::cin);
+    } else {
+        variantFileA.open(filenameA);
+    }
+
+    VariantCallFile variantFileB;
+    if (filenameB == "-") {
+        variantFileB.open(std::cin);
+    } else {
+        variantFileB.open(filenameB);
+    }
+
+    if (!variantFileA.is_open() || !variantFileB.is_open()) {
+        return 1;
+    }
+
+    Variant varA(variantFileA);
+    Variant varB(variantFileB);
+
+    vector<string> commonSamples = intersection(variantFileA.sampleNames, variantFileB.sampleNames);
+
+    // update sample list in header
+    variantFileA.updateSamples(commonSamples);
+
+    // and restrict the output sample names in the variant to those we are keeping
+    varA.setOutputSampleNames(commonSamples);
+ 
+    // write the new header
+    cout << variantFileA.header << endl;
+ 
+    // print the records, filtering is done via the setting of varA's output sample names
+    while (variantFileA.getNextVariant(varA)) {
+        cout << varA << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfcountalleles.cpp b/src/vcfcountalleles.cpp
new file mode 100644
index 0000000..9ce6aae
--- /dev/null
+++ b/src/vcfcountalleles.cpp
@@ -0,0 +1,33 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc > 1) {
+        string filename = argv[1];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    int uniqueAlleles = 0;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        uniqueAlleles += var.alleles.size();
+    }
+
+    cout << uniqueAlleles << endl;
+
+    return 0;
+
+}
+
diff --git a/src/vcfcreatemulti.cpp b/src/vcfcreatemulti.cpp
new file mode 100644
index 0000000..d4ac13c
--- /dev/null
+++ b/src/vcfcreatemulti.cpp
@@ -0,0 +1,197 @@
+#include "Variant.h"
+#include "convert.h"
+#include <set>
+#include <sstream>
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+
+double convertStrDbl(const string& s) {
+    double r;
+    convert(s, r);
+    return r;
+}
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [file]" << endl
+         << endl
+         << "If overlapping alleles are represented across multiple records, merge" << endl
+         << "them into a single record.  Currently only for indels." << endl;
+    exit(0);
+}
+
+Variant createMultiallelic(vector<Variant>& vars) {
+
+    if (vars.size() == 1) {
+        return vars.front();
+    }
+
+    int maxpos = vars.front().position + vars.front().ref.size();
+    for (vector<Variant>::iterator v = vars.begin(); v != vars.end(); ++v) {
+        //cerr << *v << endl;
+        if (maxpos < v->position + v->ref.size()) {
+            maxpos = v->position + v->ref.size();
+        }
+    }
+
+    int numalt = vars.size();
+    //cerr << "gots overlapping vars " << vars.front().position << "-" << vars.back().position << endl;
+
+    // get REF
+    // use start position to extend all other alleles
+    int start = vars.front().position;
+    string ref = vars.front().ref;
+
+    for (vector<Variant>::iterator v = vars.begin() + 1; v != vars.end(); ++v) {
+        int sdiff = (v->position + v->ref.size()) - (start + ref.size());
+        int pdiff = (start + ref.size()) - v->position;
+        if (sdiff > 0) {
+            ref.append(v->ref.substr(pdiff, sdiff));
+        }
+    }
+
+    //cerr << "ref would be " << ref << " for vars from "
+    //     << vars.front().position << " to " << vars.back().position << endl;
+
+    Variant var = vars.front();
+    var.alt.clear();
+    var.ref = ref;
+
+    for (vector<Variant>::iterator v = vars.begin(); v != vars.end(); ++v) {
+        // add alternates and splice them into the reference
+        int p5diff = v->position - var.position;
+        int p3diff = (var.position + var.ref.size()) - (v->position + v->ref.size());
+        string before;
+        string after;
+        if (p5diff > 0) {
+            before = var.ref.substr(0, p5diff);
+        }
+        if (p3diff > 0 && p3diff < var.ref.size()) {
+            after = var.ref.substr(var.ref.size() - p3diff);
+        }
+        if (p5diff || p3diff) {
+            for (vector<string>::iterator a = v->alt.begin(); a != v->alt.end(); ++a) {
+                var.alt.push_back(before);
+                string& alt = var.alt.back();
+                alt.append(*a);
+                alt.append(after);
+            }
+        } else {
+            for (vector<string>::iterator a = v->alt.begin(); a != v->alt.end(); ++a) {
+                var.alt.push_back(*a);
+            }
+        }
+    }
+
+    stringstream s;
+    s << vars.front().position << "-" << vars.back().position;
+    var.info["combined"].push_back(s.str());
+
+    return var;
+
+}
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+        {
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            {"help", no_argument, 0, 'h'},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "h",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+
+            case 'h':
+                printSummary(argv);
+                break;
+
+            case '?':
+                printSummary(argv);
+                exit(1);
+                break;
+
+            default:
+                abort ();
+        }
+    }
+
+    if (optind < argc) {
+        string filename = argv[optind];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    variantFile.addHeaderLine("##INFO=<ID=combined,Number=1,Type=String,Description=\"Range of overlapping variants which were combined into this one using vcfcreatemulti.\">");
+
+    cout << variantFile.header << endl;
+
+    bool first = true;
+    bool already = false;
+    Variant var(variantFile);
+    vector<Variant> vars;
+    string lastSeq;
+
+    while (variantFile.getNextVariant(var)) {
+
+        if (lastSeq.empty()) {
+            lastSeq = var.sequenceName;
+        }
+
+        if (vars.empty()) {
+            vars.push_back(var);
+            continue;
+        } else {
+            int maxpos = vars.front().position + vars.front().ref.size();
+            for (vector<Variant>::iterator v = vars.begin(); v != vars.end(); ++v) {
+                if (maxpos < v->position + v->ref.size()) {
+                    maxpos = v->position + v->ref.size();
+                }
+            }
+            if (var.sequenceName != lastSeq) {
+                Variant result = createMultiallelic(vars);
+                cout << result << endl;
+                vars.clear();
+                lastSeq = var.sequenceName;
+                vars.push_back(var);
+            } else if (var.position < maxpos) {
+                vars.push_back(var);
+            } else {
+                Variant result = createMultiallelic(vars);
+                cout << result << endl;
+                vars.clear();
+                vars.push_back(var);
+            }
+        }
+
+    }
+
+    if (!vars.empty()) {
+        Variant result = createMultiallelic(vars);
+        cout << result << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfdistance.cpp b/src/vcfdistance.cpp
new file mode 100644
index 0000000..61e798e
--- /dev/null
+++ b/src/vcfdistance.cpp
@@ -0,0 +1,92 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    if (argc > 1) {
+        cerr << "usage: " << argv[0] << " <[vcf file]" << endl
+             << "adds a tag (BasesToClosestVariant) to each variant record which indicates" << endl
+             << "the distance to the nearest variant" << endl;
+        return 1;
+    }
+
+    VariantCallFile variantFile;
+    variantFile.open(std::cin);
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    Variant varA(variantFile);
+    Variant varB(variantFile);
+    Variant varC(variantFile);
+
+    vector<Variant*> vars;
+    vars.push_back(&varA);
+    vars.push_back(&varB);
+    vars.push_back(&varC);
+    
+    for (vector<Variant*>::iterator v = vars.begin(); v != vars.end(); ++v) {
+        variantFile.getNextVariant(**v);
+    }
+
+    string tag = "BasesToClosestVariant";
+    string line = "##INFO=<ID=" + tag + ",Number=1,Type=Integer,Description=\"" \
+        + "Number of bases to the closest variant in the file.\">";
+    variantFile.addHeaderLine(line);
+
+    cout << variantFile.header << endl;
+
+    if (!vars.at(0)->sequenceName.empty()) {
+        if (!vars.at(1)->sequenceName.empty()) {
+            // at least two variants, so calculate the first distance
+            if (vars.at(0)->sequenceName == vars.at(1)->sequenceName) {
+                vars.at(0)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
+            }
+            cout << *vars.at(0) << endl;
+
+            if (!vars.at(2)->sequenceName.empty()) {
+                // at least three variants, so starting with the first three,
+                // calculate the middle variant's closest distance, and then
+                // slide the window forward one.
+                do {
+                    if (vars.at(1)->sequenceName == vars.at(0)->sequenceName &&
+                        vars.at(1)->sequenceName == vars.at(2)->sequenceName) {
+                        vars.at(1)->info[tag].push_back(convert(min(vars.at(1)->position - vars.at(0)->position,
+                                                                    vars.at(2)->position - vars.at(1)->position)));
+                    } else if (vars.at(1)->sequenceName == vars.at(0)->sequenceName) {
+                        vars.at(1)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
+                    } else if (vars.at(2)->sequenceName == vars.at(1)->sequenceName) {
+                        vars.at(1)->info[tag].push_back(convert(vars.at(2)->position - vars.at(1)->position));
+                    } else {
+                        // don't add the tag
+                    }
+                    cout << *vars.at(1) << endl;
+                    // rotate
+                    Variant* v = vars.at(0);
+                    vars.at(0) = vars.at(1);
+                    vars.at(1) = vars.at(2);
+                    vars.at(2) = v;
+                } while (variantFile.getNextVariant(*vars.back()));
+            }
+
+            // assign the last distance and output the last variant
+            if (vars.at(0)->sequenceName == vars.at(1)->sequenceName) {
+                vars.at(1)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
+            }
+            cout << *vars.at(1) << endl;
+        } else {
+            // output the lone variant line untouched
+            cout << *vars.at(0) << endl;
+        }
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfecho.cpp b/src/vcfecho.cpp
new file mode 100644
index 0000000..b850440
--- /dev/null
+++ b/src/vcfecho.cpp
@@ -0,0 +1,31 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc > 1) {
+        string filename = argv[1];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfentropy.cpp b/src/vcfentropy.cpp
new file mode 100644
index 0000000..4f92691
--- /dev/null
+++ b/src/vcfentropy.cpp
@@ -0,0 +1,159 @@
+#include "Variant.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+         << endl
+         << "options:" << endl 
+         << "    -f, --fasta-reference  FASTA reference file to use to obtain flanking sequences" << endl
+         << "    -w, --window-size      Size of the window over which to calculate entropy" << endl
+         << endl
+         << "Anotates the output VCF file with, for each record, EntropyLeft, EntropyRight," << endl
+         << "EntropyCenter, which are the entropies of the sequence of the given window size to the" << endl
+         << "left, right, and center  of the record.  Also adds EntropyRef and EntropyAlt for each alt." << endl
+         << endl;
+    exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+    int c;
+    string fastaRef;
+    int windowSize = 0;
+
+    if (argc == 1)
+        printSummary(argv);
+
+    while (true) {
+        static struct option long_options[] =
+        {
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            {"help", no_argument, 0, 'h'},
+            {"fasta-reference",  required_argument, 0, 'f'},
+            {"window-size", required_argument, 0, 'w'},
+            //{"length",  no_argument, &printLength, true},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hf:w:",
+                         long_options, &option_index);
+
+      /* Detect the end of the options. */
+          if (c == -1)
+            break;
+ 
+          switch (c)
+            {
+            case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+              break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+              printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+          case 'f':
+            fastaRef = optarg;
+            break;
+
+          case 'w':
+            windowSize = atoi(optarg);
+            break;
+ 
+          case 'h':
+            printSummary(argv);
+            exit(0);
+            break;
+
+          case '?':
+            /* getopt_long already printed an error message. */
+            printSummary(argv);
+            exit(1);
+            break;
+ 
+          default:
+            abort ();
+          }
+      }
+
+    if (windowSize == 0) {
+        cerr << "a window size must be specified" << endl;
+        exit(1);
+    }
+    if (fastaRef.empty()) {
+        cerr << "a FASTA reference sequence must be specified" << endl;
+        exit(1);
+    }
+
+    FastaReference ref;
+    ref.open(fastaRef);
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    variantFile.addHeaderLine("##INFO=<ID=EntropyLeft,Number=1,Type=Float,Description=\"Entropy of left-flanking sequence of "+ convert(windowSize) +"bp\">");
+    variantFile.addHeaderLine("##INFO=<ID=EntropyCenter,Number=1,Type=Float,Description=\"Entropy of centered sequence of "+ convert(windowSize) +"bp\">");
+    variantFile.addHeaderLine("##INFO=<ID=EntropyRight,Number=1,Type=Float,Description=\"Entropy of right-flanking sequence of "+ convert(windowSize) +"bp\">");
+    variantFile.addHeaderLine("##INFO=<ID=EntropyRef,Number=1,Type=Float,Description=\"Entropy of REF allele\">");
+    variantFile.addHeaderLine("##INFO=<ID=EntropyAlt,Number=A,Type=Float,Description=\"Entropy of each ALT allele\">");
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+
+        // get the ref start and end positions
+        int refstart = var.position - 1; // convert to 0-based
+        int refend = var.position + var.ref.size() - 1;
+        string leftseq = ref.getSubSequence(var.sequenceName, refstart - windowSize, windowSize);
+        string rightseq = ref.getSubSequence(var.sequenceName, refend, windowSize);
+        string centerseq = ref.getSubSequence(var.sequenceName, refstart - windowSize/2, windowSize);
+        double entropyLeft = shannon_H((char*) &leftseq[0], windowSize);
+        double entropyRight = shannon_H((char*) &rightseq[0], windowSize);
+        double entropyCenter = shannon_H((char*) &centerseq[0], windowSize);
+        double entropyRef = shannon_H((char*) var.ref.c_str(), var.ref.size());
+
+        var.info["EntropyLeft"].clear();
+        var.info["EntropyRight"].clear();
+        var.info["EntropyCenter"].clear();
+        var.info["EntropyRef"].clear();
+        var.info["EntropyAlt"].clear();
+
+        var.info["EntropyLeft"].push_back(convert(entropyLeft));
+        var.info["EntropyRight"].push_back(convert(entropyRight));
+        var.info["EntropyCenter"].push_back(convert(entropyCenter));
+        var.info["EntropyRef"].push_back(convert(entropyRef));
+
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+            double entropyAlt = shannon_H((char*) a->c_str(), a->size());
+            var.info["EntropyAlt"].push_back(convert(entropyAlt));
+        }
+
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfevenregions.cpp b/src/vcfevenregions.cpp
new file mode 100644
index 0000000..5888c98
--- /dev/null
+++ b/src/vcfevenregions.cpp
@@ -0,0 +1,202 @@
+#include "Variant.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <getopt.h>
+#include <cmath>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+         << endl
+         << "options:" << endl 
+         << "    -f, --fasta-reference REF    FASTA reference file to use to obtain primer sequences." << endl
+         << "    -n, --number-of-regions N    The number of desired regions." << endl
+         << "    -p, --number-of-positions N  The number of positions per region." << endl
+         << "    -o, --offset N               Add an offset to region positioning, to avoid boundary" << endl
+         << "                                 related artifacts in downstream processing." << endl
+         << "    -l, --overlap N              The number of sites to overlap between regions.  Default 0." << endl
+         << "    -s, --separator SEQ          Specify string to use to separate region output.  Default '-'" << endl
+         << endl
+         << "Generates a list of regions, e.g. chr20:10..30 using the variant" << endl
+         << "density information provided in the VCF file to ensure that the regions have" << endl
+         << "even numbers of variants.  This can be use to reduce the variance in runtime" << endl
+         << "when dividing variant detection or genotyping by genomic coordinates." << endl;
+    exit(0);
+}
+
+
+struct Region {
+    long int start;
+    long int end;
+    int positions;
+    Region() : start(0), end(0), positions(0) { }
+    Region(long int s, long int e)
+        : start(s), end(e), positions(0) { }
+};
+
+
+int main(int argc, char** argv) {
+
+    int c;
+    string fastaRef;
+    bool keepFailures = false;
+    bool excludeFailures = false;
+    int number_of_regions = 1;
+    int number_of_positions = 0;
+    int offset = 0;
+    int overlap = 0;
+    string regionSplitSeq = "-";
+
+    if (argc == 1)
+        printSummary(argv);
+
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"fasta-reference",  required_argument, 0, 'f'},
+                {"number-of-regions",  required_argument, 0, 'n'},
+                {"number-of-positions",  required_argument, 0, 'p'},
+                {"offset",  required_argument, 0, 'o'},
+                {"overlap",  required_argument, 0, 'l'},
+                {"separator",  required_argument, 0, 's'},
+                //{"length",  no_argument, &printLength, true},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hf:n:o:l:s:p:",
+                         long_options, &option_index);
+
+        /* Detect the end of the options. */
+        if (c == -1)
+            break;
+ 
+        switch (c)
+        {
+        case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+                break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+                printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+        case 'f':
+            fastaRef = optarg;
+            break;
+
+        case 'n':
+            number_of_regions = atoi(optarg);
+            break;
+
+        case 'p':
+            number_of_positions = atoi(optarg);
+            break;
+
+        case 'o':
+            offset = atoi(optarg);
+            break;
+
+        case 'l':
+            overlap = atoi(optarg);
+            break;
+
+        case 's':
+            regionSplitSeq = optarg;
+            break;
+ 
+        case 'h':
+            printSummary(argv);
+            exit(0);
+            break;
+
+        case '?':
+            /* getopt_long already printed an error message. */
+            printSummary(argv);
+            exit(1);
+            break;
+ 
+        default:
+            abort ();
+        }
+    }
+
+    if (fastaRef.empty()) {
+        cerr << "a FASTA reference sequence must be specified" << endl;
+        exit(1);
+    }
+
+    FastaReference ref;
+    ref.open(fastaRef);
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    map<string, vector<Region> > positions_by_chrom;
+    int total_positions = 0;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        int refstart = var.position - 1; // convert to 0-based
+        positions_by_chrom[var.sequenceName].push_back(Region(refstart + offset, refstart + offset + var.ref.size()));
+        ++total_positions;
+    }
+
+    int positions_per_region;
+    if (number_of_positions) {
+        if (number_of_positions - overlap < 0) {
+            cerr << "overlap is greater than the number of positions per region!" << endl;
+            exit(1);
+        } else {
+            positions_per_region = number_of_positions - overlap;
+        }
+    } else {
+        positions_per_region = ceil((double) total_positions / (double) number_of_regions);
+    }
+    //cerr << positions_per_region << "=" << total_positions << "/" << number_of_regions << "+" << overlap << endl;
+
+    // todo, update routine to allow overlaps
+
+    for (map<string, vector<Region> >::iterator s = positions_by_chrom.begin();
+         s != positions_by_chrom.end(); ++s) {
+        //pair<long int, long int> current_region;
+        Region current_region;
+        for (vector<Region>::iterator p = s->second.begin(); p != s->second.end(); ++p) {
+            if (current_region.positions < positions_per_region + overlap) {
+                current_region.end = p->end;
+                current_region.positions++;
+            } else {
+                cout << s->first << ":" << current_region.start << regionSplitSeq << current_region.end << endl;
+                vector<Region>::iterator l = max(s->second.begin(), p-overlap-1);
+                current_region.start = l->end;
+                current_region.end = p->end;
+                current_region.positions = overlap + 1;
+            }
+        }
+        // get refseq size, use as end coordinate for last region in target
+        current_region.end = ref.sequenceLength(s->first);
+        cout << s->first << ":" << current_region.start << regionSplitSeq << current_region.end << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcffilter.cpp b/src/vcffilter.cpp
new file mode 100644
index 0000000..9150ff1
--- /dev/null
+++ b/src/vcffilter.cpp
@@ -0,0 +1,402 @@
+#include "Variant.h"
+#include "split.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+         << endl
+         << "options:" << endl 
+         << "    -f, --info-filter     specifies a filter to apply to the info fields of records," << endl
+         << "                          removes alleles which do not pass the filter" << endl
+         << "    -g, --genotype-filter specifies a filter to apply to the genotype fields of records" << endl
+         << "    -k, --keep-info       used in conjunction with '-g', keeps variant info, but removes genotype" << endl
+         << "    -s, --filter-sites    filter entire records, not just alleles" << endl
+         << "    -t, --tag-pass        tag vcf records as positively filtered with this tag, print all records" << endl
+         << "    -F, --tag-fail        tag vcf records as negatively filtered with this tag, print all records" << endl
+         << "    -A, --append-filter   append the existing filter tag, don't just replace it" << endl
+         << "    -a, --allele-tag      apply -t on a per-allele basis.  adds or sets the corresponding INFO field tag" << endl
+         << "    -v, --invert          inverts the filter, e.g. grep -v" << endl
+         << "    -o, --or              use logical OR instead of AND to combine filters" << endl
+         << "    -r, --region          specify a region on which to target the filtering, requires a BGZF" << endl
+         << "                          compressed file which has been indexed with tabix.  any number of" << endl
+         << "                          regions may be specified." << endl
+         << endl
+         << "Filter the specified vcf file using the set of filters." << endl
+         << "Filters are specified in the form \"<ID> <operator> <value>:" << endl
+         << " -f \"DP > 10\"  # for info fields" << endl
+         << " -g \"GT = 1|1\" # for genotype fields" << endl
+         << " -f \"CpG\"  # for 'flag' fields" << endl
+         << endl
+         << "Operators can be any of: =, !, <, >, |, &" << endl
+         << endl
+         << "Any number of filters may be specified.  They are combined via logical AND" << endl
+         << "unless --or is specified on the command line.  Obtain logical negation through" << endl
+         << "the use of parentheses, e.g. \"! ( DP = 10 )\"" << endl
+         << endl
+         << "For convenience, you can specify \"QUAL\" to refer to the quality of the site, even" << endl
+         << "though it does not appear in the INFO fields." << endl
+         << endl;
+    exit(0);
+}
+
+bool passesFilters(Variant& var, vector<VariantFilter>& filters, bool logicalOr, string alt = "") {
+    for (vector<VariantFilter>::iterator f = filters.begin(); f != filters.end(); ++f) {
+        string s = "";
+        if (logicalOr) {
+            if (alt.empty()) {
+                if (f->passes(var, s)) return true;
+            } else {
+                if (f->passes(var, s, alt)) return true;
+            }
+        } else {
+            if (alt.empty()) {
+                if (!f->passes(var, s)) return false;
+            } else {
+                if (!f->passes(var, s, alt)) return false;
+            }
+        }
+    }
+    if (logicalOr)
+        return false;
+    else
+        return true;
+}
+
+
+int main(int argc, char** argv) {
+
+    int c;
+    bool invert = false;
+    bool logicalOr = false;
+    bool filterSites = false;
+    bool keepInfo = false;
+    vector<string> infofilterStrs;
+    vector<VariantFilter> infofilters;
+    vector<string> genofilterStrs;
+    vector<VariantFilter> genofilters;
+    string tagPass = "";
+    string tagFail = "";
+    string filterSpec;
+    string alleleTag;
+    vector<string> regions;
+    bool replaceFilter = true;
+
+    if (argc == 1)
+        printSummary(argv);
+
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"filter-sites", no_argument, 0, 's'},
+                {"info-filter",  required_argument, 0, 'f'},
+                {"genotype-filter",  required_argument, 0, 'g'},
+                {"tag-pass", required_argument, 0, 't'},
+                {"tag-pass", required_argument, 0, 'F'},
+                {"append-filter", no_argument, 0, 'A'},
+                {"allele-tag", required_argument, 0, 'a'},
+                {"invert", no_argument, 0, 'v'},
+                {"or", no_argument, 0, 'o'},
+                {"region", required_argument, 0, 'r'},
+                {"keep-info", no_argument, 0, 'k'},
+                //{"length",  no_argument, &printLength, true},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hvAsof:g:kt:F:r:a:",
+                         long_options, &option_index);
+
+        /* Detect the end of the options. */
+        if (c == -1)
+            break;
+ 
+        switch (c)
+        {
+        case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+                break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+                printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+        case 'f':
+            filterSpec += " " + string(optarg);
+            infofilterStrs.push_back(string(optarg));
+            break;
+
+        case 's':
+            filterSites = true;
+            break;
+
+        case 'a':
+            alleleTag = optarg;
+            break;
+ 
+        case 'g':
+            filterSpec += " genotypes filtered with: " + string(optarg);
+            genofilterStrs.push_back(string(optarg));
+            break;
+ 
+        case 't':
+            tagPass = optarg;
+            break;
+ 
+        case 'F':
+            tagFail = optarg;
+            break;
+ 
+        case 'A':
+            replaceFilter = false;
+            break;
+
+        case 'h':
+            printSummary(argv);
+            exit(0);
+            break;
+
+        case 'v':
+            invert = true;
+            break;
+
+        case 'o':
+            logicalOr = true;
+            break;
+
+        case 'r':
+            regions.push_back(optarg);
+            break;
+
+        case 'k':
+        	keepInfo = true;
+        	break;
+          
+        case '?':
+            /* getopt_long already printed an error message. */
+            printSummary(argv);
+            exit(1);
+            break;
+ 
+        default:
+            abort ();
+        }
+    }
+
+    if (genofilterStrs.size() == 0 && keepInfo) {
+		cout << "argument '-k' (--keep-info) requires a Genotype filter: ('-g')" << endl
+			<< "i.e.: -g \"GT = 1|1\" -k" << endl;
+		exit(1);
+	}
+
+    filterSpec = filterSpec.substr(1); // strip leading " "
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    for (vector<string>::iterator f = infofilterStrs.begin(); f != infofilterStrs.end(); ++f) {
+        infofilters.push_back(VariantFilter(*f, VariantFilter::RECORD, variantFile.infoTypes));
+    }
+
+    for (vector<string>::iterator f = genofilterStrs.begin(); f != genofilterStrs.end(); ++f) {
+        genofilters.push_back(VariantFilter(*f, VariantFilter::SAMPLE, variantFile.formatTypes));
+    }
+
+    vector<string> headerlines = split(variantFile.header, "\n");
+    variantFile.header.clear();
+    for (vector<string>::iterator l = headerlines.begin(); l != headerlines.end(); ++l) {
+        if (!filterSpec.empty() && (l->find("INFO") != string::npos || l + 1 == headerlines.end())) {
+            variantFile.header += "##filter=\"" + filterSpec + "\"\n";
+            filterSpec.clear();
+        }
+        variantFile.header += *l + ((l + 1 == headerlines.end()) ? "" : "\n");
+    }
+
+    if (!tagPass.empty()) {
+        variantFile.addHeaderLine("##FILTER=<ID="+ tagPass +",Description=\"Record passes the filters: " + filterSpec + ".\">");
+    }
+
+    if (!tagFail.empty()) {
+        variantFile.addHeaderLine("##FILTER=<ID="+ tagFail +",Description=\"Record fails the filters: " + filterSpec + ".\">");
+    }
+
+    if (!alleleTag.empty()) {
+        if (tagFail.empty()) {
+            tagFail = "";
+        }
+        if (tagPass.empty()) {
+            tagPass = "PASS";
+        }
+        variantFile.addHeaderLine("##INFO=<ID="+ alleleTag +",Number=A,Type=String,Description=\"" + tagPass + " if this allele passes the filters, " + tagFail + " if not, filters are: " + filterSpec + ".\">");
+    }
+
+    cout << variantFile.header << endl;
+
+    /*
+      if (genofilters.empty()) {
+      variantFile.parseSamples = false;
+      }
+    */
+
+    if (filterSites) {
+        variantFile.parseSamples = false;
+    }
+
+    Variant var(variantFile);
+
+    vector<string>::iterator regionItr = regions.begin();
+
+    do {
+
+        if (!inputFilename.empty() && !regions.empty()) {
+            string regionStr = *regionItr++;
+            variantFile.setRegion(regionStr);
+        }
+
+        while (variantFile.getNextVariant(var)) {
+            if (!genofilters.empty()) {
+                for (vector<VariantFilter>::iterator f = genofilters.begin(); f != genofilters.end(); ++f) {
+					f->removeFilteredGenotypes(var, keepInfo);
+                }
+            }
+            if (!infofilters.empty()) {
+                if (filterSites) {
+                    bool passes = passesFilters(var, infofilters, logicalOr);
+                    if (invert) {
+                        passes = !passes;
+                    }
+                    if (passes) {
+                        if (!tagPass.empty()) {
+                            if (alleleTag.empty()) {
+                                if (replaceFilter) {
+                                    var.filter.clear();
+                                    var.addFilter(tagPass);
+                                } else {
+                                    var.addFilter(tagPass);
+                                }
+                            } else {
+                                var.info[alleleTag].clear();
+                                for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+                                    var.info[alleleTag].push_back(tagPass);
+                                }
+                            }
+                        } else {
+                            if (!var.originalLine.empty()) {
+                                cout << var.originalLine << endl;
+                            } else {
+                                cout << var << endl;
+                            }
+                        }
+                    } else {
+                        if (!tagFail.empty()) {
+                            if (alleleTag.empty()) {
+                                if (replaceFilter) {
+                                    var.filter.clear();
+                                    var.addFilter(tagFail);
+                                } else {
+                                    var.addFilter(tagFail);
+                                }
+                            } else {
+                                var.info[alleleTag].clear();
+                                for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+                                    var.info[alleleTag].push_back(tagFail);
+                                }
+                            }
+                        }
+                    }
+                    if (passes && !tagPass.empty()) {
+                        cout << var << endl;
+                    } else if (!tagFail.empty()) {
+                        cout << var << endl;
+                    }
+                } else { // filter out alleles which pass
+                    // removes the failing alleles
+                    vector<string> failingAlts;
+                    vector<string> passingAlts;
+                    vector<bool> passes;
+                    for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+                        if (!passesFilters(var, infofilters, logicalOr, *a)) {
+                            failingAlts.push_back(*a);
+                            passes.push_back(false);
+                        } else {
+                            passingAlts.push_back(*a);
+                            passes.push_back(true);
+                        }
+                    }
+                    if (tagPass.empty()) { // if there is no specified tag, just remove the failing alts
+                        if (failingAlts.size() < var.alt.size()) {
+                            for (vector<string>::iterator a = failingAlts.begin(); a != failingAlts.end(); ++a) {
+                                var.removeAlt(*a);
+                            }
+                            cout << var << endl;
+                        }
+                    } else { // otherwise, apply the tag
+                        if (alleleTag.empty()) {
+                            if (!passingAlts.empty()) {
+                                if (replaceFilter) {
+                                    var.filter.clear();
+                                    var.addFilter(tagPass);
+                                } else {
+                                    var.addFilter(tagPass);
+                                }
+                            } else {
+                                if (replaceFilter) {
+                                    var.filter.clear();
+                                    if (!tagFail.empty()) {
+                                        var.addFilter(tagFail);
+                                    }
+                                } else {
+                                    if (!tagFail.empty()) {
+                                        var.addFilter(tagFail);
+                                    }
+                                }
+                            }
+                        } else {
+                            var.info[alleleTag].clear();
+                            for (vector<bool>::iterator p = passes.begin(); p != passes.end(); ++p) {
+                                if (*p) {
+                                    var.info[alleleTag].push_back(tagPass);
+                                } else {
+                                    var.info[alleleTag].push_back(tagFail);
+                                }
+                            }
+                        }
+                        // TODO
+                        // here, if we don't use genotype filters, we shouldn't re-print the samples
+                        // we haven't done anything to this part of the input.
+                        cout << var << endl;
+                    }
+                }
+            } else {
+                if (genofilters.empty()) {
+                    cout << variantFile.line << endl;
+                } else {
+                    cout << var << endl;
+                }
+            }
+        }
+
+    } while (regionItr != regions.end());
+
+    return 0;
+
+}
+
diff --git a/src/vcffixup.cpp b/src/vcffixup.cpp
new file mode 100644
index 0000000..be79d36
--- /dev/null
+++ b/src/vcffixup.cpp
@@ -0,0 +1,117 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+int countAlts(Variant& var, int alleleIndex) {
+    int alts = 0;
+    for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+        map<string, vector<string> >& sample = s->second;
+        map<string, vector<string> >::iterator gt = sample.find("GT");
+        if (gt != sample.end()) {
+            map<int, int> genotype = decomposeGenotype(gt->second.front());
+            for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
+                if (g->first == alleleIndex) {
+                    alts += g->second;
+                }
+            }
+        }
+    }
+    return alts;
+}
+
+int countAlleles(Variant& var) {
+    int alleles = 0;
+    for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+        map<string, vector<string> >& sample = s->second;
+        map<string, vector<string> >::iterator gt = sample.find("GT");
+        if (gt != sample.end()) {
+            map<int, int> genotype = decomposeGenotype(gt->second.front());
+            for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
+		if (g->first != NULL_ALLELE) {
+		    alleles += g->second;
+		}
+            }
+        }
+    }
+    return alleles;
+}
+
+int main(int argc, char** argv) {
+
+  if (argc == 1 || ((argc > 1) && strcmp(argv[1], "-h") == 0) || strcmp(argv[1], "--help") == 0) {
+        cerr << "usage: " << argv[0] << " <vcf file>" << endl
+             << "outputs a VCF stream where AC and NS have been generated for each record using sample genotypes" << endl;
+        return 1;
+    }
+
+    VariantCallFile variantFile;
+    if (argc == 1 || ((argc == 2) && strcmp(argv[1], "-") == 0)) {
+        variantFile.open(std::cin);
+        if (!variantFile.is_open()) {
+            cerr << "vcffixup: could not open stdin" << endl;
+            return 1;
+        }
+    } else {
+        string filename = argv[1];
+        variantFile.open(filename);
+        if (!variantFile.is_open()) {
+            cerr << "vcffixup: could not open " << filename << endl;
+            return 1;
+        }
+    }
+
+    Variant var(variantFile);
+
+    // remove header lines we're going to add
+    variantFile.removeInfoHeaderLine("AC");
+    variantFile.removeInfoHeaderLine("AF");
+    variantFile.removeInfoHeaderLine("NS");
+    variantFile.removeInfoHeaderLine("AN");
+
+    // and add them back, so as not to duplicate them if they are already there
+    variantFile.addHeaderLine("##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Total number of alternate alleles in called genotypes\">");
+    variantFile.addHeaderLine("##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1]\">");
+    variantFile.addHeaderLine("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">");
+    variantFile.addHeaderLine("##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
+
+    // write the new header
+    cout << variantFile.header << endl;
+ 
+    // print the records, filtering is done via the setting of varA's output sample names
+    while (variantFile.getNextVariant(var)) {
+        stringstream ns;
+        ns << var.samples.size();
+        var.info["NS"].clear();
+        var.info["NS"].push_back(ns.str());
+
+        var.info["AC"].clear();
+        var.info["AF"].clear();
+        var.info["AN"].clear();
+
+        int allelecount = countAlleles(var);
+        stringstream an;
+        an << allelecount;
+        var.info["AN"].push_back(an.str());
+
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+            string& allele = *a;
+            int altcount = countAlts(var, var.getAltAlleleIndex(allele) + 1);
+            stringstream ac;
+            ac << altcount;
+            var.info["AC"].push_back(ac.str());
+            stringstream af;
+            af << (double) altcount / (double) allelecount;
+            var.info["AF"].push_back(af.str());
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfflatten.cpp b/src/vcfflatten.cpp
new file mode 100644
index 0000000..f528360
--- /dev/null
+++ b/src/vcfflatten.cpp
@@ -0,0 +1,178 @@
+#include "Variant.h"
+#include "convert.h"
+
+using namespace std;
+using namespace vcf;
+
+
+double convertStrDbl(const string& s) {
+    double r;
+    convert(s, r);
+    return r;
+}
+
+int main(int argc, char** argv) {
+
+    int maxAlleles = 2;
+
+    VariantCallFile variantFile;
+
+    if (argc > 1) {
+        string filename = argv[1];
+        if (filename == "--help" || filename == "-h") {
+            cerr << "usage: vcfflatten [file]" << endl
+                 << endl
+                 << "Removes multi-allelic sites by picking the most common alternate.  Requires" << endl
+                 << "allele frequency specification 'AF' and use of 'G' and 'A' to specify the" << endl
+                 << "fields which vary according to the Allele or Genotype. VCF file may be" << endl
+                 << "specified on the command line or piped as stdin." << endl;
+            exit(1);
+        }
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        // count the number of alternates
+        // if we have more than N, strip the lowest-frequency ones
+        if (var.alleles.size() > maxAlleles) {
+
+            multimap<double, string> alleleFrequencies;
+
+            vector<string>& freqsstr = var.info["AF"];
+            vector<double> freqs;
+            freqs.resize(freqsstr.size());
+            transform(freqsstr.begin(), freqsstr.end(), freqs.begin(), convertStrDbl);
+
+            vector<double>::iterator f = freqs.begin();
+            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++f) {
+                alleleFrequencies.insert(pair<double, string>(*f, *a));
+            }
+
+            // pick the highest frequency alternate
+            string bestalt = alleleFrequencies.rbegin()->second;
+            // and get its index
+            int bestaltIndex = var.getAltAlleleIndex(bestalt);
+            int bestaltGenotypeIndex = bestaltIndex + 1; // per VCF spec
+
+            // keep the RR, RA, and AA alleles for this alternate
+            // generate the genotype index table for this variant
+            map<pair<int, int>, int> genotypeIndexes = var.getGenotypeIndexesDiploid();
+
+            // now get the genotype indexes we want to keep
+            vector<int> alleleIndexes;
+            alleleIndexes.push_back(0); 
+            alleleIndexes.push_back(bestaltGenotypeIndex);
+
+            // add the reference allele index for generating genotype indexes
+            int ploidy = 2;
+            vector<vector<int> > genotypesToKeep = multichoose(ploidy, alleleIndexes);
+            map<int, bool> genotypeIndexesToKeep;
+            for (vector<vector<int> >::iterator k = genotypesToKeep.begin(); k != genotypesToKeep.end(); ++k) {
+                pair<int, int> genotype = make_pair(k->front(), k->back()); // vectors are guaranteed to be diploid per multichoose
+                genotypeIndexesToKeep[genotypeIndexes[genotype]] = true;
+            }
+            // we are diploid, so there should be exactly 3 genotypes
+            assert(genotypeIndexesToKeep.size() == 3);
+
+            // get the fields which have genotype order "G"
+            // for all the infocounts
+            // find the ones which are == GENOTYPE_NUMBER or ALLELE_NUMBER
+            //     and fix em up
+            for (map<string, int>::iterator c = variantFile.infoCounts.begin(); c != variantFile.infoCounts.end(); ++c) {
+                int count = c->second;
+                if (count == GENOTYPE_NUMBER) {
+                    string key = c->first;
+                    map<string, vector<string> >::iterator v = var.info.find(key);
+                    if (v != var.info.end()) {
+                        vector<string>& vals = v->second;
+                        vector<string> tokeep;
+                        int i = 0;
+                        for (vector<string>::iterator g = vals.begin(); g != vals.end(); ++g, ++i) {
+                            if (genotypeIndexesToKeep.find(i) != genotypeIndexesToKeep.end()) {
+                                tokeep.push_back(*g);
+                            }
+                        }
+                        vals = tokeep;
+                    }
+                } else if (count == ALLELE_NUMBER) {
+                    string key = c->first;
+                    map<string, vector<string> >::iterator v = var.info.find(key);
+                    if (v != var.info.end()) {
+                        vector<string>& vals = v->second;
+                        vector<string> tokeep;
+                        int i = 0;
+                        for (vector<string>::iterator a = vals.begin(); a != vals.end(); ++a, ++i) {
+                            if (i == bestaltIndex) {
+                                tokeep.push_back(*a);
+                            }
+                        }
+                        vals = tokeep;
+                    }
+                }
+            }
+            //
+            // for all the formatcounts
+            // find the ones which are == GENOTYPE_NUMBER or ALLELE_NUMBER
+            //     for each sample, remove the new irrelevant values
+
+            // for each sample
+            //   remove info fields which now refer to nothing
+            for (map<string, int>::iterator c = variantFile.formatCounts.begin(); c != variantFile.formatCounts.end(); ++c) {
+                int count = c->second;
+                if (count == GENOTYPE_NUMBER) {
+                    string key = c->first;
+                    for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+                        map<string, vector<string> >& sample = s->second;
+                        map<string, vector<string> >::iterator v = sample.find(key);
+                        if (v != sample.end()) {
+                            vector<string>& vals = v->second;
+                            vector<string> tokeep;
+                            int i = 0;
+                            for (vector<string>::iterator g = vals.begin(); g != vals.end(); ++g, ++i) {
+                                if (genotypeIndexesToKeep.find(i) != genotypeIndexesToKeep.end()) {
+                                    tokeep.push_back(*g);
+                                }
+                            }
+                            vals = tokeep;
+                        }
+                    }
+                } else if (count == ALLELE_NUMBER) {
+                    string key = c->first;
+                    for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+                        map<string, vector<string> >& sample = s->second;
+                        map<string, vector<string> >::iterator v = sample.find(key);
+                        if (v != sample.end()) {
+                            vector<string>& vals = v->second;
+                            vector<string> tokeep;
+                            int i = 0;
+                            for (vector<string>::iterator a = vals.begin(); a != vals.end(); ++a, ++i) {
+                                if (i == bestaltIndex) {
+                                    tokeep.push_back(*a);
+                                }
+                            }
+                            vals = tokeep;
+                        }
+                    }
+                }
+            }
+
+            var.alt.clear();
+            var.alt.push_back(bestalt);
+
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfgeno2alleles.cpp b/src/vcfgeno2alleles.cpp
new file mode 100644
index 0000000..9fd66f2
--- /dev/null
+++ b/src/vcfgeno2alleles.cpp
@@ -0,0 +1,54 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+    if (argc > 1) {
+        cerr << "usage: " << argv[0] << " <[vcf file]" << endl
+             << "modifies the genotypes field to provide the literal alleles rather than indexes" << endl;
+        return 1;
+    }
+
+    VariantCallFile variantFile;
+
+    variantFile.open(std::cin);
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        map<string, map<string, vector<string> > >::iterator s     = var.samples.begin(); 
+        map<string, map<string, vector<string> > >::iterator sEnd  = var.samples.end();
+        
+        for (; s != sEnd; ++s) {
+            map<string, vector<string> >& sample = s->second;
+            vector<string>& gtstrs = sample["GT"];
+            string& genotype = gtstrs.front();
+            vector<string> gt = split(genotype, "|/");
+            
+            // report the sample and it's genotype
+            stringstream o;
+            for (vector<string>::iterator g = gt.begin(); g != gt.end(); ++g) {
+                int index = atoi(g->c_str());
+                o << var.alleles[index];
+                if (g != (gt.end()-1)) o << "/";
+            }
+            gtstrs.clear();
+            gtstrs.push_back(o.str());
+        }
+        cout << var << endl;
+    }
+    return 0;
+
+}
+
diff --git a/src/vcfgeno2haplo.cpp b/src/vcfgeno2haplo.cpp
new file mode 100644
index 0000000..f821c2d
--- /dev/null
+++ b/src/vcfgeno2haplo.cpp
@@ -0,0 +1,391 @@
+#include "Variant.h"
+#include <getopt.h>
+#include "fastahack/Fasta.h"
+#include <algorithm>
+#include <list>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+         << endl
+         << "options:" << endl 
+         << "    -r, --reference FILE    FASTA reference file, required with -i and -u" << endl
+         << "    -w, --window-size N     Merge variants at most this many bp apart (default 30)" << endl
+         << "    -o, --only-variants     Don't output the entire haplotype, just concatenate" << endl
+         << "                            REF/ALT strings (delimited by \":\")" << endl
+         << endl
+         << "Convert genotype-based phased alleles within --window-size into haplotype alleles." << endl
+         << "Will break haplotype construction when encountering non-phased genotypes on input." << endl
+         << endl;
+    exit(0);
+}
+
+bool isPhased(Variant& var) {
+    for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+        map<string, vector<string> >& sample = s->second;
+        map<string, vector<string> >::iterator g = sample.find("GT");
+        if (g != sample.end()) {
+            string gt = g->second.front();
+            if (gt.size() > 1 && gt.find("|") == string::npos) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+int main(int argc, char** argv) {
+
+    string vcfFileName;
+    string fastaFileName;
+    int windowsize = 30;
+    bool onlyVariants = false;
+
+    if (argc == 1)
+        printSummary(argv);
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"window-size", required_argument, 0, 'w'},
+                {"reference", required_argument, 0, 'r'},
+                {"only-variants", no_argument, 0, 'o'},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "how:r:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+
+        case 'o':
+            onlyVariants = true;
+            break;
+
+	    case 'w':
+            windowsize = atoi(optarg);
+            break;
+
+	    case 'r':
+            fastaFileName = string(optarg);
+            break;
+
+        case 'h':
+            printSummary(argv);
+            break;
+
+        case '?':
+            printSummary(argv);
+            exit(1);
+            break;
+
+        default:
+            abort ();
+        }
+    }
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        cerr << "could not open VCF file" << endl;
+        exit(1);
+    }
+
+    FastaReference reference;
+    if (fastaFileName.empty()) {
+        cerr << "a reference is required for haplotype allele generation" << endl;
+        exit(1);
+    }
+    reference.open(fastaFileName);
+
+    // pattern
+    // when variants are within windowSize from each other, build up local haplotypes
+    // establish all the haplotypes which exist within the window using genotypes+allele#+position map
+    // generate a haplotype allele string for each unique haplotype
+    // for completeness retain phasing information in the genotypes
+    // write a new VCF record in which there are haplotype alleles and correctly described genotypes for each sample
+    // if the variants are outside of the windowSize, just write out the record
+
+    Variant var(variantFile);
+    Variant outputVar(variantFile);
+
+    cout << variantFile.header << endl;
+
+    // get the first distances
+    vector<Variant> cluster;
+
+    while (variantFile.getNextVariant(var) || !cluster.empty()) {
+
+        bool haplotypeCluster = false;
+
+        if (variantFile.done()) {
+            if (cluster.size() >= 1) {
+                haplotypeCluster = true;
+            } else {
+                cout << cluster.front() << endl;
+                cluster.clear();
+            }
+        } else if (isPhased(var)) {
+            if (cluster.empty()
+                || cluster.back().sequenceName == var.sequenceName
+                && var.position - cluster.back().position + cluster.back().ref.size() - 1 <= windowsize) {
+                cluster.push_back(var);
+            } else {
+                if (cluster.size() == 1) {
+                    cout << cluster.front() << endl;
+                    cluster.clear();
+                    if (!variantFile.done()) {
+                        cluster.push_back(var);
+                    }
+                } else {
+                    haplotypeCluster = true;
+                }
+            }
+        } else { // not phased
+            if (cluster.empty()) {
+                cout << var << endl;
+            } else if (cluster.size() == 1) {
+                cout << cluster.front() << endl;
+                cout << var << endl;
+            } else {
+                haplotypeCluster = true;
+            }
+        }
+
+        // we need to deal with the current cluster, as our next var is outside of bounds
+        // process the last cluster if it's more than 1 var
+        if (haplotypeCluster) {
+            /*            cerr << "cluster: ";
+            for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
+                cerr << " " << v->position;
+            }
+            cerr << endl;
+            */
+
+            // generate haplotype alleles and genotypes!
+            // get the reference sequence across the haplotype in question
+            string referenceHaplotype = reference.getSubSequence(cluster.front().sequenceName,
+                                                                 cluster.front().position - 1,
+                                                                 cluster.back().position
+                                                                 + cluster.back().ref.size() - cluster.front().position);
+
+            // establish what haplotypes there are by parsing the (phased) genotypes across the samples over these records
+            map<string, vector<vector<int> > > sampleHaplotypes;
+            for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+                // build the haplotype using the genotype fields in the variant cluster
+                // only build haplotypes for samples with complete information
+                string& sampleName = *s;
+                vector<vector<int> >& haplotypes = sampleHaplotypes[sampleName];
+		
+                bool completeCoverage = true;
+                // ensure complete genotype coverage over the haplotype cluster
+                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
+                    if (v->samples.find(sampleName) == v->samples.end()
+                        || v->samples[sampleName].find("GT") == v->samples[sampleName].end()) {
+                        completeCoverage = false;
+                        break;
+                    }
+                }
+                if (!completeCoverage) {
+                    continue; // skip samples without complete coverage
+                }
+		
+                // what's the ploidy?
+                {
+                    string& gt = cluster.front().samples[sampleName]["GT"].front();
+                    vector<string> gtspec = split(gt, "|");
+                    for (vector<string>::iterator g = gtspec.begin(); g != gtspec.end(); ++g) {
+                        vector<int> haplotype;
+                        haplotypes.push_back(haplotype);
+                    }
+                }
+		
+                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
+                    string& gt = v->samples[sampleName]["GT"].front();
+                    vector<string> gtspec = split(gt, "|");
+                    vector<string>::iterator g = gtspec.begin();
+                    for (vector<vector<int> >::iterator h = haplotypes.begin(); h != haplotypes.end(); ++h, ++g) {
+                        int j;
+                        convert(*g, j);
+                        h->push_back(j);
+                    }
+                }
+            }
+
+            set<vector<int> > uniqueHaplotypes;
+            for (map<string, vector<vector<int> > >::iterator hs = sampleHaplotypes.begin();
+                 hs != sampleHaplotypes.end(); ++hs) {
+                vector<vector<int> >& haps = hs->second;
+                for (vector<vector<int> >::iterator h = haps.begin(); h != haps.end(); ++h) {
+                    uniqueHaplotypes.insert(*h);
+                }
+            }
+	    
+            // write new haplotypes
+            map<vector<int>, string> haplotypeSeqs;
+            map<vector<int>, int> haplotypeIndexes;
+            map<int, string> alleles;
+	    
+            int impossibleHaplotypes = 0;
+
+            // always include the reference haplotype as 0
+            // when we come to it in the haplotypes, we'll ignore it
+            int alleleIndex = 1;
+            for (set<vector<int> >::iterator u = uniqueHaplotypes.begin(); u != uniqueHaplotypes.end(); ++u) {
+
+                /*
+                for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z) {
+                    cerr << *z;
+                }
+                cerr << endl;
+                */
+
+                string haplotype;
+                if (!onlyVariants) {
+                    haplotype = referenceHaplotype;
+                }
+                bool isreference = true;
+                bool impossibleHaplotype = false;
+                int referenceInsertOffset = 0;
+                int j = 0; // index into variant cluster
+                int lastpos = 0;
+                int lastrefend = 0;
+                for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z, ++j) {
+                    int i = *z;
+                    Variant& vartoInsert = cluster.at(j);
+                    if (i == 0) {
+                        if (onlyVariants) {
+                            if (!haplotype.empty()) haplotype.append(":");
+                            haplotype.append(vartoInsert.ref);
+                        }
+                    }
+                    if (i != 0) {
+                        isreference = false;
+                        string& alternate = vartoInsert.alleles.at(i);
+                        if (vartoInsert.position < lastrefend) {
+                            cerr << "impossible haplotype, overlapping alleles at " << vartoInsert.sequenceName << ":" << vartoInsert.position << endl;
+                            impossibleHaplotype = true;
+                            break;
+                        } else {
+                            //cerr << vartoInsert.position << " " << cluster.front().position + referenceInsertOffset << endl;
+                            //cerr << "replacing " << vartoInsert.ref << " at " << vartoInsert.position - cluster.front().position + referenceInsertOffset << " with " << alternate << endl;
+                            if (onlyVariants) {
+                                if (!haplotype.empty()) haplotype.append(":");
+                                haplotype.append(alternate);
+                            } else {
+                                haplotype.replace(vartoInsert.position - cluster.front().position + referenceInsertOffset,
+                                                  vartoInsert.ref.size(), alternate);
+                                if (alternate.size() != vartoInsert.ref.size()) {
+                                    referenceInsertOffset += alternate.size() - vartoInsert.ref.size();
+                                }
+                                lastpos = vartoInsert.position;
+                                lastrefend = vartoInsert.position + vartoInsert.ref.size();
+                            }
+                        }
+                    }
+                }
+		
+                if (impossibleHaplotype) {
+                    ++impossibleHaplotypes;
+                    haplotypeIndexes[*u] = -1; // indicates impossible haplotype
+                    impossibleHaplotype = false;
+                } else if (isreference) {
+                    alleles[0] = haplotype;
+                    haplotypeIndexes[*u] = 0;
+                } else {
+                    alleles[alleleIndex] = haplotype;
+                    haplotypeIndexes[*u] = alleleIndex;
+                    ++alleleIndex;
+                }
+                haplotypeSeqs[*u] = haplotype;
+                // if there's not a reference allele, add it
+                if (alleles.find(0) == alleles.end()) {
+                    alleles[0] = referenceHaplotype;
+                    // nb, there is no reference haplotype among
+                    // the samples, so we don't have to add it to
+                    // the haplotypeIndexes
+                }
+            }
+
+            if (onlyVariants) {
+                string newRef;
+                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
+                    if (!newRef.empty()) newRef.append(":");
+                    newRef.append(v->ref);
+                }
+                outputVar.ref = newRef;
+            } else {
+                outputVar.ref = alleles[0];
+            }
+            outputVar.alt.clear();
+            for (int i = 1; i < alleleIndex; ++i) {
+                outputVar.alt.push_back(alleles[i]);
+            }
+	    
+            outputVar.sequenceName = cluster.front().sequenceName;
+            outputVar.position = cluster.front().position;
+            outputVar.filter = ".";
+            outputVar.id = ".";
+            outputVar.info = cluster.front().info;
+            outputVar.samples.clear();
+            outputVar.format = cluster.front().format;
+	    
+            // now the genotypes
+            for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+                string& sampleName = *s;
+                vector<string> gt;
+                vector<vector<int> > & hs = sampleHaplotypes[sampleName];
+                for (vector<vector<int> >::iterator h = hs.begin(); h != hs.end(); ++h) {
+                    int hi = haplotypeIndexes[*h];
+                    if (hi != -1) {
+                        gt.push_back(convert(hi));
+                    } else {
+                        // nonexistent or impossible haplotype
+                        gt.push_back(".");
+                    }
+                }
+                if (gt.size() != 0) {
+                    outputVar.samples[sampleName]["GT"].push_back(join(gt, "|"));
+                }
+            }
+            if (cluster.size() - impossibleHaplotypes < 2) {
+                for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
+                    cout << *v << endl;
+                }
+            } else {
+                if (!outputVar.alt.empty()) {
+                    cout << outputVar << endl;
+                } else {
+                    cerr << "no alternate alleles remain at " << outputVar.sequenceName << ":" << outputVar.position << " after haplotype validation" << endl;
+                }
+            }
+            cluster.clear();
+            if (!variantFile.done()) cluster.push_back(var);
+        }
+    }
+
+    exit(0);  // why?
+    return 0;
+
+}
+
diff --git a/src/vcfgenosamplenames.cpp b/src/vcfgenosamplenames.cpp
new file mode 100644
index 0000000..32e065a
--- /dev/null
+++ b/src/vcfgenosamplenames.cpp
@@ -0,0 +1,39 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc > 1) {
+        string filename = argv[1];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    variantFile.addHeaderLine("##FORMAT=<ID=SN,Number=1,Type=String,Description=\"The name of the sample.\">");
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        var.format.push_back("SN");
+        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+             s != var.samples.end(); ++s) {
+            s->second["SN"].clear();
+            s->second["SN"].push_back(s->first);
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfgenosummarize.cpp b/src/vcfgenosummarize.cpp
new file mode 100644
index 0000000..526bca1
--- /dev/null
+++ b/src/vcfgenosummarize.cpp
@@ -0,0 +1,107 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+    if (argc > 1 && (argv[1] == "-h" || argv[1] == "--help")) {
+        cerr << "usage: " << argv[0] << " <[input file] >[output vcf]" << endl
+             << "Adds summary statistics to each record summarizing qualities reported in" << endl
+             << "called genotypes.  Uses:" << endl
+             << "RO (reference observation count), QR (quality sum reference observations)" << endl
+             << "AO (alternate observation count), QA (quality sum alternate observations)" << endl;
+        return 1;
+    }
+
+    VariantCallFile variantFile;
+    if (argc == 1) {
+        variantFile.open(std::cin);
+    } else {
+        string filename = argv[argc-1];
+        variantFile.open(filename);
+        if (!variantFile.is_open()) {
+            cerr << "could not open " << filename << endl;
+            return 1;
+        }
+    }
+
+    Variant var(variantFile);
+
+    variantFile.removeInfoHeaderLine("AQR");
+    variantFile.addHeaderLine("##INFO=<ID=AQR,Number=1,Type=Float,Description=\"Mean reference observation quality calculated by RO and QR in called samples.\">");
+    variantFile.removeInfoHeaderLine("AQA");
+    variantFile.addHeaderLine("##INFO=<ID=AQA,Number=A,Type=Float,Description=\"Mean alternate observation quality calculated by AO and QA in called samples.\">");
+    variantFile.removeInfoHeaderLine("QR");
+    variantFile.addHeaderLine("##INFO=<ID=QR,Number=1,Type=Float,Description=\"Quality sum of reference observations calculated by QR in called samples.\">");
+    variantFile.removeInfoHeaderLine("QA");
+    variantFile.addHeaderLine("##INFO=<ID=QA,Number=A,Type=Float,Description=\"Quality sum of alternate observations calculated by QA in called samples.\">");
+    variantFile.removeInfoHeaderLine("RQA");
+    variantFile.addHeaderLine("##INFO=<ID=RQA,Number=A,Type=Float,Description=\"Ratio of mean alternate observation quality to mean reference observation quality (MQA/MQR).\">");
+
+    // write the new header
+    cout << variantFile.header << endl;
+ 
+    // print the records, filtering is done via the setting of varA's output sample names
+    while (variantFile.getNextVariant(var)) {
+        int refobs = 0;
+        int refqual = 0;
+        vector<int> altobs(var.alt.size(), 0);
+        vector<int> altqual(var.alt.size(), 0);
+        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+             s != var.samples.end(); ++s) {
+            map<string, vector<string> >& sample = s->second;
+            int x;
+            if (sample.find("RO") != sample.end()) {
+                convert(sample["RO"].front(), x);
+                refobs += x;
+                if (sample.find("QR") != sample.end()) {
+                    convert(sample["QR"].front(), x);
+                    refqual += x;
+                }
+            }
+            if (sample.find("AO") != sample.end()) {
+                vector<string>& aos = sample["AO"];
+                for (int i = 0; i != var.alt.size(); ++i) {
+                    convert(aos[i], x);
+                    altobs[i] += x;
+                }
+                if (sample.find("QA") != sample.end()) {
+                    vector<string>& qas = sample["QA"];
+                    for (int i = 0; i != var.alt.size(); ++i) {
+                        convert(qas[i], x);
+                        altqual[i] += x;
+                    }
+                }
+            }
+        }
+        var.info["QR"].push_back(convert(refqual));
+        if (refobs == 0 || refqual == 0) {
+            var.info["AQR"].push_back(convert(0));
+        } else {
+            var.info["AQR"].push_back(convert((double)refqual/(double)refobs));
+        }
+
+        for (int i = 0; i != var.alt.size(); ++i) {
+            var.info["QA"].push_back(convert(altqual[i]));
+            var.info["AQA"].push_back(convert((double)altqual[i]/(double)altobs[i]));
+            if (refobs == 0 || refqual == 0) {
+                var.info["RQA"].push_back(convert(1));
+            } else {
+                var.info["RQA"].push_back(convert(((double)altqual[i]/(double)altobs[i]) / 
+                                                  ((double)refqual/(double)refobs)));
+            }
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfgenotypecompare.cpp b/src/vcfgenotypecompare.cpp
new file mode 100644
index 0000000..c81043f
--- /dev/null
+++ b/src/vcfgenotypecompare.cpp
@@ -0,0 +1,327 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+#include <sstream>
+
+using namespace std;
+using namespace vcf;
+
+// TODO fix this for multi-allelic!!!!
+string genotypeSpec(map<int, int>& genotype) {
+    string gtspec;
+    if (isNull(genotype)) {
+        gtspec = "NN";
+    } else if (isHom(genotype)) {
+        if (hasNonRef(genotype)) {
+            gtspec = "AA";
+        } else {
+            gtspec = "RR";
+        }
+    } else {
+        gtspec = "AR";
+    }
+    return gtspec;
+}
+
+int main(int argc, char** argv) {
+
+    if (argc != 3) {
+        cerr << "usage: " << argv[0] << " <other-genotype-tag> <vcf file>" << endl
+             << "adds statistics to the INFO field of the vcf file describing the" << endl
+             << "amount of discrepancy between the genotypes (GT) in the vcf file and the" << endl
+             << "genotypes reported in the <other-genotype-tag>.  use this after" << endl
+             << "vcfannotategenotypes to get correspondence statistics for two vcfs." << endl;
+        return 1;
+    }
+
+    string otherGenoTag = argv[1];
+    string filename = argv[2];
+
+    VariantCallFile variantFile;
+    if (filename == "-") {
+        variantFile.open(std::cin);
+    } else {
+        variantFile.open(filename);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    vector<string> specs;
+
+    specs.push_back("AA_AA");
+    specs.push_back("AA_AR");
+    specs.push_back("AA_RR");
+    specs.push_back("AA_NN");
+
+    specs.push_back("AR_AA");
+    specs.push_back("AR_AR");
+    specs.push_back("AR_RR");
+    specs.push_back("AR_NN");
+
+    specs.push_back("RR_AA");
+    specs.push_back("RR_AR");
+    specs.push_back("RR_RR");
+    specs.push_back("RR_NN");
+
+    specs.push_back("NN_AA");
+    specs.push_back("NN_AR");
+    specs.push_back("NN_RR");
+    specs.push_back("NN_NN");
+
+
+    for (vector<string>::iterator spec = specs.begin(); spec != specs.end(); ++spec) {
+        string line = "##INFO=<ID=" + otherGenoTag + ".genotypes." + *spec
+            + ",Number=1,Type=Integer,Description=\"Number of genotypes with "
+            + *spec + " relationship with " + otherGenoTag + "\">";
+        variantFile.addHeaderLine(line);
+    }
+
+    string line;
+
+    line = "##INFO=<ID=" + otherGenoTag + ".genotypes.count,Number=1,Type=Integer,Description=\"Count of genotypes under comparison.\">";
+    variantFile.addHeaderLine(line);
+
+    line = "##INFO=<ID=" + otherGenoTag + ".genotypes.alternate_count,Number=1,Type=Integer,Description=\"Count of alternate genotypes in the first file.\">";
+    variantFile.addHeaderLine(line);
+
+    line = "##INFO=<ID=" + otherGenoTag
+        + ".site.alternate_positive_discrepancy,Number=1,Type=Integer,Description=\"Estimated positive discrepancy rate of "
+        + otherGenoTag + " genotypes, where positive discrepancies are all cases where an alternate allele is called GT "
+        + " but none is represented in " + otherGenoTag + " or " + otherGenoTag + " is null/no-call\">";
+    variantFile.addHeaderLine(line);
+
+    line = "##INFO=<ID=" + otherGenoTag
+        + ".site.alternate_negative_discrepancy,Number=1,Type=Integer,Description=\"Estimated negative discrepancy rate of "
+        + otherGenoTag + " genotypes, where negative discrepancies are all cases where no alternate allele is called in "
+        + " GT but an alternate is represented in " + otherGenoTag + ", including no-calls or partly null genotypes\">";
+    variantFile.addHeaderLine(line);
+
+    line = "##INFO=<ID=" + otherGenoTag
+        + ".site.alternate_null_discrepancy,Number=1,Type=Integer,Description=\"Estimated null discrepancy rate of "
+        + otherGenoTag + " genotypes, where null discrepancies are all cases where GT is specified and contains an alternate but "
+        + otherGenoTag + " is null.  Cases where GT is null or partly null are excluded.\">";
+    variantFile.addHeaderLine(line);
+
+    line = "##INFO=<ID=" + otherGenoTag
+        + ".site.call_discrepancy,Number=1,Type=Integer,Description=\"Estimated call discrepancy rate of "
+        + otherGenoTag + " genotypes (het->hom, hom->het) between " + otherGenoTag + " and GT\">";
+    variantFile.addHeaderLine(line);
+
+    line = "##INFO=<ID=" + otherGenoTag
+        + ".site.call_concordance,Number=1,Type=Integer,Description=\"Estimated call concorndance rate of "
+        + otherGenoTag + " genotypes between " + otherGenoTag + " and GT\">";
+    variantFile.addHeaderLine(line);
+
+    line = "##INFO=<ID=" + otherGenoTag
+        + ".site.non_reference_discrepancy,Number=1,Type=Float,Description=\"Estimated non-reference discrepancy relative to "
+        + otherGenoTag + " genotypes,\">";
+    variantFile.addHeaderLine(line);
+
+    line = "##INFO=<ID=" + otherGenoTag
+        + ".site.non_reference_discrepancy.count,Number=1,Type=Int,Description=\"non-reference discrepancy normalizer relative to "
+        + otherGenoTag + " genotypes,\">";
+    variantFile.addHeaderLine(line);
+
+    line = "##INFO=<ID=" + otherGenoTag
+        + ".site.non_reference_discrepancy.normalizer,Number=1,Type=Int,Description=\"non-reference discrepancy count relative to "
+        + otherGenoTag + " genotypes,\">";
+    variantFile.addHeaderLine(line);
+
+    line = "##INFO=<ID=" + otherGenoTag
+        + ".site.non_reference_sensitivity,Number=1,Type=Float,Description=\"Estimated non-reference sensitivity relative to "
+        + otherGenoTag + " genotypes,\">";
+    variantFile.addHeaderLine(line);
+
+    line = "##INFO=<ID=" + otherGenoTag
+        + ".site.non_reference_sensitivity.count,Number=1,Type=Int,Description=\"non-reference sensitivity normalizer relative to "
+        + otherGenoTag + " genotypes,\">";
+    variantFile.addHeaderLine(line);
+
+    line = "##INFO=<ID=" + otherGenoTag
+        + ".site.non_reference_sensitivity.normalizer,Number=1,Type=Int,Description=\"non-reference sensitivity count relative to "
+        + otherGenoTag + " genotypes,\">";
+    variantFile.addHeaderLine(line);
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+
+    while (variantFile.getNextVariant(var)) {
+
+	//cout << "next: " << var << endl;
+        // for each sample, check GT against <other-genotype-tag>
+        // tally stats, and append to info
+        map<string, map<string, vector<string> > >::iterator s     = var.samples.begin();
+        map<string, map<string, vector<string> > >::iterator sEnd  = var.samples.end();
+
+        map<string, int> genotypeComparisonCounts;
+        int gtCount = var.samples.size();
+        int gtAltCount = 0; // number of alternate-containing genotypes in the first file
+        int pdCount = 0; // positive discrepancy count
+        int ndCount = 0; // negative discrepancy count
+        int nnCount = 0; // null discrepancy count
+        int cdCount = 0; // call discrepancy count
+        int ccCount = 0; // call concordance count
+        int nrdCount = 0; // non-reference discrepancy count
+        int nrdNormalizer = 0; // divisor for nrd rate
+        int nrsCount = 0; // non-reference sensitivity count
+        int nrsNormalizer = 0; // divisor for nrs rate
+
+        for (; s != sEnd; ++s) {
+            map<string, vector<string> >& sample = s->second;
+            const string& name = s->first;
+
+            // decompose genotypes into counts of strings
+            // to facilitate comparison
+
+	    string gtA;
+	    if (sample.find("GT") == sample.end()) {
+		gtA = "./.";
+	    } else {
+		gtA = sample["GT"].front();
+	    }
+
+	    string gtB;
+	    if (sample.find(otherGenoTag) == sample.end()) {
+		gtB = "./.";
+	    } else {
+		gtB = sample[otherGenoTag].front();
+	    }
+
+
+            map<int, int> genotypeA = decomposeGenotype(gtA);
+            map<int, int> genotypeB = decomposeGenotype(gtB);
+
+            string gtspecA = genotypeSpec(genotypeA);
+            string gtspecB = genotypeSpec(genotypeB);
+            //cout << gtA << " " << gtB << endl;
+            //cout << gtspecA << " " << gtspecB << endl;
+            ++genotypeComparisonCounts[gtspecA + "_" + gtspecB];
+
+            if (hasNonRef(genotypeA)) {
+                ++gtAltCount;
+            }
+
+            if (genotypeA != genotypeB) {
+                if (isNull(genotypeA)) {
+                    // TODO handle this somehow, maybe via a different flag?
+                    if (!isNull(genotypeB)) {
+                        ++nnCount;  // null discrepancy, the second set makes a call, this one does not
+                    }
+                } else if (hasNonRef(genotypeA)) {
+                    if (!isNull(genotypeB) && hasNonRef(genotypeB)) { // they cannot be the same, but they both represent an alternate
+                        ++cdCount;  // the calls are discrepant
+                    } else { // the other call does not have an alternate
+                        ++pdCount;
+                        // it is also null
+                        if (isNull(genotypeB)) {
+                            ++nnCount;
+                        }
+                    }
+                } else { // the current genotype has no non-ref alternate
+                    if (!isNull(genotypeB) && hasNonRef(genotypeB)) {
+                        ++ndCount;
+                    }
+                    if (isNull(genotypeB)) {
+                        ++nnCount;
+                    }
+                }
+            } else {
+                if (!isNull(genotypeA)) {
+                    ++ccCount;
+                }
+            }
+
+
+            if (!(isNull(genotypeA) || isNull(genotypeB))
+                    && !(isHomRef(genotypeA) && isHomRef(genotypeB))) {
+                ++nrdNormalizer;
+                if (genotypeA != genotypeB) {
+                    ++nrdCount;
+                }
+            }
+
+            if (!(isNull(genotypeB) || isHomRef(genotypeB))) {
+                ++nrsNormalizer;
+                if (!(isNull(genotypeA) || isHomRef(genotypeA))) {
+                    ++nrsCount;
+                }
+            }
+
+        }
+
+        for (map<string, int>::iterator g = genotypeComparisonCounts.begin();
+                g != genotypeComparisonCounts.end(); ++g) {
+            stringstream c;
+            c << g->second;
+            vector<string>& t = var.info[otherGenoTag + ".genotypes." + g->first];
+            t.clear(); t.push_back(c.str());
+        }
+
+        stringstream gtc;
+        gtc << gtCount;
+        var.info[otherGenoTag + ".genotypes.count"].push_back(gtc.str());
+
+        stringstream gtac;
+        gtac << gtAltCount;
+        var.info[otherGenoTag + ".genotypes.alternate_count"].push_back(gtac.str());
+
+        stringstream pd;
+        pd << pdCount;
+        var.info[otherGenoTag + ".site.alternate_positive_discrepancy"].push_back(pd.str());
+
+        stringstream nd;
+        nd << ndCount;
+        var.info[otherGenoTag + ".site.alternate_negative_discrepancy"].push_back(nd.str());
+
+        stringstream nn;
+        nn << nnCount;
+        var.info[otherGenoTag + ".site.alternate_null_discrepancy"].push_back(nn.str());
+
+        stringstream cd;
+        cd << cdCount;
+        var.info[otherGenoTag + ".site.call_discrepancy"].push_back(cd.str());
+
+        stringstream cc;
+        cc << ccCount;
+        var.info[otherGenoTag + ".site.call_concordance"].push_back(cc.str());
+
+        stringstream nrdc;
+        nrdc << nrdCount;
+        var.info[otherGenoTag + ".site.non_reference_discrepancy.count"].push_back(nrdc.str());
+
+        stringstream nrdn;
+        nrdn << nrdNormalizer;
+        var.info[otherGenoTag + ".site.non_reference_discrepancy.normalizer"].push_back(nrdn.str());
+
+        if (nrdNormalizer > 0) {
+            stringstream nrd;
+            nrd << (double) nrdCount / (double) nrdNormalizer;
+            var.info[otherGenoTag + ".site.non_reference_discrepancy"].push_back(nrd.str());
+        }
+
+        stringstream nrsc;
+        nrsc << nrsCount;
+        var.info[otherGenoTag + ".site.non_reference_sensitivity.count"].push_back(nrsc.str());
+
+        stringstream nrsn;
+        nrsn << nrsNormalizer;
+        var.info[otherGenoTag + ".site.non_reference_sensitivity.normalizer"].push_back(nrsn.str());
+
+        if (nrsNormalizer > 0) {
+            stringstream nrs;
+            nrs << (double) nrsCount / (double) nrsNormalizer;
+            var.info[otherGenoTag + ".site.non_reference_sensitivity"].push_back(nrs.str());
+        }
+
+        cout << var << endl;
+
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfgenotypes.cpp b/src/vcfgenotypes.cpp
new file mode 100644
index 0000000..4fe7965
--- /dev/null
+++ b/src/vcfgenotypes.cpp
@@ -0,0 +1,66 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+    if (argc != 2) {
+        cerr << "usage: " << argv[0] << " <vcf file>" << endl
+             << "report the genotypes for each sample, for each variant in the vcf file" << endl;
+        return 1;
+    }
+
+    string filename = argv[1];
+
+    VariantCallFile variantFile;
+
+    if (filename == "-") {
+        variantFile.open(std::cin);
+    } else {
+        variantFile.open(filename);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        map<string, map<string, vector<string> > >::iterator s     = var.samples.begin(); 
+        map<string, map<string, vector<string> > >::iterator sEnd  = var.samples.end();
+        
+        cout << var.sequenceName << "\t"
+             << var.position     << "\t"
+             << var.ref          << "\t";
+        var.printAlt(cout);     cout << "\t"; 
+        var.printAlleles(cout); cout << "\t"; 
+        
+        for (; s != sEnd; ++s) {
+            map<string, vector<string> >& sample = s->second;
+            string& genotype = sample["GT"].front(); // XXX assumes we can only have one GT value
+            vector<string> gt = split(genotype, "|/");
+            
+            // report the sample and it's genotype
+            cout << s->first << ":";
+            for (vector<string>::iterator g = gt.begin(); g != gt.end(); ++g) {
+                if (g->c_str() == ".") {
+                    cout << ".";
+                } else {
+                    int index = atoi(g->c_str());
+                    cout << var.alleles[index];
+                }
+                if (g != (gt.end()-1)) cout << "/";
+            }
+            cout << "\t";
+        }
+        cout << endl;
+    }
+    return 0;
+
+}
+
diff --git a/src/vcfglbound.cpp b/src/vcfglbound.cpp
new file mode 100644
index 0000000..0b42f22
--- /dev/null
+++ b/src/vcfglbound.cpp
@@ -0,0 +1,178 @@
+#include "Variant.h"
+#include "split.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+         << endl
+         << "options:" << endl 
+         << "    -b, --bound N          Bound GLs to this limit." << endl
+         << "    -x, --exclude-broken   If GLs are > 0, remove site." << endl
+         << endl
+         << "Adjust GLs so that the maximum GL is 0 by dividing all GLs for each sample by the max." << endl
+         << "Then cap (bound) at N (e.g. -10)." << endl;
+    exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+    bool excludeBroken = false;
+    double glBound = 0;
+    int c;
+
+    while (true) {
+        static struct option long_options[] =
+        {
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            {"help", no_argument, 0, 'h'},
+            {"bound",  required_argument, 0, 'b'},
+            {"exclude-broken",  no_argument, 0, 'x'},
+            //{"length",  no_argument, &printLength, true},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hxb:",
+                         long_options, &option_index);
+
+      /* Detect the end of the options. */
+          if (c == -1)
+            break;
+ 
+          switch (c)
+            {
+            case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+              break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+              printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+          case 'b':
+              glBound = atof(optarg);
+              break;
+          
+          case 'x':
+              excludeBroken = true;
+              break;
+          
+          case 'h':
+              printSummary(argv);
+              exit(0);
+              break;
+
+          case '?':
+              /* getopt_long already printed an error message. */
+              printSummary(argv);
+              exit(1);
+              break;
+ 
+          default:
+              abort ();
+          }
+      }
+
+    if (glBound == 0) {
+        cerr << "a bound is required when running vcfglbound (try -10)" << endl;
+        exit(1);
+    }
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        if (find(var.format.begin(), var.format.end(), "GL") == var.format.end()) {
+            cout << var << endl;
+            continue;
+        }
+        if (find(var.format.begin(), var.format.end(), "GT") == var.format.end()) {
+            var.format.push_back("GT");
+            reverse(var.format.begin(), var.format.end());
+        }
+        bool isbroken = false;
+        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+             s != var.samples.end(); ++s) {
+            map<string, vector<string> >& sample = s->second;
+            map<string, vector<string> >::iterator l = sample.find("GL");
+            if (l != sample.end()) {
+
+                // find the gl max
+                vector<string>& glstrs = l->second;
+                vector<double> gls;
+                for (vector<string>::iterator gl = glstrs.begin(); gl != glstrs.end(); ++gl) {
+                    double d;
+                    convert(*gl, d);
+                    gls.push_back(d);
+                }
+
+                isbroken = false; // reset every iteration
+                for (vector<double>::iterator g = gls.begin(); g != gls.end(); ++g) {
+                    if (*g > 0) {
+                        isbroken = true;
+                        break;
+                    }
+                }
+                if (isbroken) {
+                    if (excludeBroken) {
+                        cerr << var.sequenceName << ":" << var.position << ", sample " << s->first << " has GL > 0" << endl;
+                        break;
+                    } else {
+                        cerr << "VCF record @ " << var.sequenceName << ":" << var.position << ", sample " << s->first << " has GL > 0, not processing, but outputting" << endl;
+                        continue;
+                    }
+                }
+
+                // normalize GLs to -10 min 0 max using division by max and bounding at -10
+                double minGL = 0;
+                for (vector<double>::iterator g = gls.begin(); g != gls.end(); ++g) {
+                    if (*g < minGL) minGL = *g;
+                }
+                double maxGL = minGL;
+                for (vector<double>::iterator g = gls.begin(); g != gls.end(); ++g) {
+                    if (*g > maxGL) maxGL = *g;
+                }
+                // modify gls
+                for (vector<double>::iterator g = gls.begin(); g != gls.end(); ++g) {
+                    *g = max(glBound, *g - maxGL);
+                }
+
+                // and pack back into GL field
+                glstrs.clear();
+                for (vector<double>::iterator g = gls.begin(); g != gls.end(); ++g) {
+                    glstrs.push_back(convert(*g));
+                }
+            }
+        }
+        if (excludeBroken && isbroken) {
+            cerr << "excluding VCF record @ " << var.sequenceName << ":" << var.position << " due to GLs > 0" << endl;
+        } else {
+            cout << var << endl;
+        }
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfglxgt.cpp b/src/vcfglxgt.cpp
new file mode 100644
index 0000000..5909bdc
--- /dev/null
+++ b/src/vcfglxgt.cpp
@@ -0,0 +1,171 @@
+#include "Variant.h"
+#include "split.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+         << endl
+         << "options:" << endl 
+         << "    -n, --fix-null-genotypes   only apply to null and partly-null genotypes" << endl
+         << endl
+         << "Set genotypes using the maximum genotype likelihood for each sample." << endl
+         << endl;
+    exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+    bool fixNull = false;
+    int c;
+
+    while (true) {
+        static struct option long_options[] =
+        {
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            {"help", no_argument, 0, 'h'},
+            {"fix-null-genotypes",  no_argument, 0, 'n'},
+            //{"length",  no_argument, &printLength, true},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hn",
+                         long_options, &option_index);
+
+      /* Detect the end of the options. */
+          if (c == -1)
+            break;
+ 
+          switch (c)
+            {
+            case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+              break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+              printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+          case 'n':
+	      fixNull = true;
+	      break;
+ 
+          case 'h':
+            printSummary(argv);
+            exit(0);
+            break;
+
+          case '?':
+            /* getopt_long already printed an error message. */
+            printSummary(argv);
+            exit(1);
+            break;
+ 
+          default:
+            abort ();
+          }
+      }
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    cout << variantFile.header << endl;
+
+    map<pair<int, int>, list<list<int> > > glOrderCache;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        if (find(var.format.begin(), var.format.end(), "GL") == var.format.end()) {
+            cout << var << endl;
+            continue;
+        }
+        if (find(var.format.begin(), var.format.end(), "GT") == var.format.end()) {
+            var.format.push_back("GT");
+            reverse(var.format.begin(), var.format.end());
+        }
+        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+             s != var.samples.end(); ++s) {
+            map<string, vector<string> >& sample = s->second;
+            map<string, vector<string> >::iterator g = sample.find("GT");
+            map<string, vector<string> >::iterator l = sample.find("GL");
+            if (l != sample.end()) {
+                if (g == sample.end()) {
+                    sample["GT"].push_back("./.");
+                    g = sample.find("GT");
+                }
+
+                string& gt = g->second.front();
+                // if we are fixing null but the genotype is fully specified, continue
+                if (fixNull && gt.find(".") == string::npos) continue;
+                string splitter = "/";
+                if (gt.find("|") != string::npos) splitter = "|";
+                int samplePloidy = split(gt, splitter).size();
+                int numAlleles = var.alt.size() + 1; // including reference
+
+                // get the gt GL ordering
+                pair<int, int> pa = make_pair(samplePloidy, numAlleles);
+                map<pair<int, int>, list<list<int> > >::iterator order = glOrderCache.find(pa);
+                if (order == glOrderCache.end()) {
+                    glOrderCache[pa] = glorder(samplePloidy, numAlleles);
+                }
+                list<list<int> >& glOrdering = glOrderCache[pa];
+
+                // find the gl max
+                vector<string>& gls = l->second;
+                vector<string>::iterator p = gls.begin();
+                double maxGl;
+                convert(*p, maxGl); ++p;
+                int i = 1, maxindex = 0;
+                for (; p != gls.end(); ++p, ++i) {
+                    double cgl;
+                    convert(*p, cgl);
+                    if (cgl > maxGl) {
+                        maxGl = cgl;
+                        maxindex = i; // prefers == gls in order of listing
+                    }
+                }
+		
+                // determine which genotype it represents
+                // modify, if the GT is part-null
+                vector<string>& gtv = g->second;
+                list<list<int> >::iterator b = glOrdering.begin();
+                advance(b, maxindex);
+                /*
+                  cout << "changing sample " << s->first << " gt from " << gt << " to " << join(*b, "/")
+                  << " gls are ";
+                  int q = 0;
+                  for (list<list<int> >::iterator i = glOrdering.begin(); i != glOrdering.end(); ++i, ++q) {
+                  cout << join(*i, "/") << ":" << sample["GL"].at(q) << ", ";
+                  }
+                  cout << endl;
+                */
+
+                gtv.clear();
+                gtv.push_back(join(*b, "/"));
+            }
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfhetcount.cpp b/src/vcfhetcount.cpp
new file mode 100644
index 0000000..3dd4561
--- /dev/null
+++ b/src/vcfhetcount.cpp
@@ -0,0 +1,72 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+    if (argc == 2 && (argv[1] == "-h" || argv[1] == "--help")) {
+        cerr << "usage: " << argv[0] << " <vcf file>" << endl
+             << "count the number of alternate alleles in heterozygous genotypes in all records in the vcf file" << endl
+             << "outputs a count for each individual in the file" << endl;
+        return 1;
+    }
+
+
+    string inputFilename;
+    VariantCallFile variantFile;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    unsigned int hetAlleleCount = 0;
+    map<string, unsigned int> hetCounts;
+    for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+        hetCounts[*s] = 0;
+    }
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        //cout << var << endl;
+        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+            string name = s->first;
+            map<string, vector<string> >& sample = s->second;
+            string& genotype = sample["GT"].front();
+            vector<string> gt = split(genotype, "|/");
+            int alt = 0;
+            for (vector<string>::iterator g = gt.begin(); g != gt.end(); ++g) {
+                if (*g != "0")
+                    ++alt;
+            }
+            if (alt != gt.size()) {
+                hetCounts[name] += alt;
+                //hetAlleleCount += alt;
+            }
+        }
+    }
+
+    //cout << hetAlleleCount << endl;
+    for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+        cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << *s;
+    }
+    cout << endl;
+    for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+        cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << hetCounts[*s];
+    }
+    cout << endl;
+
+    return 0;
+
+}
+
diff --git a/src/vcfhethomratio.cpp b/src/vcfhethomratio.cpp
new file mode 100644
index 0000000..10e39f8
--- /dev/null
+++ b/src/vcfhethomratio.cpp
@@ -0,0 +1,66 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+    if (argc != 2) {
+        cerr << "usage: " << argv[0] << " <vcf file>" << endl
+             << "outputs the het/hom ratio for each individual in the file" << endl;
+        return 1;
+    }
+
+    string filename = argv[1];
+
+    VariantCallFile variantFile;
+    if (filename == "-") {
+        variantFile.open(std::cin);
+    } else {
+        variantFile.open(filename);
+    }
+    if (!variantFile.is_open()) {
+        cerr << "could not open " << filename << endl;
+        return 1;
+    }
+
+    map<string, unsigned int> hetCounts;
+    map<string, unsigned int> homCounts;
+    for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+        hetCounts[*s] = 0;
+        homCounts[*s] = 0;
+    }
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        //cout << var << endl;
+        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+            string name = s->first;
+            map<string, vector<string> >& sample = s->second;
+            string& gt = sample["GT"].front();
+            map<int, int> genotype = decomposeGenotype(gt);
+            if (isHet(genotype)) {
+                ++hetCounts[name];
+            } else if (isHomNonRef(genotype)) {
+                ++homCounts[name];
+            }
+        }
+    }
+
+    for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+        cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << *s;
+    }
+    cout << endl;
+    for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+        cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << (double) hetCounts[*s] / (double) homCounts[*s];
+    }
+    cout << endl;
+
+    return 0;
+
+}
+
diff --git a/src/vcfindex.cpp b/src/vcfindex.cpp
new file mode 100644
index 0000000..24a9401
--- /dev/null
+++ b/src/vcfindex.cpp
@@ -0,0 +1,42 @@
+#include "Variant.h"
+#include "convert.h"
+#include <vector>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc > 1) {
+        string filename = argv[1];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    string idname = "id";
+    long int uid = 0;
+
+    variantFile.addHeaderLine("##INFO=<ID="+idname+",Number=A,Type=Integer,Description=\"Unique numerical identifier of allele in file.\">");
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        vector<string>& idxs = var.info[idname];
+        idxs.clear();
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+            idxs.push_back(convert(uid++));
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfinfo2qual.cpp b/src/vcfinfo2qual.cpp
new file mode 100644
index 0000000..2c4b961
--- /dev/null
+++ b/src/vcfinfo2qual.cpp
@@ -0,0 +1,50 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc == 1) {
+        cerr << "usage: " << argv[0] << " [key] [vcf_file]" << endl
+             << "Sets QUAL from info field tag keyed by [key]." << endl
+             << "The VCF file may be omitted and read from stdin." << endl
+             << "The average of the field is used if it contains multiple values." << endl;
+        return 1;
+    }
+
+    string key = argv[1];
+
+    if (argc > 2) {
+        string filename = argv[2];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        vector<string>& ivs = var.info[key];
+        double vs = 0;
+        for (vector<string>::iterator i = ivs.begin();
+             i != ivs.end(); ++i) {
+            double v;
+            convert(*i, v);
+            vs += v;
+        }
+        var.quality = vs / (double) ivs.size();
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfinfosummarize.cpp b/src/vcfinfosummarize.cpp
new file mode 100644
index 0000000..3e0f0f9
--- /dev/null
+++ b/src/vcfinfosummarize.cpp
@@ -0,0 +1,212 @@
+#include "Variant.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <getopt.h>
+#include <algorithm>
+#include <numeric>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+         << endl
+         << "options:" << endl 
+         << "    -f, --field         Summarize this field in the INFO column" << endl
+         << "    -i, --info          Store the computed statistic in this info field" << endl
+         << "    -a, --average       Take the mean for field (default)" << endl
+         << "    -m, --median        Use the median" << endl
+         << "    -n, --min           Use the min" << endl
+         << "    -x, --max           Use the max" << endl
+         << endl
+         << "Take annotations given in the per-sample fields and add the mean, median, min, or max" << endl
+         << "to the site-level INFO." << endl
+         << endl;
+    exit(0);
+}
+
+double median(vector<double> &v)
+{
+    size_t n = v.size() / 2;
+    nth_element(v.begin(), v.begin()+n, v.end());
+    return v[n];
+}
+
+double mean(vector<double> &v)
+{
+    double sum = accumulate(v.begin(), v.end(), 0.0);
+    return sum / v.size();
+}
+
+enum StatType { MEAN, MEDIAN, MIN, MAX };
+
+int main(int argc, char** argv) {
+
+    int c;
+    string sitewideField;
+    string infoField;
+    StatType statType = MEAN; 
+
+    if (argc == 1)
+        printSummary(argv);
+
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                {"help", no_argument, 0, 'h'},
+                {"field",  required_argument, 0, 'f'},
+                {"info",  required_argument, 0, 'i'},
+                {"average", no_argument, 0, 'a'},
+                {"median", no_argument, 0, 'm'},
+                {"min", no_argument, 0, 'n'},
+                {"max", no_argument, 0, 'x'},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hamnxf:i:",
+                         long_options, &option_index);
+
+        /* Detect the end of the options. */
+        if (c == -1)
+            break;
+ 
+        switch (c)
+        {
+        case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+                break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+                printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+        case 'f':
+            sitewideField = optarg;
+            break;
+
+        case 'i':
+            infoField = optarg;
+            break;
+ 
+        case 'a':
+            statType = MEAN;
+            break;
+
+        case 'm':
+            statType = MEDIAN;
+            break;
+
+        case 'n':
+            statType = MIN;
+            break;
+
+        case 'x':
+            statType = MAX;
+            break;
+
+        case 'h':
+            printSummary(argv);
+            exit(0);
+
+        case '?':
+            /* getopt_long already printed an error message. */
+            printSummary(argv);
+            exit(1);
+            break;
+ 
+        default:
+            abort ();
+        }
+    }
+
+    if (infoField.empty() || sitewideField.empty()) {
+        cerr << "Error: both a sample field and an info field are required." << endl;
+        return 1;
+    }
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    string statTypeStr;
+
+    switch (statType) {
+    case MEAN:
+        statTypeStr = "mean";
+        break;
+    case MEDIAN:
+        statTypeStr = "median";
+        break;
+    case MIN:
+        statTypeStr = "min";
+        break;
+    case MAX:
+        statTypeStr = "max";
+        break;
+    default:
+        cerr << "Error: failure to convert stat type to string" << endl;
+        return 1;
+        break;
+    }
+
+    variantFile.addHeaderLine("##INFO=<ID="+infoField+",Number=1,Type=Float,Description=\"Summary statistic generated by"+statTypeStr+" of site-wide values of "+sitewideField+" \">");
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        vector<double> vals;
+        map<string, vector<string> >::iterator i = var.info.find(sitewideField);
+        if (i != var.info.end()) {
+            for (vector<string>::iterator s = i->second.begin(); s != i->second.end(); ++s) {
+                double d;
+                convert(*s, d);
+                vals.push_back(d);
+            }
+        }
+
+        double result;
+        switch (statType) {
+        case MEAN:
+            result = mean(vals);
+            break;
+        case MEDIAN:
+            result = median(vals);
+            break;
+        case MIN:
+            result = *min_element(vals.begin(), vals.end());
+            break;
+        case MAX:
+            result = *max_element(vals.begin(), vals.end());
+            break;
+        default:
+            cerr << "Error: unrecognized StatType" << endl;
+            return 1;
+            break;
+        }
+
+        var.info[infoField].clear();
+        var.info[infoField].push_back(convert(result));
+
+        cout << var << endl;
+
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfintersect.cpp b/src/vcfintersect.cpp
new file mode 100644
index 0000000..27f272e
--- /dev/null
+++ b/src/vcfintersect.cpp
@@ -0,0 +1,577 @@
+#include "Variant.h"
+#include "BedReader.h"
+#include "intervaltree/IntervalTree.h"
+#include <getopt.h>
+#include "fastahack/Fasta.h"
+#include <algorithm>
+#include <list>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+         << endl
+         << "options:" << endl 
+         << "    -b, --bed FILE            use intervals provided by this BED file" << endl
+         << "    -R, --region REGION       use 1-based tabix-style region (e.g. chrZ:10-20), multiples allowed" << endl
+         << "    -S, --start-only          don't use the reference length information in the record to determine" << endl
+         << "                              overlap status, just use the start posiion" << endl
+         << "    -v, --invert              invert the selection, printing only records which would" << endl
+         << "                                not have been printed out" << endl
+         << "    -i, --intersect-vcf FILE  use this VCF for set intersection generation" << endl
+         << "    -u, --union-vcf FILE      use this VCF for set union generation" << endl
+         << "    -w, --window-size N       compare records up to this many bp away (default 30)" << endl
+         << "    -r, --reference FILE      FASTA reference file, required with -i and -u" << endl
+         << "    -l, --loci                output whole loci when one alternate allele matches" << endl
+         << "    -m, --ref-match           intersect on the basis of record REF string" << endl
+         << "    -t, --tag TAG             attach TAG to each record's info field if it would intersect" << endl
+         << "    -V, --tag-value VAL       use this value to indicate that the allele is passing" << endl
+         << "                              '.' will be used otherwise.  default: 'PASS'" << endl
+         << "    -M, --merge-from FROM-TAG" << endl
+         << "    -T, --merge-to   TO-TAG   merge from FROM-TAG used in the -i file, setting TO-TAG" << endl
+         << "                              in the current file." << endl
+         << endl
+         << "For bed-vcf intersection, alleles which fall into the targets are retained." << endl
+         << endl
+         << "For vcf-vcf intersection and union, unify on equivalent alleles within window-size bp" << endl
+         << "as determined by haplotype comparison alleles." << endl;
+	//<< "Intersect the records in the VCF file with targets provided in a BED file." << endl
+	//<< "Intersections are done on the reference sequences in the VCF file." << endl
+	//<< "If no VCF filename is specified on the command line (last argument) the VCF" << endl
+	//<< "read from stdin." << endl;
+    exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+    string bedFileName;
+    string vcfFileName;
+    string fastaFileName;
+    bool intersecting = false;
+    bool unioning = false;
+    bool invert = false;
+    bool contained = true;
+    bool overlapping = false;
+    bool startPositionOnly = false;
+    int windowsize = 30;
+    bool loci = false;
+    bool refmatch = false;
+    string tag;
+    string tagValue = "PASS";
+    string mergeFromTag;
+    string mergeToTag;
+    vector<BedTarget> regions;
+
+    if (argc == 1)
+        printSummary(argv);
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"bed",  required_argument, 0, 'b'},
+                {"region",  required_argument, 0, 'R'},
+                {"invert",  no_argument, 0, 'v'},
+                {"intersect-vcf", required_argument, 0, 'i'},
+                {"union-vcf", required_argument, 0, 'u'},
+                {"contained",  no_argument, 0, 'c'},
+                {"overlapping", no_argument, 0, 'o'},
+                {"window-size", required_argument, 0, 'w'},
+                {"reference", required_argument, 0, 'r'},
+                {"loci", no_argument, 0, 'l'},
+                {"ref-match", no_argument, 0, 'm'},
+                {"tag", required_argument, 0, 't'},
+                {"tag-value", required_argument, 0, 'V'},
+                {"merge-from", required_argument, 0, 'M'},
+                {"merge-to", required_argument, 0, 'T'},
+                {"start-only", no_argument, 0, 'S'},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hvcSlmob:i:u:w:r:t:V:M:T:R:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+
+	    case 'w':
+            windowsize = atoi(optarg);
+            break;
+
+        case 'b':
+            bedFileName = string(optarg);
+            break;
+
+        case 'i':
+            intersecting = true;
+            vcfFileName = string(optarg);
+            break;
+
+        case 'u':
+            unioning = true;
+            vcfFileName = string(optarg);
+            break;
+
+	    case 'r':
+            fastaFileName = string(optarg);
+            break;
+
+        case 'v':
+            invert = true;
+            break;
+
+        case 'c':
+            contained = true;
+            break;
+
+        case 'o':
+            overlapping = true;
+            break;
+
+	    case 'l':
+	        loci = true;
+	        break;
+
+	    case 'm':
+	        refmatch = true;
+	        break;
+
+	    case 't':
+	        tag = optarg;
+            break;
+
+        case 'R':
+            regions.push_back(BedTarget(optarg));
+            regions.back().left -= 1;
+            break;
+
+        case 'S':
+            startPositionOnly = true;
+            break;
+
+	    case 'V':
+            tagValue = optarg;
+            break;
+
+	    case 'M':
+            mergeFromTag = optarg;
+            break;
+
+	    case 'T':
+            mergeToTag = optarg;
+            break;
+
+        case 'h':
+            printSummary(argv);
+            break;
+
+        case '?':
+            printSummary(argv);
+            exit(1);
+            break;
+
+        default:
+            abort ();
+        }
+    }
+
+
+    VariantCallFile variantFile;
+    bool usingstdin = false;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+        usingstdin = true;
+    }
+
+    if (!variantFile.is_open()) {
+        cerr << "could not open VCF file" << endl;
+        exit(1);
+    }
+
+
+    bool usingBED = false;
+    if (!bedFileName.empty()) {
+        usingBED = true;
+    }
+    
+    if (usingBED || !regions.empty()) {
+        variantFile.parseSamples = false;
+    }
+
+    // it runs much faster to do this first.  then downstream processes don't block!
+
+    BedReader bed;
+    if (usingBED) {
+        bed.open(bedFileName);
+    }
+    if (!regions.empty()) {
+        // add to the bed
+        bed.addTargets(regions);
+        usingBED = true;
+    }
+
+    VariantCallFile otherVariantFile;
+    if (!vcfFileName.empty()) {
+        if (vcfFileName == "-") {
+            if (usingstdin) {
+                cerr << "cannot open both VCF file streams from stdin" << endl;
+                exit(1);
+            } else {
+                otherVariantFile.open(std::cin);
+            }
+        } else {
+            otherVariantFile.open(vcfFileName);
+        }
+        if (!otherVariantFile.is_open()) {
+            cerr << "could not open VCF file " << vcfFileName << endl;
+            exit(1);
+        }
+    }
+
+
+    if (!tag.empty()) {
+        variantFile.addHeaderLine("##INFO=<ID="+ tag +",Number=A,Type=String,Description=\"" + tagValue + " if this allele intersects with one in " + vcfFileName  +  ", '.' if not.\">");
+    }
+
+    if (!mergeToTag.empty()) {
+        if (mergeFromTag.empty()) {
+            cerr << "must specify a tag to merge from" << endl;
+            exit(1);
+        }
+        // get mergeFromTag type
+        map<string, VariantFieldType>::iterator f = otherVariantFile.infoTypes.find(mergeFromTag);
+        if (f == otherVariantFile.infoTypes.end()) {
+            cerr << "vcfintersect: ERROR could not find " << mergeFromTag << " in header" << endl;
+            exit(1);
+        }
+        VariantFieldType mergeFromType = f->second;
+        stringstream s;
+        s << mergeFromType;
+        
+        variantFile.addHeaderLine("##INFO=<ID="+ mergeToTag +",Number=A,Type=" + s.str() + ",Description=\"The value of " + mergeFromTag + " in " + vcfFileName  +  " '.' if the tag does not exist for the given allele in the other file, or if there is no corresponding allele.\">");
+    }
+
+    cout << variantFile.header << endl;
+
+
+    FastaReference reference;
+    if (unioning || intersecting) {
+        if (fastaFileName.empty()) {
+            cerr << "a reference is required for haplotype-based intersection and unioniong" << endl;
+            exit(1);
+        }
+        reference.open(fastaFileName);
+    }
+
+    if (!unioning && !intersecting) {
+        variantFile.parseSamples = false; // faster, as when we are
+        // only bed-intersecting we
+        // can do position-only
+        // output and don't have to
+        // manipulate specific
+        // alleles
+    }
+
+    // read the VCF file for union or intersection into an interval tree
+    // indexed using some proximity window
+
+    map<string, IntervalTree<Variant*> > variantIntervals;
+    map<string, list<Variant> > otherVariants;
+    map<string, vector<Interval<Variant*> > > otherVariantIntervals;
+
+    if (unioning || intersecting) {
+
+        Variant ovar(otherVariantFile);
+        while (otherVariantFile.getNextVariant(ovar)) {
+            long int left = ovar.position;
+            long int right = left + ovar.ref.size(); // this should be 1-past the end
+            otherVariants[ovar.sequenceName].push_back(ovar);
+            Variant* v = &otherVariants[ovar.sequenceName].back();
+            otherVariantIntervals[ovar.sequenceName].push_back(Interval<Variant*>(left, right, v));
+        }
+	
+        for (map<string, vector<Interval<Variant*> > >::iterator j = otherVariantIntervals.begin(); j != otherVariantIntervals.end(); ++j) {
+            variantIntervals[j->first] = IntervalTree<Variant*>(j->second);
+        }
+
+    }
+
+    set<Variant*> outputVariants;
+
+    long int lastOutputPosition = 0;
+    string lastSequenceName;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+
+        if (lastSequenceName.empty()) {
+            lastSequenceName = var.sequenceName;
+        } else if (lastSequenceName != var.sequenceName) {
+            if (unioning) {
+                vector<Interval<Variant*> > previousRecords;
+                long int lastSeqLength = reference.sequenceLength(lastSequenceName);
+                variantIntervals[lastSequenceName].findContained(lastOutputPosition, lastSeqLength, previousRecords);
+                for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) {
+                    Variant* v = r->value;
+                    if (outputVariants.find(v) == outputVariants.end()) {
+                        outputVariants.insert(v);
+                        cout << *v << endl; // Q: does this output everything in correct order?.... A: No.
+                    }
+                }
+                lastSequenceName = var.sequenceName;
+                lastOutputPosition = 0;
+            }
+        }
+
+        if (usingBED) {
+            vector<BedTarget*> overlaps;
+            if (startPositionOnly) {
+                // only intersect if start position (not end) is in target
+                BedTarget record(var.sequenceName, var.position, var.position, "");
+                overlaps = bed.targetsOverlapping(record);
+            } else {
+                // default behavior
+                BedTarget record(var.sequenceName, var.position, var.position + var.ref.size() - 1, "");
+                overlaps = bed.targetsOverlapping(record);
+            }
+
+            if (!invert && !overlaps.empty()) {
+                cout << variantFile.line << endl;
+            } else if (invert && overlaps.empty()) {
+                cout << variantFile.line << endl;
+            }
+
+        } else if (unioning || intersecting) {
+
+            // TODO check overlaps with union/intersection
+            // hmm... for unioning, you might need to step through the original VCF records
+            // but the idea is to exclude the haplotype-based duplicates
+
+            vector<Interval<Variant*> > results;
+
+            variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize, results);
+
+            vector<Variant*> overlapping;
+
+            for (vector<Interval<Variant*> >::iterator r = results.begin(); r != results.end(); ++r) {
+                overlapping.push_back(r->value);
+            }
+
+
+            if (unioning) {
+
+                // unioning strategy
+
+                // write out all the records from the last file
+                // between the last one printed out and the first
+                // one we're about to print out
+
+                vector<Interval<Variant*> > previousRecords;
+
+                variantIntervals[var.sequenceName].findOverlapping(lastOutputPosition, var.position - windowsize, previousRecords);
+
+                map<long int, vector<Variant*> > variants;
+
+                for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) {
+                    Variant* v = r->value;
+                    if (outputVariants.find(v) == outputVariants.end()) {
+                        outputVariants.insert(v);
+                        variants[v->position].push_back(v);
+                    }
+                }
+
+                for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) {
+                    for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) {
+                        cout << **o << endl;
+                        lastOutputPosition = max(lastOutputPosition, (*o)->position);
+                    }
+                }
+
+                // TODO find the duplicates for the other file
+            }
+
+
+            if (overlapping.empty()) {
+
+                if (unioning || (intersecting && invert)) {
+                    cout << var << endl;
+                    lastOutputPosition = max(lastOutputPosition, var.position);
+                } else if (intersecting && (!tag.empty() || !mergeToTag.empty())) {
+                    for (int i = 0; i < var.alt.size(); ++i) {
+                        if (!tag.empty()) {
+                            var.info[tag].push_back(".");
+                        }
+                        if (!mergeToTag.empty()) {
+                            var.info[mergeToTag].push_back(".");
+                        }
+                    }
+                    cout << var << endl;
+                    lastOutputPosition = max(lastOutputPosition, var.position);
+                }
+
+            } else {
+
+                // get the min and max of the overlaps
+
+                int haplotypeStart = var.position;
+                int haplotypeEnd = var.position + var.ref.size();
+
+                for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
+                    haplotypeStart = min((*v)->position, (long int) haplotypeStart);
+                    haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd);
+                }
+
+                // for everything overlapping and the current variant, construct the local haplotype within the bounds
+                // if there is an exact match, the allele in the current VCF does intersect
+
+                string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart);
+                map<string, vector<pair<Variant*, int> > > haplotypes; // map to variant and alt index
+
+                for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
+                    Variant& variant = **v;
+                    int altindex = 0;
+                    for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a, ++altindex) {
+                        string haplotype = referenceHaplotype;
+                        // get the relative start and end coordinates for the variant alternate allele
+                        int relativeStart = variant.position - haplotypeStart;
+                        haplotype.replace(relativeStart, variant.ref.size(), *a);
+                        haplotypes[haplotype].push_back(make_pair(*v, altindex));
+                    }
+                }
+
+                Variant originalVar = var;
+
+                // determine the non-intersecting alts
+                vector<string> altsToRemove;
+                vector<int> altIndexesToRemove;
+                for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+                    string haplotype = referenceHaplotype;
+                    int relativeStart = var.position - haplotypeStart;
+                    haplotype.replace(relativeStart, var.ref.size(), *a);
+                    map<string, vector<pair<Variant*, int> > >::iterator h = haplotypes.find(haplotype);
+                    if ((intersecting && !invert && h == haplotypes.end())
+                        || (intersecting && invert && h != haplotypes.end())
+                        || (unioning && h != haplotypes.end())) {
+                        if (tag.empty() && mergeToTag.empty()) {
+                            altsToRemove.push_back(*a);
+                        } else {
+                            if (!tag.empty()) {
+                                var.info[tag].push_back(".");
+                            }
+                            if (!mergeToTag.empty()) {
+                                var.info[mergeToTag].push_back(".");
+                            }
+                        }
+                    } else {
+                        if (!tag.empty()) {
+                            var.info[tag].push_back(tagValue);
+                        }
+                        // NB: just take the first value for the mergeFromTag
+                        if (!mergeToTag.empty()) {
+                            Variant* v = h->second.front().first;
+                            int index = h->second.front().second;
+                            if (v->info.find(mergeFromTag) != v->info.end()) {
+                                // now you have to find the exact allele...
+                                string& otherValue = v->info[mergeFromTag].at(index);
+                                var.info[mergeToTag].push_back(otherValue);
+                            } else if (mergeFromTag == "QUAL") {
+                                var.info[mergeToTag].push_back(convert(v->quality));
+                            } else {
+                                var.info[mergeToTag].push_back(".");
+                            }
+                        }
+                    }
+                }
+
+                // remove the non-overlapping (intersecting) or overlapping (unioning) alts
+                if (intersecting && loci && altsToRemove.size() != var.alt.size()) {
+                    // we have a match in loci mode, so we should output the whole loci, not just the matching sequence
+                } else {
+                    for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) {
+                        var.removeAlt(*a);
+                    }
+                }
+
+                if (unioning) {
+
+                    // somehow sort the records and combine them?
+                    map<long int, vector<Variant*> > variants;
+                    for (vector<Variant*>::iterator o = overlapping.begin(); o != overlapping.end(); ++o) {
+                        if ((*o)->position <= var.position && // check ensures proper ordering of variants on output
+                            outputVariants.find(*o) == outputVariants.end()) {
+                            outputVariants.insert(*o);
+                            variants[(*o)->position].push_back(*o);
+                        }
+                    }
+                    // add in the current variant, if it has alts left
+                    if (!var.alt.empty()) {
+                        vector<Variant*>& vars = variants[var.position];
+                        int numalts = 0;
+                        for (vector<Variant*>::iterator v = vars.begin(); v != vars.end(); ++v) {
+                            numalts += (*v)->alt.size();
+                        }
+                        if (numalts + var.alt.size() == originalVar.alt.size()) {
+                            variants[var.position].clear();
+                            variants[var.position].push_back(&originalVar);
+                        } else {
+                            variants[var.position].push_back(&var);
+                        }
+                    }
+
+                    for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) {
+                        for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) {
+                            cout << **o << endl;
+                            lastOutputPosition = max(lastOutputPosition, (*o)->position);
+                        }
+                    }
+                } else {
+                    // if any alts remain, output the variant record
+                    if (!var.alt.empty()) {
+                        cout << var << endl;
+                        lastOutputPosition = max(lastOutputPosition, var.position);
+                    }
+                }
+
+            }
+
+        }
+
+    }
+
+
+    // if unioning, and any variants remain, output them
+    if (unioning) {
+        for (map<string, list<Variant> >::iterator chrom = otherVariants.find(lastSequenceName);
+             chrom != otherVariants.end();
+             ++chrom) {
+            for (list<Variant>::iterator v = chrom->second.begin(); v != chrom->second.end(); ++v) {
+                Variant* variant = &*v;
+                if (outputVariants.find(variant) == outputVariants.end()) {
+                    outputVariants.insert(variant);
+                    cout << *variant << endl;
+                    // TODO guarantee sorting
+                }
+            }
+        }
+    }
+
+    exit(0);  // why?
+    return 0;
+
+}
+
diff --git a/src/vcfkeepgeno.cpp b/src/vcfkeepgeno.cpp
new file mode 100644
index 0000000..5616b22
--- /dev/null
+++ b/src/vcfkeepgeno.cpp
@@ -0,0 +1,62 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+    if (argc < 3) {
+        cerr << "usage: " << argv[0] << " <vcf file> [FIELD1] [FIELD2] ..." << endl
+             << "outputs each record in the vcf file, removing FORMAT fields not listed"
+	     << "on the command line from sample specifications in the output"
+	     << endl;
+        return 1;
+    }
+
+    string filename = argv[1];
+
+    vector<string> newFormat;
+    set<string> fieldsToKeep;
+    for (int i = 2; i < argc; ++i) {
+        fieldsToKeep.insert(argv[i]);
+        newFormat.push_back(argv[i]);
+    }
+
+    VariantCallFile variantFile;
+    if (filename == "-") {
+        variantFile.open(std::cin);
+    } else {
+        variantFile.open(filename);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    Variant var(variantFile);
+
+    vector<string> formatIds = variantFile.formatIds();
+    for (vector<string>::iterator i = formatIds.begin(); i != formatIds.end(); ++i) {
+        if (!fieldsToKeep.count(*i)) {
+            variantFile.removeGenoHeaderLine(*i);
+        }
+    }
+
+    // write the header
+    cout << variantFile.header << endl;
+ 
+    // print the records, filtering is done via the setting of varA's output sample names
+    while (variantFile.getNextVariant(var)) {
+        var.format = newFormat;
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfkeepinfo.cpp b/src/vcfkeepinfo.cpp
new file mode 100644
index 0000000..916ca89
--- /dev/null
+++ b/src/vcfkeepinfo.cpp
@@ -0,0 +1,68 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    if (argc < 3) {
+        cerr << "usage: " << argv[0] << " <vcf file> [FIELD1] [FIELD2] ..." << endl
+             << "outputs each record in the vcf file, removing INFO fields not listed on the command line" << endl;
+        return 1;
+    }
+
+    string filename = argv[1];
+
+    set<string> fieldsToKeep;
+    for (int i = 2; i < argc; ++i) {
+        fieldsToKeep.insert(argv[i]);
+    }
+
+    VariantCallFile variantFile;
+    if (filename == "-") {
+        variantFile.open(std::cin);
+    } else {
+        variantFile.open(filename);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    Variant var(variantFile);
+
+    vector<string> fieldsToErase;
+    vector<string> infoIds = variantFile.infoIds();
+    for (vector<string>::iterator i = infoIds.begin(); i != infoIds.end(); ++i) {
+        if (!fieldsToKeep.count(*i)) {
+            fieldsToErase.push_back(*i);
+            variantFile.removeInfoHeaderLine(*i);
+        }
+    }
+
+    // write the header
+    cout << variantFile.header << endl;
+ 
+    // print the records, filtering is done via the setting of varA's output sample names
+    while (variantFile.getNextVariant(var)) {
+        for (map<string, vector<string> >::iterator i = var.info.begin(); i != var.info.end(); ++i) {
+            if (!fieldsToKeep.count(i->first)) {
+                var.info.erase(i->first);
+            }
+        }
+        for (map<string, bool>::iterator i = var.infoFlags.begin(); i != var.infoFlags.end(); ++i) {
+            if (!fieldsToKeep.count(i->first)) {
+                var.infoFlags.erase(i->first);
+            }
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfkeepsamples.cpp b/src/vcfkeepsamples.cpp
new file mode 100644
index 0000000..935c8a1
--- /dev/null
+++ b/src/vcfkeepsamples.cpp
@@ -0,0 +1,54 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    if (argc < 3) {
+        cerr << "usage: " << argv[0] << " <vcf file> [SAMPLE1] [SAMPLE2] ..." << endl
+             << "outputs each record in the vcf file, removing samples not listed on the command line" << endl;
+        return 1;
+    }
+
+    string filename = argv[1];
+
+    vector<string> samplesToKeep;
+    for (int i = 2; i < argc; ++i) {
+        samplesToKeep.push_back(argv[i]);
+    }
+
+    VariantCallFile variantFile;
+    if (filename == "-") {
+        variantFile.open(std::cin);
+    } else {
+        variantFile.open(filename);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    Variant var(variantFile);
+
+    // update sample list in header
+    variantFile.updateSamples(samplesToKeep);
+
+    // and restrict the output sample names in the variant to those we are keeping
+    var.setOutputSampleNames(samplesToKeep);
+    
+    // write the new header
+    cout << variantFile.header << endl;
+ 
+    // print the records, filtering is done via the setting of varA's output sample names
+    while (variantFile.getNextVariant(var)) {
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfleftalign.cpp b/src/vcfleftalign.cpp
new file mode 100644
index 0000000..f4b992e
--- /dev/null
+++ b/src/vcfleftalign.cpp
@@ -0,0 +1,781 @@
+#include "Variant.h"
+#include "convert.h"
+#include "join.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <set>
+#include <vector>
+#include <getopt.h>
+#include <cmath>
+
+using namespace std;
+using namespace vcf;
+
+
+// Attempts to left-realign all the indels represented by the alignment cigar.
+//
+// This is done by shifting all indels as far left as they can go without
+// mismatch, then merging neighboring indels of the same class.  leftAlign
+// updates the alignment cigar with changes, and returns true if realignment
+// changed the alignment cigar.
+//
+// To left-align, we move multi-base indels left by their own length as long as
+// the preceding bases match the inserted or deleted sequence.  After this
+// step, we handle multi-base homopolymer indels by shifting them one base to
+// the left until they mismatch the reference.
+//
+// To merge neighboring indels, we iterate through the set of left-stabilized
+// indels.  For each indel we add a new cigar element to the new cigar.  If a
+// deletion follows a deletion, or an insertion occurs at the same place as
+// another insertion, we merge the events by extending the previous cigar
+// element.
+//
+// In practice, we must call this function until the alignment is stabilized.
+
+#define VCFLEFTALIGN_DEBUG(msg) \
+    if (false) { cerr << msg; }
+
+class VCFIndelAllele {
+    friend ostream& operator<<(ostream&, const VCFIndelAllele&);
+    friend bool operator==(const VCFIndelAllele&, const VCFIndelAllele&);
+    friend bool operator!=(const VCFIndelAllele&, const VCFIndelAllele&);
+    friend bool operator<(const VCFIndelAllele&, const VCFIndelAllele&);
+public:
+    bool insertion;
+    int length;
+    int position;
+    int readPosition;
+    string sequence;
+
+    bool homopolymer(void);
+
+    VCFIndelAllele(bool i, int l, int p, int rp, string s)
+        : insertion(i), length(l), position(p), readPosition(rp), sequence(s)
+        { }
+};
+
+bool FBhomopolymer(string sequence);
+ostream& operator<<(ostream& out, const VCFIndelAllele& indel);
+bool operator==(const VCFIndelAllele& a, const VCFIndelAllele& b);
+bool operator!=(const VCFIndelAllele& a, const VCFIndelAllele& b);
+bool operator<(const VCFIndelAllele& a, const VCFIndelAllele& b);
+
+bool VCFIndelAllele::homopolymer(void) {
+    string::iterator s = sequence.begin();
+    char c = *s++;
+    while (s != sequence.end()) {
+        if (c != *s++) return false;
+    }
+    return true;
+}
+
+bool FBhomopolymer(string sequence) {
+    string::iterator s = sequence.begin();
+    char c = *s++;
+    while (s != sequence.end()) {
+        if (c != *s++) return false;
+    }
+    return true;
+}
+
+ostream& operator<<(ostream& out, const VCFIndelAllele& indel) {
+    string t = indel.insertion ? "i" : "d";
+    out << t <<  ":" << indel.position << ":" << indel.readPosition << ":" << indel.sequence;
+    return out;
+}
+
+bool operator==(const VCFIndelAllele& a, const VCFIndelAllele& b) {
+    return (a.insertion == b.insertion
+            && a.length == b.length
+            && a.position == b.position
+            && a.sequence == b.sequence);
+}
+
+bool operator!=(const VCFIndelAllele& a, const VCFIndelAllele& b) {
+    return !(a==b);
+}
+
+bool operator<(const VCFIndelAllele& a, const VCFIndelAllele& b) {
+    ostringstream as, bs;
+    as << a;
+    bs << b;
+    return as.str() < bs.str();
+}
+
+
+class AltAlignment {
+public:
+    unsigned int pos;
+    string seq;
+    vector<pair<int, string> > cigar;
+    AltAlignment(unsigned int& p,
+                 string& s,
+                 string& c) {
+        pos = p;
+        seq = s;
+        cigar = splitCigar(c);
+    }
+};
+
+double entropy(const string& st) {
+    vector<char> stvec(st.begin(), st.end());
+    set<char> alphabet(stvec.begin(), stvec.end());
+    vector<double> freqs;
+    for (set<char>::iterator c = alphabet.begin(); c != alphabet.end(); ++c) {
+        int ctr = 0;
+        for (vector<char>::iterator s = stvec.begin(); s != stvec.end(); ++s) {
+            if (*s == *c) {
+                ++ctr;
+            }
+        }
+        freqs.push_back((double)ctr / (double)stvec.size());
+    }
+    double ent = 0;
+    double ln2 = log(2);
+    for (vector<double>::iterator f = freqs.begin(); f != freqs.end(); ++f) {
+        ent += *f * log(*f)/ln2;
+    }
+    ent = -ent;
+    return ent;
+}
+
+void getAlignment(Variant& var, FastaReference& reference, string& ref, vector<AltAlignment>& alignments, int window) {
+    
+    // default alignment params
+    float matchScore = 10.0f;
+    float mismatchScore = -9.0f;
+    float gapOpenPenalty = 25.0f;
+    float gapExtendPenalty = 3.33f;
+
+    // establish reference sequence
+    string pad = string(window/2, 'Z');
+    string leftFlank = reference.getSubSequence(var.sequenceName, var.zeroBasedPosition() - window/2, window/2);
+    string rightFlank = reference.getSubSequence(var.sequenceName, var.zeroBasedPosition() + var.ref.size(), window/2);
+    ref = pad + leftFlank + var.ref + rightFlank + pad;
+
+    // and iterate through the alternates, generating alignments
+    for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+        string alt = pad + leftFlank + *a + rightFlank + pad;
+        CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
+        unsigned int referencePos;
+        string cigar;
+        sw.Align(referencePos, cigar, ref, alt);
+        alignments.push_back(AltAlignment(referencePos, alt, cigar));
+    }
+}
+
+
+bool stablyLeftAlign(string& alternateSequence, string referenceSequence, int maxiterations = 50, bool debug = false);
+int countMismatches(string& alternateSequence, string referenceSequence);
+
+bool leftAlign(string& alternateSequence, Cigar& cigar, string& referenceSequence, bool debug = false) {
+
+    int arsOffset = 0; // pointer to insertion point in aligned reference sequence
+    string alignedReferenceSequence = referenceSequence;
+    int aabOffset = 0;
+    string alignmentAlignedBases = alternateSequence;
+
+    // store information about the indels
+    vector<VCFIndelAllele> indels;
+
+    int rp = 0;  // read position, 0-based relative to read
+    int sp = 0;  // sequence position
+
+    string softBegin;
+    string softEnd;
+
+    stringstream cigar_before, cigar_after;
+    for (vector<pair<int, string> >::const_iterator c = cigar.begin();
+        c != cigar.end(); ++c) {
+        unsigned int l = c->first;
+        char t = c->second.at(0);
+
+        cigar_before << l << t;
+        if (t == 'M') { // match or mismatch
+            sp += l;
+            rp += l;
+        } else if (t == 'D') { // deletion
+            indels.push_back(VCFIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l)));
+            alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
+            aabOffset += l;
+            sp += l;  // update reference sequence position
+        } else if (t == 'I') { // insertion
+            indels.push_back(VCFIndelAllele(true, l, sp, rp, alternateSequence.substr(rp, l)));
+            alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-'));
+            arsOffset += l;
+            rp += l;
+        } else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference
+            // remove these bases from the refseq and read seq, but don't modify the alignment sequence
+            if (rp == 0) {
+                alignedReferenceSequence = string(l, '*') + alignedReferenceSequence;
+                softBegin = alignmentAlignedBases.substr(0, l);
+            } else {
+                alignedReferenceSequence = alignedReferenceSequence + string(l, '*');
+                softEnd = alignmentAlignedBases.substr(alignmentAlignedBases.size() - l, l);
+            }
+            rp += l;
+        } else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
+        } else if (t == 'N') { // skipped region in the reference not present in read, aka splice
+            sp += l;
+        }
+    }
+
+
+    int alignedLength = sp;
+
+    VCFLEFTALIGN_DEBUG("| " << cigar_before.str() << endl
+       << "| " << alignedReferenceSequence << endl
+       << "| " << alignmentAlignedBases << endl);
+
+    // if no indels, return the alignment
+    if (indels.empty()) { return false; }
+
+    // for each indel, from left to right
+    //     while the indel sequence repeated to the left and we're not matched up with the left-previous indel
+    //         move the indel left
+
+    vector<VCFIndelAllele>::iterator previous = indels.begin();
+    for (vector<VCFIndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) {
+
+        // left shift by repeats
+        //
+        // from 1 base to the length of the indel, attempt to shift left
+        // if the move would cause no change in alignment optimality (no
+        // introduction of mismatches, and by definition no change in gap
+        // length), move to the new position.
+        // in practice this moves the indel left when we reach the size of
+        // the repeat unit.
+        //
+        int steppos, readsteppos;
+        VCFIndelAllele& indel = *id;
+        int i = 1;
+        while (i <= indel.length) {
+
+            int steppos = indel.position - i;
+            int readsteppos = indel.readPosition - i;
+
+#ifdef VERBOSE_DEBUG
+            if (debug) {
+                if (steppos >= 0 && readsteppos >= 0) {
+                    cerr << referenceSequence.substr(steppos, indel.length) << endl;
+                    cerr << alternateSequence.substr(readsteppos, indel.length) << endl;
+                    cerr << indel.sequence << endl;
+                }
+            }
+#endif
+            while (steppos >= 0 && readsteppos >= 0
+                   && indel.sequence == referenceSequence.substr(steppos, indel.length)
+                   && indel.sequence == alternateSequence.substr(readsteppos, indel.length)
+                   && (id == indels.begin()
+                       || (previous->insertion && steppos >= previous->position)
+                       || (!previous->insertion && steppos >= previous->position + previous->length))) {
+                VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl);
+                indel.position -= i;
+                indel.readPosition -= i;
+                steppos = indel.position - i;
+                readsteppos = indel.readPosition - i;
+            }
+            do {
+                ++i;
+            } while (i <= indel.length && indel.length % i != 0);
+        }
+
+        // left shift indels with exchangeable flanking sequence
+        //
+        // for example:
+        //
+        //    GTTACGTT           GTTACGTT
+        //    GT-----T   ---->   G-----TT
+        //
+        // GTGTGACGTGT           GTGTGACGTGT
+        // GTGTG-----T   ---->   GTG-----TGT
+        //
+        // GTGTG-----T           GTG-----TGT
+        // GTGTGACGTGT   ---->   GTGTGACGTGT
+        //
+        //
+        steppos = indel.position - 1;
+        readsteppos = indel.readPosition - 1;
+        while (steppos >= 0 && readsteppos >= 0
+               && alternateSequence.at(readsteppos) == referenceSequence.at(steppos)
+               && alternateSequence.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
+               && (id == indels.begin()
+                   || (previous->insertion && indel.position - 1 >= previous->position)
+                   || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
+            VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl);
+            indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1);
+            indel.position -= 1;
+            indel.readPosition -= 1;
+            steppos = indel.position - 1;
+            readsteppos = indel.readPosition - 1;
+        }
+        // tracks previous indel, so we don't run into it with the next shift
+        previous = id;
+    }
+
+    // bring together floating indels
+    // from left to right
+    // check if we could merge with the next indel
+    // if so, adjust so that we will merge in the next step
+    if (indels.size() > 1) {
+        previous = indels.begin();
+        for (vector<VCFIndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) {
+            VCFIndelAllele& indel = *id;
+            // parsimony: could we shift right and merge with the previous indel?
+            // if so, do it
+            int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
+            int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
+            if (previous->insertion == indel.insertion
+                    && ((previous->insertion
+                        && (previous->position < indel.position
+                        && previous->readPosition + previous->readPosition < indel.readPosition))
+                        ||
+                        (!previous->insertion
+                        && (previous->position + previous->length < indel.position)
+                        && (previous->readPosition < indel.readPosition)
+                        ))) {
+                if (previous->homopolymer()) {
+                    string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
+                    string readseq = alternateSequence.substr(prev_end_read, indel.position - prev_end_ref);
+                    VCFLEFTALIGN_DEBUG("seq: " << seq << endl << "readseq: " << readseq << endl);
+                    if (previous->sequence.at(0) == seq.at(0)
+                            && FBhomopolymer(seq)
+                            && FBhomopolymer(readseq)) {
+                        VCFLEFTALIGN_DEBUG("moving " << *previous << " right to " 
+                                << (indel.insertion ? indel.position : indel.position - previous->length) << endl);
+                        previous->position = indel.insertion ? indel.position : indel.position - previous->length;
+                    }
+                } 
+                else {
+                    int pos = previous->position;
+                    while (pos < (int) referenceSequence.length() &&
+                            ((previous->insertion && pos + previous->length <= indel.position)
+                            ||
+                            (!previous->insertion && pos + previous->length < indel.position))
+                            && previous->sequence 
+                                == referenceSequence.substr(pos + previous->length, previous->length)) {
+                        pos += previous->length;
+                    }
+                    if (pos < previous->position &&
+                        ((previous->insertion && pos + previous->length == indel.position)
+                        ||
+                        (!previous->insertion && pos == indel.position - previous->length))
+                       ) {
+                        VCFLEFTALIGN_DEBUG("right-merging tandem repeat: moving " << *previous << " right to " << pos << endl);
+                        previous->position = pos;
+                    }
+                }
+            }
+            previous = id;
+        }
+    }
+
+    // for each indel
+    //     if ( we're matched up to the previous insertion (or deletion) 
+    //          and it's also an insertion or deletion )
+    //         merge the indels
+    //
+    // and simultaneously reconstruct the cigar
+
+    Cigar newCigar;
+
+    if (!softBegin.empty()) {
+        newCigar.push_back(make_pair(softBegin.size(), "S"));
+    }
+
+    vector<VCFIndelAllele>::iterator id = indels.begin();
+    VCFIndelAllele last = *id++;
+    if (last.position > 0) {
+        newCigar.push_back(make_pair(last.position, "M"));
+        newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D")));
+    } else {
+        newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D")));
+    }
+    int lastend = last.insertion ? last.position : (last.position + last.length);
+    VCFLEFTALIGN_DEBUG(last << ",");
+
+    for (; id != indels.end(); ++id) {
+        VCFIndelAllele& indel = *id;
+        VCFLEFTALIGN_DEBUG(indel << ",");
+        if (indel.position < lastend) {
+            cerr << "impossibility?: indel realigned left of another indel" << endl
+                 << referenceSequence << endl << alternateSequence << endl;
+            exit(1);
+        } else if (indel.position == lastend && indel.insertion == last.insertion) {
+            pair<int, string>& op = newCigar.back();
+            op.first += indel.length;
+        } else if (indel.position >= lastend) {  // also catches differential indels, but with the same position
+            newCigar.push_back(make_pair(indel.position - lastend, "M"));
+            newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D")));
+        }
+        last = *id;
+        lastend = last.insertion ? last.position : (last.position + last.length);
+    }
+    
+    if (lastend < alignedLength) {
+        newCigar.push_back(make_pair(alignedLength - lastend, "M"));
+    }
+
+    if (!softEnd.empty()) {
+        newCigar.push_back(make_pair(softEnd.size(), "S"));
+    }
+
+    VCFLEFTALIGN_DEBUG(endl);
+
+    cigar = newCigar;
+
+    for (vector<pair<int, string> >::const_iterator c = cigar.begin();
+        c != cigar.end(); ++c) {
+        unsigned int l = c->first;
+        char t = c->second.at(0);
+        cigar_after << l << t;
+    }
+
+    //cerr << cigar_before.str() << " changes to " << cigar_after.str() << endl;
+    VCFLEFTALIGN_DEBUG(cigar_after.str() << endl);
+
+    // check if we're realigned
+    if (cigar_after.str() == cigar_before.str()) {
+        return false;
+    } else {
+        return true;
+    }
+
+}
+
+// Iteratively left-aligns the indels in the alignment until we have a stable
+// realignment.  Returns true on realignment success or non-realignment.
+// Returns false if we exceed the maximum number of realignment iterations.
+//
+bool stablyLeftAlign(string& alternateSequence, string referenceSequence, Cigar& cigar, int maxiterations, bool debug) {
+
+    if (!leftAlign(alternateSequence, cigar, referenceSequence, debug)) {
+
+        return true;
+
+    } else {
+
+        bool result = true;
+        while ((result = leftAlign(alternateSequence, cigar, referenceSequence, debug)) && --maxiterations > 0) { 
+        }
+
+        if (maxiterations <= 0) {
+            return false;
+        } else {
+            return true;
+        }
+
+    }
+
+}
+
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [file]" << endl
+         << endl
+         << "options:" << endl
+         << "    -r, --reference FILE  Use this reference as a basis for realignment." << endl
+         << "    -w, --window N        Use a window of this many bp when left aligning (150)." << endl
+         << endl
+         << "Left-aligns variants in the specified input file or stdin.  Window size is determined" << endl
+         << "dynamically according to the entropy of the regions flanking the indel.  These must have" << endl
+         << "entropy > 1 bit/bp, or be shorter than ~5kb." << endl;
+    exit(0);
+}
+
+int main(int argc, char** argv) {
+
+    int window = 150;
+    VariantCallFile variantFile;
+    string fastaFileName;
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"reference", required_argument, 0, 'r'},
+                {"window", required_argument, 0, 'w'},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hw:r:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+
+	    case 'r':
+            fastaFileName = optarg;
+            break;
+
+	    case 'w':
+            window = atoi(optarg);
+            break;
+
+        case '?':
+            printSummary(argv);
+            exit(1);
+            break;
+
+        case 'h':
+            printSummary(argv);
+            break;
+
+        default:
+            abort ();
+        }
+    }
+
+    if (optind < argc) {
+        string filename = argv[optind];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        cerr << "could not open VCF file" << endl;
+        exit(1);
+    }
+
+    FastaReference fastaReference;
+    if (fastaFileName.empty()) {
+        cerr << "a reference is required" << endl;
+        exit(1);
+    } else {
+        fastaReference.open(fastaFileName);
+    }
+
+    /*
+    variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">");
+    variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">");
+    if (!parseFlag.empty()) {
+        variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">");
+    }
+    */
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+
+        // if there is no indel, there is nothing to realign
+        bool hasIndel = false;
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+            if (a->size() != var.ref.size()) {
+                hasIndel = true;
+                break;
+            }
+        }
+        if (!hasIndel) {
+            cout << var << endl;
+            continue;
+        }
+
+        vector<AltAlignment> alignments;
+        string ref;
+
+        // determine window size to prevent mismapping with SW algorithm
+        int currentWindow = window;
+        int scale = 2;
+        if (var.ref.size()*scale > currentWindow) currentWindow = var.ref.size()*scale;
+        for (vector<string>::iterator a = var.alleles.begin(); a != var.alleles.end(); ++a) {
+            if (a->size()*scale > currentWindow) {
+                currentWindow = a->size()*scale;
+            }
+        }
+
+        // while the entropy of either flank is < some target entropy (~1 is fine), increase the flank sizes
+        while (currentWindow < 2000) { // limit to one step > than this
+            string refTarget = fastaReference.getSubSequence(var.sequenceName, var.position - 1 - currentWindow/2, currentWindow);
+            if (entropy(refTarget.substr(0, refTarget.size()/2)) < 1 ||
+                entropy(refTarget.substr(refTarget.size()/2)) < 1) {
+                currentWindow *= scale;
+            } else {
+                break;
+            }
+        }
+
+        // do the alignments
+        getAlignment(var, fastaReference, ref, alignments, currentWindow);
+
+        // stably left align the alignments
+        for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a) {
+            Cigar cigarBefore = a->cigar;
+            //cerr << a->seq << endl;
+            //cerr << "before : " << a->pos << " " << joinCigar(a->cigar) << endl;
+            long int prev = a->pos;
+            stablyLeftAlign(a->seq, ref, a->cigar, 20, false);
+            //cerr << "after  : " << a->pos << " " << joinCigar(a->cigar) << endl;
+            if (a->pos != prev) cerr << "modified alignment @ " << var << endl;
+        }
+        //cout << var << endl;
+
+        // transform the mappings
+        // chop off leading matching bases
+        // find the range of bp in the alleles
+        // make the new ref allele
+        // make the new alt alleles
+        // emit the var
+
+        long int newPosition = var.position+currentWindow/2;
+        long int newEndPosition = var.position-currentWindow/2;
+        // check for no-indel case
+        int newLength = var.ref.size();
+        bool giveUp = false;
+        for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end() && !giveUp; ++a) {
+            // get the first mismatching position
+            Cigar::iterator c = a->cigar.begin();
+
+            int rp = 0;
+            int sp = 0;
+            bool hitMismatch = false;
+
+            int matchingBpAtStart = 0;
+            int matchingBpAtEnd = 0;
+            // will be set to true if the first reference position match is broken by a SNP, not an indel
+            bool leadingSNP = false;
+
+            while (c != a->cigar.end()) {
+                char op = c->second[0];
+                if (c == a->cigar.begin()) {
+                    if (op != 'M') {
+                        cerr << "alignment does not start on matched sequence" << endl;
+                        cerr << var << endl;
+                        exit(1);
+                    }
+                    int i = 0;
+                    for ( ; i < c->first; ++i) {
+                        if (ref[i] != a->seq[i]) {
+                            leadingSNP = true;
+                            break;
+                        }
+                    }
+                    matchingBpAtStart = i;
+                }
+                if (!leadingSNP && c == (a->cigar.begin()+1)) {
+                    // if the first thing we run into is an indel, step back, per VCF spec
+                    if (op == 'D' || op == 'I') {
+                        --matchingBpAtStart;
+                    }
+                }
+                if (c == (a->cigar.end()-1)) {
+                    if (op != 'M') {
+                        // soft clip at end
+                        // it'll be hard to interpret this
+                        // the alignments sometimes generate this
+                        // best thing to do is to move on
+                        //cerr << "alignment does not end on matched sequence" << endl;
+                        //cout << var << endl;
+                        //exit(1);
+                        giveUp = true;
+                        break;
+                    }
+                    int i = 0;
+                    for ( ; i < c->first; ++i) {
+                        if (ref[ref.size()-1-i] != a->seq[a->seq.size()-1-i]) {
+                            break;
+                        }
+                    }
+                    matchingBpAtEnd = i;
+                }
+                ++c;
+            }
+
+            int altMismatchLength = a->seq.size() - matchingBpAtEnd - matchingBpAtStart;
+            int refMismatchLength = (var.ref.size() + currentWindow) - matchingBpAtEnd - matchingBpAtStart;
+            //cerr << "alt mismatch length " << altMismatchLength << endl
+            //     << "ref mismatch length " << refMismatchLength << endl;
+            long int newStart = var.position - currentWindow/2 + matchingBpAtStart;
+            long int newEnd = newStart + refMismatchLength;
+            //cerr << "ref should run from " << newStart << " to " << newStart + refMismatchLength << endl;
+            newPosition = min(newStart, newPosition);
+            newEndPosition = max(newEnd, newEndPosition);
+            //cerr << newPosition << " " << newEndPosition << endl;
+            //if (newRefSize < refMismatchLength) newRefSize = refMismatchLength;
+        }
+
+        // the alignment failed for some reason, continue
+        if (giveUp) {
+            cout << var << endl;
+            continue;
+        }
+
+        //cerr << "new ref start " << newPosition << " and end " << newEndPosition << " was " << var.position << "," << var.position + var.ref.size() << endl;
+        int newRefSize = newEndPosition - newPosition;
+        string newRef = fastaReference.getSubSequence(var.sequenceName, newPosition-1, newRefSize);
+        // get the number of bp to strip from the alts
+        int stripFromStart = currentWindow/2 - (var.position - newPosition);
+        int stripFromEnd = (currentWindow + newRefSize) - (stripFromStart + newRefSize) + (var.ref.size() - newRefSize);
+
+        //cerr << "strip from start " << stripFromStart << endl;
+        //cerr << "strip from end " << stripFromEnd << endl;
+
+        vector<string> newAlt;
+        vector<string>::iterator l = var.alt.begin();
+        bool failedAlt = false;
+        for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end();
+             ++a, ++l) {
+            int diff = newRef.size() - l->size();
+            string alt = a->seq.substr(stripFromStart, a->seq.size() - (stripFromEnd + stripFromStart));
+            newAlt.push_back(alt);
+            if (alt.empty()) failedAlt = true;
+        }
+
+        // check the before/after haplotypes
+        bool brokenRealignment = false;
+        if (!newRef.empty() && !failedAlt) {
+            int slop = 50; // 50 extra bp!
+            int haplotypeStart = min(var.position, newPosition) - slop;
+            int haplotypeEnd = max(var.position + var.ref.size(), newPosition + newRef.size()) + slop;
+            string referenceHaplotype = fastaReference.getSubSequence(var.sequenceName, haplotypeStart - 1,
+                                                                      haplotypeEnd - haplotypeStart);
+            vector<string>::iterator o = var.alt.begin();
+            vector<string>::iterator n = newAlt.begin();
+            for ( ; o != var.alt.end() ; ++o, ++n) {
+                // map the haplotypes
+                string oldHaplotype = referenceHaplotype;
+                string newHaplotype = referenceHaplotype;
+                oldHaplotype.replace(var.position - haplotypeStart, var.ref.size(), *o);
+                newHaplotype.replace(newPosition - haplotypeStart, newRef.size(), *n);
+                if (oldHaplotype != newHaplotype) {
+                    cerr << "broken left alignment!" << endl
+                         << "old " << oldHaplotype << endl
+                         << "new " << newHaplotype << endl;
+                    cerr << "was: " << var << endl;
+                    brokenRealignment = true;
+                }
+            }
+        }
+
+        // *if* everything is OK, update the variant
+        if (!brokenRealignment && !newRef.empty() && !failedAlt) {
+            var.ref = newRef;
+            var.alt = newAlt;
+            var.position = newPosition;
+        }
+
+        cout << var << endl;
+
+        // for each parsedalternate, get the position
+        // build a new vcf record for that position
+        // unless we are already at the position !
+        // take everything which is unique to that allele (records) and append it to the new record
+        // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes
+        // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it
+
+        
+
+        //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcflength.cpp b/src/vcflength.cpp
new file mode 100644
index 0000000..ebcc1a2
--- /dev/null
+++ b/src/vcflength.cpp
@@ -0,0 +1,49 @@
+#include "Variant.h"
+#include "convert.h"
+#include <vector>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc > 1) {
+        string filename = argv[1];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    variantFile.addHeaderLine("##INFO=<ID=length,Number=A,Type=Integer,Description=\"length(ALT) - length(REF) for each ALT\">");
+    variantFile.addHeaderLine("##INFO=<ID=length.ref,Number=1,Type=Integer,Description=\"length(REF)\">");
+    variantFile.addHeaderLine("##INFO=<ID=length.alt,Number=A,Type=Integer,Description=\"length(ALT) for each ALT\">");
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        vector<string>& lengths = var.info["length"];
+        lengths.clear();
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+            lengths.push_back(convert((int) a->size() - (int) var.ref.size()));
+        }
+        vector<string>& lengthsRef = var.info["length.ref"];
+        lengthsRef.clear();
+        lengthsRef.push_back(convert(var.ref.size()));
+        vector<string>& lengthsAlt = var.info["length.alt"];
+        lengthsAlt.clear();
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+            lengthsAlt.push_back(convert((int) a->size()));
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfnumalt.cpp b/src/vcfnumalt.cpp
new file mode 100644
index 0000000..a7c66cb
--- /dev/null
+++ b/src/vcfnumalt.cpp
@@ -0,0 +1,55 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    if (argc != 2) {
+        cerr << "usage: " << argv[0] << " <vcf file>" << endl
+             << "outputs a VCF stream where NUMALT has been generated for each record using sample genotypes" << endl;
+        return 1;
+    }
+
+    string filename = argv[1];
+
+    VariantCallFile variantFile;
+    if (filename == "-") {
+        variantFile.open(std::cin);
+    } else {
+        variantFile.open(filename);
+    }
+
+    if (!variantFile.is_open()) {
+        cerr << "could not open " << filename << endl;
+        return 1;
+    }
+
+    Variant var(variantFile);
+
+    // remove header lines we're going to add
+    variantFile.removeInfoHeaderLine("NUMALT");
+
+    // and add them back, so as not to duplicate them if they are already there
+    variantFile.addHeaderLine("##INFO=<ID=NUMALT,Number=1,Type=Integer,Description=\"Total number of segregating alternate alleles at the loci\">");
+
+    // write the new header
+    cout << variantFile.header << endl;
+ 
+    // print the records, filtering is done via the setting of varA's output sample names
+    while (variantFile.getNextVariant(var)) {
+        stringstream na;
+        na << var.alt.size();
+        var.info["NUMALT"].clear();
+        var.info["NUMALT"].push_back(na.str());
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfoverlay.cpp b/src/vcfoverlay.cpp
new file mode 100644
index 0000000..1bb415c
--- /dev/null
+++ b/src/vcfoverlay.cpp
@@ -0,0 +1,109 @@
+#include "Variant.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [<vcf file> ...]" << endl
+         << endl
+         << "options:" << endl 
+         << "    -h, --help       this dialog" << endl
+         << endl
+         << "Overlays records in the input vcf files in the order in which they appear." << endl;
+    exit(0);
+}
+
+int main(int argc, char** argv) {
+
+    if (argc == 1)
+        printSummary(argv);
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+            {
+                {"help", no_argument, 0, 'h'},
+                {0, 0, 0, 0}
+            };
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "h",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+            case 'h':
+                printSummary(argv);
+                break;
+
+            case '?':
+                printSummary(argv);
+                exit(1);
+                break;
+
+            default:
+                abort ();
+        }
+    }
+
+    // idea here is to shadow-merge
+    // records from the VCF files, which are provided in order of desired merge
+
+    map<int, pair<VariantCallFile*, Variant > > variantFiles;
+    map<string, map<long int, map<string, map<int, string> > > > linesByPrecedence;
+    int i = optind;
+
+    if (!(optind < argc - 1)) {
+        cerr << "more than one input file must be specified" << endl;
+        exit(1);
+    }
+
+	while (i < argc) {
+	    int index = i++;
+	    VariantCallFile*& variantFile = variantFiles[index].first;
+	    Variant& var = variantFiles[index].second;
+	    string inputFilename = argv[optind++];
+	    variantFile = new VariantCallFile;
+        try {
+            if (!variantFile->open(inputFilename)) {
+                cerr << "vcfoverlay could not open VCF file " << inputFilename << endl;
+                --index;
+            } else {
+                var.setVariantCallFile(variantFile);
+                while (variantFile->getNextVariant(var)) {
+                    linesByPrecedence[var.sequenceName][var.position][var.vrepr()][index] = variantFile->line;
+                }
+            }
+        } catch (...) {
+            cerr << "vcfoverlay encountered errors when opening " << inputFilename << endl;
+        }
+    }
+    
+    cout << variantFiles.begin()->second.first->header << endl;
+
+    while (!linesByPrecedence.empty()) {
+        // get the lowest entry in the buffer of observed lines
+        // print the first line
+        // get the next variant from that file, put it back into the map
+        const string& lowestChrom = linesByPrecedence.begin()->first;
+        const long int lowestPosition = linesByPrecedence.begin()->second.begin()->first;
+        map<string, map<int, string> >& pos = linesByPrecedence.begin()->second.begin()->second;
+        for (map<string, map<int, string> >::iterator m = pos.begin(); m != pos.end(); ++m) {
+            cout << m->second.begin()->second << endl;
+        }
+        linesByPrecedence[lowestChrom].erase(lowestPosition);
+        
+        if (linesByPrecedence[lowestChrom].empty()) {
+            linesByPrecedence.erase(lowestChrom);
+        }
+    }
+
+    // flush the rest of the variant records if there are any
+
+    return 0;
+}
+
diff --git a/src/vcfparsealts.cpp b/src/vcfparsealts.cpp
new file mode 100644
index 0000000..5b4e508
--- /dev/null
+++ b/src/vcfparsealts.cpp
@@ -0,0 +1,42 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc > 1) {
+        string filename = argv[1];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        map<string, vector<VariantAllele> > variants = var.parsedAlternates();
+	cout << var << endl;
+        for (map<string, vector<VariantAllele> >::iterator va = variants.begin(); va != variants.end(); ++va) {
+            cout << " ( " << va->first << " :: ";
+            vector<VariantAllele>& vars = va->second;
+            vector<VariantAllele>::iterator g = vars.begin();
+            for (; g != vars.end(); ++g) {
+                cout << *g << "; ";
+            }
+            cout << " ) ";
+        }
+        cout << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfprimers.cpp b/src/vcfprimers.cpp
new file mode 100644
index 0000000..2a5c46a
--- /dev/null
+++ b/src/vcfprimers.cpp
@@ -0,0 +1,140 @@
+#include "Variant.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+         << endl
+         << "options:" << endl 
+         << "    -f, --fasta-reference  FASTA reference file to use to obtain primer sequences" << endl
+         << "    -l, --primer-length    The length of the primer sequences on each side of the variant" << endl
+         << endl
+         << "For each VCF record, extract the flanking sequences, and write them to stdout as FASTA" << endl
+         << "records suitable for alignment.  This tool is intended for use in designing validation" << endl
+         << "experiments.  Primers extracted which would flank all of the alleles at multi-allelic" << endl
+         << "sites.  The name of the FASTA \"reads\" indicates the VCF record which they apply to." << endl
+         << "The form is >CHROM_POS_LEFT for the 3' primer and >CHROM_POS_RIGHT for the 5' primer," << endl
+         << "for example:" << endl
+         << endl
+         << ">20_233255_LEFT" << endl
+         << "CCATTGTATATATAGACCATAATTTCTTTATCCAATCATCTGTTGATGGA" << endl
+         << ">20_233255_RIGHT" << endl
+         << "ACTCAGTTGATTCCATACCTTTGCCATCATGAATCATGTTGTAATAAACA" << endl
+         << endl;
+    exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+    int c;
+    string fastaRef;
+    int primerLength = 0;
+
+    if (argc == 1)
+        printSummary(argv);
+
+    while (true) {
+        static struct option long_options[] =
+        {
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            {"help", no_argument, 0, 'h'},
+            {"fasta-reference",  required_argument, 0, 'f'},
+            {"primer-length", required_argument, 0, 'l'},
+            //{"length",  no_argument, &printLength, true},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hf:l:",
+                         long_options, &option_index);
+
+      /* Detect the end of the options. */
+          if (c == -1)
+            break;
+ 
+          switch (c)
+            {
+            case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+              break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+              printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+          case 'f':
+            fastaRef = optarg;
+            break;
+
+          case 'l':
+            primerLength = atoi(optarg);
+            break;
+ 
+          case 'h':
+            printSummary(argv);
+            exit(0);
+            break;
+
+          case '?':
+            /* getopt_long already printed an error message. */
+            printSummary(argv);
+            exit(1);
+            break;
+ 
+          default:
+            abort ();
+          }
+      }
+
+    if (primerLength == 0) {
+        cerr << "a primer length must be specified" << endl;
+        exit(1);
+    }
+    if (fastaRef.empty()) {
+        cerr << "a FASTA reference sequence must be specified" << endl;
+        exit(1);
+    }
+
+    FastaReference ref;
+    ref.open(fastaRef);
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        // get the ref start and end positions
+        int refstart = var.position - 1; // convert to 0-based
+        int refend = var.position + var.ref.size() - 1;
+        string leftprimer = ref.getSubSequence(var.sequenceName, refstart - primerLength, primerLength);
+        string rightprimer = ref.getSubSequence(var.sequenceName, refend, primerLength);
+        //cout << var << endl;
+        cout << ">" << var.sequenceName << "_" << var.position << "_LEFT" << endl
+             << leftprimer << endl
+             << ">" << var.sequenceName << "_" << var.position << "_RIGHT" << endl
+             << rightprimer << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfqual2info.cpp b/src/vcfqual2info.cpp
new file mode 100644
index 0000000..71c3335
--- /dev/null
+++ b/src/vcfqual2info.cpp
@@ -0,0 +1,44 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc == 1) {
+        cerr << "usage: " << argv[0] << " [key] [vcf_file]" << endl
+             << "Puts QUAL into an info field tag keyed by [key]." << endl
+             << "The VCF file may be omitted and read from stdin." << endl;
+        return 1;
+    }
+
+    string key = argv[1];
+
+    if (argc > 2) {
+        string filename = argv[2];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    variantFile.addHeaderLine("##INFO=<ID="+key+",Number=1,Type=Float,Description=\"QUAL value of site field.\">");
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        var.info[key].clear();
+        var.info[key].push_back(convert(var.quality));
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfrandom.cpp b/src/vcfrandom.cpp
new file mode 100644
index 0000000..debab84
--- /dev/null
+++ b/src/vcfrandom.cpp
@@ -0,0 +1,70 @@
+#include <sstream>
+#include <stdlib.h>
+#include <time.h>
+#include "Variant.h"
+#include <cmath>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    stringstream headerss;
+    headerss << "##fileformat=VCFv4.0" << endl
+             << "##source=vcfrandom" << endl
+             << "##reference=/d2/data/references/build_37/human_reference_v37.fa" << endl
+             << "##phasing=none" << endl
+             << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">" << endl
+             << "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total read depth at the locus\">" << endl
+             << "##INFO=<ID=AC,Number=1,Type=Integer,Description=\"Total number of alternate alleles in called genotypes\">" << endl
+             << "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">" << endl
+             << "##INFO=<ID=AF,Number=1,Type=Float,Description=\"Estimated allele frequency in the range (0,1]\">" << endl
+             << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl
+             << "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype\">" << endl
+             << "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">" << endl
+             << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tbill";
+
+    string header = headerss.str();
+    variantFile.openForOutput(header);
+
+    cout << variantFile.header << endl;
+
+    srand(time(NULL));
+
+    vector<string> atgc;
+    atgc.push_back("A");
+    atgc.push_back("T");
+    atgc.push_back("G");
+    atgc.push_back("C");
+
+    for (int i = 1; i < 10; ++i) {
+        Variant var(variantFile);
+        var.sequenceName = "one";
+        var.id = ".";
+        var.filter = ".";
+        var.ref = atgc.at(rand() % 4);
+        var.quality = 100;
+        stringstream s;
+        s << rand() % 100;
+        var.info["DP"].push_back(s.str());
+        var.format.push_back("GT");
+        var.format.push_back("DP");
+        var.position = i;
+        for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+            string& name = *s;
+            var.alt.clear();
+            var.alt.push_back(atgc.at(rand() % 4));
+            var.alt.push_back(atgc.at(rand() % 4));
+            var.samples[name]["GT"].push_back("0/1");
+            stringstream dp;
+            dp << floor(rand() % 100);
+            var.samples[name]["DP"].push_back(dp.str());
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
diff --git a/src/vcfrandomsample.cpp b/src/vcfrandomsample.cpp
new file mode 100644
index 0000000..3cc565b
--- /dev/null
+++ b/src/vcfrandomsample.cpp
@@ -0,0 +1,174 @@
+#include "Variant.h"
+#include "BedReader.h"
+#include <getopt.h>
+#include "mt19937ar.h"
+#include <sstream>
+#include <iostream>
+#include "convert.h"
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+         << endl
+         << "options:" << endl 
+         << "    -r, --rate RATE          base sampling probability per locus" << endl
+         << "    -s, --scale-by KEY       scale sampling likelihood by this Float info field" << endl
+         << "    -p, --random-seed N      use this random seed (by default read from /dev/random)" << endl
+         << "    -q, --pseudorandom-seed  use a pseudorandom seed (by default read from /dev/random)" << endl
+         << endl
+         << "Randomly sample sites from an input VCF file, which may be provided as stdin." << endl
+         << "Scale the sampling probability by the field specified in KEY.  This may be" << endl
+         << "used to provide uniform sampling across allele frequencies, for instance." << endl;
+    exit(0);
+}
+
+int main(int argc, char** argv) {
+
+    double rate = 1.0;
+    int seed = 0;
+    bool useprng = false;
+    string scaleByKey;
+
+    if (argc == 1)
+        printSummary(argv);
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+            {
+                {"help", no_argument, 0, 'h'},
+                {"rate",  required_argument, 0, 'r'},
+                {"scale-by",  required_argument, 0, 's'},
+                {"random-seed",  required_argument, 0, 'p'},
+                {"pseudorandom-seed",  required_argument, 0, 'q'},
+                {0, 0, 0, 0}
+            };
+
+        int option_index = 0;
+        c = getopt_long (argc, argv, "hqr:s:p:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+        case 'r':
+            rate = atof(optarg);
+            break;
+
+        case 's':
+            scaleByKey = optarg;
+            break;
+
+        case 'p':
+            seed = atoi(optarg);
+            break;
+
+        case 'q':
+            useprng = true;
+            break;
+
+        case 'h':
+            printSummary(argv);
+            break;
+
+        case '?':
+            printSummary(argv);
+            exit(1);
+            break;
+
+        default:
+            abort ();
+        }
+    }
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        cout << "could not open VCF file" << endl;
+        return 1;
+    }
+
+    // seed prng with random bits from /dev/random
+    if (!seed) {
+        fstream random;
+        if (useprng) {
+            random.open("/dev/urandom", fstream::in);
+        } else {
+            random.open("/dev/random", fstream::in);
+        }
+        random.get((char*) &seed, sizeof(int));
+        random.close();
+    }
+
+    init_genrand(seed);
+
+    vector<string> args;
+    for (int i = 0; i < argc; ++i) {
+        args.push_back(argv[i]);
+    }
+
+    stringstream liness;
+    liness << "##sampling=\"random sampling using "
+           << join(args, " ")
+           << " using random seed "
+           << seed << "\"";
+    variantFile.addHeaderLine(liness.str());
+
+    cout << variantFile.header << endl;
+    
+    // check that we can use the scaling key
+    if (!scaleByKey.empty()) {
+        if (variantFile.infoTypes.find(scaleByKey) == variantFile.infoTypes.end()) {
+            cerr << "could not find info key " << scaleByKey << endl;
+            exit(1);
+        } else {
+            if (variantFile.infoTypes[scaleByKey] != FIELD_FLOAT) {
+                cerr << "cannot use " << scaleByKey << " as a scaling factor, as it is not of type Float" << endl;
+                exit(1);
+            }
+        }
+    }
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        double randN = genrand_real1();
+        if (!scaleByKey.empty()) {
+            if (var.info.find(scaleByKey) != var.info.end()) {
+                double val;
+
+                // hack, sum the values of interest if we have multiple values
+                // really, this is only suitable for AF stuff
+                vector<string>& vals = var.info[scaleByKey];
+                for (vector<string>::iterator b = vals.begin(); b != vals.end(); ++b) {
+                    double f;
+                    convert(*b, f);
+                    val += f;
+                }
+                val /= vals.size();
+
+                if (val > 1) {
+                    cerr << "cannot scale by " << scaleByKey << "=" << val << " as it is > 1" << endl;
+                    exit(1);
+                }
+                randN *= val;
+            }
+        }
+        if (randN < rate) {
+            cout << var << endl;
+        }
+    }
+
+    return 0;
+
+}
diff --git a/src/vcfremap.cpp b/src/vcfremap.cpp
new file mode 100644
index 0000000..82c9997
--- /dev/null
+++ b/src/vcfremap.cpp
@@ -0,0 +1,350 @@
+#include "Variant.h"
+#include "BedReader.h"
+#include "intervaltree/IntervalTree.h"
+#include <getopt.h>
+#include "fastahack/Fasta.h"
+#include <algorithm>
+#include <list>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+         << endl
+         << "options:" << endl 
+         << "    -w, --ref-window-size N      align using this many bases flanking each side of the reference allele" << endl
+         << "    -s, --alt-window-size N      align using this many flanking bases from the reference around each alternate allele" << endl
+         << "    -r, --reference FILE         FASTA reference file, required with -i and -u" << endl
+         << "    -m, --match-score N          match score for SW algorithm" << endl
+         << "    -x, --mismatch-score N       mismatch score for SW algorithm" << endl
+         << "    -o, --gap-open-penalty N     gap open penalty for SW algorithm" << endl
+         << "    -e, --gap-extend-penalty N   gap extension penalty for SW algorithm" << endl
+         << "    -z, --entropy-gap-open       use entropy scaling for the gap open penalty" << endl
+         << "    -R, --repeat-gap-extend N    penalize non-repeat-unit gaps in repeat sequence" << endl
+         << "    -a, --adjust-vcf TAG         supply a new cigar as TAG in the output VCF" << endl
+         << endl
+         << "For each alternate allele, attempt to realign against the reference with lowered gap open penalty." << endl
+         << "If realignment is possible, adjust the cigar and reference/alternate alleles." << endl;
+    exit(0);
+}
+
+int main(int argc, char** argv) {
+
+    string vcfFileName;
+    string fastaFileName;
+    int windowsize = 100;
+    bool includePreviousBaseForIndels = false;
+    bool useMNPs = true;
+    int altwindowsize = 50;
+
+    // constants for SmithWaterman algorithm
+    float matchScore = 10.0f;
+    float mismatchScore = -9.0f;
+    float gapOpenPenalty = 15.0f;
+    float gapExtendPenalty = 6.66f;
+
+    bool useEntropy = false;
+    bool useRepeatGapExtendPenalty = false;
+    float repeatGapExtendPenalty = 1;
+
+    bool adjustVcf = false;
+    string adjustedTag = "remappedCIGAR";
+
+    if (argc == 1)
+        printSummary(argv);
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"ref-window-size", required_argument, 0, 'w'},
+                {"reference", required_argument, 0, 'r'},
+                {"match-score", required_argument, 0, 'm'},
+                {"mismatch-score", required_argument, 0, 'x'},
+                {"gap-open-penalty", required_argument, 0, 'o'},
+                {"gap-extend-penalty", required_argument, 0, 'e'},
+                {"alt-window-size", required_argument, 0, 's'},
+                {"entropy-gap-open", no_argument, 0, 'z'},
+                {"repeat-gap-extend", no_argument, 0, 'R'},
+                {"adjust-vcf", required_argument, 0, 'a'},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hza:w:r:m:x:o:e:s:R:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+
+	    case 'w':
+            windowsize = atoi(optarg);
+            break;
+
+	    case 'a':
+	        adjustVcf = true;
+            adjustedTag = optarg;
+            break;
+
+	    case 'r':
+            fastaFileName = string(optarg);
+            break;
+
+        case 'h':
+            printSummary(argv);
+            break;
+
+	    case 'm':
+            matchScore = atof(optarg);
+	        break;
+
+	    case 'x':
+            mismatchScore = atof(optarg);
+	        break;
+
+	    case 'o':
+            gapOpenPenalty = atof(optarg);
+	        break;
+
+	    case 'e':
+            gapExtendPenalty = atof(optarg);
+	        break;
+
+	    case 's':
+            altwindowsize = atoi(optarg);
+            break;
+
+	    case 'z':
+            useEntropy = true;
+            break;
+
+	    case 'R':
+            useRepeatGapExtendPenalty = true;
+            repeatGapExtendPenalty = atof(optarg);
+            break;
+
+        case '?':
+            printSummary(argv);
+            exit(1);
+            break;
+
+        default:
+            abort ();
+        }
+    }
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        cerr << "could not open VCF file" << endl;
+        exit(1);
+    }
+
+    FastaReference freference;
+    if (fastaFileName.empty()) {
+        cerr << "a reference is required" << endl;
+        exit(1);
+    } else {
+        freference.open(fastaFileName);
+    }
+    
+    if (adjustVcf) {
+        vector<string> commandline;
+        for (int i = 0; i < argc; ++i)
+            commandline.push_back(argv[i]);
+        variantFile.addHeaderLine("##INFO=<ID=" + adjustedTag + ",Number=A,Type=String,Description=\"CIGAR when remapped using"+ join(commandline, " ") +"\">");
+    }
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        //if (!adjustVcf) {
+	    cout << endl;
+	    cout << var << endl;
+	    //}
+        map<string, vector<VariantAllele> > variantAlleles;
+        vector<vector<pair<int, char> > > cigars;
+        vector<int> positionDiffs;
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+            //if (!adjustVcf) cout << endl;
+            cout << endl;
+
+            // try to remap locally
+
+            string reference = freference.getSubSequence(var.sequenceName, var.position - 1 - windowsize, windowsize * 2 + var.ref.size());
+	    
+            // passed to sw align
+            unsigned int referencePos;
+            string cigar;
+
+            string& alternate = *a;
+
+            vector<VariantAllele>& variants = variantAlleles[alternate];
+
+            string alternateQuery = reference.substr(windowsize - altwindowsize, altwindowsize) + alternate + reference.substr(reference.size() - windowsize, altwindowsize);
+
+            //cout << "REF:\t" << reference << endl;
+            //cout << "ALT:\t" << string(windowsize - altwindowsize, ' ') << alternateQuery << endl;
+	    
+            CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
+            if (useEntropy) sw.EnableEntropyGapPenalty(1);
+            if (useRepeatGapExtendPenalty) sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
+            sw.Align(referencePos, cigar, reference, alternateQuery);
+
+            int altpos = 0;
+            int refpos = 0;
+            int len;
+            string slen;
+            vector<pair<int, char> > cigarData;
+
+            string ref = reference.substr(referencePos);
+            positionDiffs.push_back(referencePos); // TODO this... is borked
+
+            stringstream refss;
+            stringstream altss;
+
+            if (!adjustVcf) cout << cigar << endl;
+            cout << cigar << endl;
+            for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) {
+                switch (*c) {
+                case 'I':
+                    len = atoi(slen.c_str());
+                    slen.clear();
+                    if (altpos < altwindowsize) {
+                        cigarData.push_back(make_pair(len, 'M'));
+                    } else {
+                        cigarData.push_back(make_pair(len, *c));
+                    }
+                    altss << alternateQuery.substr(altpos, len);
+                    refss << string(len, '-');
+                    altpos += len;
+                    break;
+                case 'D':
+                    len = atoi(slen.c_str());
+                    slen.clear();
+                    if (altpos < altwindowsize) {
+                    } else {
+                        cigarData.push_back(make_pair(len, *c));
+                    }
+                    refss << ref.substr(refpos, len);
+                    altss << string(len, '-');
+                    refpos += len;
+                    break;
+                case 'M':
+                    len = atoi(slen.c_str());
+                    slen.clear();
+                    {
+                        for (int i = 0; i < len; ++i) {
+                            if (ref.at(refpos + i) == alternateQuery.at(altpos + i)) {
+                                if (!cigarData.empty() && cigarData.back().second == 'M') {
+                                    cigarData.back().first++;
+                                } else {
+                                    cigarData.push_back(make_pair(1, 'M'));
+                                }
+                            } else {
+                                if (!cigarData.empty() && cigarData.back().second == 'X') {
+                                    cigarData.back().first++;
+                                } else {
+                                    cigarData.push_back(make_pair(1, 'X'));
+                                }
+                            }
+                        }
+                    }
+                    refss << ref.substr(refpos, len);
+                    altss << alternateQuery.substr(altpos, len);
+                    refpos += len;
+                    altpos += len;
+                    break;
+                case 'S':
+                    len = atoi(slen.c_str());
+                    slen.clear();
+                    cigarData.push_back(make_pair(len, *c));
+                    refss << ref.substr(refpos, len);
+                    //altss << alternateQuery.substr(altpos, len); // TODO deal with soft clipping, weird behavior
+                    refpos += len;
+                    altpos += len;
+                    break;
+                default:
+                    len = 0;
+                    slen += *c;
+                    break;
+                }
+            }
+
+            if (!adjustVcf) {
+                cout << "ref:\t" << refss.str() << endl;
+                cout << "alt:\t" << altss.str() << endl;
+            } else {
+                cout << "ref:\t" << refss.str() << endl;
+                cout << "alt:\t" << altss.str() << endl;
+                cigars.push_back(cigarData);
+            }
+
+        }
+
+        if (adjustVcf) {
+            int substart = cigars.front().front().first;
+            int subend = cigars.front().back().first;
+
+            // find the min and max match
+            for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) {
+                if (c->front().second == 'M' && c->front().first <= substart) {
+                    substart = c->front().first;
+                    if (c->size() > 1 && c->at(1).second != 'X') {
+                        --substart;
+                    }
+                }
+                if (c->back().second == 'M' && c->back().first <= subend) {
+                    subend = c->back().first;
+                }
+            }
+	    
+            // adjust the cigars and get the new reference length
+            int reflen = 0;
+            for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) {
+                c->front().first -= substart;
+                c->back().first -= subend;
+                int crf = cigarRefLen(*c);
+                if (crf > reflen)
+                    reflen = crf;
+                var.info[adjustedTag].push_back(joinCigar(*c));
+            }
+
+            // find the lowest positional difference
+            int pdiff = 0;
+            for (vector<int>::iterator d = positionDiffs.begin(); d != positionDiffs.end(); ++d) {
+                if (*d + altwindowsize < pdiff)
+                    pdiff = *d + altwindowsize;
+            }
+
+            // adjust the reference string
+            var.position += pdiff;
+
+            // adjust the variant position
+            var.ref = freference.getSubSequence(var.sequenceName, var.position - 1, reflen);
+
+            cout << var << endl;
+        }
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfremoveaberrantgenotypes.cpp b/src/vcfremoveaberrantgenotypes.cpp
new file mode 100644
index 0000000..75ebc32
--- /dev/null
+++ b/src/vcfremoveaberrantgenotypes.cpp
@@ -0,0 +1,75 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+void stripAberrant(Variant& var) {
+    map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+    while (s != var.samples.end()) {
+        map<string, vector<string> >& sample = s->second;
+        map<int, int> genotype = decomposeGenotype(sample["GT"].front());
+        int refobs = 0;
+        convert(sample["RO"].front(), refobs);
+        if (isHomNonRef(genotype) && refobs > 0) {
+            var.samples.erase(s);
+        } else if (isHomRef(genotype)) {
+            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+                int alleleIndex = var.altAlleleIndexes[*a];
+                int altobs = 0;
+                convert(sample["AO"].at(alleleIndex), altobs);
+                if (altobs > 0) {
+                    var.samples.erase(s);
+                    break;
+                }
+            }
+        }
+        ++s;
+    }
+}
+
+int main(int argc, char** argv) {
+
+    if (argc != 2) {
+        cerr << "usage: " << argv[0] << " <vcf file>" << endl
+             << "strips samples which are homozygous but have observations implying heterozygosity" << endl;
+        return 1;
+    }
+
+    string filename = argv[1];
+
+    VariantCallFile variantFile;
+    if (filename == "-") {
+        variantFile.open(std::cin);
+    } else {
+        variantFile.open(filename);
+    }
+
+    if (!variantFile.is_open()) {
+        cerr << "could not open " << filename << endl;
+        return 1;
+    }
+
+    Variant var(variantFile);
+
+    // TODO check if AC is present
+    // ensure that AC is listed as an info field
+    string line = "##filter=\"removed homozygous genotypes which have observations implying heterozygosity\">";
+    variantFile.addHeaderLine(line);
+
+    // write the new header
+    cout << variantFile.header << endl;
+ 
+    // print the records, filtering is done via the setting of varA's output sample names
+    while (variantFile.getNextVariant(var)) {
+        stripAberrant(var);
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfremovesamples.cpp b/src/vcfremovesamples.cpp
new file mode 100644
index 0000000..b4b31df
--- /dev/null
+++ b/src/vcfremovesamples.cpp
@@ -0,0 +1,76 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+// remove elements in a from b
+template<class T>
+vector<T> removeElems(vector<T>& a, vector<T>& b) {
+    map<T, bool> inA;
+    map<T, bool> inAB;
+    for (typename vector<T>::iterator i = a.begin(); i != a.end(); ++i) {
+        inA[*i] = true;
+    }
+    for (typename vector<T>::iterator i = b.begin(); i != b.end(); ++i) {
+        if (inA.find(*i) == inA.end()) {
+            inAB[*i] = true;
+        }
+    }
+    vector<T> aNb;
+    for (typename map<T, bool>::iterator i = inAB.begin(); i != inAB.end(); ++i) {
+        aNb.push_back(i->first);
+    }
+    return aNb;
+}
+
+int main(int argc, char** argv) {
+
+    if (argc < 3) {
+        cerr << "usage: " << argv[0] << " <vcf file> [SAMPLE1] [SAMPLE2] ..." << endl
+             << "outputs each record in the vcf file, removing samples listed on the command line" << endl;
+        return 1;
+    }
+
+    string filename = argv[1];
+
+    vector<string> samplesToRemove;
+    for (int i = 2; i < argc; ++i) {
+        samplesToRemove.push_back(argv[i]);
+    }
+
+    VariantCallFile variantFile;
+    if (filename == "-") {
+        variantFile.open(std::cin);
+    } else {
+        variantFile.open(filename);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    Variant var(variantFile);
+
+    vector<string> samplesToKeep = removeElems(samplesToRemove, variantFile.sampleNames);
+
+    // update sample list in header
+    variantFile.updateSamples(samplesToKeep);
+
+    // and restrict the output sample names in the variant to those we are keeping
+    var.setOutputSampleNames(samplesToKeep);
+    
+    // write the new header
+    cout << variantFile.header << endl;
+ 
+    // print the records, filtering is done via the setting of varA's output sample names
+    while (variantFile.getNextVariant(var)) {
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfroc.cpp b/src/vcfroc.cpp
new file mode 100644
index 0000000..e77562e
--- /dev/null
+++ b/src/vcfroc.cpp
@@ -0,0 +1,469 @@
+#include "Variant.h"
+#include "BedReader.h"
+#include "intervaltree/IntervalTree.h"
+#include <getopt.h>
+#include "fastahack/Fasta.h"
+#include <algorithm>
+#include <list>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+         << endl
+         << "options:" << endl 
+         << "    -t, --truth-vcf FILE      use this VCF as ground truth for ROC generation" << endl
+         << "    -w, --window-size N       compare records up to this many bp away (default 30)" << endl
+         << "    -c, --complex             directly compare complex alleles, don't parse into primitives" << endl
+         << "    -r, --reference FILE      FASTA reference file" << endl
+         << endl
+         << "Generates a pseudo-ROC curve using sensitivity and specificity estimated against" << endl
+         << "a putative truth set.  Thresholding is provided by successive QUAL cutoffs." << endl;
+    exit(0);
+}
+
+void buildVariantIntervalTree(VariantCallFile& variantFile,
+                              map<string, IntervalTree<Variant*> >& variantIntervals,
+                              list<Variant>& variants) {
+
+    map<string, vector<Interval<Variant*> > > rawVariantIntervals;
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        long int left = var.position;
+        long int right = left + var.ref.size(); // this should be 1-past the end
+        variants.push_back(var);
+        Variant* v = &variants.back();
+        rawVariantIntervals[var.sequenceName].push_back(Interval<Variant*>(left, right, v));
+    }
+	
+    for (map<string, vector<Interval<Variant*> > >::iterator j = rawVariantIntervals.begin(); j != rawVariantIntervals.end(); ++j) {
+        variantIntervals[j->first] = IntervalTree<Variant*>(j->second);
+    }
+}
+
+
+void intersectVariant(Variant& var,
+                      map<string, IntervalTree<Variant*> >& variantIntervals,
+                      vector<string*>& commonAlleles,
+                      vector<string*>& uniqueAlleles,
+                      FastaReference& reference,
+                      int windowsize = 50) {
+
+    vector<Interval<Variant*> > results;
+
+    variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize, results);
+
+    vector<Variant*> overlapping;
+
+    for (vector<Interval<Variant*> >::iterator r = results.begin(); r != results.end(); ++r) {
+        overlapping.push_back(r->value);
+    }
+
+
+    if (overlapping.empty()) {
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+            uniqueAlleles.push_back(&*a);
+        }
+    } else {
+
+        // get the min and max of the overlaps
+
+        int haplotypeStart = var.position;
+        int haplotypeEnd = var.position + var.ref.size();
+
+        for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
+            haplotypeStart = min((*v)->position, (long int) haplotypeStart);
+            haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd);
+        }
+
+        // for everything overlapping and the current variant, construct the local haplotype within the bounds
+        // if there is an exact match, the allele in the current VCF does intersect
+
+        string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart);
+        map<string, vector<pair<Variant*, int> > > haplotypes; // map to variant and alt index
+
+        for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
+            Variant& variant = **v;
+            int altindex = 0;
+            for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a, ++altindex) {
+                string haplotype = referenceHaplotype;
+                // get the relative start and end coordinates for the variant alternate allele
+                int relativeStart = variant.position - haplotypeStart;
+                haplotype.replace(relativeStart, variant.ref.size(), *a);
+                haplotypes[haplotype].push_back(make_pair(*v, altindex));
+            }
+        }
+
+
+        // determine the non-intersecting alts
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+            string haplotype = referenceHaplotype;
+            int relativeStart = var.position - haplotypeStart;
+            haplotype.replace(relativeStart, var.ref.size(), *a);
+            map<string, vector<pair<Variant*, int> > >::iterator h = haplotypes.find(haplotype);
+            if (h == haplotypes.end()) {
+                uniqueAlleles.push_back(&*a);
+            } else {
+                commonAlleles.push_back(&*a);
+            }
+        }
+
+    }
+}
+
+
+int main(int argc, char** argv) {
+
+    string truthVcfFileName;
+    string fastaFileName;
+    bool complex = false;
+    int windowsize = 30;
+
+    if (argc == 1)
+        printSummary(argv);
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"window-size", required_argument, 0, 'w'},
+                {"reference", required_argument, 0, 'r'},
+                {"complex", required_argument, 0, 'c'},
+                {"truth-vcf", required_argument, 0, 't'},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hcw:r:t:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+
+	    case 'w':
+            windowsize = atoi(optarg);
+            break;
+
+	    case 'r':
+            fastaFileName = string(optarg);
+            break;
+
+	    case 't':
+	        truthVcfFileName = optarg;
+            break;
+
+        case 'c':
+            complex = true;
+            break;
+
+        case 'h':
+            printSummary(argv);
+            break;
+
+        case '?':
+            printSummary(argv);
+            exit(1);
+            break;
+
+        default:
+            abort ();
+        }
+    }
+
+    VariantCallFile variantFile;
+    bool usingstdin = false;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+        usingstdin = true;
+    }
+
+    if (!variantFile.is_open()) {
+        cerr << "could not open VCF file" << endl;
+        exit(1);
+    }
+
+    VariantCallFile truthVariantFile;
+    if (!truthVcfFileName.empty()) {
+        if (truthVcfFileName == "-") {
+            if (usingstdin) {
+                cerr << "cannot open both VCF file streams from stdin" << endl;
+                exit(1);
+            } else {
+                truthVariantFile.open(std::cin);
+            }
+        } else {
+            truthVariantFile.open(truthVcfFileName);
+        }
+        if (!truthVariantFile.is_open()) {
+            cerr << "could not open VCF file " << truthVcfFileName << endl;
+            exit(1);
+        }
+    }
+
+    FastaReference reference;
+    if (fastaFileName.empty()) {
+        cerr << "a reference is required for the haplotype-based intersection used by vcfroc" << endl;
+        exit(1);
+    }
+    reference.open(fastaFileName);
+
+    // read the VCF file for union or intersection into an interval tree
+    // indexed using some proximity window
+
+    map<string, IntervalTree<Variant*> > truthVariantIntervals;
+    list<Variant> truthVariants;
+    buildVariantIntervalTree(truthVariantFile, truthVariantIntervals, truthVariants);
+
+    map<string, IntervalTree<Variant*> > testVariantIntervals;
+    list<Variant> testVariants;
+    buildVariantIntervalTree(variantFile, testVariantIntervals, testVariants);
+
+    map<long double, vector<VariantAllele*> > falseNegativeAllelesAtCutoff;  // false negative after this cutoff
+    map<long double, vector<VariantAllele*> > falsePositiveAllelesAtCutoff;  // false positive until this cutoff
+    list<VariantAllele*> allFalsePositiveAlleles;
+    map<long double, vector<VariantAllele*> > allelesAtCutoff;
+    //map<long double, vector<VariantAllele*> > totalAllelesAtCutoff;
+    map<Variant*, map<string, vector<VariantAllele> > > parsedAlleles;
+    map<long double, vector<Variant*> > callsByCutoff;
+
+    // replicate this method, where Q is for each unique Q in the set
+    //vcfintersect -r $reference -v -i $results.$Q.vcf $answers_primitives | vcfstats >false_negatives.$Q.stats
+    //vcfintersect -r $reference -v -i $answers_primitives $results.$Q.vcf | vcfstats >false_positives.$Q.stats
+
+    for (list<Variant>::iterator v = testVariants.begin(); v != testVariants.end(); ++v) {
+        // TODO allow different cutoff sources
+        callsByCutoff[v->quality].push_back(&*v);
+    }
+
+    // add false negatives at any cutoff
+    for (list<Variant>::iterator v = truthVariants.begin(); v != truthVariants.end(); ++v) {
+        Variant& variant = *v;
+        vector<string*> commonAlleles;
+        vector<string*> uniqueAlleles;
+        intersectVariant(variant, testVariantIntervals,
+                         commonAlleles, uniqueAlleles, reference);
+        if (complex) {
+            parsedAlleles[&*v] = variant.flatAlternates();
+        } else {
+            parsedAlleles[&*v] = variant.parsedAlternates();
+        }
+        // unique alleles are false negatives regardless of cutoff
+        for (vector<string*>::iterator a = uniqueAlleles.begin(); a != uniqueAlleles.end(); ++a) {
+            vector<VariantAllele>& alleles = parsedAlleles[&*v][**a];
+            for (vector<VariantAllele>::iterator va = alleles.begin(); va != alleles.end(); ++va) {
+                if (va->ref != va->alt) { 		// use only non-reference alleles
+                    // false negatives at threshold 0 XXX --- may not apply if threshold is generalized
+                    falseNegativeAllelesAtCutoff[-1].push_back(&*va);
+                }
+            }
+        }
+    }
+
+    for (map<long double, vector<Variant*> >::iterator q = callsByCutoff.begin(); q != callsByCutoff.end(); ++q) {
+        long double threshold = q->first;
+        vector<Variant*>& variants = q->second;
+        for (vector<Variant*>::iterator v = variants.begin(); v != variants.end(); ++v) {
+            Variant& variant = **v;
+            vector<string*> commonAlleles;
+            vector<string*> uniqueAlleles;
+            intersectVariant(variant, truthVariantIntervals,
+                             commonAlleles, uniqueAlleles, reference);
+            if (complex) {
+                parsedAlleles[*v] = variant.flatAlternates();
+            } else {
+                parsedAlleles[*v] = variant.parsedAlternates();
+            }
+
+            map<string, vector<VariantAllele> >& parsedAlts = parsedAlleles[*v];
+            // push VariantAllele*'s into the FN and FP alleles at cutoff vectors
+            for (vector<string*>::iterator a = commonAlleles.begin(); a != commonAlleles.end(); ++a) {
+                vector<VariantAllele>& alleles = parsedAlleles[*v][**a];
+                for (vector<VariantAllele>::iterator va = alleles.begin(); va != alleles.end(); ++va) {
+                    if (va->ref != va->alt) { 		// use only non-reference alleles
+                        allelesAtCutoff[threshold].push_back(&*va);
+                        falseNegativeAllelesAtCutoff[threshold].push_back(&*va);
+                    }
+                }
+            }
+            for (vector<string*>::iterator a = uniqueAlleles.begin(); a != uniqueAlleles.end(); ++a) {
+                vector<VariantAllele>& alleles = parsedAlts[**a];
+                for (vector<VariantAllele>::iterator va = alleles.begin(); va != alleles.end(); ++va) {
+                    if (va->ref != va->alt) { 		// use only non-reference alleles
+                        allelesAtCutoff[threshold].push_back(&*va);
+                        allFalsePositiveAlleles.push_back(&*va);
+                        falsePositiveAllelesAtCutoff[threshold].push_back(&*va);
+                    }
+                }
+            }
+        }
+    }
+
+
+    // output results
+    int totalSNPs = 0;
+    int falsePositiveSNPs = 0;
+    int falseNegativeSNPs = 0;
+    int totalIndels = 0;
+    int falsePositiveIndels = 0;
+    int falseNegativeIndels = 0;
+    int totalComplex = 0;
+    int falsePositiveComplex = 0;
+    int falseNegativeComplex = 0;
+
+    // write header
+    
+    cout << "threshold" << "\t"
+         << "num_snps" << "\t"
+         << "false_positive_snps" << "\t"
+         << "false_negative_snps" << "\t"
+         << "num_indels" << "\t"
+         << "false_positive_indels" << "\t"
+         << "false_negative_indels" << "\t"
+         << "num_complex" << "\t"
+         << "false_positive_complex" << "\t"
+         << "false_negative_complex" << endl;
+
+    // count total alleles in set
+    for (map<long double, vector<VariantAllele*> >::iterator a = allelesAtCutoff.begin(); a != allelesAtCutoff.end(); ++a) {
+        vector<VariantAllele*>& alleles = a->second;
+        for (vector<VariantAllele*>::iterator va = alleles.begin(); va != alleles.end(); ++va) {
+            VariantAllele& allele = **va;
+            if (allele.ref.size() == 1 && allele.ref.size() == allele.alt.size()) {
+                ++totalSNPs;
+            } else if (allele.ref.size() != allele.alt.size()) {
+                if (allele.ref.size() == 1 || allele.alt.size() == 1) {
+                    ++totalIndels;
+                } else {
+                    ++totalComplex;
+                }
+            } else {
+                ++totalComplex;
+            }
+        }
+    }
+
+    // tally total false positives
+    for (list<VariantAllele*>::iterator va = allFalsePositiveAlleles.begin(); va != allFalsePositiveAlleles.end(); ++va) {
+        VariantAllele& allele = **va;
+        if (allele.ref.size() == 1 && allele.ref.size() == allele.alt.size()) {
+            ++falsePositiveSNPs;
+        } else if (allele.ref.size() != allele.alt.size()) {
+            if (allele.ref.size() == 1 || allele.alt.size() == 1) {
+                ++falsePositiveIndels;
+            } else {
+                ++falsePositiveComplex;
+            }
+        } else {
+            ++falsePositiveComplex;
+        }
+    }
+
+    // get categorical false negatives
+    vector<VariantAllele*>& categoricalFalseNegatives = falseNegativeAllelesAtCutoff[-1];
+    for (vector<VariantAllele*>::iterator va = categoricalFalseNegatives.begin(); va != categoricalFalseNegatives.end(); ++va) {
+        VariantAllele& allele = **va;
+        if (allele.ref.size() == 1 && allele.ref.size() == allele.alt.size()) {
+            assert(allele.ref.size() == 1);
+            ++falseNegativeSNPs;
+        } else if (allele.ref.size() != allele.alt.size()) {
+            if (allele.ref.size() == 1 || allele.alt.size() == 1) {
+                ++falseNegativeIndels;
+            } else {
+                ++falseNegativeComplex;
+            }
+        } else {
+            ++falseNegativeComplex;
+        }
+    }
+    cout << -1 << "\t"
+         << totalSNPs << "\t"
+         << falsePositiveSNPs << "\t"
+         << falseNegativeSNPs << "\t"
+         << totalIndels << "\t"
+         << falsePositiveIndels << "\t"
+         << falseNegativeIndels << "\t"
+         << totalComplex << "\t"
+         << falsePositiveComplex << "\t"
+         << falseNegativeComplex << endl;
+
+    for (map<long double, vector<VariantAllele*> >::iterator a = allelesAtCutoff.begin(); a != allelesAtCutoff.end(); ++a) {
+        vector<VariantAllele*>& alleles = a->second;
+        long double threshold = a->first;
+        for (vector<VariantAllele*>::iterator va = alleles.begin(); va != alleles.end(); ++va) {
+            VariantAllele& allele = **va;
+            if (allele.ref.size() == 1 && allele.ref.size() == allele.alt.size()) {
+                assert(allele.ref.size() == 1);
+                --totalSNPs;
+            } else if (allele.ref.size() != allele.alt.size()) {
+                if (allele.ref.size() == 1 || allele.alt.size() == 1) {
+                    --totalIndels;
+                } else {
+                    --totalComplex;
+                }
+            } else {
+                --totalComplex;
+            }   
+        }
+        vector<VariantAllele*>& falseNegatives = falseNegativeAllelesAtCutoff[threshold];
+        for (vector<VariantAllele*>::iterator va = falseNegatives.begin(); va != falseNegatives.end(); ++va) {
+            VariantAllele& allele = **va;
+            if (allele.ref.size() == 1 && allele.ref.size() == allele.alt.size()) {
+                assert(allele.ref.size() == 1);
+                ++falseNegativeSNPs;
+            } else if (allele.ref.size() != allele.alt.size()) {
+                if (allele.ref.size() == 1 || allele.alt.size() == 1) {
+                    ++falseNegativeIndels;
+                } else {
+                    ++falseNegativeComplex;
+                }
+            } else {
+                ++falseNegativeComplex;
+            }
+        }
+        vector<VariantAllele*>& falsePositives = falsePositiveAllelesAtCutoff[threshold];
+        for (vector<VariantAllele*>::iterator va = falsePositives.begin(); va != falsePositives.end(); ++va) {
+            VariantAllele& allele = **va;
+            if (allele.ref.size() == 1 && allele.ref.size() == allele.alt.size()) {
+                assert(allele.ref.size() == 1);
+                --falsePositiveSNPs;
+            } else if (allele.ref.size() != allele.alt.size()) {
+                if (allele.ref.size() == 1 || allele.alt.size() == 1) {
+                    --falsePositiveIndels;
+                } else {
+                    --falsePositiveComplex;
+                }
+            } else {
+                --falsePositiveComplex;
+            }
+        }
+        cout << threshold << "\t"
+             << totalSNPs << "\t"
+             << falsePositiveSNPs << "\t"
+             << falseNegativeSNPs << "\t"
+             << totalIndels << "\t"
+             << falsePositiveIndels << "\t"
+             << falseNegativeIndels << "\t"
+             << totalComplex << "\t"
+             << falsePositiveComplex << "\t"
+             << falseNegativeComplex << endl;
+
+    }
+    
+    exit(0);  // why?
+    return 0;
+
+}
+
diff --git a/src/vcfsample2info.cpp b/src/vcfsample2info.cpp
new file mode 100644
index 0000000..2beab59
--- /dev/null
+++ b/src/vcfsample2info.cpp
@@ -0,0 +1,218 @@
+#include "Variant.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <getopt.h>
+#include <algorithm>
+#include <numeric>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+         << endl
+         << "options:" << endl 
+         << "    -f, --field         Add information about this field in samples to INFO column" << endl
+         << "    -i, --info          Store the computed statistic in this info field" << endl
+         << "    -a, --average       Take the mean of samples for field (default)" << endl
+         << "    -m, --median        Use the median" << endl
+         << "    -n, --min           Use the min" << endl
+         << "    -x, --max           Use the max" << endl
+         << endl
+         << "Take annotations given in the per-sample fields and add the mean, median, min, or max" << endl
+         << "to the site-level INFO." << endl
+         << endl;
+    exit(0);
+}
+
+double median(vector<double> &v)
+{
+    size_t n = v.size() / 2;
+    nth_element(v.begin(), v.begin()+n, v.end());
+    return v[n];
+}
+
+double mean(vector<double> &v)
+{
+    double sum = accumulate(v.begin(), v.end(), 0.0);
+    return sum / v.size();
+}
+
+enum StatType { MEAN, MEDIAN, MIN, MAX };
+
+int main(int argc, char** argv) {
+
+    int c;
+    string sampleField;
+    string infoField;
+    StatType statType = MEAN; 
+
+    if (argc == 1)
+        printSummary(argv);
+
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                {"help", no_argument, 0, 'h'},
+                {"field",  required_argument, 0, 'f'},
+                {"info",  required_argument, 0, 'i'},
+                {"average", no_argument, 0, 'a'},
+                {"median", no_argument, 0, 'm'},
+                {"min", no_argument, 0, 'n'},
+                {"max", no_argument, 0, 'x'},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hamnxf:i:",
+                         long_options, &option_index);
+
+        /* Detect the end of the options. */
+        if (c == -1)
+            break;
+ 
+        switch (c)
+        {
+        case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+                break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+                printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+        case 'f':
+            sampleField = optarg;
+            break;
+
+        case 'i':
+            infoField = optarg;
+            break;
+ 
+        case 'a':
+            statType = MEAN;
+            break;
+
+        case 'm':
+            statType = MEDIAN;
+            break;
+
+        case 'n':
+            statType = MIN;
+            break;
+
+        case 'x':
+            statType = MAX;
+            break;
+
+        case 'h':
+            printSummary(argv);
+            exit(0);
+
+        case '?':
+            /* getopt_long already printed an error message. */
+            printSummary(argv);
+            exit(1);
+            break;
+ 
+        default:
+            abort ();
+        }
+    }
+
+    if (infoField.empty() || sampleField.empty()) {
+        cerr << "Error: both a sample field and an info field are required." << endl;
+        return 1;
+    }
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    string statTypeStr;
+
+    switch (statType) {
+    case MEAN:
+        statTypeStr = "mean";
+        break;
+    case MEDIAN:
+        statTypeStr = "median";
+        break;
+    case MIN:
+        statTypeStr = "min";
+        break;
+    case MAX:
+        statTypeStr = "max";
+        break;
+    default:
+        cerr << "Error: failure to convert stat type to string" << endl;
+        return 1;
+        break;
+    }
+
+    variantFile.addHeaderLine("##INFO=<ID="+infoField+",Number=1,Type=Float,Description=\"Summary statistic generated by"+statTypeStr+" of per-sample values of "+sampleField+" \">");
+
+    cout << variantFile.header << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        vector<double> vals;
+        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+             s != var.samples.end(); ++s) {
+            map<string, vector<string> >& sample = s->second;
+            if (sample.find(sampleField) != sample.end()) {
+                double val;
+                string& s = sample[sampleField].front();
+                if (sample[sampleField].size() > 1) {
+                    cerr << "Error: cannot handle sample fields with multiple values" << endl;
+                    return 1;
+                }
+                convert(s, val);
+                vals.push_back(val);
+            }
+        }
+
+        double result;
+        switch (statType) {
+        case MEAN:
+            result = mean(vals);
+            break;
+        case MEDIAN:
+            result = median(vals);
+            break;
+        case MIN:
+            result = *min_element(vals.begin(), vals.end());
+            break;
+        case MAX:
+            result = *max_element(vals.begin(), vals.end());
+            break;
+        default:
+            cerr << "Error: unrecognized StatType" << endl;
+            return 1;
+            break;
+        }
+
+        var.info[infoField].clear();
+        var.info[infoField].push_back(convert(result));
+
+        cout << var << endl;
+
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfsamplediff.cpp b/src/vcfsamplediff.cpp
new file mode 100644
index 0000000..09ca242
--- /dev/null
+++ b/src/vcfsamplediff.cpp
@@ -0,0 +1,200 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+bool samplesDiffer(vector<string>& samples, Variant& var) {
+
+    string genotype;
+
+    for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s) {
+        string& sampleName = *s;
+        map<string, map<string, vector<string> > >::iterator f = var.samples.find(sampleName);
+        if (f != var.samples.end()) {
+            map<string, vector<string> >& sample = f->second;
+            map<string, vector<string> >::iterator gt = sample.find("GT");
+            if (gt != sample.end()) {
+                string& thisGenotype = gt->second.front();
+                if (genotype.empty()) {
+                    genotype = thisGenotype;
+                } else {
+                    if (genotype != thisGenotype) {
+                        return true;
+                    }
+                }
+            }
+        }
+    }
+
+    return false;
+
+}
+
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] <tag> <sample> <sample> [ <sample> ... ] <vcf file>" << endl
+         << "Tags each record where the listed sample genotypes differ with <tag>." << endl
+         << "The first sample is assumed to be germline, the second somatic." << endl
+         << "Each record is tagged with <tag>={germline,somatic,loh} to specify the type of" << endl
+         << "variant given the genotype difference between the two samples." << endl
+         << endl
+         << "options:" << endl
+         << "    -s --strict     Require that no observations in the germline support the somatic alternate." << endl
+         << endl;
+}
+
+
+int main(int argc, char** argv) {
+
+    bool strict = false;
+    int c;
+
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"strict",  no_argument, 0, 's'},
+                //{"length",  no_argument, &printLength, true},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hs",
+                         long_options, &option_index);
+
+        /* Detect the end of the options. */
+        if (c == -1)
+            break;
+ 
+        switch (c)
+        {
+        case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+                break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+                printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+        case 's':
+            strict = true;
+            break;
+ 
+        case 'h':
+            printSummary(argv);
+            exit(0);
+            break;
+
+        case '?':
+            /* getopt_long already printed an error message. */
+            printSummary(argv);
+            exit(1);
+            break;
+ 
+        default:
+            abort ();
+        }
+    }
+
+    if(argc - optind < 4) {
+        printSummary(argv);
+        exit(0);
+    }
+
+    string tag = argv[optind];
+
+    vector<string> samples;
+    for (int i = optind+1; i < argc - 1; ++i) {
+        samples.push_back(argv[i]);
+    }
+
+    string filename = argv[argc-1];
+
+    VariantCallFile variantFile;
+    if (filename == "-") {
+        variantFile.open(std::cin);
+    } else {
+        variantFile.open(filename);
+    }
+
+    if (!variantFile.is_open()) {
+        cerr << "could not open " << filename << endl;
+        return 1;
+    }
+
+    assert(samples.size() == 2);
+
+    Variant var(variantFile);
+
+    // TODO check if AC is present
+    // ensure that AC is listed as an info field
+    string line = "##INFO=<ID=" + tag + ",Number=1,Type=String,Description=\"Samples";
+    for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s) {
+        line += " " + *s;
+    }
+    line += " have different genotypes\">";
+    variantFile.addHeaderLine(line);
+
+    variantFile.addHeaderLine("##INFO=<ID=SSC,Number=1,Type=Float,Description=\"Somatic variant score (phred-scaled probability that the somatic variant call is correct).\">");
+
+    // write the new header
+    cout << variantFile.header << endl;
+ 
+    // print the records, filtering is done via the setting of varA's output sample names
+    while (variantFile.getNextVariant(var)) {
+        if (var.samples.find(samples.front()) != var.samples.end()
+            && var.samples.find(samples.back()) != var.samples.end()) {
+            map<string, vector<string> >& germline = var.samples[samples.front()];
+            map<string, vector<string> >& somatic = var.samples[samples.back()];
+            map<int, int> gtGermline = decomposeGenotype(germline["GT"].front());
+            map<int, int> gtSomatic  = decomposeGenotype(somatic["GT"].front());
+            int germlineAltCount = 0;
+            if (germline.find("AO") != germline.end()) {
+                convert(germline["AO"].front(), germlineAltCount);
+            }
+            var.info[tag].clear(); // remove previous
+            if (gtGermline == gtSomatic) {
+                var.info[tag].push_back("germline");
+            } else {
+                //if (isHet(gtGermline) && isHom(gtSomatic)) {
+                //    var.info[tag].push_back("loh");
+                if (isHet(gtGermline) && isHomNonRef(gtSomatic) ||
+                    isHomRef(gtGermline) && (isHet(gtSomatic) || isHomNonRef(gtSomatic))) {
+                    if (!strict || strict && germlineAltCount == 0) {
+                        var.info[tag].push_back("somatic");
+                    }
+                } else if (isHom(gtGermline) && isHet(gtSomatic)) {
+                    if (var.alt.size() == 1) {
+                        var.info[tag].push_back("reversion");
+                    } else {
+                        var.info[tag].push_back("somatic");
+                    }
+                }
+            }
+            if (germline.find("GQ") != germline.end() && somatic.find("GQ") != somatic.end()) {
+                double germlineGQ;
+                convert(germline["GQ"].front(), germlineGQ);
+                double somaticGQ;
+                convert(somatic["GQ"].front(), somaticGQ);
+                double somaticScore = min(var.quality, min(germlineGQ, somaticGQ));
+                var.info["SSC"].clear();
+                var.info["SSC"].push_back(convert(somaticScore));
+            }
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfsamplenames.cpp b/src/vcfsamplenames.cpp
new file mode 100644
index 0000000..23f68f7
--- /dev/null
+++ b/src/vcfsamplenames.cpp
@@ -0,0 +1,29 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc > 1) {
+        string filename = argv[1];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    for (vector<string>::iterator sample = variantFile.sampleNames.begin();
+            sample != variantFile.sampleNames.end(); ++sample) {
+        cout << *sample << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfsamplestats.cpp b/src/vcfsamplestats.cpp
new file mode 100644
index 0000000..ceceb98
--- /dev/null
+++ b/src/vcfsamplestats.cpp
@@ -0,0 +1,193 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+#include <getopt.h>
+
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+         << "options:" << endl 
+         << endl
+         << "    -h, --help    this dialog" << endl
+         << endl
+         << "By default, output a table of this form:" << endl
+         << "sample" << " "
+         << "sitecount" << " "
+         << "refcount" << " "
+         << "altcount" << " "
+         << "homcount" << " "
+         << "hetcount" << " "
+         << "avg_gq" << " "
+         << "avg_dp" << endl
+         << endl
+         << "for each sample in the VCF file." << endl
+         << "Reads from stdin if no file is specified on the command line." << endl
+         << endl;
+    exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+    int c;
+    //bool outputTotalStats = false;
+
+    while (true) {
+        static struct option long_options[] =
+        {
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            {"help", no_argument, 0, 'h'},
+            //{"totals",   no_argument, 0, 't'}, 
+            //{"length",  no_argument, &printLength, true},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "h",
+                         long_options, &option_index);
+
+      /* Detect the end of the options. */
+          if (c == -1)
+            break;
+ 
+          switch (c)
+            {
+            case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+              break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+              printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+          //case 't':
+           // outputTotalStats = true;
+            //break;
+ 
+          case 'h':
+            printSummary(argv);
+            exit(0);
+            break;
+          
+          case '?':
+            /* getopt_long already printed an error message. */
+            printSummary(argv);
+            exit(1);
+            break;
+ 
+          default:
+            abort ();
+          }
+      }
+
+    VariantCallFile variantFile;
+    if (optind == argc - 1) {
+        string inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+
+    map<string, int> sitecount;
+    map<string, int> refcount;
+    map<string, int> altcount;
+    map<string, int> homcount;
+    map<string, int> hetcount;
+    map<string, int> gqsum;
+    map<string, int> dpsum;
+
+    for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+        string& sample = *s;
+        sitecount[sample] = 0;
+        refcount[sample] = 0;
+        altcount[sample] = 0;
+        homcount[sample] = 0;
+        hetcount[sample] = 0;
+        gqsum[sample] = 0;
+    }
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+
+        for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+
+            string name = s->first;
+            map<string, vector<string> >& sample = s->second;
+
+            sitecount[name] += 1;
+
+            int gq;
+            if (convert(sample["GQ"].front(), gq)) {
+                gqsum[name] += gq;
+            }
+
+            int dp;
+            if (convert(sample["DP"].front(), dp))
+                dpsum[name] += dp;
+
+            string& genotype = sample["GT"].front();
+            vector<string> gt = split(genotype, "|/");
+
+            int alt = 0;
+            int ref = 0;
+
+            for (vector<string>::iterator g = gt.begin(); g != gt.end(); ++g) {
+                if (*g != "0") {
+                    ++alt;
+                } else {
+                    ++ref;
+                }
+            }
+
+            if (alt != gt.size()) {
+                hetcount[name] += alt;
+            }
+
+            if (alt == gt.size() || ref == gt.size()) {
+                homcount[name] += 1;
+            }
+
+            refcount[name] += ref;
+            altcount[name] += alt;
+
+        }
+    }
+
+    cout << "sample" << "\t"
+         << "sitecount" << "\t"
+         << "refcount" << "\t"
+         << "altcount" << "\t"
+         << "homcount" << "\t"
+         << "hetcount" << "\t"
+         << "avg_gq" << "\t"
+         << "avg_dp" << endl;
+    for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+        string& sample = *s;
+        cout << sample << "\t"
+
+             << sitecount[sample] << "\t"
+             << refcount[sample] << "\t"
+             << altcount[sample] << "\t"
+             << homcount[sample] << "\t"
+             << hetcount[sample] << "\t"
+             << (float) gqsum[sample] / (float) sitecount[sample] << "\t"
+             << (float) dpsum[sample] / (float) sitecount[sample]
+             << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfsitesummarize.cpp b/src/vcfsitesummarize.cpp
new file mode 100644
index 0000000..067d0d4
--- /dev/null
+++ b/src/vcfsitesummarize.cpp
@@ -0,0 +1,94 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc > 1) {
+        string filename = argv[1];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    // obtain all possible field names
+    vector<string> infofields;
+    vector<string> infoflags;
+
+    for (map<string, VariantFieldType>::iterator i = variantFile.infoTypes.begin(); i != variantFile.infoTypes.end(); ++i) {
+        if (variantFile.infoCounts[i->first] != ALLELE_NUMBER) {
+            if (i->second == FIELD_BOOL) {
+                infoflags.push_back(i->first);
+            } else {
+                infofields.push_back(i->first);
+            }
+        }
+    }
+
+    // write header
+
+    // defaults
+    cout << "CHROM\tPOS\tID\tREF\tQUAL\tFILTER";
+    
+    // configurable info field
+    for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
+        cout << "\t" << *i;
+    }
+    for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
+        cout << "\t" << *i;
+    }
+    cout << endl;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+
+	cout << var.sequenceName << "\t"
+	     << var.position << "\t"
+	     << var.id << "\t"
+	     << var.ref << "\t"
+	     << var.quality << "\t"
+	     << var.filter;
+
+	for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
+	    vector<string> value;
+	    string& name = *i;
+	    map<string, vector<string> >::iterator f = var.info.find(name);
+	    if (f != var.info.end()) {
+            value = f->second;
+            if (value.size() == 1) {
+                cout << "\t" << value.front();
+            } else {
+                cout << "\t"; // null
+            }
+	    } else {
+            cout << "\t"; // null
+	    }
+	}
+
+	for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
+	    string value;
+	    string& name = *i;
+	    map<string, bool>::iterator f = var.infoFlags.find(name);
+	    cout << "\t";
+	    if (f != var.infoFlags.end()) {
+            cout << 1;
+	    } else {
+            cout << 0;
+	    }
+	}
+	
+	cout << endl;
+	
+    }
+    
+    return 0;
+
+}
+
diff --git a/src/vcfsom.cpp b/src/vcfsom.cpp
new file mode 100644
index 0000000..1e53ec1
--- /dev/null
+++ b/src/vcfsom.cpp
@@ -0,0 +1,626 @@
+#include "Variant.h"
+#include "split.h"
+#include "convert.h"
+#include <string>
+#include <iostream>
+#include <set>
+#include <sys/time.h>
+#include "fsom/fsom.h"
+#include <getopt.h>
+#include <cmath>
+
+using namespace std;
+using namespace vcf;
+
+double mean(const vector<double>& data) {
+    double total = 0;
+    for (vector<double>::const_iterator i = data.begin(); i != data.end(); ++i) {
+        total += *i;
+    }
+    return total/data.size();
+}
+
+double median(vector <double>& data) {
+    double median;
+    size_t size = data.size();
+    // ascending order
+    sort(data.begin(), data.end());
+    // get middle value
+    if (size % 2 == 0) {
+        median = (data[size/2-1] + data[size/2]) / 2;
+    } else {
+        median = data[size/2];
+    }
+    return median;
+}
+
+double variance(const vector <double>& data, const double mean) {
+    double total = 0;
+    for (vector <double>::const_iterator i = data.begin(); i != data.end(); ++i) {
+        total += (*i - mean)*(*i - mean);
+    }
+    return total / (data.size());
+}
+
+double standard_deviation(const vector <double>& data, const double mean) {
+    return sqrt(variance(data, mean));
+}
+
+struct Stats {
+    double mean;
+    double stdev;
+    Stats(void) : mean(0), stdev(1) { }
+};
+
+bool load_som_metadata(string& som_metadata_file, int& x, int& y, vector<string>& fields, map<string, Stats>& stats) {
+    ifstream in(som_metadata_file.c_str());
+    if (!in.is_open()) {
+        return false;
+    }
+    string linebuf;
+    getline(in, linebuf);
+    vector<string> xy = split(linebuf, "\t ");
+    convert(xy.front(), x);
+    convert(xy.back(), y);
+    while (getline(in, linebuf)) {
+        // format is: field_name, mean, stdev
+        vector<string> m = split(linebuf, "\t ");
+        fields.push_back(m[0]);
+        Stats& s = stats[m[0]];
+        convert(m[1], s.mean);
+        convert(m[2], s.stdev);
+    }
+    in.close();
+    return true;
+}
+
+bool save_som_metadata(string& som_metadata_file, int x, int y, vector<string>& fields, map<string, Stats>& stats) {
+    ofstream out(som_metadata_file.c_str());
+    if (!out.is_open()) {
+        return false;
+    }
+    out << x << "\t" << y << endl;
+    for (vector<string>::iterator f = fields.begin(); f != fields.end(); ++f) {
+        Stats& s = stats[*f];
+        out << *f << "\t" << s.mean << "\t" << s.stdev << endl;
+    }
+    out.close();
+    return true;
+}
+
+void normalize_inputs(vector<double>& record, vector<string>& fields, map<string, Stats>& stats) {
+    vector<double>::iterator r = record.begin();
+    for (vector<string>::iterator f = fields.begin(); f != fields.end(); ++f, ++r) {
+        Stats& s = stats[*f];
+        *r = (*r - s.mean) / s.stdev;
+    }
+}
+
+void read_fields(Variant& var, int ai, vector<string>& fields, vector<double>& record) {
+    double td;
+    vector<string>::iterator j = fields.begin();
+    for (; j != fields.end(); ++j) {
+        if (*j == "QUAL") { // special handling...
+            td = var.quality;
+        } else {
+            if (var.info.find(*j) == var.info.end()) {
+                td = 0;
+            } else {
+                if (var.vcf->infoCounts[*j] == 1) { // for non Allele-variant fields
+                    convert(var.info[*j][0], td);
+                } else {
+                    convert(var.info[*j][ai], td);
+                }
+            }
+        }
+        record.push_back(td);
+    }
+}
+
+struct SomPaint {
+    int true_count;
+    int false_count;
+    double prob_true;
+    SomPaint(void) : true_count(0), false_count(0), prob_true(0) { }
+};
+
+static unsigned long prev_uticks = 0;
+
+static unsigned long get_uticks(){
+    struct timeval ts;
+    gettimeofday(&ts,0);
+    return ((ts.tv_sec * 1000000) + ts.tv_usec);
+}
+
+static void start_timer(){
+    prev_uticks = get_uticks();
+}
+
+static void print_timing( const char *msg ){
+#define MS_DELTA (1000.0)
+#define SS_DELTA (MS_DELTA * 1000.0)
+#define MM_DELTA (SS_DELTA * 60.0)
+#define HH_DELTA (MM_DELTA * 60.0)
+
+    double ticks = get_uticks() - prev_uticks;
+
+    if( ticks < MS_DELTA ){
+        fprintf(stderr, "%s\t : %lf us\n", msg, ticks );
+    }
+    else if( ticks < SS_DELTA ){
+        fprintf(stderr, "%s\t : %lf ms\n", msg, ticks / MS_DELTA );
+    }
+    else if( ticks < MM_DELTA ){
+        fprintf(stderr, "%s\t : %lf s\n", msg, ticks / SS_DELTA );
+    }
+    else if( ticks < HH_DELTA ){
+        fprintf(stderr, "%s\t : %lf m\n", msg, ticks / MM_DELTA );
+    }
+    else{
+        fprintf(stderr, "%s\t : %lf h\n", msg, ticks / HH_DELTA );
+    }
+
+    start_timer();
+}
+
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [vcf file]" << endl
+         << endl
+         << "training: " << endl
+         << "    " << argv[0] << " -s output.som -x 20 -y 20 -f \"AF DP ABP\" training.vcf" << endl
+         << endl
+         << "application: " << endl
+         << "    " << argv[0] << " -a output.som test.vcf >results.vcf" << endl
+         << endl
+         << argv[0] << "trains and/or applies a self-organizing map to the input VCF data" << endl
+         << "on stdin, adding two columns for the x and y coordinates of the winning" << endl
+         << "neuron in the network and an optional euclidean distance from a given" << endl
+         << "node (--center)." << endl
+         << endl
+         << "If a map is provided via --apply, it will be applied to input without" << endl
+         << "training.  A .meta file describing network parameters and input parameter" << endl
+         << "distributions is used to automatically setup the network." << endl
+         << endl
+         << "options:" << endl
+         << endl
+         << "    -h, --help             this dialog" << endl
+         << endl
+         << "training:" << endl
+         << endl
+         << "    -f, --fields \"FIELD ...\"  INFO fields to provide to the SOM" << endl
+         << "    -a, --apply FILE       apply the saved map to input data to FILE" << endl
+         << "    -s, --save  FILE       train on input data and save the map to FILE" << endl
+         << "    -p, --print-training-results" << endl
+         << "                           print results of SOM on training input" << endl
+         << "                           (you can also just use --apply on the same input)" << endl
+         << "    -x, --width X          width in columns of the output array" << endl
+         << "    -y, --height Y         height in columns of the output array" << endl
+         << "    -i, --iterations N     number of training iterations or epochs" << endl
+         << "    -d, --debug            print timing information" << endl
+         << endl
+         << "recalibration:" << endl
+         << endl
+         << "    -c, --center X,Y       annotate with euclidean distance from center" << endl
+         << "    -T, --paint-true VCF   use VCF file to annotate true variants (multiple)" << endl
+         << "    -F, --paint-false VCF  use VCF file to annotate false variants (multiple)" << endl
+         << "    -R, --paint-tag TAG    provide estimated FDR% in TAG in variant INFO" << endl
+         << "    -N, --false-negative   replace FDR% (false detection) with FNR% (false negative)" << endl;
+
+}
+
+
+int main(int argc, char** argv) {
+
+    int width = 100;
+    int height = 100;
+    int num_dimensions = 2;
+    int iterations = 1000;
+    string som_file;
+    string som_metadata_file;
+    bool apply = false;
+    bool train = false;
+    bool apply_to_training_data = false; // print results against training data
+    bool debug = false;
+    vector<string> fields;
+    vector<string> centerv;
+    int centerx;
+    int centery;
+    string trueVCF;
+    string falseVCF;
+    bool normalize = true;
+
+    int c;
+
+    if (argc == 1) {
+        printSummary(argv);
+        exit(1);
+    }
+
+    while (true) {
+        static struct option long_options[] =
+        {  
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            {"help", no_argument, 0, 'h'},
+            {"iterations", required_argument, 0, 'i'},
+            {"width", required_argument, 0, 'x'},
+            {"height", required_argument, 0, 'y'},
+            {"apply", required_argument, 0, 'a'},
+            {"save", required_argument, 0, 's'},
+            {"fields", required_argument, 0, 'f'},
+            {"print-training-results", no_argument, 0, 'p'},
+            {"center", required_argument, 0, 'c'},
+            {"paint-true", required_argument, 0, 'T'},
+            {"paint-false", required_argument, 0, 'F'},
+            {"debug", no_argument, 0, 'd'},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hpdi:x:y:a:s:f:c:T:F:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        string field;
+
+        switch (c)
+        {
+
+            case 'x':
+                if (!convert(optarg, width)) {
+                    cerr << "could not parse --width, -x" << endl;
+                    exit(1);
+                }
+                break;
+
+            case 'y':
+                if (!convert(optarg, height)) {
+                    cerr << "could not parse --height, -y" << endl;
+                    exit(1);
+                }
+                break;
+
+            case 'i':
+                if (!convert(optarg, iterations)) {
+                    cerr << "could not parse --iterations, -i" << endl;
+                    exit(1);
+                }
+                break;
+
+            case 'p':
+                apply_to_training_data = true;
+                break;
+
+            case 'T':
+                trueVCF = optarg;
+                break;
+
+            case 'F':
+                falseVCF = optarg;
+                break;
+
+            case 'd':
+                debug = true;
+                break;
+
+            case 'a':
+                som_file = optarg;
+                apply = true;
+                break;
+                
+            case 's':
+                som_file = optarg;
+                train = true;
+                break;
+
+            case 'f':
+                fields = split(string(optarg), ' ');
+                break;
+
+            case 'c':
+                centerv = split(string(optarg), ',');
+                convert(centerv.at(0), centerx);
+                convert(centerv.at(1), centery);
+                break;
+
+            case 'h':
+                printSummary(argv);
+                exit(0);
+                break;
+
+            default:
+                break;
+        }
+    }
+
+    size_t i, j;
+    som_network_t *net = NULL;
+    vector<string> inputs;
+    vector<vector<double> > data;
+    map<string, Stats> stats;
+
+    string line;
+    stringstream ss;
+
+    VariantCallFile variantFile;
+    bool usingstdin = false;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+        usingstdin = true;
+    }
+
+    if (!variantFile.is_open()) {
+        cerr << "could not open VCF file" << endl;
+        return 1;
+    }
+
+    som_metadata_file = som_file + ".meta";
+
+    Variant var(variantFile);
+
+    variantFile.addHeaderLine("##INFO=<ID=SOMX,Number=A,Type=Integer,Description=\"X position of best neuron for variant in self-ordering map defined in " + som_file + "\">");
+    variantFile.addHeaderLine("##INFO=<ID=SOMY,Number=A,Type=Integer,Description=\"Y position of best neuron for variant in self-ordering map defined in " + som_file + "\">");
+    if (!centerv.empty()) {
+        variantFile.addHeaderLine("##INFO=<ID=SOMD,Number=A,Type=Float,Description=\"Euclidean distance from "
+                                  + convert(centerx) + "," + convert(centery) + " as defined by " + som_file + "\">");
+    }
+    if (!trueVCF.empty() && !falseVCF.empty()) {
+        variantFile.addHeaderLine("##INFO=<ID=SOMP,Number=A,Type=Float,Description=\"Estimated probability the variant is true using som "
+                                  + som_file + ", true variants from " + trueVCF + ", and false variants from " + falseVCF + "\">");
+    }
+
+    if (debug) start_timer();
+    
+    vector<Variant> variants;
+    if (train) {
+        map<string, pair<double, double> > normalizationLimits;
+        while (variantFile.getNextVariant(var)) {
+            variants.push_back(var);
+            int ai = 0;
+            vector<string>::iterator a = var.alt.begin();
+            for ( ; a != var.alt.end(); ++a, ++ai) {
+                vector<double> record;
+                double td;
+                vector<string>::iterator j = fields.begin();
+                for (; j != fields.end(); ++j) {
+                    if (*j == "QUAL") { // special handling...
+                        td = var.quality;
+                    } else {
+                        if (var.info.find(*j) == var.info.end()) {
+                            td = 0;
+                        } else {
+                            if (variantFile.infoCounts[*j] == 1) { // for non Allele-variant fields
+                                convert(var.info[*j][0], td);
+                            } else {
+                                convert(var.info[*j][ai], td);
+                            }
+                        }
+                    }
+                    if (normalize) {
+                        pair<double, double>& limits = normalizationLimits[*j];
+                        if (td < limits.first) limits.first = td;
+                        if (td > limits.second) limits.second = td;
+                    }
+                    record.push_back(td);
+                }
+                data.push_back(record);
+            }
+        }
+        // normalize inputs
+        if (normalize) {
+            // get normalization vector
+            // goal is normalization at 0, sd=1
+            int i = 0;
+            for (vector<string>::iterator f = fields.begin(); f != fields.end(); ++f, ++i) {
+                vector<double> fv;
+                for (vector<vector<double> >::iterator d = data.begin(); d != data.end(); ++d) {
+                    fv.push_back(d->at(i));
+                }
+                Stats& s = stats[*f];
+                // get normalization constants
+                s.mean = mean(fv);
+                s.stdev = standard_deviation(fv, s.mean);
+                // normalize
+                for (vector<vector<double> >::iterator d = data.begin(); d != data.end(); ++d) {
+                    double v = d->at(i);
+                    d->at(i) = (v - s.mean) / s.stdev;
+                }
+            }
+        }
+    }
+
+    vector<double*> dataptrs (data.size());
+    for (unsigned i=0, e=dataptrs.size(); i<e; ++i) {
+        dataptrs[i] = &(data[i][0]); // assuming !thing[i].empty()
+    }
+
+    if (debug) print_timing( "Input Processing" );
+
+    if (apply) {
+        if (! (net = som_deserialize(som_file.c_str()))) {
+            cerr << "could not load SOM from " << som_file << endl;
+            return 1;
+        }
+        if (!fields.empty()) {
+            cerr << "fields specified, but a SOM is to be applied, and metadata should be stored at " << som_metadata_file << endl;
+            return 1;
+        }
+        if (!load_som_metadata(som_metadata_file, width, height, fields, stats)) {
+            cerr << "could not load SOM metadata from " << som_metadata_file << endl;
+            return 1;
+        }
+    } else {
+
+        net = som_network_new(data[0].size(), height, width);
+	
+        if ( !net )	{
+            printf( "ERROR: som_network_new failed.\n" );
+            return 1;
+        }
+    }
+
+    if (debug) print_timing( "Network Creation" );
+
+    if (train) {
+        if (debug) cerr << "Training using " << data.size() << " input vectors" << endl;
+        som_init_weights ( net, &dataptrs[0], data.size() );
+        som_train ( net, &dataptrs[0], data.size(), iterations );
+    }
+
+    if (debug) print_timing( "Network Training" );
+
+    // open and calibrate using the true and false datasets
+
+    if (train && apply_to_training_data) {
+        // currently disabled
+        /*
+        cout << variantFile.header << endl;
+        vector<Variant>::iterator v = variants.begin(); int di = 0;
+        for ( ; v != variants.end() && di < data.size(); ++v) {
+            var.info["SOMX"].clear();
+            var.info["SOMY"].clear();
+            var.info["SOMP"].clear();
+            var.info["SOMD"].clear();
+            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++di) {
+                som_set_inputs ( net, dataptrs[di] );
+                size_t x=0, y=0;
+                som_get_best_neuron_coordinates ( net, &x, &y );
+                v->info["SOMX"].push_back(convert(x));
+                v->info["SOMY"].push_back(convert(y));
+                if (!centerv.empty()) {
+                    float distance = sqrt(pow(abs((float)centerx - (float)x), 2)
+                                          + pow(abs((float)centery - (float)y), 2));
+                    var.info["SOMD"].clear();
+                    var.info["SOMD"].push_back(convert(distance));
+                }
+            }
+            cout << *v << endl;
+        }
+        */
+    } else if (apply) {
+
+        // if we have true and false sets, use them to "paint" the map
+        vector<vector<SomPaint> > paintedSOM;
+        paintedSOM.resize(width);
+        for (vector<vector<SomPaint> >::iterator t = paintedSOM.begin();
+             t != paintedSOM.end(); ++t) {
+            t->resize(height);
+        }
+
+        // handle trues
+        if (!trueVCF.empty()) {
+            VariantCallFile trueVariantFile;
+            trueVariantFile.open(trueVCF);
+            Variant v(trueVariantFile);
+            while (trueVariantFile.getNextVariant(v)) {
+                int ai = 0;
+                vector<string>::iterator a = v.alt.begin();
+                for ( ; a != v.alt.end(); ++a, ++ai) {
+                    vector<double> record;
+                    read_fields(v, ai, fields, record);
+                    if (normalize) {
+                        normalize_inputs(record, fields, stats);
+                    }
+                    som_set_inputs ( net, &record[0] );
+                    size_t x=0, y=0;
+                    som_get_best_neuron_coordinates ( net, &x, &y );
+                    paintedSOM[x][y].true_count += 1;
+                }
+            }
+        }
+
+        // get falses
+        if (!falseVCF.empty()) {
+            VariantCallFile falseVariantFile;
+            falseVariantFile.open(falseVCF);
+            Variant v(falseVariantFile);
+            while (falseVariantFile.getNextVariant(v)) {
+                int ai = 0;
+                vector<string>::iterator a = v.alt.begin();
+                for ( ; a != v.alt.end(); ++a, ++ai) {
+                    vector<double> record;
+                    read_fields(v, ai, fields, record);
+                    if (normalize) {
+                        normalize_inputs(record, fields, stats);
+                    }
+                    som_set_inputs ( net, &record[0] );
+                    size_t x=0, y=0;
+                    som_get_best_neuron_coordinates ( net, &x, &y );
+                    paintedSOM[x][y].false_count += 1;
+                }
+            }
+        }
+
+        // estimate probability of each node using true and false set
+        for (vector<vector<SomPaint> >::iterator t = paintedSOM.begin();
+             t != paintedSOM.end(); ++t) {
+            for (vector<SomPaint>::iterator p = t->begin(); p != t->end(); ++p) {
+                //cout << "count at node " << t - paintedSOM.begin() << "," << p - t->begin()
+                //     << " is " << p->true_count << " true, " << p->false_count << " false" << endl;
+                if (p->true_count + p->false_count > 0) {
+                    p->prob_true = (double) p->true_count / (double) (p->true_count + p->false_count);
+                } else {
+                    // for nodes without training data, could we estimate from surrounding nodes?
+                    // yes, TODO, but for now we can be conservative and say "0"
+                    p->prob_true = 0;
+                }
+            }
+        }
+
+        cout << variantFile.header << endl;
+        while (variantFile.getNextVariant(var)) {
+            var.info["SOMX"].clear();
+            var.info["SOMY"].clear();
+            var.info["SOMP"].clear();
+            var.info["SOMD"].clear();
+            int ai = 0;
+            vector<string>::iterator a = var.alt.begin();
+            for ( ; a != var.alt.end(); ++a, ++ai) {
+                vector<double> record;
+                read_fields(var, ai, fields, record);
+                if (normalize) {
+                    normalize_inputs(record, fields, stats);
+                }
+                som_set_inputs ( net, &record[0] );
+                size_t x=0, y=0;
+                som_get_best_neuron_coordinates ( net, &x, &y );
+                if (!trueVCF.empty() && !falseVCF.empty()) {
+                    SomPaint& paint = paintedSOM[x][y];
+                    var.info["SOMP"].push_back(convert(paint.prob_true));
+                }
+                var.info["SOMX"].push_back(convert(x));
+                var.info["SOMY"].push_back(convert(y));
+                if (!centerv.empty()) {
+                    float distance = sqrt(pow(abs((float)centerx - (float)x), 2)
+                                          + pow(abs((float)centery - (float)y), 2));
+                    var.info["SOMD"].push_back(convert(distance));
+                }
+            }
+            cout << var << endl;
+        }
+    }
+
+    if (debug) print_timing( "Input Recognition" );
+
+    if (train) {
+        if (!save_som_metadata(som_metadata_file, width, height, fields, stats)) {
+            cerr << "could not save metadata to " << som_metadata_file << endl;
+        }
+        som_serialize(net, som_file.c_str());
+    }
+
+    som_network_destroy ( net );
+
+    if (debug) print_timing( "Network Destruction" );
+
+    return 0;
+
+}
diff --git a/src/vcfstats.cpp b/src/vcfstats.cpp
new file mode 100644
index 0000000..da8137b
--- /dev/null
+++ b/src/vcfstats.cpp
@@ -0,0 +1,570 @@
+#include "Variant.h"
+#include "split.h"
+#include "convert.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+bool isTransition(const string& ref, const string& alt) {
+    if (((ref == "A" && alt == "G") || (ref == "G" && alt == "A")) ||
+        ((ref == "C" && alt == "T") || (ref == "T" && alt == "C"))) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+bool isDeamination(const string& ref, const string& alt) {
+    if ((ref == "G" && alt == "A") ||
+        (ref == "C" && alt == "T")) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+bool isAmination(const string& ref, const string& alt) {
+    if ((ref == "A" && alt == "G") ||
+        (ref == "T" && alt == "C")) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+class AlleleStats {
+public:
+    int transitions;
+    int transversions;
+    int deaminations;
+    int aminations;
+    int mismatches;
+    int insertedbases;
+    int insertions;
+    int deletedbases;
+    int deletions;
+    //AlleleStats(int ts, int tv, int da, int am, int mm)
+    AlleleStats(void)
+        : transitions(0)
+        , transversions(0)
+        , deaminations(0)
+        , aminations(0)
+        , mismatches(0)
+        , insertions(0)
+        , insertedbases(0)
+        , deletions(0)
+        , deletedbases(0)
+    { }
+};
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+         << endl
+         << "    -r, --region          specify a region on which to target the stats, requires a BGZF" << endl
+         << "                          compressed file which has been indexed with tabix.  any number of" << endl
+         << "                          regions may be specified." << endl
+         << "    -a, --add-info        add the statistics intermediate information to the VCF file," << endl
+         << "                          writing out VCF records instead of summary statistics" << endl
+         << "    -t, --add-type        only add the type= field to the info (faster than -a)" << endl
+         << "    -l, --no-length-frequency    don't out the indel and mnp length-frequency spectra" << endl
+         << "    -m, --match-score N          match score for SW algorithm" << endl
+         << "    -x, --mismatch-score N       mismatch score for SW algorithm" << endl
+         << "    -o, --gap-open-penalty N     gap open penalty for SW algorithm" << endl
+         << "    -e, --gap-extend-penalty N   gap extension penalty for SW algorithm" << endl
+         << endl
+         << "Prints statistics about variants in the input VCF file." << endl;
+}
+
+
+int main(int argc, char** argv) {
+
+    vector<string> regions;
+    bool addTags = false;
+    bool addType = false;
+    bool lengthFrequency = true;
+
+    // constants for SmithWaterman algorithm
+    float matchScore = 10.0f;
+    float mismatchScore = -9.0f;
+    float gapOpenPenalty = 15.0f;
+    float gapExtendPenalty = 6.66f;
+
+    bool useReferenceAlignment = false;
+
+    int c;
+    while (true) {
+        static struct option long_options[] =
+            {
+                /* These options set a flag. */
+                //{"verbose", no_argument,       &verbose_flag, 1},
+                {"help", no_argument, 0, 'h'},
+                {"region", required_argument, 0, 'r'},
+                {"add-info", no_argument, 0, 'a'},
+                {"add-type", no_argument, 0, 't'},
+                {"no-length-frequency", no_argument, 0, 'l'},
+                {"match-score", required_argument, 0, 'm'},
+                {"mismatch-score", required_argument, 0, 'x'},
+                {"gap-open-penalty", required_argument, 0, 'o'},
+                {"gap-extend-penalty", required_argument, 0, 'e'},
+                //{"length",  no_argument, &printLength, true},
+                {0, 0, 0, 0}
+            };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hlatr:m:x:o:e:",
+                         long_options, &option_index);
+
+        /* Detect the end of the options. */
+        if (c == -1)
+            break;
+ 
+        switch (c)
+        {
+        case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+                break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+                printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+	    case 'h':
+            printSummary(argv);
+            exit(0);
+            break;
+		
+	    case 'r':
+            regions.push_back(optarg);
+            break;
+		
+	    case 'l':
+            lengthFrequency = false;
+            break;
+		
+	    case 'a':
+            addTags = true;
+            break;
+
+	    case 't':
+            addType = true;
+            break;
+
+	    case 'm':
+            matchScore = atof(optarg);
+	        break;
+
+	    case 'x':
+            mismatchScore = atof(optarg);
+	        break;
+
+	    case 'o':
+            gapOpenPenalty = atof(optarg);
+	        break;
+
+	    case 'e':
+            gapExtendPenalty = atof(optarg);
+	        break;
+		
+	    default:
+            abort ();
+        }
+    }
+
+    VariantCallFile variantFile;
+    string inputFilename;
+    if (optind == argc - 1) {
+        inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    if (addType && !addTags) {
+        variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">");
+        variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">");
+        cout << variantFile.header << endl;
+    }
+
+    if (addTags) {
+        variantFile.addHeaderLine("##INFO=<ID=transitions,Number=A,Type=Integer,Description=\"Total number of transitions in the alternate allele\">");
+        variantFile.addHeaderLine("##INFO=<ID=transversions,Number=A,Type=Integer,Description=\"Total number of transversions in the alternate allele\">");
+        variantFile.addHeaderLine("##INFO=<ID=deaminations,Number=A,Type=Integer,Description=\"Total number of deaminations in the alternate allele\">");
+        variantFile.addHeaderLine("##INFO=<ID=aminations,Number=A,Type=Integer,Description=\"Total number of aminations in the alternate allele\">");
+        variantFile.addHeaderLine("##INFO=<ID=mismatches,Number=A,Type=Integer,Description=\"Total number of mismatches in the alternate allele\">");
+        variantFile.addHeaderLine("##INFO=<ID=insertions,Number=A,Type=Integer,Description=\"Total number of inserted bases in the alternate allele\">");
+        variantFile.addHeaderLine("##INFO=<ID=deletions,Number=A,Type=Integer,Description=\"Total number of deleted bases in the alternate allele\">");
+        variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">");
+        variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">");
+        variantFile.addHeaderLine("##INFO=<ID=reflen,Number=1,Type=Integer,Description=\"The length of the reference allele\">");
+        variantFile.addHeaderLine("##INFO=<ID=altlen,Number=A,Type=Integer,Description=\"The length of the alternate allele\">");
+        cout << variantFile.header << endl;
+    }
+
+    Variant var(variantFile);
+
+    vector<string>::iterator regionItr = regions.begin();
+
+    int variantAlleles = 0;
+    int uniqueVariantAlleles = 0;
+    int variantSites = 0;
+    int snps = 0;
+    int transitions = 0;
+    int transversions = 0;
+    int deaminations = 0;
+    int aminations = 0;
+    int totalinsertions = 0;
+    int totaldeletions = 0;
+    int insertedbases = 0;
+    int deletedbases = 0;
+    int totalmnps = 0;
+    int totalcomplex = 0;
+    int mismatchbases = 0;
+    int mnpbases = 0;
+    int biallelics = 0;
+    int multiallelics = 0;
+    map<int, int> insertions;
+    map<int, int> deletions;
+    map<int, int> mnps;
+    map<int, int> complexsubs;
+
+    bool includePreviousBaseForIndels = false;
+    bool useMNPs = true;
+    bool useEntropy = false;
+
+    AlleleStats biallelicSNPs;
+
+    // todo, add biallelic snp dialog to output and ts/tv for snps and mnps
+
+    do {
+
+        if (!inputFilename.empty() && !regions.empty()) {
+            string regionStr = *regionItr++;
+            variantFile.setRegion(regionStr);
+        }
+
+        while (variantFile.getNextVariant(var)) {
+            ++variantSites;
+            if (var.alt.size() > 1) {
+                ++multiallelics;
+            } else {
+                ++biallelics;
+            }
+            map<string, vector<VariantAllele> > alternates = var.parsedAlternates(includePreviousBaseForIndels,
+                                                                                  useMNPs,
+                                                                                  useEntropy,
+                                                                                  matchScore,
+                                                                                  mismatchScore,
+                                                                                  gapOpenPenalty,
+                                                                                  gapExtendPenalty);
+            map<VariantAllele, vector<string> > uniqueVariants;
+	    
+            vector<string> cigars;
+	    
+            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+                string& alternate = *a;
+                if (addTags)
+                    var.info["altlen"].push_back(convert(alternate.size()));
+                vector<VariantAllele>& vav = alternates[alternate];
+                if (vav.size() > 1) {
+                    // check that there are actually multiple non-reference alleles
+                    int nonRefAlleles = 0;
+                    for (vector<VariantAllele>::iterator z = vav.begin(); z != vav.end(); ++z) {
+                        if (z->ref != z->alt)
+                            ++nonRefAlleles;
+                    }
+                    if (nonRefAlleles > 1)
+                        ++totalcomplex;
+                }
+                for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
+                    uniqueVariants[*v].push_back(alternate);
+                }
+
+                if (addTags || addType) {
+                    string cigar;
+                    pair<int, string> element;
+                    for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
+                        VariantAllele& va = *v;
+                        if (va.ref != va.alt) {
+                            if (element.second == "M") {
+                                cigar += convert(element.first) + element.second;
+                                element.second = ""; element.first = 0;
+                            }
+                            if (va.ref.size() == va.alt.size()) {
+                                cigar += convert(va.ref.size()) + "X";
+                            } else if (va.ref.size() > va.alt.size()) {
+                                cigar += convert(va.ref.size() - va.alt.size()) + "D";
+                            } else {
+                                cigar += convert(va.alt.size() - va.ref.size()) + "I";
+                            }
+                        } else {
+                            if (element.second == "M") {
+                                element.first += va.ref.size();
+                            } else {
+                                element = make_pair(va.ref.size(), "M");
+                            }
+                        }
+                    }
+                    if (element.second == "M") {
+                        cigar += convert(element.first) + element.second;
+                    }
+                    element.second = ""; element.first = 0;
+                    cigars.push_back(cigar);
+                }
+            }
+
+            if (addTags) {
+                var.info["cigar"] = cigars;
+                var.info["reflen"].push_back(convert(var.ref.size()));
+            } else if (addType) {
+                var.info["cigar"] = cigars;
+            }
+
+            variantAlleles += var.alt.size();
+            map<string, AlleleStats> alleleStats;
+
+            for (map<VariantAllele, vector<string> >::iterator v = uniqueVariants.begin(); v != uniqueVariants.end(); ++v) {
+                const VariantAllele& va = v->first;
+                vector<string>& alternates = v->second;
+
+                if (!(addTags || addType)) { // don't add any tag information if we're not going to output it
+                    alternates.clear();
+                }
+
+                if (va.ref != va.alt) {
+                    ++uniqueVariantAlleles;
+                    if (va.ref.size() == va.alt.size()) {
+                        if (va.ref.size() == 1) {
+                            ++snps;
+                            ++mismatchbases;
+                            for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+                                ++alleleStats[*a].mismatches;
+                            }
+                            if (isTransition(va.ref, va.alt)) {
+                                ++transitions;
+                                for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+                                    ++alleleStats[*a].transitions;
+                                }
+                            } else {
+                                ++transversions;
+                                for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+                                    ++alleleStats[*a].transversions;
+                                }
+                            }
+                            if (isAmination(va.ref, va.alt)) {
+                                ++aminations;
+                                for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+                                    ++alleleStats[*a].aminations;
+                                }
+                            }
+                            if (isDeamination(va.ref, va.alt)) {
+                                ++deaminations;
+                                for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+                                    ++alleleStats[*a].deaminations;
+                                }
+                            }
+                        } else {
+                            ++totalmnps;
+                            ++mnps[va.alt.size()]; // not entirely correct
+                            for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+                                alleleStats[*a].mismatches += va.alt.size();
+                            }
+                            string::const_iterator r = va.ref.begin();
+                            for (string::const_iterator a = va.alt.begin(); a != va.alt.end(); ++a, ++r) {
+                                string rstr = string(1, *r);
+                                string astr = string(1, *a);
+                                if (rstr == astr) {
+                                    continue;
+                                }
+                                if (isTransition(rstr, astr)) {
+                                    ++transitions;
+                                    for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+                                        ++alleleStats[*a].transitions;
+                                    }
+                                } else {
+                                    ++transversions;
+                                    for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+                                        ++alleleStats[*a].transversions;
+                                    }
+                                }
+                                if (isAmination(rstr, astr)) {
+                                    ++aminations;
+                                    for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+                                        ++alleleStats[*a].aminations;
+                                    }
+                                }
+                                if (isDeamination(rstr, astr)) {
+                                    ++deaminations;
+                                    for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+                                        ++alleleStats[*a].deaminations;
+                                    }
+                                }
+                                ++mismatchbases;
+                                ++mnpbases;
+                            }
+                        }
+                    } else if (va.ref.size() > va.alt.size()) {
+                        int diff = va.ref.size() - va.alt.size();
+                        deletedbases += diff;
+                        ++totaldeletions;
+                        ++deletions[diff];
+                        for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+                            alleleStats[*a].deletedbases += diff;
+                            alleleStats[*a].deletions += 1;
+                        }
+                    } else {
+                        int diff = va.alt.size() - va.ref.size();
+                        insertedbases += diff;
+                        ++totalinsertions;
+                        ++insertions[diff];
+                        for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+                            alleleStats[*a].insertedbases += diff;
+                            alleleStats[*a].insertions += 1;
+                        }
+                    }
+                }
+            }
+            if (addTags || addType) {
+                for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+                    string vartype;
+                    if (alleleStats[*a].insertions + alleleStats[*a].deletions == 0) {
+                        if (alleleStats[*a].mismatches == 1) {
+                            vartype = "snp";
+                        } else if (alleleStats[*a].mismatches > 1) {
+                            vartype = "complex";
+                        } else {
+                            vartype = "ref";
+                        }
+                    } else if (alleleStats[*a].insertions + alleleStats[*a].deletions == 1) {
+                        if (alleleStats[*a].insertions == 1) {
+                            vartype = "ins";
+                        } else {
+                            vartype = "del";
+                        }
+                    } else {
+                        vartype = "complex";
+                    }
+                    if (addTags) {
+                        var.info["mismatches"].push_back(convert(alleleStats[*a].mismatches));
+                        var.info["insertions"].push_back(convert(alleleStats[*a].insertions));
+                        var.info["deletions"].push_back(convert(alleleStats[*a].deletions));
+                        var.info["transitions"].push_back(convert(alleleStats[*a].transitions));
+                        var.info["transversions"].push_back(convert(alleleStats[*a].transversions));
+                        var.info["deaminations"].push_back(convert(alleleStats[*a].deaminations));
+                        var.info["aminations"].push_back(convert(alleleStats[*a].aminations));
+                    }
+                    var.info["type"].push_back(vartype);
+                }
+                cout << var << endl;
+            }
+            // biallelic SNP case
+            if (var.alt.size() == 1 && var.ref.size() == 1 && var.alt.front().size() == 1) {
+                if (isTransition(var.ref, var.alt.front())) {
+                    biallelicSNPs.transitions++;
+                } else {
+                    biallelicSNPs.transversions++;
+                }
+                biallelicSNPs.mismatches++;
+            }
+        }
+
+    } while (regionItr != regions.end());
+
+
+    // find the maximum indel size
+    int maxindel = 0;
+    for (map<int, int>::iterator i = insertions.begin(); i != insertions.end(); ++i) {
+        if (i->first > maxindel) {
+            maxindel = i->first;
+        }
+    }
+    for (map<int, int>::iterator i = deletions.begin(); i != deletions.end(); ++i) {
+        if (i->first > maxindel) {
+            maxindel = i->first;
+        }
+    }
+
+    // and maximum mnp
+    int maxmnp = 0;
+    for (map<int, int>::iterator i = mnps.begin(); i != mnps.end(); ++i) {
+        if (i->first > maxmnp) {
+            maxmnp = i->first;
+        }
+    }
+
+    // now print the results
+
+    if (!addTags && !addType) {
+        cout << "total variant sites:\t" << variantSites << endl
+             << "of which " << biallelics << " (" << (double) biallelics / variantSites << ") are biallelic and "
+                            << multiallelics << " (" << (double) multiallelics / variantSites << ") are multiallelic" << endl
+             << "total variant alleles:\t" << variantAlleles << endl
+             << "unique variant alleles:\t" << uniqueVariantAlleles << endl
+             << endl
+             << "snps:\t" << snps << endl
+             << "mnps:\t" << totalmnps << endl
+             << "indels:\t" << totalinsertions + totaldeletions << endl
+             << "complex:\t" << totalcomplex << endl
+             << endl
+             << "mismatches:\t" << mismatchbases << endl
+             << endl
+             << "ts/tv ratio:\t" << (double) transitions / (double) transversions << endl
+             << "deamination ratio:\t" << (double) deaminations / aminations << endl
+             << "biallelic snps:\t" << biallelicSNPs.mismatches << " @ "
+             << (double) biallelicSNPs.transitions / (double) biallelicSNPs.transversions << endl;
+
+        if (lengthFrequency) {
+            cout << endl
+                 << "ins/del length frequency distribution" << endl
+                 << "length\tins\tdel\tins/del" << endl;
+            for (int i = 1; i <= maxindel; ++i) {
+                int ins = insertions[i];
+                int del = deletions[i];
+                cout << i << "\t"
+                     << (ins > 0 ? convert(ins) : "" ) << "\t"
+                     << (del > 0 ? convert(del) : "") << "\t"
+                     << (ins > 0 && del > 0 ? convert((double) ins / (double) del) : "")
+                     << endl;
+            }
+        }
+
+        cout << endl
+             << "insertion alleles / deletion alleles:\t"
+             << (double) totalinsertions / (double) totaldeletions << endl
+             << "inserted bases / deleted bases:\t"
+             << (double) insertedbases / (double) deletedbases << endl
+             << endl;
+
+        if (lengthFrequency) {
+            cout << "mnp length frequency distribution" << endl
+                 << "length\tcount" << endl;
+            for (int i = 2; i <= maxmnp; ++i) {
+                int mnp = mnps[i];
+                cout << i << "\t"
+                     << (mnp > 0 ? convert(mnp) : "")
+                     << endl;
+            }
+        }
+
+        cout << "total bases in mnps:\t" << mnpbases << endl;
+
+        /*
+          cout << "complex event frequency distribution" << endl
+          << "length\tcount" << endl;
+          for (map<int, int>::iterator i = complexsubs.begin(); i != complexsubs.end(); ++i) {
+          cout << i->first << "\t" << i->second << endl;
+          }
+        */
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfstreamsort.cpp b/src/vcfstreamsort.cpp
new file mode 100644
index 0000000..834de64
--- /dev/null
+++ b/src/vcfstreamsort.cpp
@@ -0,0 +1,143 @@
+#include "Variant.h"
+#include <algorithm>
+#include <getopt.h>
+#include "convert.h"
+
+using namespace std;
+using namespace vcf;
+
+bool listContains(list<string>& l, string& v) {
+    for (list<string>::iterator i = l.begin(); i != l.end(); ++i) {
+        if (*i == v) return true;
+    }
+    return false;
+}
+
+void printSummary(char** argv) {
+    cerr << "usage: " << argv[0] << " [options] [vcf file]" << endl
+         << endl
+         << "Sorts the input (either stdin or file) using a streaming sort algorithm."
+         << endl
+         << "options:" << endl
+         << endl
+         << "    -h, --help             this dialog" << endl
+         << "    -w, --window N         number of sites to sort (default 10000)" << endl
+         << "    -a, --all              load all sites and then sort in memory" << endl;
+}
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+    int sortSitesWindow = 10000;
+    bool sortAll = false;
+
+    int c;
+
+    while (true) {
+        static struct option long_options[] =
+        {  
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            {"help", no_argument, 0, 'h'},
+            {"window", required_argument, 0, 'w'},
+            {"all", required_argument, 0, 'a'},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "haw:",
+                         long_options, &option_index);
+
+        if (c == -1)
+            break;
+
+        string field;
+
+        switch (c)
+        {
+
+        case 'w':
+            if (!convert(optarg, sortSitesWindow)) {
+                cerr << "could not parse --window, -w" << endl;
+                exit(1);
+            }
+            break;
+                
+        case 'a':
+            sortAll = true;
+            break;
+
+        case 'h':
+            printSummary(argv);
+            exit(0);
+            break;
+            
+        default:
+            break;
+        }
+    }
+
+    if (optind == argc - 1) {
+        string inputFilename = argv[optind];
+        variantFile.open(inputFilename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    cout << variantFile.header << endl;
+
+    map<string, map<long int, map<string, vector<Variant> > > > records;
+    long int back = 0;
+    int numrecords = 0;
+    list<string> sequenceNames;
+
+    variantFile.parseSamples = false;
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        //cerr << "at position " << var.sequenceName << ":" << var.position << endl;
+        if (!listContains(sequenceNames, var.sequenceName)) {
+            //cerr << "adding new sequence name " << var.sequenceName << endl;
+            sequenceNames.push_back(var.sequenceName);
+        }
+        records[var.sequenceName][var.position][var.vrepr()].push_back(var);
+        if (records[var.sequenceName][var.position].size() == 1) ++numrecords;
+        if (!sortAll && numrecords > sortSitesWindow) {
+            //cerr << "outputting a position" << endl;
+            if (records[sequenceNames.front()].empty()) {
+                //cerr << "end of reference sequence " << sequenceNames.front() << endl;
+                sequenceNames.pop_front();
+            }
+            map<long int, map<string, vector<Variant> > >& frecords = records[sequenceNames.front()];
+            map<string, vector<Variant> >& vars = frecords.begin()->second;
+            for (map<string, vector<Variant> >::iterator v = vars.begin(); v != vars.end(); ++v) {
+                for (vector<Variant>::iterator s = v->second.begin(); s != v->second.end(); ++s) {
+                    cout << s->originalLine << endl;
+                }
+            }
+            frecords.erase(frecords.begin());
+            --numrecords;
+        }
+    }
+    //cerr << "done processing input, cleaning up" << endl;
+    for (list<string>::iterator s = sequenceNames.begin(); s != sequenceNames.end(); ++s) {
+        map<long int, map<string, vector<Variant> > >& q = records[*s];
+        for (map<long int, map<string, vector<Variant> > >::iterator r = q.begin(); r != q.end(); ++r) {
+            for (map<string, vector<Variant> >::iterator v = r->second.begin(); v != r->second.end(); ++v) {
+                for (vector<Variant>::iterator s = v->second.begin(); s != v->second.end(); ++s) {
+                    cout << s->originalLine << endl;
+                }
+            }
+            --numrecords;
+        }
+    }
+    //cerr << numrecords << " remain" << endl;
+
+    return 0;
+
+}
+
diff --git a/src/vcfuniq.cpp b/src/vcfuniq.cpp
new file mode 100644
index 0000000..30ad21b
--- /dev/null
+++ b/src/vcfuniq.cpp
@@ -0,0 +1,49 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc > 1) {
+        string filename = argv[1];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    cout << variantFile.header << endl;
+
+    string lastsn;
+    long int lastpos;
+    string lastref;
+    vector<string> lastalt;
+
+    variantFile.parseSamples = false;
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        if (!lastsn.empty()
+            && (lastsn == var.sequenceName
+                && lastpos == var.position
+                && lastref == var.ref
+                && lastalt == var.alt)) {
+            continue;
+        } else {
+            lastsn = var.sequenceName;
+            lastpos = var.position;
+            lastref = var.ref;
+            lastalt = var.alt;
+            cout << var.originalLine << endl;
+        }
+    }
+
+    return 0;
+
+}
+
diff --git a/src/vcfuniqalleles.cpp b/src/vcfuniqalleles.cpp
new file mode 100644
index 0000000..3c1c7e2
--- /dev/null
+++ b/src/vcfuniqalleles.cpp
@@ -0,0 +1,54 @@
+#include "Variant.h"
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+    VariantCallFile variantFile;
+
+    if (argc > 1) {
+        string filename = argv[1];
+        variantFile.open(filename);
+    } else {
+        variantFile.open(std::cin);
+    }
+
+    if (!variantFile.is_open()) {
+        return 1;
+    }
+
+    cout << variantFile.header << endl;
+
+    string lastsn;
+    long int lastpos;
+    string lastref;
+    vector<string> lastalt;
+
+    Variant var(variantFile);
+    while (variantFile.getNextVariant(var)) {
+        set<string> alleles;
+        vector<string> alleles_to_remove;
+        for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+            if (*a != var.ref) {
+                if (alleles.find(*a) == alleles.end()) {
+                    alleles.insert(*a);
+                } else {
+                    alleles_to_remove.push_back(*a);
+                }
+            } else {
+                alleles_to_remove.push_back(*a); // same as ref
+            }
+        }
+        for (vector<string>::iterator a = alleles_to_remove.begin(); a != alleles_to_remove.end(); ++a) {
+            cerr << "removing " << *a << " from " << var.sequenceName << ":" << var.position << endl;
+            var.removeAlt(*a);
+        }
+        cout << var << endl;
+    }
+
+    return 0;
+
+}
+
diff --git a/tests/lib/Local/vcflib/Test.pm b/tests/lib/Local/vcflib/Test.pm
new file mode 100644
index 0000000..761c22a
--- /dev/null
+++ b/tests/lib/Local/vcflib/Test.pm
@@ -0,0 +1,32 @@
+use strict;
+use warnings;
+
+package Local::vcflib::Test;
+use base 'Exporter';
+
+use File::Basename qw< dirname >;
+use IPC::Run3 qw< run3 >;
+use Test::More;
+
+our @EXPORT = qw( run run_ok );
+our $BIN    = dirname(__FILE__) . "/../../../../bin";
+
+sub run {
+    my ($run, $stdin)    = @_;
+    my ($command, @opts) = @$run;
+    run3(["$BIN/$command", @opts], \$stdin, \(my $stdout), \(my $stderr));
+    return ($stdout, $stderr, $?);
+}
+
+sub run_ok {
+    local $Test::Builder::Level = $Test::Builder::Level + 1;
+    my ($stdout, $stderr, $exit) = run(@_);
+    ok $exit >> 8 == 0, "exit code"
+        or diag "error running command: " . join(" ", @{$_[0]}) . "\n"
+               ."with input:\n$_[1]\n--\n"
+               ."exit code = " . ($exit >> 8) . " (system() return value = $exit)\n"
+               ."stderr = \n$stderr";
+    return ($stdout, $stderr);
+}
+
+1;
diff --git a/tests/vcfdistance.t b/tests/vcfdistance.t
new file mode 100644
index 0000000..233ddfa
--- /dev/null
+++ b/tests/vcfdistance.t
@@ -0,0 +1,98 @@
+use strict;
+use warnings;
+use Test::More;
+use Local::vcflib::Test;
+
+my @vcf = split /\n/, <<'';
+##fileformat=VCFv4.0
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+refseq	502	.	G	A	38553	PASS	
+refseq	552	.	G	A	24044	PASS	
+refseq	660	.	G	A	38553	PASS	
+refseq	678	.	G	A	24044	PASS	
+refseq	684	.	G	A	24044	PASS	
+
+sub variants {
+    join "\n", @vcf[0 .. $_[0] + 1]
+}
+
+my ($output, $header) = ('', <<'');
+##fileformat=VCFv4.0
+##INFO=<ID=BasesToClosestVariant,Number=1,Type=Integer,Description="Number of bases to the closest variant in the file.">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+
+#
+# Various numbers of variant lines
+#
+($output) = run_ok(["vcfdistance"], variants(5));
+is $output, $header . <<'', "distances for 5 variant lines";
+refseq	502	.	G	A	38553	PASS	BasesToClosestVariant=50;
+refseq	552	.	G	A	24044	PASS	BasesToClosestVariant=50;
+refseq	660	.	G	A	38553	PASS	BasesToClosestVariant=18;
+refseq	678	.	G	A	24044	PASS	BasesToClosestVariant=6;
+refseq	684	.	G	A	24044	PASS	BasesToClosestVariant=6;
+
+($output) = run_ok(["vcfdistance"], variants(4));
+is $output, $header . <<'', "distances for 4 variant lines";
+refseq	502	.	G	A	38553	PASS	BasesToClosestVariant=50;
+refseq	552	.	G	A	24044	PASS	BasesToClosestVariant=50;
+refseq	660	.	G	A	38553	PASS	BasesToClosestVariant=18;
+refseq	678	.	G	A	24044	PASS	BasesToClosestVariant=18;
+
+($output) = run_ok(["vcfdistance"], variants(3));
+is $output, $header . <<'', "distances for 3 variant lines";
+refseq	502	.	G	A	38553	PASS	BasesToClosestVariant=50;
+refseq	552	.	G	A	24044	PASS	BasesToClosestVariant=50;
+refseq	660	.	G	A	38553	PASS	BasesToClosestVariant=108;
+
+($output) = run_ok(["vcfdistance"], variants(2));
+is $output, $header . <<'', "distances for 2 variant lines";
+refseq	502	.	G	A	38553	PASS	BasesToClosestVariant=50;
+refseq	552	.	G	A	24044	PASS	BasesToClosestVariant=50;
+
+($output) = run_ok(["vcfdistance"], variants(1));
+is $output, $header . <<'', "distances for 1 variant line";
+refseq	502	.	G	A	38553	PASS	
+
+($output) = run_ok(["vcfdistance"], variants(0));
+is $output, $header, "distances for 0 variant lines";
+
+#
+# Various combinations of reference sequences (obviously non-comparable)
+#
+ at vcf = split /\n/, <<'';
+##fileformat=VCFv4.0
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+ref1	502	.	G	A	38553	PASS	
+ref2	552	.	G	A	24044	PASS	
+ref2	660	.	G	A	38553	PASS	
+ref2	678	.	G	A	24044	PASS	
+ref3	684	.	G	A	24044	PASS	
+
+($output) = run_ok(["vcfdistance"], variants(5));
+is $output, $header . <<'', "distances for 5 variant lines; three references";
+ref1	502	.	G	A	38553	PASS	
+ref2	552	.	G	A	24044	PASS	BasesToClosestVariant=108;
+ref2	660	.	G	A	38553	PASS	BasesToClosestVariant=18;
+ref2	678	.	G	A	24044	PASS	BasesToClosestVariant=18;
+ref3	684	.	G	A	24044	PASS	
+
+($output) = run_ok(["vcfdistance"], variants(4));
+is $output, $header . <<'', "distances for 4 variant lines, two references";
+ref1	502	.	G	A	38553	PASS	
+ref2	552	.	G	A	24044	PASS	BasesToClosestVariant=108;
+ref2	660	.	G	A	38553	PASS	BasesToClosestVariant=18;
+ref2	678	.	G	A	24044	PASS	BasesToClosestVariant=18;
+
+($output) = run_ok(["vcfdistance"], variants(3));
+is $output, $header . <<'', "distances for 3 variant lines, two references";
+ref1	502	.	G	A	38553	PASS	
+ref2	552	.	G	A	24044	PASS	BasesToClosestVariant=108;
+ref2	660	.	G	A	38553	PASS	BasesToClosestVariant=108;
+
+($output) = run_ok(["vcfdistance"], variants(2));
+is $output, $header . <<'', "distances for 2 variant lines, two references";
+ref1	502	.	G	A	38553	PASS	
+ref2	552	.	G	A	24044	PASS	
+
+done_testing;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/libvcflib.git