[med-svn] [libvcflib] 01/02: Imported Upstream version 0.0.20141212
Andreas Tille
tille at debian.org
Sun Feb 1 10:55:23 UTC 2015
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository libvcflib.
commit 47d88078ab2af69a961d344ac607a4e05a790980
Author: Andreas Tille <tille at debian.org>
Date: Sun Feb 1 11:54:34 2015 +0100
Imported Upstream version 0.0.20141212
---
.gitignore | 118 ++
.gitmodules | 21 +
LICENSE | 19 +
Makefile | 173 +++
README.md | 650 ++++++++++
bin/bed2region | 9 +
bin/plot_roc.r | 153 +++
bin/vcf2bed.py | 16 +
bin/vcf2sqlite.py | 130 ++
bin/vcf_strip_extra_headers | 18 +
bin/vcfbiallelic | 20 +
bin/vcfclearid | 12 +
bin/vcfclearinfo | 12 +
bin/vcfcomplex | 26 +
bin/vcffirstheader | 16 +
bin/vcfgtcompare.sh | 16 +
bin/vcfindelproximity | 82 ++
bin/vcfindels | 26 +
bin/vcfmultiallelic | 20 +
bin/vcfmultiway | 20 +
bin/vcfmultiwayscripts | 30 +
bin/vcfnobiallelicsnps | 29 +
bin/vcfnoindels | 25 +
bin/vcfnosnps | 25 +
bin/vcfnulldotslashdot | 22 +
bin/vcfplotaltdiscrepancy.r | 511 ++++++++
bin/vcfplotaltdiscrepancy.sh | 12 +
bin/vcfplotsitediscrepancy.r | 99 ++
bin/vcfplottstv.sh | 13 +
bin/vcfprintaltdiscrepancy.r | 37 +
bin/vcfprintaltdiscrepancy.sh | 18 +
bin/vcfqualfilter | 32 +
bin/vcfregionreduce | 29 +
bin/vcfregionreduce_and_cut | 32 +
bin/vcfregionreduce_pipe | 29 +
bin/vcfregionreduce_uncompressed | 29 +
bin/vcfremovenonATGC | 29 +
bin/vcfsnps | 26 +
bin/vcfsort | 3 +
bin/vcfvarstats | 225 ++++
samples/sample.vcf | 31 +
src/BedReader.h | 176 +++
src/Variant.cpp | 2405 ++++++++++++++++++++++++++++++++++++
src/Variant.h | 586 +++++++++
src/convert.h | 22 +
src/join.h | 36 +
src/mt19937ar.h | 192 +++
src/split.cpp | 23 +
src/split.h | 53 +
src/ssw.c | 834 +++++++++++++
src/ssw.h | 129 ++
src/ssw_cpp.cpp | 399 ++++++
src/ssw_cpp.h | 216 ++++
src/vcf2dag.cpp | 168 +++
src/vcf2fasta.cpp | 264 ++++
src/vcf2tsv.cpp | 241 ++++
src/vcfaddinfo.cpp | 111 ++
src/vcfafpath.cpp | 52 +
src/vcfallelicprimitives.cpp | 414 +++++++
src/vcfaltcount.cpp | 50 +
src/vcfannotate.cpp | 126 ++
src/vcfannotategenotypes.cpp | 220 ++++
src/vcfbreakmulti.cpp | 114 ++
src/vcfcat.cpp | 34 +
src/vcfcheck.cpp | 139 +++
src/vcfclassify.cpp | 162 +++
src/vcfcleancomplex.cpp | 71 ++
src/vcfcombine.cpp | 207 ++++
src/vcfcommonsamples.cpp | 85 ++
src/vcfcountalleles.cpp | 33 +
src/vcfcreatemulti.cpp | 197 +++
src/vcfdistance.cpp | 92 ++
src/vcfecho.cpp | 31 +
src/vcfentropy.cpp | 159 +++
src/vcfevenregions.cpp | 202 +++
src/vcffilter.cpp | 402 ++++++
src/vcffixup.cpp | 117 ++
src/vcfflatten.cpp | 178 +++
src/vcfgeno2alleles.cpp | 54 +
src/vcfgeno2haplo.cpp | 391 ++++++
src/vcfgenosamplenames.cpp | 39 +
src/vcfgenosummarize.cpp | 107 ++
src/vcfgenotypecompare.cpp | 327 +++++
src/vcfgenotypes.cpp | 66 +
src/vcfglbound.cpp | 178 +++
src/vcfglxgt.cpp | 171 +++
src/vcfhetcount.cpp | 72 ++
src/vcfhethomratio.cpp | 66 +
src/vcfindex.cpp | 42 +
src/vcfinfo2qual.cpp | 50 +
src/vcfinfosummarize.cpp | 212 ++++
src/vcfintersect.cpp | 577 +++++++++
src/vcfkeepgeno.cpp | 62 +
src/vcfkeepinfo.cpp | 68 +
src/vcfkeepsamples.cpp | 54 +
src/vcfleftalign.cpp | 781 ++++++++++++
src/vcflength.cpp | 49 +
src/vcfnumalt.cpp | 55 +
src/vcfoverlay.cpp | 109 ++
src/vcfparsealts.cpp | 42 +
src/vcfprimers.cpp | 140 +++
src/vcfqual2info.cpp | 44 +
src/vcfrandom.cpp | 70 ++
src/vcfrandomsample.cpp | 174 +++
src/vcfremap.cpp | 350 ++++++
src/vcfremoveaberrantgenotypes.cpp | 75 ++
src/vcfremovesamples.cpp | 76 ++
src/vcfroc.cpp | 469 +++++++
src/vcfsample2info.cpp | 218 ++++
src/vcfsamplediff.cpp | 200 +++
src/vcfsamplenames.cpp | 29 +
src/vcfsamplestats.cpp | 193 +++
src/vcfsitesummarize.cpp | 94 ++
src/vcfsom.cpp | 626 ++++++++++
src/vcfstats.cpp | 570 +++++++++
src/vcfstreamsort.cpp | 143 +++
src/vcfuniq.cpp | 49 +
src/vcfuniqalleles.cpp | 54 +
tests/lib/Local/vcflib/Test.pm | 32 +
tests/vcfdistance.t | 98 ++
120 files changed, 19059 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..99ce1a1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,118 @@
+*~
+.Rhistory
+.*swp
+.nfs*
+*.o
+BedReader.cpp
+Fasta.cpp
+Fasta.h
+Makefile.bad
+Multinomial.cpp
+Multinomial.h
+Pilot1
+Pilot2
+VCF.h
+VariantFilter.h
+b.vcf
+bugs/
+callgrind.out.7143
+f
+freebayes.chr20.integrated.nogeno.20101123.vcf
+glorder
+glorder.cpp
+glorder.py
+glorder.pyc
+gmon.out
+multimaptest.cpp
+pooled.sqlite
+pooled.sqlite3
+shunt
+shunt.c
+t.bed
+test.db
+test.vcf
+test.vcf.gz
+test.vcf.gz.tbi
+test/
+vcf2tsv
+vcfaddinfo
+vcfaddtag.cpp
+vcfafpath
+vcfallelicprimitives
+vcfaltcount
+vcfannotate
+vcfannotategenotypes
+vcfbreakmulti
+vcfcheck
+vcfclassify
+vcfcleancomplex
+vcfcommonsamples
+vcfcountalleles
+vcfcreatemulti
+vcfdistance
+vcfecho
+vcfentropy
+vcffilter
+vcffixup
+vcffixup.cpp.bak
+vcfflatten
+vcfgeno2haplo
+vcfgenotypecompare
+vcfgenotypes
+vcfglxgt
+vcfhaplotyecompare.cpp
+vcfhetcount
+vcfhethomratio
+vcfintersect
+vcfkeepfields
+vcfkeepgeno
+vcfkeepinfo
+vcfkeepsamples
+vcflength
+vcfmultiwaywwwindexfilter
+vcfnogeno
+vcfnogeno.cpp
+vcfnumalt
+vcfoverlay
+vcfparallel
+vcfparsealts
+vcfphylo.cpp
+vcfplotaltdiscrepancy.r.loess
+vcfplottstv.r
+vcfprimers
+vcfrandom
+vcfrandomsample
+vcfremap
+vcfremoveaberrantgenotypes
+vcfremovesamples
+vcfroc
+vcfsamplediff
+vcfsamplenames
+vcfsitesummarize
+vcfsom
+vcfsplit.cpp
+vcfstats
+vcfstreamsort
+vcfuniqalleles
+#vcfcountalleles.cpp#
+.vcfplotaltdiscrepancy.r.swo
+.vcfstats.cpp.swn
+.vcfstats.cpp.swo
+a.out
+vcfuniq
+vcfcat
+vcfevenregions
+vcfgenosummarize
+vcfgenosamplenames
+vcf2fasta
+bin/vcf2dag
+bin/vcfcombine
+bin/vcfgeno2alleles
+bin/vcfglbound
+bin/vcfindex
+bin/vcfinfo2qual
+bin/vcfinfosummarize
+bin/vcfleftalign
+bin/vcfqual2info
+bin/vcfsample2info
+libvcflib.a
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..9f675ec
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,21 @@
+[submodule "tabixpp"]
+ path = tabixpp
+ url = https://github.com/ekg/tabixpp.git
+[submodule "smithwaterman"]
+ path = smithwaterman
+ url = https://github.com/ekg/smithwaterman.git
+[submodule "multichoose"]
+ path = multichoose
+ url = https://github.com/ekg/multichoose.git
+[submodule "fastahack"]
+ path = fastahack
+ url = https://github.com/ekg/fastahack.git
+[submodule "intervaltree"]
+ path = intervaltree
+ url = https://github.com/ekg/intervaltree.git
+[submodule "fsom"]
+ path = fsom
+ url = https://github.com/ekg/fsom.git
+[submodule "filevercmp"]
+ path = filevercmp
+ url = https://github.com/ekg/filevercmp.git
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0708937
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2012 Erik Garrison
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..5a45987
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,173 @@
+#OBJ_DIR = ./
+HEADERS = src/Variant.h \
+ src/split.h \
+ src/join.h
+SOURCES = src/Variant.cpp \
+ src/split.cpp
+OBJECTS= $(SOURCES:.cpp=.o)
+
+# TODO
+#vcfstats.cpp
+
+BIN_SOURCES = src/vcfecho.cpp \
+ src/vcfaltcount.cpp \
+ src/vcfhetcount.cpp \
+ src/vcfhethomratio.cpp \
+ src/vcffilter.cpp \
+ src/vcf2tsv.cpp \
+ src/vcfgenotypes.cpp \
+ src/vcfannotategenotypes.cpp \
+ src/vcfcommonsamples.cpp \
+ src/vcfremovesamples.cpp \
+ src/vcfkeepsamples.cpp \
+ src/vcfsamplenames.cpp \
+ src/vcfgenotypecompare.cpp \
+ src/vcffixup.cpp \
+ src/vcfclassify.cpp \
+ src/vcfsamplediff.cpp \
+ src/vcfremoveaberrantgenotypes.cpp \
+ src/vcfrandom.cpp \
+ src/vcfparsealts.cpp \
+ src/vcfstats.cpp \
+ src/vcfflatten.cpp \
+ src/vcfprimers.cpp \
+ src/vcfnumalt.cpp \
+ src/vcfcleancomplex.cpp \
+ src/vcfintersect.cpp \
+ src/vcfannotate.cpp \
+ src/vcfallelicprimitives.cpp \
+ src/vcfoverlay.cpp \
+ src/vcfaddinfo.cpp \
+ src/vcfkeepinfo.cpp \
+ src/vcfkeepgeno.cpp \
+ src/vcfafpath.cpp \
+ src/vcfcountalleles.cpp \
+ src/vcflength.cpp \
+ src/vcfdistance.cpp \
+ src/vcfrandomsample.cpp \
+ src/vcfentropy.cpp \
+ src/vcfglxgt.cpp \
+ src/vcfroc.cpp \
+ src/vcfcheck.cpp \
+ src/vcfstreamsort.cpp \
+ src/vcfuniq.cpp \
+ src/vcfuniqalleles.cpp \
+ src/vcfremap.cpp \
+ src/vcf2fasta.cpp \
+ src/vcfsitesummarize.cpp \
+ src/vcfbreakmulti.cpp \
+ src/vcfcreatemulti.cpp \
+ src/vcfevenregions.cpp \
+ src/vcfcat.cpp \
+ src/vcfgenosummarize.cpp \
+ src/vcfgenosamplenames.cpp \
+ src/vcfgeno2haplo.cpp \
+ src/vcfleftalign.cpp \
+ src/vcfcombine.cpp \
+ src/vcfgeno2alleles.cpp \
+ src/vcfindex.cpp \
+ src/vcf2dag.cpp \
+ src/vcfsample2info.cpp \
+ src/vcfqual2info.cpp \
+ src/vcfinfo2qual.cpp \
+ src/vcfglbound.cpp \
+ src/vcfinfosummarize.cpp
+
+# when we can figure out how to build on mac
+# src/vcfsom.cpp
+
+#BINS = $(BIN_SOURCES:.cpp=)
+BINS = $(addprefix bin/,$(notdir $(BIN_SOURCES:.cpp=)))
+SHORTBINS = $(notdir $(BIN_SOURCES:.cpp=))
+
+TABIX = tabixpp/tabix.o
+
+FASTAHACK = fastahack/Fasta.o
+
+SMITHWATERMAN = smithwaterman/SmithWatermanGotoh.o
+
+REPEATS = smithwaterman/Repeats.o
+
+INDELALLELE = smithwaterman/IndelAllele.o
+
+DISORDER = smithwaterman/disorder.o
+
+LEFTALIGN = smithwaterman/LeftAlign.o
+
+FSOM = fsom/fsom.o
+
+FILEVERCMP = filevercmp/filevercmp.o
+
+INCLUDES = -I. -L. -Ltabixpp/
+LDFLAGS = -lvcflib -ltabix -lz -lm
+
+
+all: $(OBJECTS) $(BINS)
+
+CXX = g++
+CXXFLAGS = -O3 -D_FILE_OFFSET_BITS=64
+#CXXFLAGS = -O2
+#CXXFLAGS = -pedantic -Wall -Wshadow -Wpointer-arith -Wcast-qual
+
+SSW = src/ssw.o src/ssw_cpp.o
+
+ssw.o: src/ssw.h
+ssw_cpp.o:src/ssw_cpp.h
+
+openmp:
+ $(MAKE) CXXFLAGS="$(CXXFLAGS) -fopenmp -D HAS_OPENMP"
+
+profiling:
+ $(MAKE) CXXFLAGS="$(CXXFLAGS) -g" all
+
+gprof:
+ $(MAKE) CXXFLAGS="$(CXXFLAGS) -pg" all
+
+$(OBJECTS): $(SOURCES) $(HEADERS) $(TABIX)
+ $(CXX) -c -o $@ src/$(*F).cpp $(INCLUDES) $(LDFLAGS) $(CXXFLAGS)
+
+$(TABIX):
+ cd tabixpp && $(MAKE)
+
+$(SMITHWATERMAN):
+ cd smithwaterman && $(MAKE)
+
+$(DISORDER): $(SMITHWATERMAN)
+
+$(REPEATS): $(SMITHWATERMAN)
+
+$(LEFTALIGN): $(SMITHWATERMAN)
+
+$(INDELALLELE): $(SMITHWATERMAN)
+
+$(FASTAHACK):
+ cd fastahack && $(MAKE)
+
+#$(FSOM):
+# cd fsom && $(CXX) $(CXXFLAGS) -c fsom.c -lm
+
+$(FILEVERCMP):
+ cd filevercmp && make
+
+$(SHORTBINS):
+ $(MAKE) bin/$@
+
+$(BINS): $(BIN_SOURCES) libvcflib.a $(OBJECTS) $(SMITHWATERMAN) $(FASTAHACK) $(DISORDER) $(LEFTALIGN) $(INDELALLELE) $(SSW) $(FILEVERCMP)
+ $(CXX) src/$(notdir $@).cpp -o $@ $(INCLUDES) $(LDFLAGS) $(CXXFLAGS)
+
+libvcflib.a: $(OBJECTS) $(SMITHWATERMAN) $(REPEATS) $(FASTAHACK) $(DISORDER) $(LEFTALIGN) $(INDELALLELE) $(SSW) $(FILEVERCMP) $(TABIX)
+ ar rs libvcflib.a $(OBJECTS) smithwaterman/sw.o $(FASTAHACK) $(SSW) $(FILEVERCMP) $(TABIX) tabixpp/bgzf.o tabixpp/index.o tabixpp/knetfile.o tabixpp/kstring.o
+
+
+test: $(BINS)
+ @prove -Itests/lib -w tests/*.t
+
+clean:
+ rm -f $(BINS) $(OBJECTS)
+ rm -f ssw_cpp.o ssw.o
+ rm -f libvcflib.a
+ cd tabixpp && make clean
+ cd smithwaterman && make clean
+ cd fastahack && make clean
+
+.PHONY: clean all test
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..13603b2
--- /dev/null
+++ b/README.md
@@ -0,0 +1,650 @@
+# vcflib
+### a C++ library for parsing and manipulating VCF files.
+[![Gitter](https://badges.gitter.im/Join Chat.svg)](https://gitter.im/ekg/vcflib?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+
+#### author: Erik Garrison <erik.garrison at bc.edu>
+
+#### license: MIT
+
+---
+
+## overview
+
+The [Variant Call Format (VCF)](http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41)
+is a flat-file, tab-delimited textual format
+intended to concisely describe reference-indexed variations between individuals.
+VCF provides a common interchange format for the description of variation in individuals and populations of samples,
+and has become the _defacto_ standard reporting format for a wide array of genomic variant detectors.
+
+vcflib provides methods to manipulate and interpret sequence variation as it can be described by VCF.
+It is both:
+
+ * an API for parsing and operating on records of genomic variation as it can be described by the VCF format,
+ * and a collection of command-line utilities for executing complex manipulations on VCF files.
+
+The API itself provides a quick and extremely permissive method to read and write VCF files.
+Extensions and applications of the library provided in the included utilities (*.cpp) comprise the vast bulk of the library's utility for most users.
+
+## usage
+
+vcflib provides a variety of functions for VCF manipulation:
+
+### comparison
+
+ * Generate **haplotype-aware intersections** ([vcfintersect](#vcfintersect) -i), **unions** (vcfintersect -u), and **complements** (vcfintersect -v -i).
+ * **Overlay-merge** multiple VCF files together, using provided order as precedence ([vcfoverlay](#vcfoverlay)).
+ * **Combine** multiple VCF files together, handling samples when alternate allele descriptions are identical ([vcfcombine](#vcfcombine)).
+ * **Validate** the integrity and identity of the VCF by verifying that the VCF record's REF matches a given reference file ([vcfcheck](#vcfcheck)).
+
+### format conversion
+
+ * Convert a VCF file into a per-allele or per-genotype **tab-separated (.tsv)** file ([vcf2tsv](#vcf2tsv)).
+ * Store a VCF file in an **SQLite3** database (vcf2sqlite.py).
+ * Make a **BED file** from the intervals in a VCF file (vcf2bed.py).
+
+### filtering and subsetting
+
+ * **Filter** variants and genotypes using arbitrary expressions based on values in the INFO and sample fields ([vcffilter](#vcffilter)).
+ * **Randomly sample** a subset of records from a VCF file, given a rate ([vcfrandomsample](#vcfrandomsample)).
+ * **Select variants** of a certain type (vcfsnps, vcfbiallelic, vcfindels, vcfcomplex, etc.)
+
+### annotation
+
+ * **Annotate** one VCF file with fields from the INFO column of another, based on position ([vcfaddinfo](#vcfaddinfo), [vcfintersect](#vcfintersect)).
+ * Incorporate annotations or targets provided by a *BED* file ([vcfannotate](#vcfannotate), [vcfintersect](#vcfintersect)).
+ * Examine **genotype correspondence** between two VCF files by annotating samples in one file with genotypes from another ([vcfannotategenotypes](#vcfannotategenotypes)).
+ * Annotate variants with the **distance** to the nearest variant ([vcfdistance](#vcfdistance)).
+ * Count the number of alternate alleles represented in samples at each variant record ([vcfaltcount](#vcfaltcount)).
+ * **Subset INFO fields** to decrease file size and processing time ([vcfkeepinfo](#vcfkeepinfo)).
+ * Lighten up VCF files by keeping only a **subset of per-sample information** ([vcfkeepgeno](#vcfkeepgeno)).
+ * **Numerically index** alleles in a VCF file ([vcfindex](#vcfindex)).
+
+### samples
+
+ * Quickly obtain the **list of samples** in a given VCF file ([vcfsamplenames](#vcfsamplenames)).
+ * **Remove samples** from a VCF file ([vcfkeepsamples](#vcfkeepsamples), [vcfremovesamples](#vcfremovesamples)).
+
+### ordering
+
+ * **Sort variants** by genome coordinate ([vcfstreamsort](#vcfstreamsort)).
+ * **Remove duplicate** variants in vcfstreamsort'ed files according to their REF and ALT fields ([vcfuniq](#vcfuniq)).
+
+### variant representation
+
+ * **Break multiallelic** records into multiple records ([vcfbreakmulti](#vcfbreakmulti)), retaining allele-specific INFO fields.
+ * **Combine overlapping biallelic** records into a single record ([vcfcreatemulti](#vcfcreatemulti)).
+ * **Decompose complex variants** into a canonical SNP and indel representation ([vcfallelicprimitives](#vcfallelicprimitives)), generating phased genotypes for available samples.
+ * **Reconstitute complex variants** provided a phased VCF with samples ([vcfgeno2haplo](#vcfgeno2haplo)).
+ * **Left-align indel and complex variants** ([vcfleftalign](#vcfleftalign)).
+
+### genotype manipulation
+
+ * **Set genotypes** in a VCF file provided genotype likelihoods in the GL field ([vcfglxgt](#vcfglxgt)).
+ * Establish putative **somatic variants** using reported differences between germline and somatic samples ([vcfsamplediff](#vcfsamplediff)).
+ * Remove samples for which the reported genotype (GT) and observation counts disagree (AO, RO) ([vcfremoveaberrantgenotypes](#vcfremoveaberrantgenotypes)).
+
+### interpretation and classification of variants
+
+ * Obtain aggregate **statistics** about VCF files ([vcfstats](#vcfstats)).
+ * Print the **receiver-operating characteristic (ROC)** of one VCF given a truth set ([vcfroc](#vcfroc)).
+ * Annotate VCF records with the **Shannon entropy** of flanking sequence ([vcfentropy](#vcfentropy)).
+ * Calculate the heterozygosity rate ([vcfhetcount](#vcfhetcount)).
+ * Generate potential **primers** from VCF records ([vcfprimers](#vcfprimers)), to check for genome uniqueness.
+ * Convert the numerical represenation of genotypes provided by the GT field to a **human-readable genotype format** ([vcfgenotypes](#vcfgenotypes)).
+ * Observe how different alignment parameters, including context and entropy-dependent ones, influence **variant classification and interpretation** ([vcfremap](#vcfremap)).
+ * **Classify variants** by annotations in the INFO field using a self-organizing map ([vcfsom](#vcfsom)); **re-estimate their quality** given known variants.
+
+
+A number of "helper" perl and python scripts (e.g. vcf2bed.py, vcfbiallelic) further extend functionality.
+
+In practice, users are encouraged to drive the utilities in the library in a streaming fashion, using pipes, to fully utilize resources on multi-core systems during interactive work. Piping provides a convenient method to interface with other libraries (vcf-tools, BedTools, GATK, htslib, bcftools, freebayes) which interface via VCF files, allowing the composition of an immense variety of processing functions.
+
+## development
+
+See src/vcfecho.cpp for basic usage. src/Variant.h and src/Variant.cpp describe methods available in the API.
+vcflib is incorporated into several projects, such as [freebayes](https://github.com/ekg/freebayes), which may provide a point of reference for prospective developers.
+Additionally, developers should be aware of that vcflib contains submodules (git repositories) comprising its dependencies (outside of lzib and a *nix environment).
+
+
+## installing
+
+vcflib includes submodules, so to obtain vcflib you have to use:
+
+ % git clone --recursive git://github.com/ekg/vcflib.git
+
+or
+
+ % git clone --recursive https://github.com/ekg/vcflib.git
+
+To build, use Make:
+
+ % cd vcflib
+ % make
+
+Executables are built into the ./bin directory in the repository.
+A number of shell, perl, python, and R scripts already reside there.
+This makes installation easy, as users can add vcflib/bin to their path, or copy the contained executables to a directory already in their path.
+
+
+## executables
+
+### vcf2tsv
+
+ usage: vcf2tsv [-n null_string] [-g] [vcf file]
+ Converts stdin or given VCF file to tab-delimited format, using null string to replace empty values in the table.
+ Specifying -g will output one line per sample with genotype information.
+
+### vcfaddinfo
+
+ usage: vcfaddinfo <vcf file> <vcf file>
+ Adds info fields from the second file which are not present in the first vcf file.
+
+
+### vcfafpath
+
+Uses allele frequencies in the AF info column to estimate phylogeny at multiallelic sites.
+
+
+### vcfallelicprimitives
+
+ usage: vcfallelicprimitives [options] [file]
+
+ options:
+ -m, --use-mnps Retain MNPs as separate events (default: false)
+ -t, --tag-parsed FLAG Tag records which are split apart of a complex allele
+ with this flag
+
+ If multiple alleleic primitives (gaps or mismatches) are specified in
+ a single VCF record, split the record into multiple lines, but drop all
+ INFO fields. "Pure" MNPs are split into multiple SNPs unless the -m
+ flag is provided. Genotypes are phased where complex alleles have been
+ decomposed, provided genotypes in the input.
+
+
+### vcfaltcount
+
+Counts the number of alternate alleles in the record.
+
+
+### vcfannotate
+
+ usage: vcfannotate [options] [<vcf file>]
+
+ options:
+ -b, --bed use annotations provided by this BED file
+ -k, --key use this INFO field key for the annotations
+ -d, --default use this INFO field key for records without annotations
+
+ Intersect the records in the VCF file with targets provided in a BED file.
+ Intersections are done on the reference sequences in the VCF file.
+ If no VCF filename is specified on the command line (last argument) the VCF
+ read from stdin.
+
+
+### vcfannotategenotypes
+
+ usage: vcfannotategenotypes <annotation-tag> <vcf file> <vcf file>
+
+ annotates genotypes in the first file with genotypes in the second adding the
+ genotype as another flag to each sample filed in the first file.
+ annotation-tag is the name of the sample flag which is added to store the
+ annotation. also adds a 'has\_variant' flag for sites where the second file has
+ a variant.
+
+
+### vcfbreakmulti
+
+ usage: vcfbreakmulti [options] [file]
+
+ If multiple alleles are specified in a single record, break the record into
+ multiple lines, preserving allele-specific INFO fields.
+
+
+### vcfcheck
+
+ usage: vcfcheck [options] <vcf file>
+
+ options: -f, --fasta-reference FASTA reference file to use to obtain
+ primer sequences
+
+ Verifies that the VCF REF field matches the reference as described.
+
+
+
+### vcfcleancomplex
+
+Removes reference-matching sequence from complex alleles and adjusts records to
+reflect positional change.
+
+
+### vcfcombine
+
+ usage: vcfcombine [vcf file] [vcf file] ...
+
+ options:
+ -h --help This text.
+ -r --region REGION A region specifier of the form chrN:x-y to bound the merge
+
+Combines VCF files positionally, combining samples when sites and alleles are identical.
+Any number of VCF files may be combined. The INFO field and other columns are taken from
+one of the files which are combined when records in multiple files match. Alleles must
+have identical ordering to be combined into one record. If they do not, multiple records
+will be emitted.
+
+
+### vcfcommonsamples
+
+ usage: vcfcommonsamples <vcf file> <vcf file> outputs each record in the
+ first file, removing samples not present in the second
+
+
+### vcfcountalleles
+
+Counts the total number of alleles in the input.
+
+
+### vcfcreatemulti
+
+If overlapping alleles are represented across multiple records, merge them into a single record.
+
+### vcfdistance
+
+Adds a value to each VCF record indicating the distance to the nearest variant
+in the file.
+
+
+### vcfentropy
+
+ usage: vcfentropy [options] <vcf file>
+
+ options: -f, --fasta-reference FASTA reference file to use to obtain
+primer sequences -w, --window-size Size of the window over which to
+calculate entropy
+
+ Anotates the output VCF file with, for each record, EntropyLeft,
+EntropyRight, EntropyCenter, which are the entropies of the sequence of the
+given window size to the left, right, and center of the record.
+
+
+
+### vcffilter
+
+ usage: vcffilter [options] <vcf file>
+
+ options:
+ -f, --info-filter specifies a filter to apply to the info fields of records,
+ removes alleles which do not pass the filter
+ -g, --genotype-filter specifies a filter to apply to the genotype fields of records
+ -s, --filter-sites filter entire records, not just alleles
+ -t, --tag-pass tag vcf records as positively filtered with this tag, print all records
+ -F, --tag-fail tag vcf records as negatively filtered with this tag, print all records
+ -A, --append-filter append the existing filter tag, don't just replace it
+ -a, --allele-tag apply -t on a per-allele basis. adds or sets the corresponding INFO field tag
+ -v, --invert inverts the filter, e.g. grep -v
+ -o, --or use logical OR instead of AND to combine filters
+ -r, --region specify a region on which to target the filtering, requires a BGZF
+ compressed file which has been indexed with tabix. any number of
+ regions may be specified.
+
+ Filter the specified vcf file using the set of filters.
+ Filters are specified in the form "<ID> <operator> <value>:
+ -f "DP > 10" # for info fields
+ -g "GT = 1|1" # for genotype fields
+ -f "CpG" # for 'flag' fields
+
+ Operators can be any of: =, !, <, >, |, &
+
+ Any number of filters may be specified. They are combined via logical AND
+ unless --or is specified on the command line. Obtain logical negation through
+ the use of parentheses, e.g. "! ( DP = 10 )"
+
+ For convenience, you can specify "QUAL" to refer to the quality of the site, even
+ though it does not appear in the INFO fields.
+
+
+### vcffixup
+
+Count the allele frequencies across alleles present in each record in the
+VCF file. (Similar to vcftools --freq.)
+
+Uses genotypes from the VCF file to correct AC (alternate allele count), AF
+(alternate allele frequency), NS (number of called), in the VCF records. For
+example:
+
+ % vcfkeepsamples file.vcf NA12878 | vcffixup - | vcffilter -f "AC > 0"
+
+Would downsample file.vcf to only NA12878, removing sites for which the sample
+was not called as polymorphic.
+
+
+### vcfflatten
+
+ usage: vcfflatten [file]
+
+ Removes multi-allelic sites by picking the most common alternate. Requires
+ allele frequency specification 'AF' and use of 'G' and 'A' to specify the
+ fields which vary according to the Allele or Genotype. VCF file may be
+ specified on the command line or piped as stdin.
+
+
+### vcfgeno2haplo
+
+ usage: vcfgeno2haplo [options] [<vcf file>]
+
+ options:
+ -w, --window-size N compare records up to this many bp away (default 30)
+ -r, --reference FILE FASTA reference file, required with -i and -u
+
+ Convert genotype-based phased alleles within --window-size into haplotype alleles.
+
+
+
+### vcfgenotypecompare
+
+ usage: vcfgenotypecompare <other-genotype-tag> <vcf file>
+ adds statistics to the INFO field of the vcf file describing the
+ amount of discrepancy between the genotypes (GT) in the vcf file and the
+ genotypes reported in the <other-genotype-tag>. use this after
+ vcfannotategenotypes to get correspondence statistics for two vcfs.
+
+
+### vcfgenotypes
+
+Converts numerical representation of genotypes (standard in GT field) to the
+alleles provided in the call's ALT/REF fields.
+
+
+### vcfglxgt
+
+ usage: vcfglxgt [options] <vcf file>
+
+ options:
+ -n, --fix-null-genotypes only apply to null and partly-null genotypes
+
+ Set genotypes using the maximum genotype likelihood for each sample.
+
+
+
+### vcfhetcount
+
+Count the number of heterozygotes in the input VCF.
+
+
+### vcfhethomratio
+
+Provides the ratio between heterozygotes and homozygotes.
+
+### vcfindex
+
+Adds a field (id) which contains an allele-specific numerical index.
+
+### vcfintersect
+
+ usage: vcfintersect [options] [<vcf file>]
+
+ options:
+ -b, --bed FILE use intervals provided by this BED file
+ -v, --invert invert the selection, printing only records which would
+ not have been printed out
+ -i, --intersect-vcf FILE use this VCF for set intersection generation
+ -u, --union-vcf FILE use this VCF for set union generation
+ -w, --window-size N compare records up to this many bp away (default 30)
+ -r, --reference FILE FASTA reference file, required with -i and -u
+ -l, --loci output whole loci when one alternate allele matches
+ -m, --ref-match intersect on the basis of record REF string
+ -t, --tag TAG attach TAG to each record's info field if it would intersect
+ -V, --tag-value VAL use this value to indicate that the allele is passing
+ '.' will be used otherwise. default: 'PASS'
+ -M, --merge-from FROM-TAG
+ -T, --merge-to TO-TAG merge from FROM-TAG used in the -i file, setting TO-TAG
+ in the current file.
+
+ For bed-vcf intersection, alleles which fall into the targets are retained.
+
+ For vcf-vcf intersection and union, unify on equivalent alleles within window-size bp
+ as determined by haplotype comparison alleles.
+
+
+### vcfkeepgeno
+
+ usage: vcfkeepgeno <vcf file> [FIELD1] [FIELD2] ...
+ outputs each record in the vcf file, removing FORMAT fields not listed
+ on the command line from sample specifications in the output
+
+
+### vcfkeepinfo
+
+ usage: vcfkeepinfo <vcf file> [FIELD1] [FIELD2] ...
+ outputs each record in the vcf file, removing INFO fields not listed on the command line
+
+
+### vcfkeepsamples
+
+ usage: vcfkeepsamples <vcf file> [SAMPLE1] [SAMPLE2] ...
+ outputs each record in the vcf file, removing samples not listed on the command line
+
+
+### vcfleftalign
+
+Left-align indels and complex variants in the input using a pairwise ref/alt
+alignment followed by a heuristic, iterative left realignment process that
+shifts indel representations to their absolute leftmost (5') extent. This is
+the same procedure used in the internal left alignment in freebayes, and can be
+used when preparing VCF files for input to freebayes to decrease positional
+representation differences between the input alleles and left-realigned
+alignments.
+
+ usage: vcfleftalign [options] [file]
+
+ options:
+ -r, --reference FILE Use this reference as a basis for realignment.
+ -w, --window N Use a window of this many bp when left aligning (150).
+
+ Left-aligns variants in the specified input file or stdin. Window size is determined
+ dynamically according to the entropy of the regions flanking the indel. These must have
+ entropy > 1 bit/bp, or be shorter than ~5kb.
+
+
+### vcflength
+
+Adds the length of the variant record (in [-/+]) relative to the reference allele to each VCF record.
+
+
+### vcfnumalt
+
+Annotates the VCF stream on stdin with the number of alternate alleles at the site.
+
+
+### vcfoverlay
+
+ usage: vcfoverlay [options] [<vcf file> ...]
+
+ options:
+ -h, --help this dialog
+
+ Overlays records in the input vcf files in the order in which they appear.
+
+
+### vcfparsealts
+
+Demonstration of alternate allele parsing method. This method uses pairwise
+alignment of REF and ALTs to determine component allelic primitives for each
+alternate allele.
+
+Use `vcfallelicprimitives` to decompose records while preserving format.
+
+
+### vcfprimers
+
+ usage: vcfprimers [options] <vcf file>
+
+ options:
+ -f, --fasta-reference FASTA reference file to use to obtain primer sequences
+ -l, --primer-length The length of the primer sequences on each side of the variant
+
+ For each VCF record, extract the flanking sequences, and write them to stdout as FASTA
+ records suitable for alignment. This tool is intended for use in designing validation
+ experiments. Primers extracted which would flank all of the alleles at multi-allelic
+ sites. The name of the FASTA "reads" indicates the VCF record which they apply to.
+ The form is >CHROM_POS_LEFT for the 3' primer and >CHROM_POS_RIGHT for the 5' primer,
+ for example:
+
+ >20_233255_LEFT
+ CCATTGTATATATAGACCATAATTTCTTTATCCAATCATCTGTTGATGGA
+ >20_233255_RIGHT
+ ACTCAGTTGATTCCATACCTTTGCCATCATGAATCATGTTGTAATAAACA
+
+
+
+### vcfrandomsample
+
+ usage: vcfrandomsample [options] [<vcf file>]
+
+ options:
+ -r, --rate RATE base sampling probability per locus
+ -s, --scale-by KEY scale sampling likelihood by this Float info field
+ -p, --random-seed N use this random seed
+
+ Randomly sample sites from an input VCF file, which may be provided as stdin.
+ Scale the sampling probability by the field specified in KEY. This may be
+ used to provide uniform sampling across allele frequencies, for instance.
+
+
+### vcfremap
+
+ usage: vcfremap [options] [<vcf file>]
+
+ options:
+ -w, --ref-window-size N align using this many bases flanking each side of the reference allele
+ -s, --alt-window-size N align using this many flanking bases from the reference around each alternate allele
+ -r, --reference FILE FASTA reference file, required with -i and -u
+ -m, --match-score N match score for SW algorithm
+ -x, --mismatch-score N mismatch score for SW algorithm
+ -o, --gap-open-penalty N gap open penalty for SW algorithm
+ -e, --gap-extend-penalty N gap extension penalty for SW algorithm
+ -z, --entropy-gap-open use entropy scaling for the gap open penalty
+ -R, --repeat-gap-extend N penalize non-repeat-unit gaps in repeat sequence
+ -a, --adjust-vcf TAG supply a new cigar as TAG in the output VCF
+
+ For each alternate allele, attempt to realign against the reference with lowered gap open penalty.
+ If realignment is possible, adjust the cigar and reference/alternate alleles.
+
+
+### vcfremoveaberrantgenotypes
+
+Strips genotypes which are homozygous but have observations implying
+heterozygosity. Requires RA (reference allele observation) and AA (alternate
+allele observation) for each genotype.
+
+
+### vcfremovesamples
+
+ usage: vcfremovesamples <vcf file> [SAMPLE1] [SAMPLE2] ...
+ outputs each record in the vcf file, removing samples listed on the command line
+
+
+### vcfroc
+
+ usage: vcfroc [options] [<vcf file>]
+
+ options:
+ -t, --truth-vcf FILE use this VCF as ground truth for ROC generation
+ -w, --window-size N compare records up to this many bp away (default 30)
+ -r, --reference FILE FASTA reference file
+
+ Generates a pseudo-ROC curve using sensitivity and specificity estimated against
+ a putative truth set. Thresholding is provided by successive QUAL cutoffs.
+
+
+### vcfsamplediff
+
+ usage: vcfsamplediff <tag> <sample> <sample> [ <sample> ... ] <vcf file>
+ tags each record where the listed sample genotypes differ with <tag>
+ The first sample is assumed to be germline, the second somatic.
+ Each record is tagged with <tag>={germline,somatic,loh} to specify the type of
+ variant given the genotype difference between the two samples.
+
+
+### vcfsamplenames
+
+Prints the names of the samples in the VCF file.
+
+
+### vcfsom
+
+ usage: vcfsom [options] [vcf file]
+
+ training:
+ vcfsom -s output.som -f "AF DP ABP" training.vcf
+
+ application:
+ vcfsom -a output.som -f "AF DP ABP" test.vcf >results.vcf
+
+ vcfsomtrains and/or applies a self-organizing map to the input VCF data
+ on stdin, adding two columns for the x and y coordinates of the winning
+ neuron in the network and an optional euclidean distance from a given
+ node (--center).
+
+ If a map is provided via --apply, map will be applied to input without
+ training. Automated filtering to an estimated FP rate is
+
+ options:
+
+ -h, --help this dialog
+
+ training:
+
+ -f, --fields "FIELD ..." INFO fields to provide to the SOM
+ -a, --apply FILE apply the saved map to input data to FILE
+ -s, --save FILE train on input data and save the map to FILE
+ -t, --print-training-results
+ print results of SOM on training input
+ (you can also just use --apply on the same input)
+ -x, --width X width in columns of the output array
+ -y, --height Y height in columns of the output array
+ -i, --iterations N number of training iterations or epochs
+ -d, --debug print timing information
+
+ recalibration:
+
+ -c, --center X,Y annotate with euclidean distance from center
+ -p, --paint-true VCF use VCF file to annotate true variants (multiple)
+ -f, --paint-false VCF use VCF file to annotate false variants (multiple)
+ -R, --paint-tag TAG provide estimated FDR% in TAG in variant INFO
+ -N, --false-negative replace FDR% (false detection) with FNR% (false negative)
+
+
+### vcfstats
+
+ usage: vcfstats [options] <vcf file>
+
+ -r, --region specify a region on which to target the stats, requires a BGZF
+ compressed file which has been indexed with tabix. any number of
+ regions may be specified.
+ -a, --add-info add the statistics intermediate information to the VCF file,
+ writing out VCF records instead of summary statistics
+ -l, --no-length-frequency don't out the indel and mnp length-frequency spectra
+ -m, --match-score N match score for SW algorithm
+ -x, --mismatch-score N mismatch score for SW algorithm
+ -o, --gap-open-penalty N gap open penalty for SW algorithm
+ -e, --gap-extend-penalty N gap extension penalty for SW algorithm
+
+ Prints statistics about variants in the input VCF file.
+
+
+### vcfstreamsort
+
+Reads VCF on stdin and guarantees that the positional order is correct provided out-of-order
+variants are no more than 100 positions in the VCF file apart.
+
+
+### vcfuniq
+
+Like GNU uniq, but for VCF records. Remove records which have the same positon, ref, and alt
+as the previous record.
+
+
+### vcfuniqalleles
+
+For each record, remove any duplicate alternate alleles that may have resulted from merging
+separate VCF files.
diff --git a/bin/bed2region b/bin/bed2region
new file mode 100755
index 0000000..ffa40ef
--- /dev/null
+++ b/bin/bed2region
@@ -0,0 +1,9 @@
+#!/usr/bin/perl
+
+while (<STDIN>) {
+ $_ =~ /^(.+?)\s(.+?)\s(.+)\s*/;
+ $chrom = $1;
+ $pos = $2;
+ $end = $3;
+ print $chrom . ":" . $pos . "-" . $end . "\n";
+}
diff --git a/bin/plot_roc.r b/bin/plot_roc.r
new file mode 100755
index 0000000..5a99615
--- /dev/null
+++ b/bin/plot_roc.r
@@ -0,0 +1,153 @@
+#!/usr/bin/Rscript
+
+
+
+
+require(plyr)
+require(ggplot2)
+require(pracma)
+require(grid)
+
+argv <- commandArgs(trailingOnly = TRUE)
+
+prefix <- gsub("\\s","", argv[1])
+print(prefix)
+truthset <- argv[2]
+print(truthset)
+results <- argv[3]
+print(results)
+xmin <- as.numeric(argv[4])
+xmax <- as.numeric(argv[5])
+ymin <- as.numeric(argv[6])
+ymax <- as.numeric(argv[7])
+
+roc <- read.delim(results)
+
+bests <- ddply(roc, .(set), function(x) { data.frame(best_snps=with(x, min(false_negative_snps + false_positive_snps)), best_snp_threshold=min(subset(x, (false_negative_snps + false_positive_snps) == with(x, min(false_negative_snps + false_positive_snps)))$threshold ), best_indels=with(x, min(false_negative_indels + false_positive_indels)), best_indel_threshold=min(subset(x, (false_negative_indels + false_positive_indels) == with(x, min(false_negative_indels + false_positive_indels)))$th [...]
+
+write.table(bests, paste(prefix, ".bests.tsv", sep=""), row.names=FALSE, quote=FALSE, sep="\t")
+
+#abs(trapz(c(1, roc$complexfpr), c(1, roc$complextpr)))
+
+true_snps <- with(subset(roc, set==truthset), max(num_snps))
+true_indels <- with(subset(roc, set==truthset), max(num_indels))
+
+# get ROC AUC
+auc <- ddply(roc, .(set),
+ function(x) {
+ data.frame(
+ snp_auc=ifelse(true_snps>0,
+ with(x,
+ abs(trapz(c(1,
+ false_positive_snps/(false_positive_snps+ max(false_negative_snps + num_snps - false_positive_snps))),
+ c(max(1- false_negative_snps/true_snps),
+ 1- false_negative_snps/true_snps)))),
+ 0),
+ indel_auc=ifelse(true_indels>0,
+ with(x,
+ abs(trapz(c(1,
+ false_positive_indels/(false_positive_indels+ max(false_negative_indels + num_indels - false_positive_indels))),
+ c(max(1- false_negative_indels/true_indels),
+ 1- false_negative_indels/true_indels)))),
+ 0)
+ )
+ }
+ )
+
+write.table(auc, paste(prefix, ".auc.tsv", sep=""), row.names=FALSE, quote=FALSE, sep="\t")
+
+
+rocsnps <- ddply(roc, .(set),
+ function(x) {
+ data.frame(
+ FPR=
+ with(x,
+ c(1,
+ false_positive_snps/(false_positive_snps+ max(false_negative_snps + num_snps - false_positive_snps)))),
+ TPR=
+ with(x,
+ c(max(1- false_negative_snps/true_snps),
+ 1- false_negative_snps/true_snps)),
+ type=as.factor("snps")
+ )
+ }
+ )
+
+rocindels <- ddply(roc, .(set),
+ function(x) {
+ data.frame(
+ FPR=
+ with(x,
+ c(1,
+ false_positive_indels/(false_positive_indels+ max(false_negative_indels + num_indels - false_positive_indels)))),
+ TPR=
+ with(x,
+ c(max(1- false_negative_indels/true_indels),
+ 1- false_negative_indels/true_indels)),
+ type=as.factor("indels")
+ )
+ }
+ )
+
+
+if (FALSE) {
+if (true_snps>0) {
+ ggplot(subset(roc, set != truthset),
+ aes(false_positive_snps/(false_positive_snps+with(subset(roc, set==set), max(false_negative_snps + num_snps - false_positive_snps))),
+ 1- false_negative_snps/with(subset(roc, set==set), max(false_negative_snps + num_snps - false_positive_snps)),
+ group=set,
+ color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw()
+ + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
+ ggsave(paste(prefix, ".snps.png", sep=""), height=6, width=9)
+}
+
+if (true_indels>0) {
+ ggplot(subset(roc, set != truthset),
+ aes(false_positive_indels/(false_positive_indels+with(subset(roc, set==set), max(false_negative_indels + num_indels - false_positive_indels))),
+ 1- false_negative_indels/with(subset(roc, set==set), max(false_negative_indels + num_indels - false_positive_indels)),
+ group=set,
+ color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw()
+ + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
+ ggsave(paste(prefix, ".indels.png", sep=""), height=6, width=9)
+}
+}
+
+
+# new versions
+if (true_snps>0) {
+ ggplot(subset(rocsnps, set != truthset),
+ aes(FPR,
+ TPR,
+ group=set,
+ color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw() + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
+ ggsave(paste(prefix, ".snps.png", sep=""), height=6, width=9)
+}
+
+if (true_indels>0) {
+ ggplot(subset(rocindels, set != truthset),
+ aes(FPR,
+ TPR,
+ group=set,
+ color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw() + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
+ ggsave(paste(prefix, ".indels.png", sep=""), height=6, width=9)
+}
+
+if (true_indels>0 && true_snps>0) {
+
+(
+ ggplot(subset(rbind(rocsnps,rocindels), set != truthset),
+ aes(FPR,
+ TPR,
+ group=set,
+ color=set))
+ + scale_x_continuous("false positive rate")
+ + scale_y_continuous("true positive rate")
+ + geom_path()
+ + theme_bw()
+ + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
+ + facet_grid(type ~ .)
+ + theme(panel.margin = unit(1, "lines"))
+)
+ ggsave(paste(prefix, ".both.png", sep=""), height=5, width=5)
+
+}
diff --git a/bin/vcf2bed.py b/bin/vcf2bed.py
new file mode 100755
index 0000000..c04cb60
--- /dev/null
+++ b/bin/vcf2bed.py
@@ -0,0 +1,16 @@
+#!/usr/bin/python
+
+import sys
+
+for line in sys.stdin:
+ if line.startswith('#'):
+ continue
+ fields = line.strip().split()
+ # VCF is 1-based, BED is 0-based half open
+ # print out chrom, start, end,
+ chrom = fields[0]
+ start = int(fields[1]) - 1
+ span = len(fields[3]) # handle multi-base alleles
+ end = start + span
+ name = fields[2]
+ print "\t".join(map(str, [chrom, start, end, name]))
diff --git a/bin/vcf2sqlite.py b/bin/vcf2sqlite.py
new file mode 100755
index 0000000..f0170b0
--- /dev/null
+++ b/bin/vcf2sqlite.py
@@ -0,0 +1,130 @@
+#!/usr/bin/python
+
+import sys
+import re
+import sqlite3
+
+if len(sys.argv) < 2:
+ print "usage", sys.argv[0], " [dbname]"
+ print "reads VCF on stdin, and writes output to a sqlite3 db [dbname]"
+ exit(1)
+
+dbname = sys.argv[1]
+
+# parse the header
+# into a mapping from tag -> type
+
+infotypes = {}
+infonumbers = {}
+
+for line in sys.stdin:
+ if line.startswith('##INFO'):
+ #<ID=XRS,Number=1,Type=Float,
+ i = re.search("ID=(.*?),", line)
+ n = re.search("Number=(.*?),", line)
+ t = re.search("Type=(.*?),", line)
+ if i and n and t:
+ id = i.groups()[0]
+ number = n.groups()[0]
+ if number == "A":
+ number = -1
+ elif number == "G" or int(number) > 1:
+ # unclear how to deal with these
+ continue
+ else:
+ number = int(number)
+ typestr = t.groups()[0]
+ infotypes[id] = typestr
+ infonumbers[id] = number
+ else:
+ continue
+ elif line.startswith('##'):
+ continue
+ else:
+ break # header line, sample names etc.
+
+# write the table schema
+
+infotype_to_sqltype = {}
+infotype_to_sqltype["Flag"] = "boolean"
+infotype_to_sqltype["Integer"] = "integer"
+infotype_to_sqltype["Float"] = "real"
+infotype_to_sqltype["String"] = "text"
+
+tablecmd = """create table alleles"""
+specs = ["CHROM text",
+ "POS integer",
+ "ID text",
+ "REF text",
+ "ALT text",
+ "QUAL real",
+ "FILTER text"]
+
+sorted_fields = sorted(infotypes.keys())
+for field in sorted_fields:
+ infotype = infotypes[field]
+ sqltype = infotype_to_sqltype[infotype]
+ field = field.replace(".", "_") # escape periods, which are not allowed
+ specs.append(field + " " + sqltype)
+
+tablecmd += " (" + ", ".join(specs) + ")"
+
+conn = sqlite3.connect(dbname)
+conn.execute(tablecmd)
+
+# for each record
+# parse the record
+# for each allele
+
+for line in sys.stdin:
+ fields = line.split('\t')
+ chrom, pos, id, ref, alts, qual, filter, info = fields[:8]
+ alts = alts.split(",")
+ altindex = 0
+ chrom = "\'" + chrom + "\'"
+ id = "\'" + id + "\'"
+ ref = "\'" + ref + "\'"
+ filter = "\'" + filter + "\'"
+ for alt in alts:
+ alt = "\'" + alt + "\'"
+ info_values = {}
+ for pair in info.split(";"):
+ if pair.find("=") is not -1:
+ pair = pair.split("=")
+ key = pair[0]
+ value = pair[1]
+ if not infonumbers.has_key(key):
+ continue
+ if infonumbers[key] == -1:
+ values = value.split(",")
+ value = values[altindex]
+ info_values[key] = value
+ else:
+ # boolean flag
+ info_values[pair] = "1"
+ ordered_insertion = []
+ for field in sorted_fields:
+ value = "null"
+ if info_values.has_key(field):
+ value = info_values[field]
+ if infotypes[field] == "String":
+ value = "\'" + value + "\'"
+ else:
+ # missing flag means "false" for that flag
+ if infotypes[field] == "Flag":
+ value = "0"
+ ordered_insertion.append(value)
+ cmd = "insert into alleles values (" \
+ + ", ".join([chrom, pos, id, ref, alt, qual, filter]) \
+ + ", " \
+ + ", ".join(ordered_insertion) + ")"
+ conn.execute(cmd)
+ altindex += 1
+
+conn.commit()
+
+# TODO ignoring samples (for now)
+
+# add indexes everywhere?
+
+conn.close()
diff --git a/bin/vcf_strip_extra_headers b/bin/vcf_strip_extra_headers
new file mode 100755
index 0000000..c8b05e6
--- /dev/null
+++ b/bin/vcf_strip_extra_headers
@@ -0,0 +1,18 @@
+#!/usr/bin/perl
+
+my $seen_non_header = 0;
+
+while (<STDIN>) {
+ if (! $seen_non_header) {
+ if (/^#/) {
+ } else {
+ $seen_non_header = 1;
+ }
+ print;
+ } else {
+ if (! /^#/) {
+ print;
+ }
+ }
+
+}
diff --git a/bin/vcfbiallelic b/bin/vcfbiallelic
new file mode 100755
index 0000000..7761fec
--- /dev/null
+++ b/bin/vcfbiallelic
@@ -0,0 +1,20 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+ if ($_ =~ /^#/) {
+ print;
+ } else {
+ $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+ $chrom = $1;
+ $pos = $2;
+ $tag = $3;
+ $ref = $4;
+ $alt = $5;
+ if ($alt =~ /,/) {
+ # remove anything which isn't biallelic
+ } else {
+ print;
+ }
+ }
+}
diff --git a/bin/vcfclearid b/bin/vcfclearid
new file mode 100755
index 0000000..f428682
--- /dev/null
+++ b/bin/vcfclearid
@@ -0,0 +1,12 @@
+#!/usr/bin/python
+#
+
+import sys
+
+for line in sys.stdin:
+ if line.startswith("#"):
+ print line.strip()
+ else:
+ fields = line.strip().split("\t")
+ fields[2] = "."
+ print "\t".join(fields)
diff --git a/bin/vcfclearinfo b/bin/vcfclearinfo
new file mode 100755
index 0000000..a0512fd
--- /dev/null
+++ b/bin/vcfclearinfo
@@ -0,0 +1,12 @@
+#!/usr/bin/python
+#
+
+import sys
+
+for line in sys.stdin:
+ if line.startswith("#"):
+ print line.strip()
+ else:
+ fields = line.strip().split("\t")
+ fields[7] = "."
+ print "\t".join(fields)
diff --git a/bin/vcfcomplex b/bin/vcfcomplex
new file mode 100755
index 0000000..9c2b188
--- /dev/null
+++ b/bin/vcfcomplex
@@ -0,0 +1,26 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+ if ($_ =~ /^#/) {
+ print;
+ } else {
+ $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+ $chrom = $1;
+ $pos = $2;
+ $tag = $3;
+ $ref = $4;
+ $alts = $5;
+ $hasindel = 0;
+ @alts = split(/,/, $alts);
+ $snp = 1;
+ foreach $alt (@alts) {
+ if (length($ref) > 1 || length($alt) != length($ref)) {
+ $snp = 0;
+ }
+ }
+ if (!$snp) {
+ print;
+ }
+ }
+}
diff --git a/bin/vcffirstheader b/bin/vcffirstheader
new file mode 100755
index 0000000..2e77c2e
--- /dev/null
+++ b/bin/vcffirstheader
@@ -0,0 +1,16 @@
+#!/usr/bin/python
+
+import sys
+
+header=True
+for line in sys.stdin:
+ if line.startswith('##'):
+ if header:
+ print line.strip()
+ continue
+ elif line.startswith('#'):
+ if header:
+ print line.strip()
+ header=False
+ continue
+ print line.strip()
diff --git a/bin/vcfgtcompare.sh b/bin/vcfgtcompare.sh
new file mode 100755
index 0000000..d563313
--- /dev/null
+++ b/bin/vcfgtcompare.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+if [ $# != 3 ];
+then
+ echo "usage: $0 [annotation] [fileA] [fileB]"
+ echo "annotates records in the first file with genotypes and sites from the second"
+ exit
+fi
+
+annotation=$1
+fileA=$2
+fileB=$3
+
+vcfcommonsamples $fileA $fileB \
+ | vcfannotategenotypes $annotation - $fileB \
+ | vcfgenotypecompare $annotation -
diff --git a/bin/vcfindelproximity b/bin/vcfindelproximity
new file mode 100755
index 0000000..982ba81
--- /dev/null
+++ b/bin/vcfindelproximity
@@ -0,0 +1,82 @@
+#!/usr/bin/perl
+#
+
+
+
+# for line in the vcf
+# stuff the line into a queue
+# when you reach an indel
+# record the position
+# pop lines from the back of the queue until we are at the current position
+#
+
+my @lines;
+
+my $prox = $ARGV[0];
+
+my $lastchrom = "";
+my $indelpos = 0;
+
+while (<STDIN>) {
+
+ if ($_ =~ /^#/) {
+ print $_;
+ next;
+ }
+
+ $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+ my $chrom = $1;
+ my $pos = $2;
+ my $tag = $3;
+ my $ref = $4;
+ my $alt = $5;
+ #print "chrom: $chrom, pos: $pos, ref: $ref, alt: $alt\n";
+
+ # if new chrom, print out everything from last one
+ if ($lastchrom == "") {
+ $lastchrom = $chrom;
+ }
+
+ if ($chrom != $lastchrom) {
+ while ($lines) {
+ print pop(@lines);
+ }
+ }
+
+ unshift(@lines, $_);
+
+ my $diff = length($ref) - length($alt);
+
+ if ($diff != 0) {
+ # insertion
+ if ($indelpos == 0) {
+ $indelpos = $pos;
+ }
+ $nextindelpos = $pos;
+ #print "last $indelpos next $nextindelpos\n";
+ while (@lines) {
+ my $line = pop(@lines);
+ $line =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+ my $c = $1;
+ my $p = $2;
+ my $t = $3;
+ my $r = $4;
+ my $a = $5;
+ # print indels
+ if (length($r) - length($a) != 0) {
+ print $line;
+ } else {
+ # print other events which are more than prox away from indels
+ if (abs($indelpos - $p) >= $prox and abs($nextindelpos - $p) >= $prox) {
+ print $line;
+ }
+ }
+ }
+ $indelpos = $pos;
+ }
+}
+
+# flush lines end of file
+while ($lines) {
+ print pop(@lines);
+}
diff --git a/bin/vcfindels b/bin/vcfindels
new file mode 100755
index 0000000..1b92a45
--- /dev/null
+++ b/bin/vcfindels
@@ -0,0 +1,26 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+ if ($_ =~ /^#/) {
+ print;
+ } else {
+ $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+ $chrom = $1;
+ $pos = $2;
+ $tag = $3;
+ $ref = $4;
+ $alts = $5;
+ $hasindel = 0;
+ @alts = split(/,/, $alts);
+ $snp = 1;
+ foreach $alt (@alts) {
+ if (length($alt) != length($ref)) {
+ $snp = 0;
+ }
+ }
+ if (!$snp) {
+ print;
+ }
+ }
+}
diff --git a/bin/vcfmultiallelic b/bin/vcfmultiallelic
new file mode 100755
index 0000000..47f3dce
--- /dev/null
+++ b/bin/vcfmultiallelic
@@ -0,0 +1,20 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+ if ($_ =~ /^#/) {
+ print;
+ } else {
+ $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+ $chrom = $1;
+ $pos = $2;
+ $tag = $3;
+ $ref = $4;
+ $alt = $5;
+ if ($alt =~ /,/) {
+ print;
+ } else {
+ # remove anything which isn't multiallelic
+ }
+ }
+}
diff --git a/bin/vcfmultiway b/bin/vcfmultiway
new file mode 100755
index 0000000..9536a65
--- /dev/null
+++ b/bin/vcfmultiway
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+reference=$1
+shift
+
+echo comparing $@
+
+for fileA in $@;
+do
+ for fileB in $@;
+ do
+ if [ "$fileA" = "$fileB" ]
+ then
+ vcfstats $fileA >$(basename $fileA).stats.txt
+ else
+ vcfintersect -r $reference -i $fileA $fileB | vcfstats >$(basename $fileA).common.$(basename $fileB).stats.txt
+ vcfintersect -r $reference -v -i $fileB $fileA | vcfstats >$(basename $fileA).unique.$(basename $fileB).stats.txt
+ fi
+ done
+done
diff --git a/bin/vcfmultiwayscripts b/bin/vcfmultiwayscripts
new file mode 100755
index 0000000..8373805
--- /dev/null
+++ b/bin/vcfmultiwayscripts
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+reference=$1
+outdir=$2
+scriptsdir=$3
+shift
+shift
+shift
+
+mkdir -p $outdir
+mkdir -p $scriptsdir
+
+echo comparing $@
+
+for fileA in $@;
+do
+ fileA=$(pwd)/$fileA
+ for fileB in $@;
+ do
+ fileB=$(pwd)/$fileB
+ echo $fileA vs $fileB
+ if [ "$fileA" = "$fileB" ]
+ then
+ echo "vcfstats $fileA >$outdir/$(basename $fileA).stats" >$scriptsdir/$(basename $fileA).sh
+ else
+ echo "vcfintersect -r $reference -i $fileA $fileB | vcfstats >$outdir/$(basename $fileA).common.$(basename $fileB).stats" >$scriptsdir/$(basename $fileA).common.$(basename $fileB).sh
+ echo "vcfintersect -r $reference -v -i $fileB $fileA | vcfstats >$outdir/$(basename $fileA).unique.$(basename $fileB).stats" >$scriptsdir/$(basename $fileA).unique.$(basename $fileB).sh
+ fi
+ done
+done
diff --git a/bin/vcfnobiallelicsnps b/bin/vcfnobiallelicsnps
new file mode 100755
index 0000000..2433eee
--- /dev/null
+++ b/bin/vcfnobiallelicsnps
@@ -0,0 +1,29 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+ if ($_ =~ /^#/) {
+ print;
+ } else {
+ $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+ $chrom = $1;
+ $pos = $2;
+ $tag = $3;
+ $ref = $4;
+ $alts = $5;
+ $hasnonsnp = 0;
+ $biallelic = 1;
+ if ($alts =~ /,/) {
+ $biallelic = 0;
+ }
+ @alts = split(/,/, $alts);
+ foreach $alt (@alts) {
+ if (!(length($alt)==1 && length($alt) == length($ref))) {
+ $hasnonsnp = 1;
+ }
+ }
+ if ($hasnonsnp || !$biallelic) {
+ print;
+ }
+ }
+}
diff --git a/bin/vcfnoindels b/bin/vcfnoindels
new file mode 100755
index 0000000..c2051ac
--- /dev/null
+++ b/bin/vcfnoindels
@@ -0,0 +1,25 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+ if ($_ =~ /^#/) {
+ print;
+ } else {
+ $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+ $chrom = $1;
+ $pos = $2;
+ $tag = $3;
+ $ref = $4;
+ $alts = $5;
+ $hasindel = 0;
+ @alts = split(/,/, $alts);
+ foreach $alt (@alts) {
+ if (length($alt ) != length($ref)) {
+ $hasindel = 1;
+ }
+ }
+ if (! $hasindel) {
+ print;
+ }
+ }
+}
diff --git a/bin/vcfnosnps b/bin/vcfnosnps
new file mode 100755
index 0000000..19ee084
--- /dev/null
+++ b/bin/vcfnosnps
@@ -0,0 +1,25 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+ if ($_ =~ /^#/) {
+ print;
+ } else {
+ $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+ $chrom = $1;
+ $pos = $2;
+ $tag = $3;
+ $ref = $4;
+ $alts = $5;
+ $hasnonsnp = 0;
+ @alts = split(/,/, $alts);
+ foreach $alt (@alts) {
+ if (!(length($alt)==1 && length($alt) == length($ref))) {
+ $hasnonsnp = 1;
+ }
+ }
+ if ($hasnonsnp) {
+ print;
+ }
+ }
+}
diff --git a/bin/vcfnulldotslashdot b/bin/vcfnulldotslashdot
new file mode 100755
index 0000000..9951a34
--- /dev/null
+++ b/bin/vcfnulldotslashdot
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+
+import sys
+import math
+
+def bincoeff(n,k): return math.factorial(n) / (math.factorial(n-k)*math.factorial(k))
+def multcoeff(n,k): return bincoeff(n+k-1,k)
+
+for line in sys.stdin:
+ if line.startswith("#"):
+ print line.strip()
+ continue
+ fields = line.strip().split("\t")
+ alleles = len(fields[4].split(","))+1
+ # assume that we have GT:GL
+ # how many genotypes? assume diploid
+ flatgls = ",".join(map(str,[0]*multcoeff(alleles,2)))
+ for i in range(9, len(fields)):
+ if fields[i] == ".":
+ fields[i] = "./.:" + flatgls
+ print "\t".join(fields)
+
diff --git a/bin/vcfplotaltdiscrepancy.r b/bin/vcfplotaltdiscrepancy.r
new file mode 100755
index 0000000..8717987
--- /dev/null
+++ b/bin/vcfplotaltdiscrepancy.r
@@ -0,0 +1,511 @@
+#!/usr/bin/Rscript
+
+# helper functions
+
+nan.to.zero <- function(n) {
+ if (is.nan(n)) return(0) else return(n)
+}
+
+
+# get the input VCF tabular format, assert that sites must have AC > 0
+vcf <- subset(read.table(pipe('cat /dev/stdin'), header=T), AC > 0)
+
+filename <- commandArgs(TRUE)[1]
+tag <- commandArgs(TRUE)[2]
+
+tag.genotypes_count <- paste(tag, '.genotypes.alternate_count', sep='')
+tag.genotypes_alternate_count <- paste(tag, '.genotypes.alternate_count', sep='')
+tag.non_reference_discrepancy_count <- paste(tag, '.site.non_reference_discrepancy.count', sep='')
+tag.non_reference_discrepancy_normalizer <- paste(tag, '.site.non_reference_discrepancy.normalizer', sep='')
+tag.non_reference_sensitivity_count <- paste(tag, '.site.non_reference_sensitivity.count', sep='')
+tag.non_reference_sensitivity_normalizer <- paste(tag, '.site.non_reference_sensitivity.normalizer', sep='')
+tag.alternate_positive_discrepancy <- paste(tag, '.site.alternate_positive_discrepancy', sep='')
+tag.alternate_negative_discrepancy <- paste(tag, '.site.alternate_negative_discrepancy', sep='')
+tag.has_variant <- paste(tag, '.has_variant', sep='')
+
+vcf.numberOfSites <- length(vcf[, tag.genotypes_alternate_count])
+vcf.totalAltAlleles <- sum(vcf[, tag.genotypes_alternate_count])
+vcf.positiveDiscrepancy <- sum(vcf[, tag.alternate_positive_discrepancy]) / sum(vcf[, tag.genotypes_alternate_count])
+vcf.negativeDiscrepancy <- sum(vcf[, tag.alternate_negative_discrepancy]) / sum(vcf[, tag.genotypes_alternate_count])
+vcf.sitesTruePositive <- mean(vcf[, tag.has_variant])
+
+min_sites <- 5 # number of sites required for "simple plotting"
+
+#library(ggplot2)
+#vcf2 <- data.frame(QUAL=vcf$QUAL, AC=vcf$AC, has_variant=vcf[, tag.has_variant])
+#qplot(AC, has_variant, group=AC, geom="boxplot", data=subset(vcf2, AC <= 20))
+#ggsave(paste(filename, '.', tag, '.PD.vs.AC.boxplot.ac_lt_20.pdf', sep=''))
+
+
+cat('number of sites', vcf.numberOfSites, '\n')
+cat('total alternate alleles', vcf.totalAltAlleles, '\n')
+cat('positive discrepancy', vcf.positiveDiscrepancy, '\n')
+cat('negative discrepancy', vcf.negativeDiscrepancy, '\n')
+
+#x <- cbind(tapply(vcf, as.list(seq(0,max(vcf$AC))),
+# function(x) {
+# sum(x[, tag.alternate_positive_discrepancy]) / sum(x[, tag.genotypes_alternate_count])
+# }))
+
+byac <- data.frame(ac=as.vector(seq(1,max(vcf$AC)))) #, fdr=as.vector(x))
+
+
+byac$fdr <- as.vector(cbind(by(byac$ac, byac$ac,
+ function(x) {
+ s <- subset(vcf, AC == x)
+ return(nan.to.zero(sum(s[, tag.alternate_positive_discrepancy]) / sum(s[, tag.genotypes_alternate_count])))
+ })))
+
+# false detection count
+byac$fpc <- as.vector(cbind(by(byac$ac, byac$ac,
+ function(x) {
+ s <- subset(vcf, AC == x)
+ return(sum(s[, tag.alternate_positive_discrepancy]))
+ })))
+
+byac$alleles <- as.vector(cbind(by(byac$ac, byac$ac,
+ function(x) {
+ s <- subset(vcf, AC == x)
+ return(sum(s[, tag.genotypes_alternate_count]))
+ })))
+
+byac$sites <- as.vector(cbind(by(byac$ac, byac$ac,
+ function(x) {
+ s <- subset(vcf, AC == x)
+ return(length(s$AC))
+ })))
+
+# count true positive sites
+byac$site_tpc <- as.vector(cbind(by(byac$ac, byac$ac,
+ function(x) {
+ s <- subset(vcf, AC == x)
+ return(sum(s[, tag.has_variant]))
+ })))
+
+# fpc == false detection count
+byac$site_fpc <- byac$sites - byac$site_tpc
+# site detection fpr is 1 - true positive rate
+byac$site_fpr <- 1 - ( byac$site_tpc / byac$sites )
+
+summary(byac)
+
+#print(byac$sites)
+#print(byac$site_tpc)
+#print(byac$site_fpc)
+#print(byac$site_fpr)
+
+#byac$site_fpr_gt0 <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) {
+# s <- subset(byac, ac == i, select=c(site_fpr, sites))
+#if (s$sites >= min_sites) {
+# return(s$site_fpr)
+#} else {
+# return(NA)
+#}
+#})))
+
+#byac$site_fprlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac <= i, select=site_fpr)))))
+byac$site_fprlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) {
+ s <- subset(byac, ac <= i, select=c(site_fpc, sites))
+ return(sum(s$site_fpc) / sum(s$sites))
+})))
+
+#byac$site_fprgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac >= i, select=site_fpr)))))
+byac$site_fprgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) {
+ s <- subset(byac, ac >= i, select=c(site_fpc, sites))
+ return(sum(s$site_fpc) / sum(s$sites))
+})))
+
+byac$cfa <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) sum(subset(byac, ac <= i, select=alleles)) / sum(byac$alleles))))
+
+byac$cfs <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) sum(subset(byac, ac <= i, select=sites)) / length(vcf$AC))))
+
+# inappropriate collapse via averaging of fdr
+#byac$alternate_pdlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac <= i, select=fdr)))))
+
+byac$alternate_pdr <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) {
+ s <- subset(byac, ac == i, select=c(fpc, alleles))
+ return(sum(s$fpc) / sum(s$alleles))
+})))
+
+# use this one
+byac$alternate_pdlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) {
+ s <- subset(byac, ac <= i, select=c(fpc, alleles))
+ return(sum(s$fpc) / sum(s$alleles))
+})))
+
+#byac$alternate_pdgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac >= i, select=fdr)))))
+byac$alternate_pdgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) {
+ s <- subset(byac, ac >= i, select=c(fpc, alleles))
+ return(sum(s$fpc) / sum(s$alleles))
+})))
+
+byac$nrs <- as.vector(cbind(by(byac$ac, byac$ac,
+ function(x) {
+ s <- subset(vcf, AC == x)
+ return(nan.to.zero(sum(s[, tag.non_reference_sensitivity_count]) / sum(s[, tag.non_reference_sensitivity_normalizer])))
+ })))
+
+byac$nrd <- as.vector(cbind(by(byac$ac, byac$ac,
+ function(x) {
+ s <- subset(vcf, AC == x)
+ return(nan.to.zero(sum(s[, tag.non_reference_discrepancy_count]) / sum(s[, tag.non_reference_discrepancy_normalizer])))
+ })))
+
+byac$nrslt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) {
+ s <- subset(vcf, AC <= i, select=c(tag.non_reference_sensitivity_count, tag.non_reference_sensitivity_normalizer))
+ return(nan.to.zero(sum(s[, tag.non_reference_sensitivity_count]) / sum(s[, tag.non_reference_sensitivity_normalizer])))
+})))
+
+byac$nrdlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) {
+ s <- subset(vcf, AC <= i, select=c(tag.non_reference_discrepancy_count, tag.non_reference_discrepancy_normalizer))
+ return(nan.to.zero(sum(s[, tag.non_reference_discrepancy_count]) / sum(s[, tag.non_reference_discrepancy_normalizer])))
+})))
+
+byac_gtsites <- subset(byac, sites >= min_sites)
+
+
+if (FALSE) {
+pdf(paste(filename, '.', tag, '.PD.vs.AC.smooth.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byac$cfa, ylim=c(0,1.0),
+ xlab='alternate allele count (AC)', xaxt='n',
+ ylab='', yaxt='n', type='l', col='red')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byac$ac),10), labels=seq(0,max(byac$ac),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(smoothed)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(new=T)
+#plot(byac$alternate_pdlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='green')
+plot(byac$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+lines(byac$ac, predict(loess(byac$alternate_pdr ~ byac$ac, span=0.5)), col="green")
+par(new=T)
+plot(byac$cfa, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='red')
+par(new=T)
+plot(byac$cfs, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='purple')
+par(new=T)
+lines(byac$ac, predict(loess(byac$site_fpr ~ byac$ac, span=0.5)), col="blue")
+par(new=T)
+lines(byac$ac, predict(loess(byac$nrs ~ byac$ac, span=0.5)), col="magenta")
+par(new=T)
+lines(byac$ac, predict(loess(byac$nrd ~ byac$ac, span=0.5)), col="brown")
+par(new=T, cex=0.65)
+mtext(paste("alternate genotype PD: ", round(vcf.positiveDiscrepancy, digits=4), ", site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative fraction of alt alleles', 'cumulative fraction of sites', 'alt genotypes PD', 'site PD', 'non-ref sensitivity', 'non-ref discrepancy', 'site PD at AC'),
+ fill=c('red', 'purple', 'green', 'blue', 'magenta', 'brown', 'black'))
+garbage <- dev.off()
+}
+
+
+
+pdf(paste(filename, '.', tag, '.PD.vs.AC.cumulative.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byac$cfa, ylim=c(0,1.0),
+ xlab='alternate allele count (AC)', xaxt='n',
+ ylab='', yaxt='n', type='l', col='red')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byac$ac),10), labels=seq(0,max(byac$ac),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(cumulative)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(new=T)
+#plot(byac_gtsites$ac, byac_gtsites$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+plot(byac$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+plot(byac$alternate_pdlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='green')
+par(new=T)
+plot(byac$cfa, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='red')
+par(new=T)
+plot(byac$cfs, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='purple')
+par(new=T)
+plot(byac$site_fprlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue')
+par(new=T)
+plot(byac$nrslt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='magenta')
+par(new=T)
+plot(byac$nrdlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='brown')
+par(new=T, cex=0.65)
+mtext(paste("alternate genotype PD: ", round(vcf.positiveDiscrepancy, digits=4), ", site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative fraction of alt alleles', 'cumulative fraction of sites', 'alt genotypes PD', 'site PD', 'non-ref sensitivity', 'non-ref discrepancy', 'site PD at AC'),
+ fill=c('red', 'purple', 'green', 'blue', 'magenta', 'brown', 'black'))
+garbage <- dev.off()
+
+
+
+pdf(paste(filename, '.', tag, '.PD.vs.AC.cumulative.simple.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byac$cfs, ylim=c(0,1.0), xlim=c(0,max(vcf$AC)),
+ xlab='alternate allele count (AC)', xaxt='n',
+ ylab='', yaxt='n', type='l', col='purple')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byac$ac),10), labels=seq(0,max(byac$ac),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(cumulative)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(new=T)
+plot(byac$site_fprlt, ylim=c(0,1.0), xlim=c(0,max(vcf$AC)), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue')
+par(new=T)
+plot(byac_gtsites$ac, byac_gtsites$site_fpr, ylim=c(0,1.0), xlim=c(0,max(vcf$AC)), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T, cex=0.65)
+mtext(paste("site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative fraction of sites', 'cumulative site PD', paste('site PD at AC (>=', min_sites, 'sites)')),
+ fill=c('purple', 'blue', 'black'))
+garbage <- dev.off()
+
+
+
+
+pdf(paste(filename, '.', tag, '.PD.vs.AC.instantaneous.ac_lt_20.pdf', sep=''))
+#par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byac$sites, ylim=c(0,max(byac$sites)), xlim=c(0,20),
+ xlab='alternate allele count (AC)', xaxt='n',
+ ylab='number of sites', type='l', pch=19, col='blue')
+#axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+#countTicks <- round(seq(0,1,0.1) * max(byac$sites))
+#axis(2, at=countTicks, labels=countTicks)
+par(new=T)
+axis(1, at=seq(0,max(byac$ac),1), labels=seq(0,max(byac$ac),1))
+grid(lty=5)
+par(new=T)
+plot(byac$sites, ylim=c(0,max(byac$sites)), xlim=c(0,20), type='o', pch=19, col='blue', xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(instantaneous)'))
+par(new=T)
+mtext("number of sites", side=2, line=3) #, cex=0.75)
+#par(new=T)
+#plot(byac$site_fprlt, ylim=c(0,1.0), xlim=c(0,20), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue')
+par(new=T)
+plot(byac_gtsites$ac, byac_gtsites$site_tpc, ylim=c(0,max(byac$sites)), xlim=c(0,20), xlab='', xaxt='n', ylab='', yaxt='n', col='red', pch=19, type='o')
+par(new=T) #, cex=0.65)
+mtext(paste("site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T) #, cex=0.65)
+#legend('topright', c('number of sites', 'site PD count'),
+# fill=c('blue', 'red'))
+garbage <- dev.off()
+
+# stratifying by QUAL
+
+if (FALSE) {
+
+
+x <- cbind(by(vcf, vcf$QUAL,
+ function(x) {
+ sum(x[, tag.alternate_positive_discrepancy]) / sum(x[, tag.genotypes_alternate_count])
+ }))
+
+byqual <- data.frame(qual=as.numeric(rownames(x)), fdr=as.vector(x))
+
+# false detection count
+byqual$fpc <- as.vector(cbind(by(vcf, vcf$QUAL,
+ function(i) { sum(i[, tag.alternate_positive_discrepancy]) } )))
+
+byqual$alleles <- as.vector(cbind(by(vcf, vcf$QUAL,
+ function(i) {
+ sum(i[, tag.genotypes_alternate_count])
+ })))
+
+byqual$sites <- as.vector(cbind(by(vcf$QUAL, vcf$QUAL, function(i) length(i))))
+
+# count true positive sites
+byqual$site_tpc <- as.vector(cbind(by(vcf[, tag.has_variant], vcf$QUAL, function(i) sum(i))))
+# fpc == false detection count
+byqual$site_fpc <- byqual$sites - byqual$site_tpc
+# site detection fpr is 1 - true positive rate
+byqual$site_fpr <- 1 - ( byqual$site_tpc / byqual$sites )
+
+#byqual$site_fpr_gt0 <- as.vector(cbind(tapply(byqual$ac, byqual$ac, function(i) {
+# s <- subset(byqual, ac == i, select=c(site_fpr, sites))
+#if (s$sites >= min_sites) {
+# return(s$site_fpr)
+#} else {
+# return(NA)
+#}
+#})))
+
+#byqual$site_fprlt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) mean(subset(byqual, qual <= i, select=site_fpr)))))
+byqual$site_fprlt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) {
+ s <- subset(byqual, qual <= i, select=c(site_fpc, sites))
+ return(sum(s$site_fpc) / sum(s$sites))
+})))
+
+#byqual$site_fprgt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) mean(subset(byqual, qual >= i, select=site_fpr)))))
+byqual$site_fprgt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) {
+ s <- subset(byqual, qual >= i, select=c(site_fpc, sites))
+ return(sum(s$site_fpc) / sum(s$sites))
+})))
+
+byqual$cfa <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) sum(subset(byqual, qual <= i, select=alleles)) / sum(byqual$alleles))))
+
+byqual$cfs <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) sum(subset(byqual, qual <= i, select=sites)) / length(vcf$QUAL))))
+
+# inappropriate collapse via averaging of fdr
+#byqual$alternate_pdlt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) mean(subset(byqual, qual <= i, select=fdr)))))
+
+byqual$alternate_pdr <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) {
+ s <- subset(byqual, qual == i, select=c(fpc, alleles))
+ return(sum(s$fpc) / sum(s$alleles))
+})))
+
+# use this one
+byqual$alternate_pdlt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) {
+ s <- subset(byqual, qual <= i, select=c(fpc, alleles))
+ return(sum(s$fpc) / sum(s$alleles))
+})))
+
+#byqual$alternate_pdgt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) mean(subset(byqual, qual >= i, select=fdr)))))
+byqual$alternate_pdgt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) {
+ s <- subset(byqual, qual >= i, select=c(fpc, alleles))
+ return(sum(s$fpc) / sum(s$alleles))
+})))
+
+nan.to.zero <- function(n) {
+ if (is.nan(n)) return(0) else return(n)
+}
+
+byqual$nrs <- as.vector(cbind(by(vcf, vcf$QUAL, function(i) {
+ return(nan.to.zero(sum(i[, tag.non_reference_sensitivity_count]) / sum(i[, tag.non_reference_sensitivity_normalizer])))
+})))
+
+byqual$nrd <- as.vector(cbind(by(vcf, vcf$QUAL, function(i) {
+ return(nan.to.zero(sum(i[, tag.non_reference_discrepancy_count]) / sum(i[, tag.non_reference_discrepancy_normalizer])))
+})))
+
+byqual$nrslt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) {
+ s <- subset(vcf, QUAL <= i, select=c(tag.non_reference_sensitivity_count, tag.non_reference_sensitivity_normalizer))
+ return(nan.to.zero(sum(s[, tag.non_reference_sensitivity_count]) / sum(s[, tag.non_reference_sensitivity_normalizer])))
+})))
+
+byqual$nrdlt <- as.vector(cbind(tapply(byqual$qual, byqual$qual, function(i) {
+ s <- subset(vcf, QUAL <= i, select=c(tag.non_reference_discrepancy_count, tag.non_reference_discrepancy_normalizer))
+ return(nan.to.zero(sum(s[, tag.non_reference_discrepancy_count]) / sum(s[, tag.non_reference_discrepancy_normalizer])))
+})))
+
+byqual_gt10 <- subset(byqual, sites >= min_sites)
+
+
+if (FALSE) {
+pdf(paste(filename, '.', tag, '.PD.vs.QUAL.smooth.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byqual$cfa, ylim=c(0,1.0),
+ xlab='QUAL', xaxt='n',
+ ylab='', yaxt='n', type='l', col='red')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byqual$qual),10), labels=seq(0,max(byqual$qual),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(smoothed)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(new=T)
+#plot(byqual$alternate_pdlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='green')
+plot(byqual$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+lines(byqual$qual, predict(loess(byqual$alternate_pdr ~ byqual$qual, span=0.5)), col="green")
+par(new=T)
+plot(byqual$cfa, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='red')
+par(new=T)
+plot(byqual$cfs, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='purple')
+par(new=T)
+lines(byqual$qual, predict(loess(byqual$site_fpr ~ byqual$qual, span=0.5)), col="blue")
+par(new=T)
+lines(byqual$qual, predict(loess(byqual$nrs ~ byqual$qual, span=0.5)), col="magenta")
+par(new=T)
+lines(byqual$qual, predict(loess(byqual$nrd ~ byqual$qual, span=0.5)), col="brown")
+par(new=T, cex=0.65)
+mtext(paste("alternate genotype PD: ", round(vcf.positiveDiscrepancy, digits=4), ", site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative fraction of alt alleles', 'cumulative fraction of sites', 'alt genotypes PD', 'site PD', 'non-ref sensitivity', 'non-ref discrepancy', 'site PD at QUAL'),
+ fill=c('red', 'purple', 'green', 'blue', 'magenta', 'brown', 'black'))
+garbage <- dev.off()
+}
+
+
+
+pdf(paste(filename, '.', tag, '.PD.vs.QUAL.cumulative.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byqual$cfa, ylim=c(0,1.0),
+ xlab='QUAL', xaxt='n',
+ ylab='', yaxt='n', type='l', col='red')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byqual$qual),10), labels=seq(0,max(byqual$qual),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(cumulative)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(new=T)
+plot(byqual_gt10$qual, byqual_gt10$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+#plot(byqual$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+plot(byqual$alternate_pdlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='green')
+par(new=T)
+plot(byqual$cfa, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='red')
+par(new=T)
+plot(byqual$cfs, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='purple')
+par(new=T)
+plot(byqual$site_fprlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue')
+par(new=T)
+plot(byqual$nrslt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='magenta')
+par(new=T)
+plot(byqual$nrdlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='brown')
+par(new=T, cex=0.65)
+mtext(paste("alternate genotype PD: ", round(vcf.positiveDiscrepancy, digits=4), ", site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative fraction of alt alleles', 'cumulative fraction of sites', 'alt genotypes PD', 'site PD', 'non-ref sensitivity', 'non-ref discrepancy', 'site PD at QUAL (>= 10 sites)'),
+ fill=c('red', 'purple', 'green', 'blue', 'magenta', 'brown', 'black'))
+garbage <- dev.off()
+
+
+
+pdf(paste(filename, '.', tag, '.PD.vs.QUAL.cumulative.simple.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byqual$cfs, ylim=c(0,1.0),
+ xlab='QUAL', xaxt='n',
+ ylab='', yaxt='n', type='l', col='purple')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byqual$qual),10), labels=seq(0,max(byqual$qual),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'positive discrepancy versus', tag, '(cumulative)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(new=T)
+plot(byqual$site_fprlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue')
+par(new=T)
+plot(byqual_gt10$qual, byqual_gt10$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T, cex=0.65)
+mtext(paste("site PD: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative fraction of sites', 'site PD', 'site PD at QUAL (>= 10 sites)'),
+ fill=c('purple', 'blue', 'black'))
+garbage <- dev.off()
+
+}
diff --git a/bin/vcfplotaltdiscrepancy.sh b/bin/vcfplotaltdiscrepancy.sh
new file mode 100755
index 0000000..0817895
--- /dev/null
+++ b/bin/vcfplotaltdiscrepancy.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+filename=$1
+tag=$2
+
+vcf2tsv \
+ | tsvsplit \
+ QUAL \
+ AC \
+ $tag.has_variant \
+ | tf2binary \
+ | vcfplotsitediscrepancy.r $filename $tag
diff --git a/bin/vcfplotsitediscrepancy.r b/bin/vcfplotsitediscrepancy.r
new file mode 100755
index 0000000..d4f4ecb
--- /dev/null
+++ b/bin/vcfplotsitediscrepancy.r
@@ -0,0 +1,99 @@
+#!/usr/bin/Rscript --vanilla --slave
+
+# get the input VCF tabular format, assert that sites must have AC > 0
+vcf <- subset(read.table(pipe('cat /dev/stdin'), header=T), AC > 0)
+
+filename <- commandArgs(TRUE)[1]
+tag <- commandArgs(TRUE)[2]
+
+tag.has_variant <- paste(tag, '.has_variant', sep='')
+
+vcf.numberOfSites <- length(vcf$AC)
+vcf.sitesTruePositive <- mean(vcf[, tag.has_variant])
+
+# false detection count
+x <- cbind(by(vcf$AC, vcf$AC, function(i) length(i)))
+
+byac <- data.frame(ac=as.numeric(rownames(x)), sites=as.vector(x))
+
+# count true positive sites
+byac$site_tpc <- as.vector(cbind(by(vcf[, tag.has_variant], vcf$AC, function(i) sum(i))))
+# fpc == false detection count
+byac$site_fpc <- byac$sites - byac$site_tpc
+# site detection fpr is 1 - true positive rate
+byac$site_fpr <- 1 - ( byac$site_tpc / byac$sites )
+
+#byac$site_fprlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac <= i, select=site_fpr)))))
+byac$site_fprlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) {
+ s <- subset(byac, ac <= i, select=c(site_fpc, sites))
+ return(sum(s$site_fpc) / sum(s$sites))
+})))
+
+#byac$site_fprgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac >= i, select=site_fpr)))))
+byac$site_fprgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) {
+ s <- subset(byac, ac >= i, select=c(site_fpc, sites))
+ return(sum(s$site_fpc) / sum(s$sites))
+})))
+
+byac$cfs <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) sum(subset(byac, ac <= i, select=sites)) / length(vcf$AC))))
+
+
+pdf(paste(filename, '.', tag, '.site_FDR.vs.AC.smooth.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byac$cfs, ylim=c(0,1.0),
+ xlab='alternate allele count (AC)', xaxt='n',
+ ylab='false discovery rate (FDR)', yaxt='n', type='l', col='red')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byac$ac),10), labels=seq(0,max(byac$ac),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'putative site false discovery rate versus', tag, '(smoothed)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+par(col='red')
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(col='black')
+par(new=T)
+plot(byac$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+lines(byac$ac, predict(loess(byac$site_fpr ~ byac$ac, span=0.5)), col="blue")
+par(new=T, cex=0.65)
+mtext(paste("site FDR: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative sites', 'site FDR (loess smoothed)', 'FDR at AC'),
+ fill=c('red', 'blue', 'black'))
+garbage <- dev.off()
+
+
+
+pdf(paste(filename, '.', tag, '.site_FDR.vs.AC.cumulative.pdf', sep=''))
+par(cex=0.75)
+par(mar=c(5,4,4,5) + 0.1)
+plot(byac$cfs, ylim=c(0,1.0),
+ xlab='alternate allele count (AC)', xaxt='n',
+ ylab='false discovery rate (FDR)', yaxt='n', type='l', col='red')
+axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1))
+axis(1, at=seq(0,max(byac$ac),10), labels=seq(0,max(byac$ac),10), cex=0.75)
+grid(lty=5)
+par(new=T)
+title(paste(filename, 'putative false discovery rate versus', tag, '(cumulative)'))
+par(new=T)
+countTicks <- seq(0,1,0.1) * vcf.numberOfSites
+axis(4, at=seq(0,1,0.1), labels=round(countTicks))
+par(col='red')
+mtext("number of sites", side=4, line=3, cex=0.75)
+par(col='black')
+par(new=T)
+plot(byac$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n')
+par(new=T)
+plot(byac$site_fprgt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='green')
+par(new=T)
+plot(byac$site_fprlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue')
+par(new=T, cex=0.65)
+mtext(paste("site FDR: ", round(1 - vcf.sitesTruePositive, digits=4), sep=''))
+par(new=T, cex=0.65)
+legend('topleft', c('cumulative sites', 'site FDR <= AC', 'site FDR >= AC', 'FDR at AC'),
+ fill=c('red', 'blue', 'green', 'black'))
+garbage <- dev.off()
diff --git a/bin/vcfplottstv.sh b/bin/vcfplottstv.sh
new file mode 100755
index 0000000..835b5ef
--- /dev/null
+++ b/bin/vcfplottstv.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+filename=$1
+title=$2
+
+vcf2tsv \
+ | tsvsplit \
+ QUAL \
+ AC \
+ AF \
+ TS \
+ | tf2binary \
+ | vcfplottstv.r $filename $title
diff --git a/bin/vcfprintaltdiscrepancy.r b/bin/vcfprintaltdiscrepancy.r
new file mode 100755
index 0000000..ef33502
--- /dev/null
+++ b/bin/vcfprintaltdiscrepancy.r
@@ -0,0 +1,37 @@
+#!/usr/bin/Rscript --vanilla --slave
+
+# get the input VCF tabular format, assert that sites must have AC > 0
+vcf <- subset(read.table(pipe('cat /dev/stdin'), header=T), AC > 0)
+
+tag <- commandArgs(TRUE)[1]
+
+tag.genotypes_alternate_count <- paste(tag, '.genotypes.alternate_count', sep='')
+tag.non_reference_discrepancy_count <- paste(tag, '.site.non_reference_discrepancy.count', sep='')
+tag.non_reference_discrepancy_normalizer <- paste(tag, '.site.non_reference_discrepancy.normalizer', sep='')
+tag.non_reference_sensitivity_count <- paste(tag, '.site.non_reference_sensitivity.count', sep='')
+tag.non_reference_sensitivity_normalizer <- paste(tag, '.site.non_reference_sensitivity.normalizer', sep='')
+tag.alternate_positive_discrepancy <- paste(tag, '.site.alternate_positive_discrepancy', sep='')
+tag.alternate_negative_discrepancy <- paste(tag, '.site.alternate_negative_discrepancy', sep='')
+tag.has_variant <- paste(tag, '.has_variant', sep='')
+
+vcf.numberOfSites <- length(vcf[, tag.genotypes_alternate_count])
+vcf.totalAltAlleles <- sum(vcf[, tag.genotypes_alternate_count])
+vcf.positiveDiscrepancy <- sum(vcf[, tag.alternate_positive_discrepancy]) / sum(vcf[, tag.genotypes_alternate_count])
+vcf.negativeDiscrepancy <- sum(vcf[, tag.alternate_negative_discrepancy]) / sum(vcf[, tag.genotypes_alternate_count])
+vcf.sitesTruePositive <- sum(vcf[, tag.has_variant]) / nrow(vcf)
+
+cat('number of sites', vcf.numberOfSites, '\n')
+cat('total alternate alleles', vcf.totalAltAlleles, '\n')
+cat('positive discrepancy', vcf.positiveDiscrepancy, '\n')
+cat('negative discrepancy', vcf.negativeDiscrepancy, '\n')
+
+x <- cbind(by(vcf, vcf$AC,
+ function(x) {
+ sum(x[, tag.alternate_positive_discrepancy]) / sum(x[, tag.genotypes_alternate_count])
+ }))
+
+byac <- data.frame(ac=as.numeric(rownames(x)), fdr=as.vector(x))
+
+print(byac)
+
+
diff --git a/bin/vcfprintaltdiscrepancy.sh b/bin/vcfprintaltdiscrepancy.sh
new file mode 100755
index 0000000..1df0a65
--- /dev/null
+++ b/bin/vcfprintaltdiscrepancy.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+tag=$1
+
+vcf2tsv \
+ | tsvsplit \
+ QUAL \
+ AC \
+ $tag.has_variant \
+ $tag.site.alternate_negative_discrepancy \
+ $tag.site.alternate_positive_discrepancy \
+ $tag.genotypes.alternate_count \
+ $tag.site.non_reference_sensitivity.count \
+ $tag.site.non_reference_sensitivity.normalizer \
+ $tag.site.non_reference_discrepancy.count \
+ $tag.site.non_reference_discrepancy.normalizer \
+ | tf2binary \
+ | vcfprintaltdiscrepancy.r $tag
diff --git a/bin/vcfqualfilter b/bin/vcfqualfilter
new file mode 100755
index 0000000..8fe970f
--- /dev/null
+++ b/bin/vcfqualfilter
@@ -0,0 +1,32 @@
+#!/usr/bin/perl
+#
+#
+
+use Getopt::Long;
+my $cutoff = -1;
+my $max = -1;
+my $indel = 0;
+my $snp = 0;
+$result = GetOptions ("c|cutoff=i" => \$cutoff,
+ "m|max=i" => \$max,
+ "i|indel=i" => \$indel,
+ "s|snp=i" => \$snp);
+
+
+while (<STDIN>) {
+ if ($_ =~ /^#/) {
+ print $_;
+ next;
+ }
+
+ if ($_ =~ /^(.*?\t){6}(.*?)\t/) {
+ $qual = $1;
+ }
+ if ($cutoff ne -1 and $qual >= $cutoff and ($max eq -1 or $qual <= $max)) {
+ print $_;
+ } elsif ($snp and $_ =~ "SNP" and $qual >= $snp) {
+ print $_;
+ } elsif ($indel and $_ =~ "INS\|DEL" and $qual >= $indel) {
+ print $_;
+ }
+}
diff --git a/bin/vcfregionreduce b/bin/vcfregionreduce
new file mode 100755
index 0000000..fde0938
--- /dev/null
+++ b/bin/vcfregionreduce
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+if [ $# -ne 2 ];
+then
+ echo "Usage: $0 [region file] [directory]"
+ echo
+ echo "Generates \`basename directory\`.vcf.gz, which is the concatenation"
+ echo "of files in the directory named [directory]/[region1].vcf.gz,"
+ echo "[directory]/[region2].vcf.gz, etc. in the order in which they"
+ echo "occur in the region file."
+ echo
+ echo "A tabix index is subsequently generated."
+ exit 1
+fi
+
+regionfile=$1
+mergedir=$2
+mergename=$(basename $mergedir)
+vcfgenotypes=$mergename.vcf.gz
+#vcfsites=$mergename.sites.vcf.gz
+
+firstfile=$mergedir/$(head -1 $regionfile).vcf.gz
+files=$(for region in $(cat $regionfile); do echo $mergedir/$region.vcf.gz; done)
+
+( zcat $firstfile | head -1000 | grep ^#
+for file in $files
+do
+ zcat $file | grep -v "^#"
+done ) | ( bgzip >$vcfgenotypes && tabix -p vcf $vcfgenotypes )
diff --git a/bin/vcfregionreduce_and_cut b/bin/vcfregionreduce_and_cut
new file mode 100755
index 0000000..f15dc2c
--- /dev/null
+++ b/bin/vcfregionreduce_and_cut
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+if [ $# -ne 2 ];
+then
+ echo "Usage: $0 [region file] [directory]"
+ echo
+ echo "Generates \`basename directory\`.vcf.gz and \`basename directory\`.sites.vcf.gz"
+ echo "which are the concatenation of files in the directory named [directory]/[region1].vcf.gz,"
+ echo "[directory]/[region2].vcf.gz, etc. in the order in which they occur in the region file."
+ echo
+ echo "Tabix indexes are simultaneously generated."
+ exit 1
+fi
+
+regionfile=$1
+mergedir=$2
+mergename=$(basename $mergedir)
+vcfgenotypes=$mergename.vcf.gz
+vcfsites=$mergename.sites.vcf.gz
+
+regions=$(cat $regionfile)
+
+firstfile=$mergedir/$(echo $regions | cut -f 1 -d\ ).vcf.gz
+files=$(for region in $regions; do echo $mergedir/$region.vcf.gz; done)
+
+( zcat $firstfile | head -1000 | grep ^#
+for file in $files
+do
+ zcat $file | grep -v "^#"
+done ) | uniq | pee \
+ "bgzip >$vcfgenotypes && tabix -p vcf $vcfgenotypes" \
+ "cut -f -8 | bgzip >$vcfsites && tabix -p vcf $vcfsites"
diff --git a/bin/vcfregionreduce_pipe b/bin/vcfregionreduce_pipe
new file mode 100755
index 0000000..8a21782
--- /dev/null
+++ b/bin/vcfregionreduce_pipe
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+if [ $# -ne 2 ];
+then
+ echo "Usage: $0 [region file] [directory]"
+ echo
+ echo "Generates \`basename directory\`.vcf.gz, which is the concatenation"
+ echo "of files in the directory named [directory]/[region1].vcf.gz,"
+ echo "[directory]/[region2].vcf.gz, etc. in the order in which they"
+ echo "occur in the region file."
+ echo
+ echo "A tabix index is subsequently generated."
+ exit 1
+fi
+
+regionfile=$1
+mergedir=$2
+mergename=$(basename $mergedir)
+vcfgenotypes=$mergename.vcf.gz
+#vcfsites=$mergename.sites.vcf.gz
+
+firstfile=$mergedir/$(head -1 $regionfile).vcf.gz
+files=$(for region in $(cat $regionfile); do echo $mergedir/$region.vcf.gz; done)
+
+zcat $firstfile | head -1000 | grep ^#
+for file in $files
+do
+ zcat $file | grep -v "^#"
+done
diff --git a/bin/vcfregionreduce_uncompressed b/bin/vcfregionreduce_uncompressed
new file mode 100755
index 0000000..41c7528
--- /dev/null
+++ b/bin/vcfregionreduce_uncompressed
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+if [ $# -ne 2 ];
+then
+ echo "Usage: $0 [region file] [directory]"
+ echo
+ echo "Generates \`basename directory\`.vcf.gz, which is the concatenation"
+ echo "of files in the directory named [directory]/[region1].vcf.gz,"
+ echo "[directory]/[region2].vcf.gz, etc. in the order in which they"
+ echo "occur in the region file."
+ echo
+ echo "A tabix index is subsequently generated."
+ exit 1
+fi
+
+regionfile=$1
+mergedir=$2
+mergename=$(basename $mergedir)
+vcfgenotypes=$mergename.vcf.gz
+#vcfsites=$mergename.sites.vcf.gz
+
+firstfile=$mergedir/$(head -1 $regionfile).vcf
+files=$(for region in $(cat $regionfile); do echo $mergedir/$region.vcf; done)
+
+( cat $firstfile | head -1000 | grep ^#
+for file in $files
+do
+ cat $file | grep -v "^#"
+done ) | ( bgzip >$vcfgenotypes && tabix -p vcf $vcfgenotypes )
diff --git a/bin/vcfremovenonATGC b/bin/vcfremovenonATGC
new file mode 100755
index 0000000..7418843
--- /dev/null
+++ b/bin/vcfremovenonATGC
@@ -0,0 +1,29 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+ if ($_ =~ /^#/) {
+ print;
+ } else {
+ $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+ $chrom = $1;
+ $pos = $2;
+ $tag = $3;
+ $ref = $4;
+ $alts = $5;
+ $hasJunk = 0;
+ @alts = split(/,/, $alts);
+
+ if (!($ref =~ /A|T|G|C/)) {
+ $hasJunk = 1;
+ }
+ foreach $alt (@alts) {
+ if (!($alt =~ /A|T|G|C/)) {
+ $hasJunk = 1;
+ }
+ }
+ if (!$hasJunk) {
+ print;
+ }
+ }
+}
diff --git a/bin/vcfsnps b/bin/vcfsnps
new file mode 100755
index 0000000..b2b8b79
--- /dev/null
+++ b/bin/vcfsnps
@@ -0,0 +1,26 @@
+#!/usr/bin/perl
+#
+
+while (<STDIN>) {
+ if ($_ =~ /^#/) {
+ print;
+ } else {
+ $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+ $chrom = $1;
+ $pos = $2;
+ $tag = $3;
+ $ref = $4;
+ $alts = $5;
+ $hasindel = 0;
+ @alts = split(/,/, $alts);
+ $snp = 1;
+ foreach $alt (@alts) {
+ if (length($ref) > 1 || length($alt) != length($ref)) {
+ $snp = 0;
+ }
+ }
+ if ($snp) {
+ print;
+ }
+ }
+}
diff --git a/bin/vcfsort b/bin/vcfsort
new file mode 100755
index 0000000..def75ee
--- /dev/null
+++ b/bin/vcfsort
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+head -1000 $1 | grep "^#"; cat $@ | grep -v "^#" | sort -k1,1d -k2,2n
diff --git a/bin/vcfvarstats b/bin/vcfvarstats
new file mode 100755
index 0000000..1f645f7
--- /dev/null
+++ b/bin/vcfvarstats
@@ -0,0 +1,225 @@
+#!/usr/bin/perl
+#
+
+use IPC::Open2;
+
+sub revcomplement {
+ $revcom = reverse shift;
+ $revcom =~ tr/ACGTacgt/TGCAtgca/;
+ return $revcom;
+}
+
+$reference = $ARGV[0];
+
+if ($reference) {
+ $pid = open2(\*FASTAHACK_OUT, \*FASTAHACK_IN, "fastahack -c $reference");
+}
+
+#print FASTAHACK_IN "1:10000\n";
+#$result = <FASTAHACK_OUT>;
+#print $result;
+
+
+#open(VCF, $file);
+
+$ts = 0;
+$tv = 0;
+$cpg = 0;
+$total = 0;
+$snp = 0;
+$mnp = 0;
+$mnplen = 0;
+%mnp = ();
+$ins = 0;
+$inslen = 0;
+%ins = ();
+$del = 0;
+$dellen = 0;
+%del = ();
+
+%dint = (); # di-nucleotide distribution
+
+while (<STDIN>) {
+ if ($_ =~ /^#/) {
+ next;
+ } else {
+ $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
+ $chrom = $1;
+ $pos = $2;
+ $tag = $3;
+ $ref = $4;
+ $alt = $5;
+ #print "chrom: $chrom, pos: $pos, ref: $ref, alt: $alt\n";
+ }
+
+ $diff = length($ref) - length($alt);
+
+ $is_snp = 0;
+ if ($_ =~ /SNP/) {
+ $is_snp = 1;
+ $snp += 1;
+ # get di-nt's
+ if ($reference) {
+ if ($_ =~ /^(\d+)\t(\d+)/) {
+ $seq = $1;
+ $start = $2;
+ $end = $2 + 1;
+ print FASTAHACK_IN "$seq:$start..$end\n";
+ $dibp = <FASTAHACK_OUT>;
+ chomp $dibp;
+ $dint{$dibp} += 1;
+ }
+ }
+ } elsif ($diff eq 0 and length($ref) eq 1) {
+ $snp += 1;
+ $is_snp = 1;
+ } elsif ($diff eq 0 and length($ref) gt 1) {
+ $mnp += 1;
+ $mnplen += length($ref);
+ $mnp{length($ref)} += 1;
+ }
+ if ($is_snp) {
+ if ((($ref eq "A" and $alt eq "G") or ($ref eq "G" and $alt eq "A"))
+ or
+ (($ref eq "C" and $alt eq "T") or ($ref eq "T" and $alt eq "C"))) {
+ $ts += 1;
+ } else {
+ $tv += 1;
+ }
+ if ($_ =~ /CpG/) { $cpg += 1; }
+ }
+
+ if ($diff lt 0) {
+ $len = abs($diff);
+ $ins += 1;
+ $inslen += $len;
+ $ins{$len} += 1;
+ }
+
+ if ($diff gt 0) {
+ $len = abs($diff);
+ $del += 1;
+ $dellen += $len;
+ $del{$len} += 1;
+ }
+ #elsif (length($ref) > 1 and $diff eq 0) {
+ # print $_ . "\n";
+ # $mnp += 1;
+ # $mnplen += length($ref);
+ # $mnp{length($ref)} += 1;
+ #}
+
+ $total += 1;
+}
+
+if ($total == 0) {
+ die "no VCF records read on stdin\n";
+}
+
+print "total variants:\t$total" . "\n";
+print "\n";
+if ($snp > 0) {
+ print "total snps:\t$snp\n";
+ print "transitions:\t$ts\n";
+ print "transversions:\t$tv\n";
+ if ($tv > 0) {
+ print "ts/tv ratio:\t" . ($ts / $tv) . "\n";
+ }
+ print "CpG sites:\t$cpg\n";
+ if ($cpg > 0) {
+ print "CpG/total snps:\t" . ($cpg / $snp) . "\n";
+ }
+}
+
+if (($ins + $del) > 0) {
+ print "\n";
+ print "total indels:\t" . ($ins + $del) . "\n";
+ print "insertions:\t$ins\t$inslen bp\n";
+ print "deletions:\t$del\t$dellen bp\n";
+
+ $max = 0;
+ while ( my ($size, $count) = each(%ins) ) {
+ if ($size > $max) { $max = $size; }
+ }
+ while ( my ($size, $count) = each(%del) ) {
+ if ($size > $max) { $max = $size; }
+ }
+
+ print "\n";
+
+ if ($inslen > 0 and $dellen > 0) {
+ $indel_length_ratio = $inslen / $dellen;
+ print "ins/del length ratio:\t$indel_length_ratio\n";
+ print "\n";
+ print "indel size frequency distribution\n";
+ print "size\tins\tdel\tins/del\tcurr/prev\n";
+
+ $last_delcount = 0;
+ $last_inscount = 0;
+ $last_ratio_del = 0;
+ $last_ratio_ins = 0;
+ for (1 .. $max) {
+ $inscount = $ins{$_};
+ $delcount = $del{$_};
+ if ($last_delcount != 0) {
+ $last_ratio_del = $delcount / $last_delcount;
+ }
+ if ($last_inscount != 0) {
+ $last_ratio_ins = $inscount / $last_inscount;
+ }
+ $last_delcount = $delcount;
+ $last_inscount = $inscount;
+ if ($inscount > 0 and $delcount > 0) {
+ $ratio = $inscount / $delcount;
+ } else {
+ $ratio = "";
+ }
+ print "$_\t$inscount\t$delcount\t"
+ . sprintf("%.3f", $ratio);
+ if ($last_ratio_ins != 0 or $last_ratio_del != 0) {
+ print "\t";
+ if ($last_ratio_ins != 0) {
+ print sprintf("%.3f", $last_ratio_ins);
+ }
+ print "\t";
+ if ($last_ratio_del != 0) {
+ print sprintf("%.3f", $last_ratio_del);
+ }
+ print "\n";
+ } else {
+ print "\n";
+ }
+ }
+ # FIXME
+ #print "\t\t\t\t" . sprintf("%.3f", $even_odd_ratio_sum_ins / $ins)
+ # . "\t" . sprintf("%.3f", $even_odd_ratio_sum_del / $del);
+ }
+}
+
+if ($mnplen > 0) {
+ print "\n";
+ print "total mnps:\t$mnp\n";
+ print "mnps length:\t$mnplen\n";
+ print "mnp size distribution\n";
+ $max = 0;
+ while ( my ($size, $count) = each(%mnp) ) {
+ if ($size > $max) { $max = $size; }
+ }
+ print "size\tcount\n";
+ for (2 .. $max) {
+ print $_ . "\t" . $mnp{$_} . "\n";
+ }
+}
+
+if ($reference) {
+
+ print "\n";
+
+ print "di-nucleotide distribution for SNPs\n";
+ print "di-nt\tcount\tcount/(total snps / 16)\n";
+ while ( my ($dibp, $count) = each(%dint) ) {
+ print "$dibp\t$count\t" . ($count / ($snp / 16)) . "\n";
+ }
+
+}
+
diff --git a/samples/sample.vcf b/samples/sample.vcf
new file mode 100644
index 0000000..e8dd794
--- /dev/null
+++ b/samples/sample.vcf
@@ -0,0 +1,31 @@
+##fileformat=VCFv4.0
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=1000GenomesPilot-NCBI36
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
+##ALT=<ID=CNV,Description="Copy number variable region">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
+19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3
+19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3
+20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
+20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,.
+20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,.
+20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,.
+20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3
+20 1235237 . T . . . . GT 0/0 0|0 ./.
+X 10 rsTest AC A,ATG 10 PASS . GT 0 0/1 0|2
diff --git a/src/BedReader.h b/src/BedReader.h
new file mode 100644
index 0000000..9deee16
--- /dev/null
+++ b/src/BedReader.h
@@ -0,0 +1,176 @@
+#ifndef BEDREADER_H
+#define BEDREADER_H
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <deque>
+#include <map>
+#include <iterator>
+#include <algorithm>
+#include "intervaltree/IntervalTree.h"
+#include "split.h"
+
+using namespace std;
+
+string strip(string const& str, char const* separators = " \t") {
+ string::size_type const first = str.find_first_not_of(separators);
+ return (first == string::npos) ? string()
+ : str.substr(first, str.find_last_not_of(separators) - first + 1);
+}
+
+void parseRegion(
+ string& region,
+ string& startSeq,
+ int& startPos,
+ int& stopPos) {
+
+ size_t foundFirstColon = region.find(":");
+
+ // we only have a single string, use the whole sequence as the target
+ if (foundFirstColon == string::npos) {
+ startSeq = region;
+ startPos = 0;
+ stopPos = -1;
+ } else {
+ startSeq = region.substr(0, foundFirstColon);
+ string sep = "..";
+ size_t foundRangeSep = region.find(sep, foundFirstColon);
+ if (foundRangeSep == string::npos) {
+ sep = "-";
+ foundRangeSep = region.find("-", foundFirstColon);
+ }
+ if (foundRangeSep == string::npos) {
+ startPos = atoi(region.substr(foundFirstColon + 1).c_str());
+ // differ from bamtools in this regard, in that we process only
+ // the specified position if a range isn't given
+ stopPos = startPos + 1;
+ } else {
+ startPos = atoi(region.substr(foundFirstColon + 1, foundRangeSep - foundFirstColon).c_str());
+ // if we have range sep specified, but no second number, read to the end of sequence
+ if (foundRangeSep + sep.size() != region.size()) {
+ stopPos = atoi(region.substr(foundRangeSep + sep.size()).c_str()); // end-exclusive, bed-format
+ } else {
+ //stopPos = reference.sequenceLength(startSeq);
+ stopPos = -1;
+ }
+ }
+ }
+}
+
+// stores the posiitional information of a bed target entry
+class BedTarget {
+
+public:
+
+ string seq; // sequence name
+ int left; // left position
+ int right; // right position, adjusted to 0-base
+ string desc; // descriptive information, target name typically
+
+ BedTarget(string s) {
+ parseRegion(s, seq, left, right);
+ }
+
+ BedTarget(string s, int l, int r, string d = "")
+ : seq(s)
+ , left(l)
+ , right(r)
+ , desc(d)
+ { }
+
+};
+
+
+class BedReader {
+
+ bool _isOpen;
+ ifstream file;
+
+public:
+
+ bool isOpen(void) { return _isOpen; }
+
+ vector<BedTarget> targets;
+ map<string, IntervalTree<BedTarget*> > intervals; // intervals by reference sequence
+
+ vector<BedTarget> entries(void) {
+
+ vector<BedTarget> entries;
+
+ if (!isOpen()) {
+ cerr << "bed targets file is not open" << endl;
+ exit(1);
+ }
+
+ string line;
+ while (std::getline(file, line)) {
+ vector<string> fields = split(line, " \t");
+ BedTarget entry(strip(fields[0]),
+ atoi(strip(fields[1]).c_str()),
+ atoi(strip(fields[2]).c_str()),
+ (fields.size() >= 4) ? strip(fields[3]) : "");
+ entries.push_back(entry);
+ }
+
+ return entries;
+
+ }
+
+ vector<BedTarget*> targetsContained(BedTarget& target) {
+ vector<Interval<BedTarget*> > results;
+ intervals[target.seq].findContained(target.left, target.right, results);
+ vector<BedTarget*> contained;
+ for (vector<Interval<BedTarget*> >::iterator r = results.begin(); r != results.end(); ++r) {
+ contained.push_back(r->value);
+ }
+ return contained;
+ }
+
+ vector<BedTarget*> targetsOverlapping(BedTarget& target) {
+ vector<Interval<BedTarget*> > results;
+ intervals[target.seq].findOverlapping(target.left, target.right, results);
+ vector<BedTarget*> overlapping;
+ for (vector<Interval<BedTarget*> >::iterator r = results.begin(); r != results.end(); ++r) {
+ overlapping.push_back(r->value);
+ }
+ return overlapping;
+ }
+
+BedReader(void)
+ : _isOpen(false)
+ { }
+
+BedReader(string& fname)
+ : _isOpen(false) {
+ open(fname);
+ }
+
+ void addTargets(vector<BedTarget>& targets) {
+ map<string, vector<Interval<BedTarget*> > > intervalsBySeq;
+ for (vector<BedTarget>::iterator t = targets.begin(); t != targets.end(); ++t) {
+ intervalsBySeq[t->seq].push_back(Interval<BedTarget*>(1 + t->left, t->right, &*t));
+ }
+ for (map<string, vector<Interval<BedTarget*> > >::iterator s = intervalsBySeq.begin(); s != intervalsBySeq.end(); ++s) {
+ intervals[s->first] = IntervalTree<BedTarget*>(s->second);
+ }
+ }
+
+ void open(const string& fname) {
+ file.open(fname.c_str());
+ _isOpen = true;
+ targets = entries();
+ map<string, vector<Interval<BedTarget*> > > intervalsBySeq;
+ for (vector<BedTarget>::iterator t = targets.begin(); t != targets.end(); ++t) {
+ intervalsBySeq[t->seq].push_back(Interval<BedTarget*>(1 + t->left, t->right, &*t));
+ }
+ for (map<string, vector<Interval<BedTarget*> > >::iterator s = intervalsBySeq.begin(); s != intervalsBySeq.end(); ++s) {
+ intervals[s->first] = IntervalTree<BedTarget*>(s->second);
+ }
+ }
+
+};
+
+#endif
+
diff --git a/src/Variant.cpp b/src/Variant.cpp
new file mode 100644
index 0000000..d6f4a92
--- /dev/null
+++ b/src/Variant.cpp
@@ -0,0 +1,2405 @@
+#include "Variant.h"
+#include <utility>
+
+namespace vcf {
+
+void Variant::parse(string& line, bool parseSamples) {
+
+ // clean up potentially variable data structures
+ info.clear();
+ infoFlags.clear();
+ format.clear();
+ alt.clear();
+ alleles.clear();
+
+ // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT [SAMPLE1 .. SAMPLEN]
+ vector<string> fields = split(line, '\t');
+ if (fields.size() < 7) {
+ cerr << "broken VCF record (less than 7 fields)" << endl
+ << line << endl;
+ exit(1);
+ }
+
+ sequenceName = fields.at(0);
+ char* end; // dummy variable for strtoll
+ position = strtoll(fields.at(1).c_str(), &end, 10);
+ id = fields.at(2);
+ ref = fields.at(3);
+ alt = split(fields.at(4), ","); // a comma-separated list of alternate alleles
+
+ // make a list of all (ref + alts) alleles, allele[0] = ref, alleles[1:] = alts
+ // add the ref allele ([0]), resize for the alt alleles, and then add the alt alleles
+ alleles.push_back(ref);
+ alleles.resize(alt.size()+1);
+ std::copy(alt.begin(), alt.end(), alleles.begin()+1);
+
+ // set up reverse lookup of allele index
+ altAlleleIndexes.clear();
+ int n = 0;
+ for (vector<string>::iterator a = alt.begin();
+ a != alt.end(); ++a, ++n) {
+ altAlleleIndexes[*a] = n;
+ }
+
+ convert(fields.at(5), quality);
+ filter = fields.at(6);
+ if (fields.size() > 7) {
+ vector<string> infofields = split(fields.at(7), ';');
+ for (vector<string>::iterator f = infofields.begin(); f != infofields.end(); ++f) {
+ if (*f == ".") {
+ continue;
+ }
+ vector<string> kv = split(*f, '=');
+ if (kv.size() == 2) {
+ split(kv.at(1), ',', info[kv.at(0)]);
+ } else if (kv.size() == 1) {
+ infoFlags[kv.at(0)] = true;
+ }
+ }
+ }
+ // check if we have samples specified
+ // and that we are supposed to parse them
+ if (parseSamples && fields.size() > 8) {
+ format = split(fields.at(8), ':');
+ // if the format changed, we have to rebuild the samples
+ if (fields.at(8) != lastFormat) {
+ samples.clear();
+ lastFormat = fields.at(8);
+ }
+ vector<string>::iterator sampleName = sampleNames.begin();
+ vector<string>::iterator sample = fields.begin() + 9;
+ for (; sample != fields.end() && sampleName != sampleNames.end(); ++sample, ++sampleName) {
+ string& name = *sampleName;
+ if (*sample == "." || *sample == "./.") {
+ samples.erase(name);
+ continue;
+ }
+ vector<string> samplefields = split(*sample, ':');
+ vector<string>::iterator i = samplefields.begin();
+ if (samplefields.size() != format.size()) {
+ // ignore this case... malformed (or 'null') sample specs are caught above
+ // /*
+ // cerr << "inconsistent number of fields for sample " << name << endl
+ // << "format is " << join(format, ":") << endl
+ // << "sample is " << *sample << endl;
+ // exit(1);
+ // *
+ }
+ else {
+ for (vector<string>::iterator f = format.begin(); f != format.end(); ++f) {
+ samples[name][*f] = split(*i, ','); ++i;
+ }
+ }
+ }
+ if (sampleName != sampleNames.end()) {
+ cerr << "error: more sample names in header than sample fields" << endl;
+ cerr << "samples: " << join(sampleNames, " ") << endl;
+ cerr << "line: " << line << endl;
+ exit(1);
+ }
+ if (sample != fields.end()) {
+ cerr << "error: more sample fields than samples listed in header" << endl;
+ cerr << "samples: " << join(sampleNames, " ") << endl;
+ cerr << "line: " << line << endl;
+ cerr << *sample << endl;
+ exit(1);
+ }
+ } else if (!parseSamples) {
+ originalLine = line;
+ }
+
+ //return true; // we should be catching exceptions...
+}
+
+void Variant::setVariantCallFile(VariantCallFile& v) {
+ sampleNames = v.sampleNames;
+ outputSampleNames = v.sampleNames;
+ vcf = &v;
+}
+
+void Variant::setVariantCallFile(VariantCallFile* v) {
+ sampleNames = v->sampleNames;
+ outputSampleNames = v->sampleNames;
+ vcf = v;
+}
+
+ostream& operator<<(ostream& out, VariantFieldType type) {
+ switch (type) {
+ case FIELD_INTEGER:
+ out << "integer";
+ break;
+ case FIELD_FLOAT:
+ out << "float";
+ break;
+ case FIELD_BOOL:
+ out << "bool";
+ break;
+ case FIELD_STRING:
+ out << "string";
+ break;
+ default:
+ out << "unknown";
+ break;
+ }
+ return out;
+}
+
+VariantFieldType typeStrToVariantFieldType(string& typeStr) {
+ if (typeStr == "Integer") {
+ return FIELD_INTEGER;
+ } else if (typeStr == "Float") {
+ return FIELD_FLOAT;
+ } else if (typeStr == "Flag") {
+ return FIELD_BOOL;
+ } else if (typeStr == "String") {
+ return FIELD_STRING;
+ } else {
+ return FIELD_UNKNOWN;
+ }
+}
+
+VariantFieldType Variant::infoType(string& key) {
+ map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
+ if (s == vcf->infoTypes.end()) {
+ if (key == "QUAL") { // hack to use QUAL as an "info" field
+ return FIELD_INTEGER;
+ }
+ cerr << "no info field " << key << endl;
+ exit(1);
+ } else {
+ return s->second;
+ }
+}
+
+ VariantFieldType Variant::formatType(string& key) {
+ map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
+ if (s == vcf->formatTypes.end()) {
+ cerr << "no format field " << key << endl;
+ exit(1);
+ } else {
+ return s->second;
+ }
+ }
+
+ bool Variant::getInfoValueBool(string& key, int index) {
+ map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
+ if (s == vcf->infoTypes.end()) {
+ cerr << "no info field " << key << endl;
+ exit(1);
+ } else {
+ int count = vcf->infoCounts[key];
+ // XXX TODO, fix for Genotype variants...
+ if (count != ALLELE_NUMBER) {
+ index = 0;
+ }
+ if (index == INDEX_NONE) {
+ if (count != 1) {
+ cerr << "no field index supplied and field count != 1" << endl;
+ exit(1);
+ } else {
+ index = 0;
+ }
+ }
+ VariantFieldType type = s->second;
+ if (type == FIELD_BOOL) {
+ map<string, bool>::iterator b = infoFlags.find(key);
+ if (b == infoFlags.end())
+ return false;
+ else
+ return true;
+ } else {
+ cerr << "not flag type " << key << endl;
+ }
+ }
+ }
+
+ string Variant::getInfoValueString(string& key, int index) {
+ map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
+ if (s == vcf->infoTypes.end()) {
+ cerr << "no info field " << key << endl;
+ exit(1);
+ } else {
+ int count = vcf->infoCounts[key];
+ // XXX TODO, fix for Genotype variants...
+ if (count != ALLELE_NUMBER) {
+ index = 0;
+ }
+ if (index == INDEX_NONE) {
+ if (count != 1) {
+ cerr << "no field index supplied and field count != 1" << endl;
+ exit(1);
+ } else {
+ index = 0;
+ }
+ }
+ VariantFieldType type = s->second;
+ if (type == FIELD_STRING) {
+ map<string, vector<string> >::iterator b = info.find(key);
+ if (b == info.end())
+ return "";
+ return b->second.at(index);
+ } else {
+ cerr << "not string type " << key << endl;
+ return "";
+ }
+ }
+ }
+
+ double Variant::getInfoValueFloat(string& key, int index) {
+ map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
+ if (s == vcf->infoTypes.end()) {
+ if (key == "QUAL") {
+ return quality;
+ }
+ cerr << "no info field " << key << endl;
+ exit(1);
+ } else {
+ int count = vcf->infoCounts[key];
+ // XXX TODO, fix for Genotype variants...
+ if (count != ALLELE_NUMBER) {
+ index = 0;
+ }
+ if (index == INDEX_NONE) {
+ if (count != 1) {
+ cerr << "no field index supplied and field count != 1" << endl;
+ exit(1);
+ } else {
+ index = 0;
+ }
+ }
+ VariantFieldType type = s->second;
+ if (type == FIELD_FLOAT || type == FIELD_INTEGER) {
+ map<string, vector<string> >::iterator b = info.find(key);
+ if (b == info.end())
+ return false;
+ double r;
+ if (!convert(b->second.at(index), r)) {
+ cerr << "could not convert field " << key << "=" << b->second.at(index) << " to " << type << endl;
+ exit(1);
+ }
+ return r;
+ } else {
+ cerr << "unsupported type for variant record " << type << endl;
+ exit(1);
+ }
+ }
+ }
+
+ int Variant::getNumSamples(void) {
+ return sampleNames.size();
+ }
+
+ int Variant::getNumValidGenotypes(void) {
+ int valid_genotypes = 0;
+ map<string, map<string, vector<string> > >::const_iterator s = samples.begin();
+ map<string, map<string, vector<string> > >::const_iterator sEnd = samples.end();
+ for (; s != sEnd; ++s) {
+ map<string, vector<string> > sample_info = s->second;
+ if (sample_info["GT"].front() != "./.") {
+ valid_genotypes++;
+ }
+ }
+ return valid_genotypes;
+ }
+
+ bool Variant::getSampleValueBool(string& key, string& sample, int index) {
+ map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
+ if (s == vcf->infoTypes.end()) {
+ cerr << "no info field " << key << endl;
+ exit(1);
+ } else {
+ int count = vcf->formatCounts[key];
+ // XXX TODO, fix for Genotype variants...
+ if (count != ALLELE_NUMBER) {
+ index = 0;
+ }
+ if (index == INDEX_NONE) {
+ if (count != 1) {
+ cerr << "no field index supplied and field count != 1" << endl;
+ exit(1);
+ } else {
+ index = 0;
+ }
+ }
+ VariantFieldType type = s->second;
+ map<string, vector<string> >& sampleData = samples[sample];
+ if (type == FIELD_BOOL) {
+ map<string, vector<string> >::iterator b = sampleData.find(key);
+ if (b == sampleData.end())
+ return false;
+ else
+ return true;
+ } else {
+ cerr << "not bool type " << key << endl;
+ }
+ }
+ }
+
+ string Variant::getSampleValueString(string& key, string& sample, int index) {
+ map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
+ if (s == vcf->infoTypes.end()) {
+ cerr << "no info field " << key << endl;
+ exit(1);
+ } else {
+ int count = vcf->formatCounts[key];
+ // XXX TODO, fix for Genotype variants...
+ if (count != ALLELE_NUMBER) {
+ index = 0;
+ }
+ if (index == INDEX_NONE) {
+ if (count != 1) {
+ cerr << "no field index supplied and field count != 1" << endl;
+ exit(1);
+ } else {
+ index = 0;
+ }
+ }
+ VariantFieldType type = s->second;
+ map<string, vector<string> >& sampleData = samples[sample];
+ if (type == FIELD_STRING) {
+ map<string, vector<string> >::iterator b = sampleData.find(key);
+ if (b == sampleData.end()) {
+ return "";
+ } else {
+ return b->second.at(index);
+ }
+ } else {
+ cerr << "not string type " << key << endl;
+ }
+ }
+ }
+
+ double Variant::getSampleValueFloat(string& key, string& sample, int index) {
+ map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
+ if (s == vcf->infoTypes.end()) {
+ cerr << "no info field " << key << endl;
+ exit(1);
+ } else {
+ // XXX TODO wrap this with a function call
+ int count = vcf->formatCounts[key];
+ // XXX TODO, fix for Genotype variants...
+ if (count != ALLELE_NUMBER) {
+ index = 0;
+ }
+ if (index == INDEX_NONE) {
+ if (count != 1) {
+ cerr << "no field index supplied and field count != 1" << endl;
+ exit(1);
+ } else {
+ index = 0;
+ }
+ }
+ VariantFieldType type = s->second;
+ map<string, vector<string> >& sampleData = samples[sample];
+ if (type == FIELD_FLOAT || type == FIELD_INTEGER) {
+ map<string, vector<string> >::iterator b = sampleData.find(key);
+ if (b == sampleData.end())
+ return false;
+ double r;
+ if (!convert(b->second.at(index), r)) {
+ cerr << "could not convert field " << key << "=" << b->second.at(index) << " to " << type << endl;
+ exit(1);
+ }
+ return r;
+ } else {
+ cerr << "unsupported type for sample " << type << endl;
+ }
+ }
+ }
+
+ bool Variant::getValueBool(string& key, string& sample, int index) {
+ if (sample.empty()) { // an empty sample name means
+ return getInfoValueBool(key, index);
+ } else {
+ return getSampleValueBool(key, sample, index);
+ }
+ }
+
+ double Variant::getValueFloat(string& key, string& sample, int index) {
+ if (sample.empty()) { // an empty sample name means
+ return getInfoValueFloat(key, index);
+ } else {
+ return getSampleValueFloat(key, sample, index);
+ }
+ }
+
+ string Variant::getValueString(string& key, string& sample, int index) {
+ if (sample.empty()) { // an empty sample name means
+ return getInfoValueString(key, index);
+ } else {
+ return getSampleValueString(key, sample, index);
+ }
+ }
+
+ int Variant::getAltAlleleIndex(string& allele) {
+ map<string, int>::iterator f = altAlleleIndexes.find(allele);
+ if (f == altAlleleIndexes.end()) {
+ cerr << "no such allele \'" << allele << "\' in record " << sequenceName << ":" << position << endl;
+ exit(1);
+ } else {
+ return f->second;
+ }
+ }
+
+ void Variant::addFilter(string& tag) {
+ if (filter == "" || filter == ".")
+ filter = tag;
+ else
+ filter += "," + tag;
+ }
+
+ void Variant::addFormatField(string& key) {
+ bool hasTag = false;
+ for (vector<string>::iterator t = format.begin(); t != format.end(); ++t) {
+ if (*t == key) {
+ hasTag = true;
+ break;
+ }
+ }
+ if (!hasTag) {
+ format.push_back(key);
+ }
+ }
+
+ void Variant::printAlt(ostream& out) {
+ for (vector<string>::iterator i = alt.begin(); i != alt.end(); ++i) {
+ out << *i;
+ // add a comma for all but the last alternate allele
+ if (i != (alt.end() - 1)) out << ",";
+ }
+ }
+
+ void Variant::printAlleles(ostream& out) {
+ for (vector<string>::iterator i = alleles.begin(); i != alleles.end(); ++i) {
+ out << *i;
+ // add a comma for all but the last alternate allele
+ if (i != (alleles.end() - 1)) out << ",";
+ }
+ }
+
+ ostream& operator<<(ostream& out, Variant& var) {
+ // ensure there are no empty fields
+ if (var.sequenceName.empty()) var.sequenceName = ".";
+ if (var.id.empty()) var.id = ".";
+ if (var.ref.empty()) var.ref = ".";
+ if (var.alt.empty()) var.alt.push_back(".");
+ if (var.filter.empty()) var.filter = ".";
+
+ out << var.sequenceName << "\t"
+ << var.position << "\t"
+ << var.id << "\t"
+ << var.ref << "\t";
+ // report the list of alternate alleles.
+ var.printAlt(out);
+ out << "\t"
+ << var.quality << "\t"
+ << var.filter << "\t";
+ if (var.info.empty() && var.infoFlags.empty()) {
+ out << ".";
+ } else {
+ for (map<string, vector<string> >::iterator i = var.info.begin(); i != var.info.end(); ++i) {
+ if (!i->second.empty()) {
+ out << ((i == var.info.begin()) ? "" : ";") << i->first << "=" << join(i->second, ",");
+ }
+ }
+ for (map<string, bool>::iterator i = var.infoFlags.begin(); i != var.infoFlags.end(); ++i) {
+ if (i == var.infoFlags.end()) {
+ out << "";
+ } else if (i == var.infoFlags.begin() && var.info.empty()) {
+ out << "";
+ } else {
+ out << ";";
+ }
+ out << i->first;
+ }
+ }
+ if (!var.format.empty()) {
+ out << "\t";
+ for (vector<string>::iterator f = var.format.begin(); f != var.format.end(); ++f) {
+ out << ((f == var.format.begin()) ? "" : ":") << *f;
+ }
+ for (vector<string>::iterator s = var.outputSampleNames.begin(); s != var.outputSampleNames.end(); ++s) {
+ out << "\t";
+ map<string, map<string, vector<string> > >::iterator sampleItr = var.samples.find(*s);
+ if (sampleItr == var.samples.end()) {
+ out << ".";
+ } else {
+ map<string, vector<string> >& sample = sampleItr->second;
+ if (sample.size() == 0) {
+ out << ".";
+ } else {
+ for (vector<string>::iterator f = var.format.begin(); f != var.format.end(); ++f) {
+ map<string, vector<string> >::iterator g = sample.find(*f);
+ out << ((f == var.format.begin()) ? "" : ":");
+ if (g != sample.end() && !g->second.empty()) {
+ out << join(g->second, ",");
+ } else {
+ out << ".";
+ }
+ }
+ }
+ }
+ }
+ }
+ return out;
+ }
+
+ void Variant::setOutputSampleNames(vector<string>& samplesToOutput) {
+ outputSampleNames = samplesToOutput;
+ }
+
+
+// shunting yard algorithm
+ void infixToPrefix(queue<RuleToken> tokens, queue<RuleToken>& prefixtokens) {
+ stack<RuleToken> ops;
+ while (!tokens.empty()) {
+ RuleToken& token = tokens.front();
+ if (isOperator(token)) {
+ //cerr << "found operator " << token.value << endl;
+ while (ops.size() > 0 && isOperator(ops.top())
+ && ( (isLeftAssociative(token) && priority(token) <= priority(ops.top()))
+ || (isRightAssociative(token) && priority(token) < priority(ops.top())))) {
+ prefixtokens.push(ops.top());
+ ops.pop();
+ }
+ ops.push(token);
+ } else if (isLeftParenthesis(token)) {
+ //cerr << "found paran " << token.value << endl;
+ ops.push(token);
+ } else if (isRightParenthesis(token)) {
+ //cerr << "found paran " << token.value << endl;
+ while (ops.size() > 0 && !isLeftParenthesis(ops.top())) {
+ prefixtokens.push(ops.top());
+ ops.pop();
+ }
+ if (ops.size() == 0) {
+ cerr << "error: mismatched parentheses" << endl;
+ exit(1);
+ }
+ if (isLeftParenthesis(ops.top())) {
+ ops.pop();
+ }
+ } else {
+ //cerr << "found operand " << token.value << endl;
+ prefixtokens.push(token);
+ }
+ tokens.pop();
+ }
+ while (ops.size() > 0) {
+ if (isRightParenthesis(ops.top()) || isLeftParenthesis(ops.top())) {
+ cerr << "error: mismatched parentheses" << endl;
+ exit(1);
+ }
+ prefixtokens.push(ops.top());
+ ops.pop();
+ }
+ }
+
+ RuleToken::RuleToken(string tokenstr, map<string, VariantFieldType>& variables) {
+ isVariable = false;
+ if (tokenstr == "!") {
+ type = RuleToken::NOT_OPERATOR;
+ } else if (tokenstr == "&") {
+ type = RuleToken::AND_OPERATOR;
+ } else if (tokenstr == "|") {
+ type = RuleToken::OR_OPERATOR;
+ } else if (tokenstr == "+") {
+ type = RuleToken::ADD_OPERATOR;
+ } else if (tokenstr == "-") {
+ type = RuleToken::SUBTRACT_OPERATOR;
+ } else if (tokenstr == "*") {
+ type = RuleToken::MULTIPLY_OPERATOR;
+ } else if (tokenstr == "/") {
+ type = RuleToken::DIVIDE_OPERATOR;
+ } else if (tokenstr == "=") {
+ type = RuleToken::EQUAL_OPERATOR;
+ } else if (tokenstr == ">") {
+ type = RuleToken::GREATER_THAN_OPERATOR;
+ } else if (tokenstr == "<") {
+ type = RuleToken::LESS_THAN_OPERATOR;
+ } else if (tokenstr == "(") {
+ type = RuleToken::LEFT_PARENTHESIS;
+ } else if (tokenstr == ")") {
+ type = RuleToken::RIGHT_PARENTHESIS;
+ } else { // operand
+ type = RuleToken::OPERAND;
+ if (variables.find(tokenstr) == variables.end()) {
+ if (convert(tokenstr, number)) {
+ type = RuleToken::NUMBER;
+ } else if (tokenstr == "QUAL") {
+ isVariable = true;
+ } else {
+ type = RuleToken::STRING_VARIABLE;
+ }
+ } else {
+ isVariable = true;
+ }
+ }
+ value = tokenstr;
+ }
+
+
+ void tokenizeFilterSpec(string& filterspec, queue<RuleToken>& tokens, map<string, VariantFieldType>& variables) {
+ string lastToken = "";
+ bool inToken = false;
+ for (unsigned int i = 0; i < filterspec.size(); ++i) {
+ char c = filterspec.at(i);
+ if (c == ' ' || c == '\n') {
+ inToken = false;
+ if (!inToken && lastToken.size() > 0) {
+ tokens.push(RuleToken(lastToken, variables));
+ lastToken = "";
+ }
+ } else if (!inToken && (isOperatorChar(c) || isParanChar(c))) {
+ inToken = false;
+ if (lastToken.size() > 0) {
+ tokens.push(RuleToken(lastToken, variables));
+ lastToken = "";
+ }
+ tokens.push(RuleToken(filterspec.substr(i,1), variables));
+ } else {
+ inToken = true;
+ lastToken += c;
+ }
+ }
+ // get the last token
+ if (inToken) {
+ tokens.push(RuleToken(lastToken, variables));
+ }
+ }
+
+// class which evaluates filter expressions
+// allow filters to be defined using boolean infix expressions e.g.:
+//
+// "GQ > 10 & (DP < 3 | DP > 5) & SAMPLE = NA12878"
+// or
+// "GT = 1/1 | GT = 0/0"
+//
+// on initialization, tokenizes the input sequence, and converts it from infix to postfix
+// on call to
+//
+
+
+ VariantFilter::VariantFilter(string filterspec, VariantFilterType filtertype, map<string, VariantFieldType>& variables) {
+ type = filtertype;
+ spec = filterspec;
+ tokenizeFilterSpec(filterspec, tokens, variables);
+ infixToPrefix(tokens, rules);
+ /*while (!rules.empty()) {
+ cerr << " " << rules.front().value << ((isNumeric(rules.front())) ? "f" : "");
+ rules.pop();
+ }
+ */
+ //cerr << endl;
+ //cerr << join(" ", tokens) << endl;
+ }
+
+// all alts pass
+ bool VariantFilter::passes(Variant& var, string& sample) {
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string& allele = *a;
+ if (!passes(var, sample, allele)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool VariantFilter::passes(Variant& var, string& sample, string& allele) {
+ // to evaluate a rpn boolean queue with embedded numbers and variables
+ // make a result stack, use float to allow comparison of floating point
+ // numbers, booleans, and integers
+ stack<RuleToken> results;
+ queue<RuleToken> rulesCopy = rules; // copy
+
+ int index;
+ if (allele.empty()) {
+ index = 0; // apply to the whole record
+ } else {
+ // apply to a specific allele
+ index = var.getAltAlleleIndex(allele);
+ }
+
+ while (!rulesCopy.empty()) {
+ RuleToken token = rulesCopy.front();
+ rulesCopy.pop();
+ // pop operands from the front of the queue and push them onto the stack
+ if (isOperand(token)) {
+ //cout << "is operand: " << token.value << endl;
+ // if the token is variable, i.e. not evaluated in this context, we
+ // must evaluate it before pushing it onto the stack
+ if (token.isVariable) {
+ //cout << "is variable" << endl;
+ // look up the variable using the Variant, depending on our filter type
+ //cout << "token.value " << token.value << endl;
+ VariantFieldType vtype;
+ if (sample.empty()) { // means we are record-specific
+ vtype = var.infoType(token.value);
+ } else {
+ vtype = var.formatType(token.value);
+ //cout << "type = " << type << endl;
+ }
+ //cout << "type: " << type << endl;
+
+ if (vtype == FIELD_INTEGER || vtype == FIELD_FLOAT) {
+ token.type = RuleToken::NUMERIC_VARIABLE;
+ token.number = var.getValueFloat(token.value, sample, index);
+ //cerr << "number: " << token.number << endl;
+ } else if (vtype == FIELD_BOOL) {
+ token.type = RuleToken::BOOLEAN_VARIABLE;
+ token.state = var.getValueBool(token.value, sample, index);
+ //cerr << "state: " << token.state << endl;
+ } else if (vtype == FIELD_STRING) {
+ //cout << "token.value = " << token.value << endl;
+ token.type = RuleToken::STRING_VARIABLE;
+ token.str = var.getValueString(token.value, sample, index);
+ } else if (isString(token)) {
+ token.type = RuleToken::STRING_VARIABLE;
+ token.str = var.getValueString(token.value, sample, index);
+ //cerr << "string: " << token.str << endl;
+ }
+ } else {
+ double f;
+ string s;
+ //cerr << "parsing operand" << endl;
+ if (convert(token.value, f)) {
+ token.type = RuleToken::NUMERIC_VARIABLE;
+ token.number = f;
+ //cerr << "number: " << token.number << endl;
+ } else if (convert(token.value, s)) {
+ token.type = RuleToken::STRING_VARIABLE;
+ token.str = s;
+ //cerr << "string: " << token.str << endl;
+ } else {
+ cerr << "could not parse non-variable operand " << token.value << endl;
+ exit(1);
+ }
+ }
+ results.push(token);
+ }
+ // apply operators to the first n elements on the stack and push the result back onto the stack
+ else if (isOperator(token)) {
+ //cerr << "is operator: " << token.value << endl;
+ RuleToken a, b, r;
+ // is it a not-operator?
+ switch (token.type) {
+ case ( RuleToken::NOT_OPERATOR ):
+ a = results.top();
+ results.pop();
+ if (!isBoolean(a)) {
+ cerr << "cannot negate a non-boolean" << endl;
+ } else {
+ a.state = !a.state;
+ results.push(a);
+ }
+ break;
+
+ case ( RuleToken::EQUAL_OPERATOR ):
+ a = results.top(); results.pop();
+ b = results.top(); results.pop();
+ if (a.type == b.type) {
+ switch (a.type) {
+ case (RuleToken::STRING_VARIABLE):
+ r.state = (a.str == b.str);
+ break;
+ case (RuleToken::NUMERIC_VARIABLE):
+ r.state = (a.number == b.number);
+ break;
+ case (RuleToken::BOOLEAN_VARIABLE):
+ r.state = (a.state == b.state);
+ break;
+ default:
+ cerr << "should not get here" << endl; exit(1);
+ break;
+ }
+ } else if (a.type == RuleToken::STRING_VARIABLE && b.type == RuleToken::NUMERIC_VARIABLE) {
+ r.state = (convert(b.number) == a.str);
+ } else if (b.type == RuleToken::STRING_VARIABLE && a.type == RuleToken::NUMERIC_VARIABLE) {
+ r.state = (convert(a.number) == b.str);
+ }
+ results.push(r);
+ break;
+
+ case ( RuleToken::GREATER_THAN_OPERATOR ):
+ a = results.top(); results.pop();
+ b = results.top(); results.pop();
+ if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
+ r.state = (b.number > a.number);
+ } else {
+ cerr << "cannot compare (>) objects of dissimilar types" << endl;
+ cerr << a.type << " " << b.type << endl;
+ exit(1);
+ }
+ results.push(r);
+ break;
+
+ case ( RuleToken::LESS_THAN_OPERATOR ):
+ a = results.top(); results.pop();
+ b = results.top(); results.pop();
+ if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
+ r.state = (b.number < a.number);
+ } else {
+ cerr << "cannot compare (<) objects of dissimilar types" << endl;
+ cerr << a.type << " " << b.type << endl;
+ exit(1);
+ }
+ results.push(r);
+ break;
+
+ case ( RuleToken::ADD_OPERATOR ):
+ a = results.top(); results.pop();
+ b = results.top(); results.pop();
+ if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
+ r.number = (b.number + a.number);
+ r.type = RuleToken::NUMERIC_VARIABLE;
+ } else {
+ cerr << "cannot add objects of dissimilar types" << endl;
+ cerr << a.type << " " << b.type << endl;
+ exit(1);
+ }
+ results.push(r);
+ break;
+
+ case ( RuleToken::SUBTRACT_OPERATOR ):
+ a = results.top(); results.pop();
+ b = results.top(); results.pop();
+ if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
+ r.number = (b.number - a.number);
+ r.type = RuleToken::NUMERIC_VARIABLE;
+ } else {
+ cerr << "cannot subtract objects of dissimilar types" << endl;
+ cerr << a.type << " " << b.type << endl;
+ exit(1);
+ }
+ results.push(r);
+ break;
+
+ case ( RuleToken::MULTIPLY_OPERATOR ):
+ a = results.top(); results.pop();
+ b = results.top(); results.pop();
+ if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
+ r.number = (b.number * a.number);
+ r.type = RuleToken::NUMERIC_VARIABLE;
+ } else {
+ cerr << "cannot multiply objects of dissimilar types" << endl;
+ cerr << a.type << " " << b.type << endl;
+ exit(1);
+ }
+ results.push(r);
+ break;
+
+ case ( RuleToken::DIVIDE_OPERATOR):
+ a = results.top(); results.pop();
+ b = results.top(); results.pop();
+ if (a.type == b.type && a.type == RuleToken::NUMERIC_VARIABLE) {
+ r.number = (b.number / a.number);
+ r.type = RuleToken::NUMERIC_VARIABLE;
+ } else {
+ cerr << "cannot divide objects of dissimilar types" << endl;
+ cerr << a.type << " " << b.type << endl;
+ exit(1);
+ }
+ results.push(r);
+ break;
+
+ case ( RuleToken::AND_OPERATOR ):
+ case ( RuleToken::OR_OPERATOR ):
+ a = results.top(); results.pop();
+ b = results.top(); results.pop();
+ if (a.type == b.type && a.type == RuleToken::BOOLEAN_VARIABLE) {
+ if (token.type == RuleToken::AND_OPERATOR) {
+ r.state = (a.state && b.state);
+ } else {
+ r.state = (a.state || b.state);
+ }
+ } else {
+ cerr << "cannot compare (& or |) objects of dissimilar types" << endl;
+ exit(1);
+ }
+ results.push(r);
+ break;
+ default:
+ cerr << "should not get here!" << endl; exit(1);
+ break;
+ }
+ }
+ }
+ // at the end you should have only one value on the stack, return it as a boolean
+ if (results.size() == 1) {
+ if (isBoolean(results.top())) {
+ return results.top().state;
+ } else {
+ cerr << "error, non-boolean value left on stack" << endl;
+ //cerr << results.top().value << endl;
+ exit(1);
+ }
+ } else if (results.size() > 1) {
+ cerr << "more than one value left on results stack!" << endl;
+ while (!results.empty()) {
+ cerr << results.top().value << endl;
+ results.pop();
+ }
+ exit(1);
+ } else {
+ cerr << "results stack empty" << endl;
+ exit(1);
+ }
+}
+
+void VariantFilter::removeFilteredGenotypes(Variant& var, bool keepInfo) {
+
+ for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+ string& name = *s;
+ if (!passes(var, name)) {
+ if (keepInfo) {
+ var.samples[name]["GT"].clear();
+ var.samples[name]["GT"].push_back("./.");
+ }
+ else {
+ var.samples.erase(name);
+ }
+ }
+ }
+}
+
+/*
+bool VariantCallFile::openVCF(string& filename) {
+ file.open(filename.c_str(), ifstream::in);
+ if (!file.is_open()) {
+ cerr << "could not open " << filename << endl;
+ return false;
+ } else {
+ return parseHeader();
+ }
+}
+
+bool VariantCallFile::openVCF(ifstream& stream) {
+ file = stream;
+ if (!file.is_open()) {
+ cerr << "provided file is not open" << endl;
+ return false;
+ } else {
+ return parseHeader();
+ }
+}
+*/
+
+void VariantCallFile::updateSamples(vector<string>& newSamples) {
+ sampleNames = newSamples;
+ // regenerate the last line of the header
+ vector<string> headerLines = split(header, '\n');
+ vector<string> colnames = split(headerLines.at(headerLines.size() - 1), '\t'); // get the last, update the samples
+ vector<string> newcolnames;
+ newcolnames.resize(9 + sampleNames.size());
+ copy(colnames.begin(), colnames.begin() + 9, newcolnames.begin());
+ copy(sampleNames.begin(), sampleNames.end(), newcolnames.begin() + 9);
+ headerLines.at(headerLines.size() - 1) = join(newcolnames, "\t");
+ header = join(headerLines, "\n");
+}
+
+// non-destructive version of above
+string VariantCallFile::headerWithSampleNames(vector<string>& newSamples) {
+ // regenerate the last line of the header
+ if (newSamples.empty()) return header;
+ vector<string> headerLines = split(header, '\n');
+ vector<string> colnames = split(headerLines.at(headerLines.size() - 1), '\t'); // get the last, update the samples
+ vector<string> newcolnames;
+ unsigned int colCount = colnames.size(); // used to be hard-coded 9, hopefully the dynamic colCount isn't an issue
+ if (colCount < 8)
+ {
+ cout << "VCF file is not suitable for use because it does not have a format field." << endl;
+ exit(0);
+ }
+ newcolnames.resize(colCount + newSamples.size());
+ copy(colnames.begin(), colnames.begin() + colCount, newcolnames.begin());
+ copy(newSamples.begin(), newSamples.end(), newcolnames.begin() + colCount);
+ headerLines.at(headerLines.size() - 1) = join(newcolnames, "\t");
+ return join(headerLines, "\n");
+}
+
+// TODO cleanup, store header lines instead of bulk header
+void VariantCallFile::addHeaderLine(string line) {
+ vector<string> headerLines = split(header, '\n');
+ headerLines.insert(headerLines.end() - 1, line);
+ header = join(unique(headerLines), "\n");
+}
+
+// helper to addHeaderLine
+vector<string>& unique(vector<string>& strings) {
+ set<string> uniq;
+ vector<string> res;
+ for (vector<string>::const_iterator s = strings.begin(); s != strings.end(); ++s) {
+ if (uniq.find(*s) == uniq.end()) {
+ res.push_back(*s);
+ uniq.insert(*s);
+ }
+ }
+ strings = res;
+ return strings;
+}
+
+vector<string> VariantCallFile::infoIds(void) {
+ vector<string> tags;
+ vector<string> headerLines = split(header, '\n');
+ for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
+ string& line = *s;
+ if (line.find("##INFO") == 0) {
+ size_t pos = line.find("ID=");
+ if (pos != string::npos) {
+ pos += 3;
+ size_t tagend = line.find(",", pos);
+ if (tagend != string::npos) {
+ tags.push_back(line.substr(pos, tagend - pos));
+ }
+ }
+ }
+ }
+ return tags;
+}
+
+vector<string> VariantCallFile::formatIds(void) {
+ vector<string> tags;
+ vector<string> headerLines = split(header, '\n');
+ for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
+ string& line = *s;
+ if (line.find("##FORMAT") == 0) {
+ size_t pos = line.find("ID=");
+ if (pos != string::npos) {
+ pos += 3;
+ size_t tagend = line.find(",", pos);
+ if (tagend != string::npos) {
+ tags.push_back(line.substr(pos, tagend - pos));
+ }
+ }
+ }
+ }
+ return tags;
+}
+
+void VariantCallFile::removeInfoHeaderLine(string tag) {
+ vector<string> headerLines = split(header, '\n');
+ vector<string> newHeader;
+ string id = "ID=" + tag + ",";
+ for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
+ string& line = *s;
+ if (line.find("##INFO") == 0) {
+ if (line.find(id) == string::npos) {
+ newHeader.push_back(line);
+ }
+ } else {
+ newHeader.push_back(line);
+ }
+ }
+ header = join(newHeader, "\n");
+}
+
+void VariantCallFile::removeGenoHeaderLine(string tag) {
+ vector<string> headerLines = split(header, '\n');
+ vector<string> newHeader;
+ string id = "ID=" + tag + ",";
+ for (vector<string>::iterator s = headerLines.begin(); s != headerLines.end(); ++s) {
+ string& headerLine = *s;
+ if (headerLine.find("##FORMAT") == 0) {
+ if (headerLine.find(id) == string::npos) {
+ newHeader.push_back(headerLine);
+ }
+ } else {
+ newHeader.push_back(headerLine);
+ }
+ }
+ header = join(newHeader, "\n");
+}
+
+vector<string> VariantCallFile::getHeaderLinesFromFile()
+{
+ string headerStr = "";
+
+ if (usingTabix) {
+ tabixFile->getHeader(headerStr);
+ if (headerStr.empty()) {
+ cerr << "error: no VCF header" << endl;
+ exit(1);
+ }
+ tabixFile->getNextLine(line);
+ firstRecord = true;
+ } else {
+ while (std::getline(*file, line)) {
+ if (line.substr(0,1) == "#") {
+ headerStr += line + '\n';
+ } else {
+ // done with header
+ if (headerStr.empty()) {
+ cerr << "error: no VCF header" << endl;
+ return vector<string>();
+ }
+ firstRecord = true;
+ break;
+ }
+ }
+ }
+ return split(headerStr, "\n");
+}
+
+bool VariantCallFile::parseHeader(void) {
+
+ string headerStr = "";
+
+ if (usingTabix) {
+ tabixFile->getHeader(headerStr);
+ if (headerStr.empty()) {
+ cerr << "error: no VCF header" << endl;
+ exit(1);
+ }
+ tabixFile->getNextLine(line);
+ firstRecord = true;
+ } else {
+ while (std::getline(*file, line)) {
+ if (line.substr(0,1) == "#") {
+ headerStr += line + '\n';
+ } else {
+ // done with header
+ if (headerStr.empty()) {
+ cerr << "error: no VCF header" << endl;
+ return false;
+ }
+ firstRecord = true;
+ break;
+ }
+ }
+ }
+ this->vcf_header = headerStr;
+
+ return parseHeader(headerStr);
+
+}
+
+bool VariantCallFile::parseHeader(string& hs) {
+
+ if (hs.substr(hs.size() - 1, 1) == "\n") {
+ hs.erase(hs.size() - 1, 1); // remove trailing newline
+ }
+ header = hs; // stores the header in the object instance
+
+ vector<string> headerLines = split(header, "\n");
+ for (vector<string>::iterator h = headerLines.begin(); h != headerLines.end(); ++h) {
+ string headerLine = *h;
+ if (headerLine.substr(0,2) == "##") {
+ // meta-information headerLines
+ // TODO parse into map from info/format key to type
+ // ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+ // ##FORMAT=<ID=CB,Number=1,Type=String,Description="Called by S(Sanger), M(UMich), B(BI)">
+ size_t found = headerLine.find_first_of("=");
+ string entryType = headerLine.substr(2, found - 2);
+ // handle reference here, no "<" and ">" given
+ //} else if (entryType == "reference") {
+ size_t dataStart = headerLine.find_first_of("<");
+ size_t dataEnd = headerLine.find_first_of(">");
+ if (dataStart != string::npos && dataEnd != string::npos) {
+ string entryData = headerLine.substr(dataStart + 1, dataEnd - dataStart - 1);
+ // XXX bad; this will break if anyone ever moves the order
+ // of the fields around to include a "long form" string
+ // including either a = or , in the first or second field
+ if (entryType == "INFO" || entryType == "FORMAT") {
+ vector<string> fields = split(entryData, "=,");
+ if (fields[0] != "ID") {
+ cerr << "header parse error at:" << endl
+ << "fields[0] != \"ID\"" << endl
+ << headerLine << endl;
+ exit(1);
+ }
+ string id = fields[1];
+ if (fields[2] != "Number") {
+ cerr << "header parse error at:" << endl
+ << "fields[2] != \"Number\"" << endl
+ << headerLine << endl;
+ exit(1);
+ }
+ int number;
+ string numberstr = fields[3].c_str();
+ // XXX TODO VCF has variable numbers of fields...
+ if (numberstr == "A") {
+ number = ALLELE_NUMBER;
+ } else if (numberstr == "G") {
+ number = GENOTYPE_NUMBER;
+ } else if (numberstr == ".") {
+ number = 1;
+ } else {
+ convert(numberstr, number);
+ }
+ if (fields[4] != "Type") {
+ cerr << "header parse error at:" << endl
+ << "fields[4] != \"Type\"" << endl
+ << headerLine << endl;
+ exit(1);
+ }
+ VariantFieldType type = typeStrToVariantFieldType(fields[5]);
+ if (entryType == "INFO") {
+ infoCounts[id] = number;
+ infoTypes[id] = type;
+ //cerr << id << " == " << type << endl;
+ } else if (entryType == "FORMAT") {
+ //cout << "found format field " << id << " with type " << type << endl;
+ formatCounts[id] = number;
+ formatTypes[id] = type;
+ }
+ }
+ }
+ } else if (headerLine.substr(0,1) == "#") {
+ // field name headerLine
+ vector<string> fields = split(headerLine, '\t');
+ if (fields.size() > 8) {
+ sampleNames.resize(fields.size() - 9);
+ copy(fields.begin() + 9, fields.end(), sampleNames.begin());
+ }
+ }
+ }
+
+ return true;
+}
+
+bool VariantCallFile::getNextVariant(Variant& var) {
+ if (firstRecord && !justSetRegion) {
+ if (!line.empty() && line.substr(0,1) != "#") {
+ var.parse(line, parseSamples);
+ firstRecord = false;
+ _done = false;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ if (usingTabix) {
+ if (justSetRegion && !line.empty() && line.substr(0,1) != "#") {
+ if (firstRecord) {
+ firstRecord = false;
+ }
+ var.parse(line, parseSamples);
+ line.clear();
+ justSetRegion = false;
+ _done = false;
+ return true;
+ } else if (tabixFile->getNextLine(line)) {
+ var.parse(line, parseSamples);
+ _done = false;
+ return true;
+ } else {
+ _done = true;
+ return false;
+ }
+ } else {
+ if (std::getline(*file, line)) {
+ var.parse(line, parseSamples);
+ _done = false;
+ return true;
+ } else {
+ _done = true;
+ return false;
+ }
+ }
+}
+
+bool VariantCallFile::setRegion(string seq, long int start, long int end) {
+ stringstream regionstr;
+ if (end) {
+ regionstr << seq << ":" << start << "-" << end;
+ } else {
+ regionstr << seq << ":" << start;
+ }
+ return setRegion(regionstr.str());
+}
+
+bool VariantCallFile::setRegion(string region) {
+ if (!usingTabix) {
+ cerr << "cannot setRegion on a non-tabix indexed file" << endl;
+ exit(1);
+ }
+ size_t dots = region.find("..");
+ // convert between bamtools/freebayes style region string and tabix/samtools style
+ if (dots != string::npos) {
+ region.replace(dots, 2, "-");
+ }
+ if (tabixFile->setRegion(region)) {
+ if (tabixFile->getNextLine(line)) {
+ justSetRegion = true;
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
+}
+
+
+// genotype manipulation
+/*
+map<string, int> decomposeGenotype(string& genotype) {
+ string splitter = "/";
+ if (genotype.find("|") != string::npos) {
+ splitter = "|";
+ }
+ vector<string> haps = split(genotype, splitter);
+ map<string, int> decomposed;
+ for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
+ ++decomposed[*h];
+ }
+ return decomposed;
+}
+*/
+
+map<int, int> decomposeGenotype(const string& genotype) {
+ string splitter = "/";
+ if (genotype.find("|") != string::npos) {
+ splitter = "|";
+ }
+ vector<string> haps = split(genotype, splitter);
+ map<int, int> decomposed;
+ for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
+ int alt;
+ if (*h == ".") {
+ ++decomposed[NULL_ALLELE];
+ } else {
+ convert(*h, alt);
+ ++decomposed[alt];
+ }
+ }
+ return decomposed;
+}
+
+vector<int> decomposePhasedGenotype(const string& genotype) {
+ string splitter = "/";
+ if (genotype.find("|") != string::npos) {
+ splitter = "|";
+ }
+ vector<string> haps = split(genotype, splitter);
+ if (haps.size() > 1 && splitter == "/") {
+ cerr << "could not find '|' in genotype, cannot decomposePhasedGenotype on unphased genotypes" << endl;
+ exit(1);
+ }
+ vector<int> decomposed;
+ for (vector<string>::iterator h = haps.begin(); h != haps.end(); ++h) {
+ int alt;
+ if (*h == ".") {
+ decomposed.push_back(NULL_ALLELE);
+ } else {
+ convert(*h, alt);
+ decomposed.push_back(alt);
+ }
+ }
+ return decomposed;
+}
+
+string genotypeToString(const map<int, int>& genotype) {
+ vector<int> s;
+ for (map<int, int>::const_iterator g = genotype.begin(); g != genotype.end(); ++g) {
+ int a = g->first;
+ int c = g->second;
+ for (int i = 0; i < c; ++i) s.push_back(a);
+ }
+ sort(s.begin(), s.end());
+ vector<string> r;
+ for (vector<int>::iterator i = s.begin(); i != s.end(); ++i) {
+ if (*i == NULL_ALLELE) r.push_back(".");
+ else r.push_back(convert(*i));
+ }
+ return join(r, "/"); // TODO adjust for phased/unphased
+}
+
+string phasedGenotypeToString(const vector<int>& genotype) {
+ vector<string> r;
+ for (vector<int>::const_iterator i = genotype.begin(); i != genotype.end(); ++i) {
+ if (*i == NULL_ALLELE) r.push_back(".");
+ else r.push_back(convert(*i));
+ }
+ return join(r, "|");
+}
+
+bool isHet(const map<int, int>& genotype) {
+ return genotype.size() > 1;
+}
+
+bool isHom(const map<int, int>& genotype) {
+ return genotype.size() == 1;
+}
+
+bool hasNonRef(const map<int, int>& genotype) {
+ for (map<int, int>::const_iterator g = genotype.begin(); g != genotype.end(); ++g) {
+ if (g->first != 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool isHomRef(const map<int, int>& genotype) {
+ return isHom(genotype) && !hasNonRef(genotype);
+}
+
+bool isHomNonRef(const map<int, int>& genotype) {
+ return isHom(genotype) && hasNonRef(genotype);
+}
+
+bool isNull(const map<int, int>& genotype) {
+ return genotype.find(NULL_ALLELE) != genotype.end();
+}
+
+int ploidy(const map<int, int>& genotype) {
+ int i = 0;
+ for (map<int, int>::const_iterator g = genotype.begin(); g != genotype.end(); ++g) {
+ i += g->second;
+ }
+ return i;
+}
+
+// generates cigar from allele parsed by parsedAlternates
+string varCigar(vector<VariantAllele>& vav, bool xForMismatch) {
+ string cigar;
+ pair<int, string> element;
+ for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
+ VariantAllele& va = *v;
+ if (va.ref != va.alt) {
+ if (element.second == "M") {
+ cigar += convert(element.first) + element.second;
+ element.second = ""; element.first = 0;
+ }
+ if (va.ref.size() == va.alt.size()) {
+ cigar += convert(va.ref.size()) + (xForMismatch ? "X" : "M");
+ } else if (va.ref.size() > va.alt.size()) {
+ cigar += convert(va.ref.size() - va.alt.size()) + "D";
+ } else {
+ cigar += convert(va.alt.size() - va.ref.size()) + "I";
+ }
+ } else {
+ if (element.second == "M") {
+ element.first += va.ref.size();
+ } else {
+ element = make_pair(va.ref.size(), "M");
+ }
+ }
+ }
+ if (element.second == "M") {
+ cigar += convert(element.first) + element.second;
+ }
+ element.second = ""; element.first = 0;
+ return cigar;
+}
+
+map<string, vector<VariantAllele> > Variant::parsedAlternates(bool includePreviousBaseForIndels,
+ bool useMNPs,
+ bool useEntropy,
+ float matchScore,
+ float mismatchScore,
+ float gapOpenPenalty,
+ float gapExtendPenalty,
+ float repeatGapExtendPenalty,
+ string flankingRefLeft,
+ string flankingRefRight) {
+
+ map<string, vector<VariantAllele> > variantAlleles;
+
+ // add the reference allele
+ variantAlleles[ref].push_back(VariantAllele(ref, ref, position));
+
+ // single SNP case, no ambiguity possible, no need to spend a lot of
+ // compute aligning ref and alt fields
+ if (alt.size() == 1 && ref.size() == 1 && alt.front().size() == 1) {
+ variantAlleles[alt.front()].push_back(VariantAllele(ref, alt.front(), position));
+ return variantAlleles;
+ }
+
+ // padding is used to ensure a stable alignment of the alternates to the reference
+ // without having to go back and look at the full reference sequence
+ int paddingLen = max(10, (int) (ref.size())); // dynamically determine optimum padding length
+ for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
+ string& alternate = *a;
+ paddingLen = max(paddingLen, (int) (alternate.size()));
+ }
+ char padChar = 'Z';
+ char anchorChar = 'Q';
+ string padding(paddingLen, padChar);
+
+ // this 'anchored' string is done for stability
+ // the assumption is that there should be a positional match in the first base
+ // this is true for VCF 4.1, and standard best practices
+ // using the anchor char ensures this without other kinds of realignment
+ string reference_M;
+ if (flankingRefLeft.empty() && flankingRefRight.empty()) {
+ reference_M = padding + ref + padding;
+ reference_M[paddingLen] = anchorChar;
+ } else {
+ reference_M = flankingRefLeft + ref + flankingRefRight;
+ paddingLen = flankingRefLeft.size();
+ }
+
+ // passed to sw.Align
+ unsigned int referencePos;
+
+ string cigar;
+
+ for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
+
+ string& alternate = *a;
+ vector<VariantAllele>& variants = variantAlleles[alternate];
+ string alternateQuery_M;
+ if (flankingRefLeft.empty() && flankingRefRight.empty()) {
+ alternateQuery_M = padding + alternate + padding;
+ alternateQuery_M[paddingLen] = anchorChar;
+ } else {
+ alternateQuery_M = flankingRefLeft + alternate + flankingRefRight;
+ }
+ //const unsigned int alternateLen = alternate.size();
+
+ if (true) {
+ CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
+ if (useEntropy) sw.EnableEntropyGapPenalty(1);
+ if (repeatGapExtendPenalty != 0) sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
+ sw.Align(referencePos, cigar, reference_M, alternateQuery_M);
+ } else { // disabled for now
+ StripedSmithWaterman::Aligner aligner;
+ StripedSmithWaterman::Filter sswFilter;
+ StripedSmithWaterman::Alignment alignment;
+ aligner.Align(alternateQuery_M.c_str(), reference_M.c_str(), reference_M.size(), sswFilter, &alignment);
+ cigar = alignment.cigar_string;
+ }
+
+ // left-realign the alignment...
+
+ vector<pair<int, string> > cigarData = splitCigar(cigar);
+
+ if (cigarData.front().second != "M" || cigarData.back().second != "M"
+ || cigarData.front().first < paddingLen || cigarData.back().first < paddingLen) {
+ cerr << "parsedAlternates: alignment does not start with match over padded sequence" << endl;
+ cerr << cigar << endl;
+ cerr << reference_M << endl;
+ cerr << alternateQuery_M << endl;
+ exit(1);
+ } else {
+ cigarData.front().first -= paddingLen;
+ cigarData.back().first -= paddingLen;;
+ }
+ //cigarData = cleanCigar(cigarData);
+ cigar = joinCigar(cigarData);
+
+ int altpos = 0;
+ int refpos = 0;
+
+ for (vector<pair<int, string> >::iterator e = cigarData.begin(); e != cigarData.end(); ++e) {
+
+ int len = e->first;
+ string type = e->second;
+
+ switch (type.at(0)) {
+ case 'I':
+ if (includePreviousBaseForIndels) {
+ if (!variants.empty() &&
+ variants.back().ref != variants.back().alt) {
+ VariantAllele a = VariantAllele("", alternate.substr(altpos, len), refpos + position);
+ variants.back() = variants.back() + a;
+ } else {
+ VariantAllele a = VariantAllele(ref.substr(refpos - 1, 1),
+ alternate.substr(altpos - 1, len + 1),
+ refpos + position - 1);
+ variants.push_back(a);
+ }
+ } else {
+ variants.push_back(VariantAllele("", alternate.substr(altpos, len), refpos + position));
+ }
+ altpos += len;
+ break;
+ case 'D':
+ if (includePreviousBaseForIndels) {
+ if (!variants.empty() &&
+ variants.back().ref != variants.back().alt) {
+ VariantAllele a = VariantAllele(ref.substr(refpos, len), "", refpos + position);
+ variants.back() = variants.back() + a;
+ } else {
+ VariantAllele a = VariantAllele(ref.substr(refpos - 1, len + 1),
+ alternate.substr(altpos - 1, 1),
+ refpos + position - 1);
+ variants.push_back(a);
+ }
+ } else {
+ variants.push_back(VariantAllele(ref.substr(refpos, len), "", refpos + position));
+ }
+ refpos += len;
+ break;
+ case 'M':
+ {
+ for (int i = 0; i < len; ++i) {
+ VariantAllele a = VariantAllele(ref.substr(refpos + i, 1),
+ alternate.substr(altpos + i, 1),
+ refpos + i + position);
+ if (useMNPs &&
+ variants.back().ref.size() == variants.back().alt.size()
+ && variants.back().ref != variants.back().alt) {
+ variants.back() = variants.back() + a;
+ } else {
+ variants.push_back(a);
+ }
+ }
+ }
+ refpos += len;
+ altpos += len;
+ break;
+ case 'S':
+ refpos += len;
+ altpos += len;
+ break;
+ default:
+ break;
+ }
+
+ }
+ }
+
+ return variantAlleles;
+}
+
+map<string, vector<VariantAllele> > Variant::flatAlternates(void) {
+ map<string, vector<VariantAllele> > variantAlleles;
+ for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a) {
+ string& alternate = *a;
+ vector<VariantAllele>& variants = variantAlleles[alternate];
+ variants.push_back(VariantAllele(ref, alternate, position));
+ }
+ return variantAlleles;
+}
+
+set<string> Variant::altSet(void) {
+ set<string> altset(alt.begin(), alt.end());
+ return altset;
+}
+
+ostream& operator<<(ostream& out, VariantAllele& var) {
+ out << var.position << " " << var.ref << " -> " << var.alt;
+ return out;
+}
+
+VariantAllele operator+(const VariantAllele& a, const VariantAllele& b) {
+ return VariantAllele(a.ref + b.ref, a.alt + b.alt, a.position);
+}
+
+bool operator<(const VariantAllele& a, const VariantAllele& b) {
+ return a.repr < b.repr;
+}
+
+map<pair<int, int>, int> Variant::getGenotypeIndexesDiploid(void) {
+
+ map<pair<int, int>, int> genotypeIndexes;
+ //map<int, map<Genotype*, int> > vcfGenotypeOrder;
+ vector<int> indexes;
+ for (int i = 0; i < alleles.size(); ++i) {
+ indexes.push_back(i);
+ }
+ int ploidy = 2; // ONLY diploid
+ vector<vector<int> > genotypes = multichoose(ploidy, indexes);
+ for (vector<vector<int> >::iterator g = genotypes.begin(); g != genotypes.end(); ++g) {
+ sort(g->begin(), g->end()); // enforce e.g. 0/1, 0/2, 1/2 ordering over reverse
+ // XXX this does not handle non-diploid!!!!
+ int j = g->front();
+ int k = g->back();
+ genotypeIndexes[make_pair(j, k)] = (k * (k + 1) / 2) + j;
+ }
+ return genotypeIndexes;
+
+}
+
+void Variant::updateAlleleIndexes(void) {
+ // adjust the allele index
+ altAlleleIndexes.clear();
+ int m = 0;
+ for (vector<string>::iterator a = alt.begin();
+ a != alt.end(); ++a, ++m) {
+ altAlleleIndexes[*a] = m;
+ }
+}
+
+// TODO only works on "A"llele variant fields
+void Variant::removeAlt(string& altAllele) {
+
+ int altIndex = getAltAlleleIndex(altAllele); // this is the alt-relative index, 0-based
+
+ for (map<string, int>::iterator c = vcf->infoCounts.begin(); c != vcf->infoCounts.end(); ++c) {
+ int count = c->second;
+ if (count == ALLELE_NUMBER) {
+ string key = c->first;
+ map<string, vector<string> >::iterator v = info.find(key);
+ if (v != info.end()) {
+ vector<string>& vals = v->second;
+ vector<string> tokeep;
+ int i = 0;
+ for (vector<string>::iterator a = vals.begin(); a != vals.end(); ++a, ++i) {
+ if (i != altIndex) {
+ tokeep.push_back(*a);
+ }
+ }
+ vals = tokeep;
+ }
+ }
+ }
+
+ for (map<string, int>::iterator c = vcf->formatCounts.begin(); c != vcf->formatCounts.end(); ++c) {
+ int count = c->second;
+ if (count == ALLELE_NUMBER) {
+ string key = c->first;
+ for (map<string, map<string, vector<string> > >::iterator s = samples.begin();
+ s != samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ map<string, vector<string> >::iterator v = sample.find(key);
+ if (v != sample.end()) {
+ vector<string>& vals = v->second;
+ vector<string> tokeep;
+ int i = 0;
+ for (vector<string>::iterator a = vals.begin(); a != vals.end(); ++a, ++i) {
+ if (i != altIndex) {
+ tokeep.push_back(*a);
+ }
+ }
+ vals = tokeep;
+ }
+ }
+ }
+ }
+
+ int altSpecIndex = altIndex + 1; // this is the genotype-spec index, ref=0, 1-based for alts
+
+ vector<string> newalt;
+ map<int, int> alleleIndexMapping;
+ // setup the new alt string
+ alleleIndexMapping[0] = 0; // reference allele remains the same
+ alleleIndexMapping[NULL_ALLELE] = NULL_ALLELE; // null allele remains the same
+ int i = 1; // current index
+ int j = 1; // new index
+ for (vector<string>::iterator a = alt.begin(); a != alt.end(); ++a, ++i) {
+ if (i != altSpecIndex) {
+ newalt.push_back(*a);
+ // get the mapping between new and old allele indexes
+ alleleIndexMapping[i] = j;
+ ++j;
+ } else {
+ alleleIndexMapping[i] = NULL_ALLELE;
+ }
+ }
+
+ // fix the sample genotypes, removing reference to the old allele
+ map<string, int> samplePloidy;
+ for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ if (sample.find("GT") != sample.end()) {
+ string& gt = sample["GT"].front();
+ string splitter = "/";
+ if (gt.find("|") != string::npos) {
+ splitter = "|";
+ }
+
+ if (splitter == "/") {
+ samplePloidy[s->first] = split(gt, splitter).size();
+ map<int, int> genotype = decomposeGenotype(sample["GT"].front());
+ map<int, int> newGenotype;
+ for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
+ newGenotype[alleleIndexMapping[g->first]] += g->second;
+ }
+ sample["GT"].clear();
+ sample["GT"].push_back(genotypeToString(newGenotype));
+ } else {
+ samplePloidy[s->first] = split(gt, splitter).size();
+ vector<int> genotype = decomposePhasedGenotype(sample["GT"].front());
+ vector<int> newGenotype;
+ for (vector<int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
+ newGenotype.push_back(alleleIndexMapping[*g]);
+ }
+ sample["GT"].clear();
+ sample["GT"].push_back(phasedGenotypeToString(newGenotype));
+ }
+ }
+ }
+
+ set<int> ploidies;
+ for (map<string, int>::iterator p = samplePloidy.begin(); p != samplePloidy.end(); ++p) {
+ ploidies.insert(p->second);
+ }
+
+ // fix the sample genotype likelihoods, removing reference to the old allele
+ // which GL fields should we remove?
+ vector<int> toRemove;
+ toRemove.push_back(altSpecIndex);
+ map<int, map<int, int> > glMappingByPloidy;
+ for (set<int>::iterator p = ploidies.begin(); p != ploidies.end(); ++p) {
+ glMappingByPloidy[*p] = glReorder(*p, alt.size() + 1, alleleIndexMapping, toRemove);
+ }
+
+ for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ map<string, vector<string> >::iterator glsit = sample.find("GL");
+ if (glsit != sample.end()) {
+ vector<string>& gls = glsit->second; // should be split already
+ map<int, string> newgls;
+ map<int, int>& newOrder = glMappingByPloidy[samplePloidy[s->first]];
+ int i = 0;
+ for (vector<string>::iterator g = gls.begin(); g != gls.end(); ++g, ++i) {
+ int j = newOrder[i];
+ if (j != -1) {
+ newgls[i] = *g;
+ }
+ }
+ // update the gls
+ gls.clear();
+ for (map<int, string>::iterator g = newgls.begin(); g != newgls.end(); ++g) {
+ gls.push_back(g->second);
+ }
+ }
+ }
+
+ // reset the alt
+ alt = newalt;
+
+ // and the alleles
+ alleles.clear();
+ alleles.push_back(ref);
+ alleles.insert(alleles.end(), alt.begin(), alt.end());
+
+ updateAlleleIndexes();
+
+}
+
+// union of lines in headers of input files
+string unionInfoHeaderLines(string& s1, string& s2) {
+ vector<string> lines1 = split(s1, "\n");
+ vector<string> lines2 = split(s2, "\n");
+ vector<string> result;
+ set<string> l2;
+ string lastHeaderLine; // this one needs to be at the end
+ for (vector<string>::iterator s = lines2.begin(); s != lines2.end(); ++s) {
+ if (s->substr(0,6) == "##INFO") {
+ l2.insert(*s);
+ }
+ }
+ for (vector<string>::iterator s = lines1.begin(); s != lines1.end(); ++s) {
+ if (l2.count(*s)) {
+ l2.erase(*s);
+ }
+ if (s->substr(0,6) == "#CHROM") {
+ lastHeaderLine = *s;
+ } else {
+ result.push_back(*s);
+ }
+ }
+ for (set<string>::iterator s = l2.begin(); s != l2.end(); ++s) {
+ result.push_back(*s);
+ }
+ if (lastHeaderLine.empty()) {
+ cerr << "could not find CHROM POS ... header line" << endl;
+ exit(1);
+ }
+ result.push_back(lastHeaderLine);
+ return join(result, "\n");
+}
+
+string mergeCigar(const string& c1, const string& c2) {
+ vector<pair<int, string> > cigar1 = splitCigar(c1);
+ vector<pair<int, string> > cigar2 = splitCigar(c2);
+ // check if the middle elements are the same
+ if (cigar1.back().second == cigar2.front().second) {
+ cigar1.back().first += cigar2.front().first;
+ cigar2.erase(cigar2.begin());
+ }
+ for (vector<pair<int, string> >::iterator c = cigar2.begin(); c != cigar2.end(); ++c) {
+ cigar1.push_back(*c);
+ }
+ return joinCigar(cigar1);
+}
+
+vector<pair<int, string> > splitCigar(const string& cigarStr) {
+ vector<pair<int, string> > cigar;
+ string number;
+ string type;
+ // strings go [Number][Type] ...
+ for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
+ char c = *s;
+ if (isdigit(c)) {
+ if (type.empty()) {
+ number += c;
+ } else {
+ // signal for next token, push back the last pair, clean up
+ cigar.push_back(make_pair(atoi(number.c_str()), type));
+ number.clear();
+ type.clear();
+ number += c;
+ }
+ } else {
+ type += c;
+ }
+ }
+ if (!number.empty() && !type.empty()) {
+ cigar.push_back(make_pair(atoi(number.c_str()), type));
+ }
+ return cigar;
+}
+
+list<pair<int, string> > splitCigarList(const string& cigarStr) {
+ list<pair<int, string> > cigar;
+ string number;
+ string type;
+ // strings go [Number][Type] ...
+ for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
+ char c = *s;
+ if (isdigit(c)) {
+ if (type.empty()) {
+ number += c;
+ } else {
+ // signal for next token, push back the last pair, clean up
+ cigar.push_back(make_pair(atoi(number.c_str()), type));
+ number.clear();
+ type.clear();
+ number += c;
+ }
+ } else {
+ type += c;
+ }
+ }
+ if (!number.empty() && !type.empty()) {
+ cigar.push_back(make_pair(atoi(number.c_str()), type));
+ }
+ return cigar;
+}
+
+vector<pair<int, string> > cleanCigar(const vector<pair<int, string> >& cigar) {
+ vector<pair<int, string> > cigarClean;
+ for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+ if (c->first > 0) {
+ cigarClean.push_back(*c);
+ }
+ }
+ return cigarClean;
+}
+
+string joinCigar(const vector<pair<int, string> >& cigar) {
+ string cigarStr;
+ for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+ if (c->first) {
+ cigarStr += convert(c->first) + c->second;
+ }
+ }
+ return cigarStr;
+}
+
+string joinCigar(const vector<pair<int, char> >& cigar) {
+ string cigarStr;
+ for (vector<pair<int, char> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+ if (c->first) {
+ cigarStr += convert(c->first) + string(1, c->second);
+ }
+ }
+ return cigarStr;
+}
+
+string joinCigarList(const list<pair<int, string> >& cigar) {
+ string cigarStr;
+ for (list<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+ cigarStr += convert(c->first) + c->second;
+ }
+ return cigarStr;
+}
+
+int cigarRefLen(const vector<pair<int, char> >& cigar) {
+ int len = 0;
+ for (vector<pair<int, char> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+ if (c->second == 'M' || c->second == 'D' || c->second == 'X') {
+ len += c->first;
+ }
+ }
+ return len;
+}
+
+int cigarRefLen(const vector<pair<int, string> >& cigar) {
+ int len = 0;
+ for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
+ if (c->second == "M" || c->second == "D" || c->second == "X") {
+ len += c->first;
+ }
+ }
+ return len;
+}
+
+bool isEmptyCigarElement(const pair<int, string>& elem) {
+ return elem.first == 0;
+}
+
+list<list<int> > _glorder(int ploidy, int alts) {
+ if (ploidy == 1) {
+ list<list<int> > results;
+ for (int n = 0; n < alts; ++n) {
+ list<int> v;
+ v.push_back(n);
+ results.push_back(v);
+ }
+ return results;
+ } else {
+ list<list<int> > results;
+ for (int n = 0; n < alts; ++n) {
+ list<list<int> > x = _glorder(ploidy - 1, alts);
+ for (list<list<int> >::iterator v = x.begin(); v != x.end(); ++v) {
+ if (v->front() <= n) {
+ v->push_front(n);
+ results.push_back(*v);
+ }
+ }
+ }
+ return results;
+ }
+}
+
+// genotype likelihood-ordering of genotypes, where each genotype is a
+// list of integers (as written in the GT field)
+list<list<int> > glorder(int ploidy, int alts) {
+ list<list<int> > results = _glorder(ploidy, alts);
+ for (list<list<int> >::iterator v = results.begin(); v != results.end(); ++v) {
+ v->reverse();
+ }
+ return results;
+}
+
+// which genotype likelihoods would include this alternate allele
+list<int> glsWithAlt(int alt, int ploidy, int numalts) {
+ list<int> gls;
+ list<list<int> > orderedGenotypes = glorder(ploidy, numalts);
+ int i = 0;
+ for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v, ++i) {
+ for (list<int>::iterator q = v->begin(); q != v->end(); ++q) {
+ if (*q == alt) {
+ gls.push_back(i);
+ break;
+ }
+ }
+ }
+ return gls;
+}
+
+// describes the mapping between the old gl ordering and and a new
+// one in which the GLs including the old alt have been removed
+// a map to -1 means "remove"
+map<int, int> glReorder(int ploidy, int numalts, map<int, int>& alleleIndexMapping, vector<int>& altsToRemove) {
+ map<int, int> mapping;
+ list<list<int> > orderedGenotypes = glorder(ploidy, numalts);
+ for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v) {
+ for (list<int>::iterator n = v->begin(); n != v->end(); ++n) {
+ *n = alleleIndexMapping[*n];
+ }
+ }
+ list<list<int> > newOrderedGenotypes = glorder(ploidy, numalts - altsToRemove.size());
+ map<list<int>, int> newOrderedGenotypesMapping;
+ int i = 0;
+ // mapping is wrong...
+ for (list<list<int> >::iterator v = newOrderedGenotypes.begin(); v != newOrderedGenotypes.end(); ++v, ++i) {
+ newOrderedGenotypesMapping[*v] = i;
+ }
+ i = 0;
+ for (list<list<int> >::iterator v = orderedGenotypes.begin(); v != orderedGenotypes.end(); ++v, ++i) {
+ map<list<int>, int>::iterator m = newOrderedGenotypesMapping.find(*v);
+ if (m != newOrderedGenotypesMapping.end()) {
+ //cout << "new gl order of " << i << " is " << m->second << endl;
+ mapping[i] = m->second;
+ } else {
+ //cout << i << " will be removed" << endl;
+ mapping[i] = -1;
+ }
+ }
+ return mapping;
+}
+
+string Variant::getGenotype(string& sample) {
+ map<string, map<string, vector<string> > >::iterator s = samples.find(sample);
+ if (s != samples.end()) {
+ map<string, vector<string> >::iterator f = s->second.find("GT");
+ if (f != s->second.end()) {
+ return f->second.front();
+ }
+ }
+ return "";
+}
+
+bool Variant::isPhased(void) {
+ for (map<string, map<string, vector<string> > >::iterator s = samples.begin(); s != samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ map<string, vector<string> >::iterator g = sample.find("GT");
+ if (g != sample.end()) {
+ string gt = g->second.front();
+ if (gt.size() > 1 && gt.find("|") == string::npos) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+long Variant::zeroBasedPosition(void) {
+ return position - 1;
+}
+
+string Variant::vrepr(void) {
+ return sequenceName + "\t" + convert(position) + "\t" + join(alleles, ",");
+}
+
+// TODO
+/*
+vector<Variant*> Variant::matchingHaplotypes() {
+
+ int haplotypeStart = var.position;
+ int haplotypeEnd = var.position + var.ref.size();
+
+ for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
+ haplotypeStart = min((*v)->position, (long int) haplotypeStart);
+ haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd);
+ }
+
+ // for everything overlapping and the current variant, construct the local haplotype within the bounds
+ // if there is an exact match, the allele in the current VCF does intersect
+
+ string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart);
+ map<string, vector<pair<Variant*, int> > > haplotypes; // map to variant and alt index
+
+ for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
+ Variant& variant = **v;
+ int altindex = 0;
+ for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a, ++altindex) {
+ string haplotype = referenceHaplotype;
+ // get the relative start and end coordinates for the variant alternate allele
+ int relativeStart = variant.position - haplotypeStart;
+ haplotype.replace(relativeStart, variant.ref.size(), *a);
+ haplotypes[haplotype].push_back(make_pair(*v, altindex));
+ }
+ }
+
+ Variant originalVar = var;
+
+ // determine the non-intersecting alts
+ vector<string> altsToRemove;
+ vector<int> altIndexesToRemove;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string haplotype = referenceHaplotype;
+ int relativeStart = var.position - haplotypeStart;
+ haplotype.replace(relativeStart, var.ref.size(), *a);
+ map<string, vector<pair<Variant*, int> > >::iterator h = haplotypes.find(haplotype);
+ if ((intersecting && !invert && h == haplotypes.end())
+ || (intersecting && invert && h != haplotypes.end())
+ || (unioning && h != haplotypes.end())) {
+ if (tag.empty() && mergeToTag.empty()) {
+ altsToRemove.push_back(*a);
+ } else {
+ if (!tag.empty()) {
+ var.info[tag].push_back(".");
+ }
+ if (!mergeToTag.empty()) {
+ var.info[mergeToTag].push_back(".");
+ }
+ }
+ } else {
+ if (!tag.empty()) {
+ var.info[tag].push_back(tagValue);
+ }
+ // NB: just take the first value for the mergeFromTag
+ if (!mergeToTag.empty()) {
+ Variant* v = h->second.front().first;
+ int index = h->second.front().second;
+ if (v->info.find(mergeFromTag) != v->info.end()) {
+ // now you have to find the exact allele...
+ string& otherValue = v->info[mergeFromTag].at(index);
+ var.info[mergeToTag].push_back(otherValue);
+ } else if (mergeFromTag == "QUAL") {
+ var.info[mergeToTag].push_back(convert(v->quality));
+ } else {
+ var.info[mergeToTag].push_back(".");
+ }
+ }
+ }
+ }
+
+ // remove the non-overlapping (intersecting) or overlapping (unioning) alts
+ if (intersecting && loci && altsToRemove.size() != var.alt.size()) {
+ // we have a match in loci mode, so we should output the whole loci, not just the matching sequence
+ } else {
+ for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) {
+ var.removeAlt(*a);
+ }
+ }
+
+ if (unioning) {
+
+ // somehow sort the records and combine them?
+ map<long int, vector<Variant*> > variants;
+ for (vector<Variant*>::iterator o = overlapping.begin(); o != overlapping.end(); ++o) {
+ if ((*o)->position <= var.position && // check ensures proper ordering of variants on output
+ outputVariants.find(*o) == outputVariants.end()) {
+ outputVariants.insert(*o);
+ variants[(*o)->position].push_back(*o);
+ }
+ }
+ // add in the current variant, if it has alts left
+ if (!var.alt.empty()) {
+ vector<Variant*>& vars = variants[var.position];
+ int numalts = 0;
+ for (vector<Variant*>::iterator v = vars.begin(); v != vars.end(); ++v) {
+ numalts += (*v)->alt.size();
+ }
+ if (numalts + var.alt.size() == originalVar.alt.size()) {
+ variants[var.position].clear();
+ variants[var.position].push_back(&originalVar);
+ } else {
+ variants[var.position].push_back(&var);
+ }
+ }
+
+ for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) {
+ for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) {
+ cout << **o << endl;
+ lastOutputPosition = max(lastOutputPosition, (*o)->position);
+ }
+ }
+ } else {
+ // if any alts remain, output the variant record
+ if (!var.alt.empty()) {
+ cout << var << endl;
+ lastOutputPosition = max(lastOutputPosition, var.position);
+ }
+ }
+
+}
+*/
+
+
+ VCFHeader::VCFHeader()
+ {
+
+ // add manditory fields
+ this->header_columns.push_back("#CHROM");
+ this->header_columns.push_back("POS");
+ this->header_columns.push_back("ID");
+ this->header_columns.push_back("REF");
+ this->header_columns.push_back("ALT");
+ this->header_columns.push_back("QUAL");
+ this->header_columns.push_back("FILTER");
+ this->header_columns.push_back("INFO");
+
+ // add the line names in order
+ // the order is used when outputting as a string
+ this->header_line_names_ordered.push_back("##fileFormat");
+ this->header_line_names_ordered.push_back("##fileDate");
+ this->header_line_names_ordered.push_back("##source");
+ this->header_line_names_ordered.push_back("##reference");
+ this->header_line_names_ordered.push_back( "##contig");
+ this->header_line_names_ordered.push_back("##phasing");
+ this->header_line_names_ordered.push_back( "##assembly");
+
+ // add the list names in order
+ // the order is used when outputting as a string (getHeaderString)
+ this->header_list_names_ordered.push_back("##info");
+ this->header_list_names_ordered.push_back("##filter");
+ this->header_list_names_ordered.push_back("##format");
+ this->header_list_names_ordered.push_back("##alt");
+ this->header_list_names_ordered.push_back("##sample");
+ this->header_list_names_ordered.push_back("##pedigree");
+ this->header_list_names_ordered.push_back("##pedigreedb");
+
+ // initialize the header_lines with the above vector.
+ // Set the key as the ##_type_ and the value as an empty string
+ // Empty strings are ignored when outputting as string (getHeaderString)
+ for (vector<string>::const_iterator header_lines_iter = this->header_line_names_ordered.begin(); header_lines_iter != this->header_line_names_ordered.end(); ++header_lines_iter)
+ {
+ this->header_lines[(*header_lines_iter)] = "";
+ }
+
+ // initialize the header_lines with the above vector.
+ // Set the key as the ##_type_ and the value as an empty vector<string>
+ // Empty vectors are ignored when outputting as string (getHeaderString)
+ for (vector<string>::const_iterator header_lists_iter = this->header_list_names_ordered.begin(); header_lists_iter != this->header_list_names_ordered.end(); ++header_lists_iter)
+ {
+ this->header_lists[(*header_lists_iter)] = vector<string>(0);
+ }
+
+ }
+
+ void VCFHeader::addMetaInformationLine(const string& meta_line)
+ {
+ // get the meta_line unique key (first chars before the =)
+ unsigned int meta_line_index = meta_line.find("=", 0);
+ string meta_line_prefix = meta_line.substr(0, meta_line_index);
+
+ // check if the meta_line_prefix is in the header_lines, if so add it to the appropirate list
+ if (this->header_lines.find(meta_line_prefix) != header_lines.end()) // the meta_line is a header line so replace what was there
+ {
+ this->header_lines[meta_line_prefix] = meta_line;
+ }
+ else if (header_lists.find(meta_line_prefix) != header_lists.end() &&
+ !metaInfoIdExistsInVector(meta_line, this->header_lists[meta_line_prefix])) // check if the metalineprefix is in the headerLists, if so add it to the appropirate list
+ {
+ this->header_lists[meta_line_prefix].push_back(meta_line);
+ }
+ }
+
+ string VCFHeader::getHeaderString()
+ {
+ // getHeaderString generates the string each time it is called
+ string header_string;
+
+ // start by adding the header_lines
+ for (vector<string>::const_iterator header_lines_iter = this->header_line_names_ordered.begin(); header_lines_iter != this->header_line_names_ordered.end(); ++header_lines_iter)
+ {
+ if (this->header_lines[(*header_lines_iter)] != "")
+ {
+ header_string += this->header_lines[(*header_lines_iter)] + "\n";
+ }
+ }
+
+ // next add header_lists
+ for (vector<string>::const_iterator header_lists_iter = this->header_list_names_ordered.begin(); header_lists_iter != this->header_list_names_ordered.end(); ++header_lists_iter)
+ {
+ vector<string> tmp_header_lists = this->header_lists[(*header_lists_iter)];
+ for (vector<string>::const_iterator header_list = tmp_header_lists.begin(); header_list != tmp_header_lists.end(); ++header_list)
+ {
+ header_string += (*header_list) + "\n";
+ }
+ }
+
+ // last add header columns
+ vector<string>::const_iterator last_element = this->header_columns.end() - 1;
+ for (vector<string>::const_iterator header_column_iter = this->header_columns.begin(); header_column_iter != this->header_columns.end(); ++header_column_iter)
+ {
+ string delimiter = (header_column_iter == last_element) ? "\n" : "\t";
+ header_string += (*header_column_iter) + delimiter;
+ }
+ return header_string;
+ }
+
+ bool VCFHeader::metaInfoIdExistsInVector(const string& meta_line, vector<string>& meta_lines)
+ {
+ // extract the id from meta_line
+ size_t meta_line_id_start_idx = meta_line.find("ID=", 0); // used for the start of the substring index
+ size_t meta_line_id_end_idx = meta_line.find(",", meta_line_id_start_idx); // used for end of the substring index
+ string meta_line_id = (meta_line_id_start_idx < meta_line_id_end_idx) ? meta_line.substr(meta_line_id_start_idx, meta_line_id_end_idx - meta_line_id_start_idx) : "";
+
+ for (vector<string>::const_iterator iter = meta_lines.begin(); iter != meta_lines.end(); ++iter)
+ {
+ // extract the id from iter's meta_line string
+ size_t iter_meta_line_id_start_idx = (*iter).find("ID=", 0);
+ size_t iter_meta_line_id_end_idx = (*iter).find(",", iter_meta_line_id_start_idx);
+ string iter_meta_line_id = (iter_meta_line_id_start_idx < iter_meta_line_id_end_idx) ? (*iter).substr(iter_meta_line_id_start_idx, iter_meta_line_id_end_idx - iter_meta_line_id_start_idx) : "";
+ // compare the meta_line_id with the iter_meta_line_id
+ if (strcasecmp(meta_line_id.c_str(), iter_meta_line_id.c_str()) == 0)
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void VCFHeader::addHeaderColumn(const string& header_column)
+ {
+ // don't add duplicates
+ // vector<string>::iterator test = find(this->header_columns.begin(), this->header_columns.end(), header_column);
+ if (find(this->header_columns.begin(), this->header_columns.end(), header_column) == this->header_columns.end())
+ {
+ this->header_columns.push_back(header_column);
+ }
+ }
+
+} // end namespace vcf
diff --git a/src/Variant.h b/src/Variant.h
new file mode 100644
index 0000000..307ca84
--- /dev/null
+++ b/src/Variant.h
@@ -0,0 +1,586 @@
+#ifndef __VARIANT_H
+#define __VARIANT_H
+
+#include <vector>
+#include <list>
+#include <map>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <utility>
+#include <stdlib.h>
+#include <assert.h>
+#include <stack>
+#include <queue>
+#include <set>
+#include "split.h"
+#include "join.h"
+#include "tabixpp/tabix.hpp"
+#include "smithwaterman/SmithWatermanGotoh.h"
+#include "smithwaterman/disorder.h"
+#include "ssw_cpp.h"
+#include "convert.h"
+#include "multichoose/multichoose.h"
+extern "C" {
+ #include "filevercmp/filevercmp.h"
+}
+
+using namespace std;
+
+namespace vcf {
+
+class Variant;
+
+enum VariantFieldType { FIELD_FLOAT = 0
+ , FIELD_INTEGER
+ , FIELD_BOOL
+ , FIELD_STRING
+ , FIELD_UNKNOWN
+ };
+
+enum VariantFieldNumber { ALLELE_NUMBER = -2
+ , GENOTYPE_NUMBER = -1
+ };
+
+const int INDEX_NONE = -1;
+const int NULL_ALLELE = -1;
+
+VariantFieldType typeStrToFieldType(string& typeStr);
+ostream& operator<<(ostream& out, VariantFieldType type);
+
+typedef map<string, map<string, vector<string> > > Samples;
+typedef vector<pair<int, string> > Cigar;
+
+class VariantCallFile {
+
+public:
+
+ istream* file;
+ Tabix* tabixFile;
+
+ bool usingTabix;
+ string vcf_header;
+
+
+ string header;
+ string line; // the current line
+ string fileformat;
+ string fileDate;
+ string source;
+ string reference;
+ string phasing;
+ map<string, VariantFieldType> infoTypes;
+ map<string, int> infoCounts;
+ map<string, VariantFieldType> formatTypes;
+ map<string, int> formatCounts;
+ vector<string> sampleNames;
+ bool parseSamples;
+ bool _done;
+
+ void updateSamples(vector<string>& newSampleNames);
+ string headerWithSampleNames(vector<string>& newSamples); // non-destructive, for output
+ void addHeaderLine(string line);
+ void removeInfoHeaderLine(string line);
+ void removeGenoHeaderLine(string line);
+ vector<string> infoIds(void);
+ vector<string> formatIds(void);
+
+ bool open(string& filename) {
+ vector<string> filenameParts = split(filename, ".");
+ if (filenameParts.back() == "gz" || filenameParts.back() == "bgz") {
+ return openTabix(filename);
+ } else {
+ return openFile(filename);
+ }
+ }
+
+ bool openFile(string& filename) {
+ file = &_file;
+ _file.open(filename.c_str(), ifstream::in);
+ parsedHeader = parseHeader();
+ return parsedHeader;
+ }
+
+ bool openTabix(string& filename) {
+ usingTabix = true;
+ tabixFile = new Tabix(filename);
+ parsedHeader = parseHeader();
+ return parsedHeader;
+ }
+
+ bool open(istream& stream) {
+ file = &stream;
+ parsedHeader = parseHeader();
+ return parsedHeader;
+ }
+
+ bool open(ifstream& stream) {
+ file = &stream;
+ parsedHeader = parseHeader();
+ return parsedHeader;
+ }
+
+ bool openForOutput(string& headerStr) {
+ parsedHeader = parseHeader(headerStr);
+ return parsedHeader;
+ }
+
+VariantCallFile(void) : usingTabix(false), parseSamples(true), justSetRegion(false), parsedHeader(false) { }
+ ~VariantCallFile(void) {
+ if (usingTabix) {
+ delete tabixFile;
+ }
+ }
+
+ bool is_open(void) { return parsedHeader; }
+
+ bool eof(void) { return _file.eof(); }
+
+ bool done(void) { return _done; }
+
+ bool parseHeader(string& headerStr);
+
+ bool parseHeader(void);
+
+ bool getNextVariant(Variant& var);
+
+ bool setRegion(string region);
+ bool setRegion(string seq, long int start, long int end = 0);
+ vector<string> getHeaderLinesFromFile();
+
+private:
+ bool firstRecord;
+ bool justSetRegion;
+ bool usingFile;
+ ifstream _file;
+ bool parsedHeader;
+
+};
+
+class VariantAllele {
+ friend ostream& operator<<(ostream& out, VariantAllele& var);
+ friend bool operator<(const VariantAllele& a, const VariantAllele& b);
+ friend VariantAllele operator+(const VariantAllele& a, const VariantAllele& b);
+public:
+ string ref;
+ string alt;
+ string repr;
+ long position;
+ /* // TODO
+ bool isSNP(void);
+ bool isMNP(void);
+ bool isInsertion(void);
+ bool isDeletion(void);
+ bool isIndel(void);
+ */
+ VariantAllele(string r, string a, long p)
+ : ref(r), alt(a), position(p)
+ {
+ stringstream s;
+ s << position << ":" << ref << "/" << alt;
+ repr = s.str();
+ }
+};
+
+class Variant {
+
+ friend ostream& operator<<(ostream& out, Variant& var);
+
+public:
+
+ string sequenceName;
+ long position;
+ long zeroBasedPosition(void);
+ string id;
+ string ref;
+ vector<string> alt; // a list of all the alternate alleles present at this locus
+ vector<string> alleles; // a list all alleles (ref + alt) at this locus
+ // the indicies are organized such that the genotype codes (0,1,2,.etc.)
+ // correspond to the correct offest into the allelese vector.
+ // that is, alleles[0] = ref, alleles[1] = first alternate allele, etc.
+ string vrepr(void); // a comparable record of the variantion described by the record
+ set<string> altSet(void); // set of alleles, rather than vector of them
+ map<string, int> altAlleleIndexes; // reverse lookup for alleles
+ map<string, vector<VariantAllele> > parsedAlternates(bool includePreviousBaseForIndels = false,
+ bool useMNPs = false,
+ bool useEntropy = false,
+ float matchScore = 10.0f,
+ float mismatchScore = -9.0f,
+ float gapOpenPenalty = 15.0f,
+ float gapExtendPenalty = 6.66f,
+ float repeatGapExtendPenalty = 0.0f,
+ string flankingRefLeft = "",
+ string flankingRefRight = "");
+ // the same output format as parsedAlternates, without parsing
+ map<string, vector<VariantAllele> > flatAlternates(void);
+
+ map<string, string> extendedAlternates(long int newPosition, long int length);
+
+ string originalLine; // the literal of the record, as read
+ // TODO
+ // the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j
+ // vector<pair<int, int> > genotypes; // indexes into the alleles, ordered as per the spec
+ string filter;
+ double quality;
+ VariantFieldType infoType(string& key);
+ map<string, vector<string> > info; // vector<string> allows for lists by Genotypes or Alternates
+ map<string, bool> infoFlags;
+ VariantFieldType formatType(string& key);
+ vector<string> format;
+ map<string, map<string, vector<string> > > samples; // vector<string> allows for lists by Genotypes or Alternates
+ vector<string> sampleNames;
+ vector<string> outputSampleNames;
+ VariantCallFile* vcf;
+
+ //void addInfoInt(string& tag, int value);
+ //void addInfoFloat(string& tag, double value);
+ //void addInfoString(string& tag, string& value);
+
+ void removeAlt(string& altallele);
+
+public:
+
+ Variant() { }
+
+ Variant(VariantCallFile& v)
+ : sampleNames(v.sampleNames)
+ , outputSampleNames(v.sampleNames)
+ , vcf(&v)
+ { }
+
+ void setVariantCallFile(VariantCallFile& v);
+ void setVariantCallFile(VariantCallFile* v);
+
+ void parse(string& line, bool parseSamples = true);
+ void addFilter(string& tag);
+ bool getValueBool(string& key, string& sample, int index = INDEX_NONE);
+ double getValueFloat(string& key, string& sample, int index = INDEX_NONE);
+ string getValueString(string& key, string& sample, int index = INDEX_NONE);
+ bool getSampleValueBool(string& key, string& sample, int index = INDEX_NONE);
+ double getSampleValueFloat(string& key, string& sample, int index = INDEX_NONE);
+ string getSampleValueString(string& key, string& sample, int index = INDEX_NONE);
+ bool getInfoValueBool(string& key, int index = INDEX_NONE);
+ double getInfoValueFloat(string& key, int index = INDEX_NONE);
+ string getInfoValueString(string& key, int index = INDEX_NONE);
+ void printAlt(ostream& out); // print a comma-sep list of alternate alleles to an ostream
+ void printAlleles(ostream& out); // print a comma-sep list of *all* alleles to an ostream
+ int getAltAlleleIndex(string& allele);
+ void updateAlleleIndexes(void);
+ void addFormatField(string& key);
+ void setOutputSampleNames(vector<string>& outputSamples);
+ map<pair<int, int>, int> getGenotypeIndexesDiploid(void);
+ int getNumSamples(void);
+ int getNumValidGenotypes(void);
+ string getGenotype(string& sample);
+ bool isPhased(void);
+ // TODO
+ //void setInfoField(string& key, string& val);
+
+private:
+
+ string lastFormat;
+
+};
+
+
+// from BamTools
+// RuleToken implementation
+
+class RuleToken {
+
+public:
+
+ // enums
+ enum RuleTokenType { OPERAND = 0
+ , NUMBER
+ , BOOLEAN_VARIABLE
+ , NUMERIC_VARIABLE
+ , STRING_VARIABLE
+ , AND_OPERATOR
+ , OR_OPERATOR
+ , ADD_OPERATOR
+ , SUBTRACT_OPERATOR
+ , MULTIPLY_OPERATOR
+ , DIVIDE_OPERATOR
+ , NOT_OPERATOR
+ , EQUAL_OPERATOR
+ , GREATER_THAN_OPERATOR
+ , LESS_THAN_OPERATOR
+ , LEFT_PARENTHESIS
+ , RIGHT_PARENTHESIS
+ };
+
+ // constructor
+ RuleToken(string token, map<string, VariantFieldType>& variables);
+ RuleToken(void)
+ : type(BOOLEAN_VARIABLE)
+ , state(false)
+ { }
+
+ // data members
+ RuleTokenType type;
+ string value;
+
+ double number;
+ string str;
+ bool state;
+
+ bool isVariable; // if this is a variable
+ //bool isEvaluated; // when we evaluate variables
+
+ RuleToken apply(RuleToken& other);
+
+};
+
+inline int priority(const RuleToken& token) {
+ switch ( token.type ) {
+ case ( RuleToken::MULTIPLY_OPERATOR ) : return 8;
+ case ( RuleToken::DIVIDE_OPERATOR ) : return 8;
+ case ( RuleToken::ADD_OPERATOR ) : return 7;
+ case ( RuleToken::SUBTRACT_OPERATOR ) : return 7;
+ case ( RuleToken::NOT_OPERATOR ) : return 6;
+ case ( RuleToken::EQUAL_OPERATOR ) : return 5;
+ case ( RuleToken::GREATER_THAN_OPERATOR ) : return 5;
+ case ( RuleToken::LESS_THAN_OPERATOR ) : return 5;
+ case ( RuleToken::AND_OPERATOR ) : return 4;
+ case ( RuleToken::OR_OPERATOR ) : return 3;
+ case ( RuleToken::LEFT_PARENTHESIS ) : return 0;
+ case ( RuleToken::RIGHT_PARENTHESIS ) : return 0;
+ default: cerr << "invalid token type" << endl; exit(1);
+ }
+}
+
+inline bool isRightAssociative(const RuleToken& token) {
+ return (token.type == RuleToken::NOT_OPERATOR ||
+ token.type == RuleToken::LEFT_PARENTHESIS);
+}
+
+inline bool isLeftAssociative(const RuleToken& token) {
+ return !isRightAssociative(token);
+}
+
+inline bool isLeftParenthesis(const RuleToken& token) {
+ return ( token.type == RuleToken::LEFT_PARENTHESIS );
+}
+
+inline bool isRightParenthesis(const RuleToken& token) {
+ return ( token.type == RuleToken::RIGHT_PARENTHESIS );
+}
+
+inline bool isOperand(const RuleToken& token) {
+ return ( token.type == RuleToken::OPERAND ||
+ token.type == RuleToken::NUMBER ||
+ token.type == RuleToken::NUMERIC_VARIABLE ||
+ token.type == RuleToken::STRING_VARIABLE ||
+ token.type == RuleToken::BOOLEAN_VARIABLE
+ );
+}
+
+inline bool isOperator(const RuleToken& token) {
+ return ( token.type == RuleToken::AND_OPERATOR ||
+ token.type == RuleToken::OR_OPERATOR ||
+ token.type == RuleToken::NOT_OPERATOR ||
+ token.type == RuleToken::EQUAL_OPERATOR ||
+ token.type == RuleToken::GREATER_THAN_OPERATOR ||
+ token.type == RuleToken::LESS_THAN_OPERATOR ||
+ token.type == RuleToken::MULTIPLY_OPERATOR ||
+ token.type == RuleToken::DIVIDE_OPERATOR ||
+ token.type == RuleToken::ADD_OPERATOR ||
+ token.type == RuleToken::SUBTRACT_OPERATOR
+ );
+}
+
+inline bool isOperatorChar(const char& c) {
+ return (c == '!' ||
+ c == '&' ||
+ c == '|' ||
+ c == '=' ||
+ c == '>' ||
+ c == '<' ||
+ c == '*' ||
+ c == '/' ||
+ c == '+' ||
+ c == '-');
+}
+
+inline bool isParanChar(const char& c) {
+ return (c == '(' || c == ')');
+}
+
+inline bool isNumeric(const RuleToken& token) {
+ return token.type == RuleToken::NUMERIC_VARIABLE;
+}
+
+inline bool isString(const RuleToken& token) {
+ return token.type == RuleToken::STRING_VARIABLE;
+}
+
+inline bool isBoolean(const RuleToken& token) {
+ return token.type == RuleToken::BOOLEAN_VARIABLE;
+}
+
+inline bool isVariable(const RuleToken& token) {
+ return isNumeric(token) || isString(token) || isBoolean(token);
+}
+
+void tokenizeFilterSpec(string& filterspec, stack<RuleToken>& tokens, map<string, VariantFieldType>& variables);
+
+
+class VariantFilter {
+
+public:
+
+ enum VariantFilterType { SAMPLE = 0,
+ RECORD };
+
+ string spec;
+ queue<RuleToken> tokens; // tokens, infix notation
+ queue<RuleToken> rules; // tokens, prefix notation
+ VariantFilterType type;
+ VariantFilter(string filterspec, VariantFilterType filtertype, map<string, VariantFieldType>& variables);
+ bool passes(Variant& var, string& sample); // all alts pass
+ bool passes(Variant& var, string& sample, string& allele);
+ void removeFilteredGenotypes(Variant& var, bool keepInfo);
+
+};
+
+
+// genotype manipulation
+
+// TODO
+//map<string, int> decomposeGenotype(string& genotype);
+
+vector<int> decomposePhasedGenotype(const string& genotype);
+map<int, int> decomposeGenotype(const string& genotype);
+
+string genotypeToString(const map<int, int>& genotype);
+
+string phasedGenotypeToString(const vector<int>& genotype);
+
+bool isHet(const map<int, int>& genotype);
+
+bool isHom(const map<int, int>& genotype);
+
+bool hasNonRef(const map<int, int>& genotype);
+
+bool isHomRef(const map<int, int>& genotype);
+
+bool isHomNonRef(const map<int, int>& genotype);
+
+bool isNull(const map<int, int>& genotype);
+
+int ploidy(const map<int, int>& genotype);
+
+string unionInfoHeaderLines(string& s1, string& s2);
+
+// genotype likelihood ordering
+
+list<list<int> > glorder(int ploidy, int alleles);
+list<list<int> > _glorder(int ploidy, int alleles);
+list<int> glsWithAlt(int alt, int ploidy, int numalts);
+map<int, int> glReorder(int ploidy, int numalts, map<int, int>& alleleIndexMapping, vector<int>& altsToRemove);
+
+vector<string>& unique(vector<string>& strings);
+
+string varCigar(vector<VariantAllele>& vav, bool xForMismatch = false);
+string mergeCigar(const string& c1, const string& c2);
+vector<pair<int, string> > splitCigar(const string& cigarStr);
+list<pair<int, string> > splitCigarList(const string& cigarStr);
+int cigarRefLen(const vector<pair<int, char> >& cigar);
+int cigarRefLen(const vector<pair<int, string> >& cigar);
+vector<pair<int, string> > cleanCigar(const vector<pair<int, string> >& cigar);
+string joinCigar(const vector<pair<int, string> >& cigar);
+string joinCigar(const vector<pair<int, char> >& cigar);
+string joinCigarList(const list<pair<int, string> >& cigar);
+bool isEmptyCigarElement(const pair<int, string>& elem);
+
+// for sorting, generating maps ordered by chromosome name
+class ChromNameCompare {
+public:
+ bool operator()(const string& a, const string& b) const {
+ return (filevercmp(a.c_str(), b.c_str()) < 0);
+ }
+};
+
+class VCFHeader
+{
+public:
+ VCFHeader();
+ ~VCFHeader() {}
+
+ /*
+ * Adds header_column to this->header_columns if
+ * it doesn't already exits.
+ */
+ void addHeaderColumn(const string& header_column);
+
+ /*
+ * Adds meta_line to either header_lines or header_lists.
+ *
+ * We parse out the ##_type_ from meta_line
+ * - If the meta_line ##_type_ is a key in header_lines then meta_line is added to header_lines
+ * - If the meta_line ##_type_ is a key in header_lists then meta_line is added to header_lists[##_type_] vector<string>
+ * Unless that header_lists[##_type_] vector already contains the ID that is in meta_line, in that case it is not added
+ */
+ void addMetaInformationLine(const string& meta_line);
+
+ /*
+ * Converts header_lines, header_lists and header_columns to a proper VCF header
+ */
+ string getHeaderString();
+
+private:
+ VCFHeader(const VCFHeader& vcfHeader); // Do not implement the copy constructor, there is no reason to add this functionality
+ VCFHeader& operator=(const VCFHeader& vcfHeader); // Do not implement operator=, there is no reason to add this functionality
+
+ /*
+ * This is a helper function that determines if the ID substring contained in meta_line
+ * exists as a ID substring within the vector<string> meta_lines. Returns true if
+ * the ID exists within the vector and false otherwise.
+ */
+ bool metaInfoIdExistsInVector(const string& meta_line, vector<string>& meta_lines);
+
+ /*
+ * header_line_names_ordered contains all the header lines that
+ * are available and in the expected order for a valid VCF file
+ */
+ vector<string> header_line_names_ordered;
+ /*
+ * header_list_names_ordered contains all the header lists that
+ * are available and in the expected order for a valid VCF file
+ */
+ vector<string> header_list_names_ordered;
+
+ /*
+ * header_columns is set by the constructor to contain the 8 manditory VCF fields.
+ * Also, unique header_columns for each of the vcf files are added as well.
+ * Duplicates are not allowed, to prevent duplicates use addHeaderColumn when adding header columns
+ */
+ vector<string> header_columns;
+
+ /*
+ * the maps we're going to be using will be case-insensitive
+ * so that "fileFormat" and "fileformat" hash to the same item.
+ */
+ struct stringcasecmp : binary_function<string, string, bool> {
+ struct charcasecmp : public std::binary_function<unsigned char, unsigned char, bool> {
+ bool operator() (const unsigned char& c1, const unsigned char& c2) const {
+ return tolower (c1) < tolower (c2);
+ }
+ };
+ bool operator() (const std::string & s1, const std::string & s2) const {
+ return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(), charcasecmp());
+ }
+ };
+
+ // contains all the ##_types_ as keys, the value is either empty or a VCF file has set it
+ map<string, string, stringcasecmp> header_lines;
+
+ // contains all the ##_types_ as keys, the value is a vector of ##_type_ (since there can be duplicate #INFO for example, duplicate ids are not allowed)
+ map<string, vector<string>, stringcasecmp> header_lists;
+
+};
+
+} // end namespace VCF
+
+#endif
diff --git a/src/convert.h b/src/convert.h
new file mode 100644
index 0000000..d73d518
--- /dev/null
+++ b/src/convert.h
@@ -0,0 +1,22 @@
+#ifndef __CONVERT_H
+#define __CONVERT_H
+
+#include <sstream>
+
+// converts the string into the specified type, setting r to the converted
+// value and returning true/false on success or failure
+template<typename T>
+bool convert(const std::string& s, T& r) {
+ std::istringstream iss(s);
+ iss >> r;
+ return iss.eof() ? true : false;
+}
+
+template<typename T>
+std::string convert(const T& r) {
+ std::ostringstream oss;
+ oss << r;
+ return oss.str();
+}
+
+#endif
diff --git a/src/join.h b/src/join.h
new file mode 100644
index 0000000..c46a75f
--- /dev/null
+++ b/src/join.h
@@ -0,0 +1,36 @@
+#ifndef __JOIN_H
+#define __JOIN_H
+
+// functions to split a string by a specific delimiter
+#include <string>
+#include <vector>
+#include <sstream>
+#include <string.h>
+
+// join a vector of elements by a delimiter object. ostream<< must be defined
+// for both class S and T and an ostream, as it is e.g. in the case of strings
+// and character arrays
+template<class S, class T>
+std::string join(std::vector<T>& elems, S& delim) {
+ std::stringstream ss;
+ typename std::vector<T>::iterator e = elems.begin();
+ ss << *e++;
+ for (; e != elems.end(); ++e) {
+ ss << delim << *e;
+ }
+ return ss.str();
+}
+
+// same for lists
+template<class S, class T>
+std::string join(std::list<T>& elems, S& delim) {
+ std::stringstream ss;
+ typename std::list<T>::iterator e = elems.begin();
+ ss << *e++;
+ for (; e != elems.end(); ++e) {
+ ss << delim << *e;
+ }
+ return ss.str();
+}
+
+#endif
diff --git a/src/mt19937ar.h b/src/mt19937ar.h
new file mode 100644
index 0000000..3f239e1
--- /dev/null
+++ b/src/mt19937ar.h
@@ -0,0 +1,192 @@
+/*
+ A C-program for MT19937, with initialization improved 2002/1/26.
+ Coded by Takuji Nishimura and Makoto Matsumoto.
+
+ Before using, initialize the state by using init_genrand(seed)
+ or init_by_array(init_key, key_length).
+
+ Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. The names of its contributors may not be used to endorse or promote
+ products derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+ Any feedback is very welcome.
+ http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+ email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+*/
+
+#include <stdio.h>
+
+/* Period parameters */
+#define N 624
+#define M 397
+#define MATRIX_A 0x9908b0dfUL /* constant vector a */
+#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
+#define LOWER_MASK 0x7fffffffUL /* least significant r bits */
+
+static unsigned long mt[N]; /* the array for the state vector */
+static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */
+
+/* initializes mt[N] with a seed */
+void init_genrand(unsigned long s)
+{
+ mt[0]= s & 0xffffffffUL;
+ for (mti=1; mti<N; mti++) {
+ mt[mti] =
+ (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
+ /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
+ /* In the previous versions, MSBs of the seed affect */
+ /* only MSBs of the array mt[]. */
+ /* 2002/01/09 modified by Makoto Matsumoto */
+ mt[mti] &= 0xffffffffUL;
+ /* for >32 bit machines */
+ }
+}
+
+/* initialize by an array with array-length */
+/* init_key is the array for initializing keys */
+/* key_length is its length */
+/* slight change for C++, 2004/2/26 */
+void init_by_array(unsigned long init_key[], int key_length)
+{
+ int i, j, k;
+ init_genrand(19650218UL);
+ i=1; j=0;
+ k = (N>key_length ? N : key_length);
+ for (; k; k--) {
+ mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
+ + init_key[j] + j; /* non linear */
+ mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
+ i++; j++;
+ if (i>=N) { mt[0] = mt[N-1]; i=1; }
+ if (j>=key_length) j=0;
+ }
+ for (k=N-1; k; k--) {
+ mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
+ - i; /* non linear */
+ mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
+ i++;
+ if (i>=N) { mt[0] = mt[N-1]; i=1; }
+ }
+
+ mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
+}
+
+/* generates a random number on [0,0xffffffff]-interval */
+unsigned long genrand_int32(void)
+{
+ unsigned long y;
+ static unsigned long mag01[2]={0x0UL, MATRIX_A};
+ /* mag01[x] = x * MATRIX_A for x=0,1 */
+
+ if (mti >= N) { /* generate N words at one time */
+ int kk;
+
+ if (mti == N+1) /* if init_genrand() has not been called, */
+ init_genrand(5489UL); /* a default initial seed is used */
+
+ for (kk=0;kk<N-M;kk++) {
+ y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
+ mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
+ }
+ for (;kk<N-1;kk++) {
+ y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
+ mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
+ }
+ y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
+ mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
+
+ mti = 0;
+ }
+
+ y = mt[mti++];
+
+ /* Tempering */
+ y ^= (y >> 11);
+ y ^= (y << 7) & 0x9d2c5680UL;
+ y ^= (y << 15) & 0xefc60000UL;
+ y ^= (y >> 18);
+
+ return y;
+}
+
+/* generates a random number on [0,0x7fffffff]-interval */
+long genrand_int31(void)
+{
+ return (long)(genrand_int32()>>1);
+}
+
+/* generates a random number on [0,1]-real-interval */
+double genrand_real1(void)
+{
+ return genrand_int32()*(1.0/4294967295.0);
+ /* divided by 2^32-1 */
+}
+
+/* generates a random number on [0,1)-real-interval */
+double genrand_real2(void)
+{
+ return genrand_int32()*(1.0/4294967296.0);
+ /* divided by 2^32 */
+}
+
+/* generates a random number on (0,1)-real-interval */
+double genrand_real3(void)
+{
+ return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
+ /* divided by 2^32 */
+}
+
+/* generates a random number on [0,1) with 53-bit resolution*/
+double genrand_res53(void)
+{
+ unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
+ return(a*67108864.0+b)*(1.0/9007199254740992.0);
+}
+/* These real versions are due to Isaku Wada, 2002/01/09 added */
+
+/*
+int main(void)
+{
+ int i;
+ unsigned long init[4]={0x123, 0x234, 0x345, 0x456}, length=4;
+ init_by_array(init, length);
+ printf("1000 outputs of genrand_int32()\n");
+ for (i=0; i<1000; i++) {
+ printf("%10lu ", genrand_int32());
+ if (i%5==4) printf("\n");
+ }
+ printf("\n1000 outputs of genrand_real2()\n");
+ for (i=0; i<1000; i++) {
+ printf("%10.8f ", genrand_real2());
+ if (i%5==4) printf("\n");
+ }
+ return 0;
+}
+*/
diff --git a/src/split.cpp b/src/split.cpp
new file mode 100644
index 0000000..831dfcd
--- /dev/null
+++ b/src/split.cpp
@@ -0,0 +1,23 @@
+#include "split.h"
+
+
+std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
+ std::string delims = std::string(1, delim);
+ tokenize(s, elems, delims);
+ return elems;
+}
+
+std::vector<std::string> split(const std::string &s, char delim) {
+ std::vector<std::string> elems;
+ return split(s, delim, elems);
+}
+
+std::vector<std::string> &split(const std::string &s, const std::string& delims, std::vector<std::string> &elems) {
+ tokenize(s, elems, delims);
+ return elems;
+}
+
+std::vector<std::string> split(const std::string &s, const std::string& delims) {
+ std::vector<std::string> elems;
+ return split(s, delims, elems);
+}
diff --git a/src/split.h b/src/split.h
new file mode 100644
index 0000000..e10ba78
--- /dev/null
+++ b/src/split.h
@@ -0,0 +1,53 @@
+#ifndef __SPLIT_H
+#define __SPLIT_H
+
+// functions to split a string by a specific delimiter
+#include <string>
+#include <vector>
+#include <sstream>
+#include <string.h>
+
+// thanks to Evan Teran, http://stackoverflow.com/questions/236129/how-to-split-a-string/236803#236803
+
+// split a string on a single delimiter character (delim)
+std::vector<std::string>& split(const std::string &s, char delim, std::vector<std::string> &elems);
+std::vector<std::string> split(const std::string &s, char delim);
+
+// split a string on any character found in the string of delimiters (delims)
+std::vector<std::string>& split(const std::string &s, const std::string& delims, std::vector<std::string> &elems);
+std::vector<std::string> split(const std::string &s, const std::string& delims);
+
+// from Marius, http://stackoverflow.com/a/1493195/238609
+template < class ContainerT >
+void tokenize(const std::string& str, ContainerT& tokens,
+ const std::string& delimiters = " ", const bool trimEmpty = false)
+{
+
+ std::string::size_type pos, lastPos = 0;
+ while(true)
+ {
+ pos = str.find_first_of(delimiters, lastPos);
+ if(pos == std::string::npos)
+ {
+
+ pos = str.length();
+
+ if(pos != lastPos || !trimEmpty) {
+ tokens.push_back(typename ContainerT::value_type(str.data()+lastPos, (typename ContainerT::value_type::size_type)pos-lastPos));
+ }
+
+ break;
+ }
+ else
+ {
+ if(pos != lastPos || !trimEmpty) {
+ tokens.push_back(typename ContainerT::value_type(str.data()+lastPos, (typename ContainerT::value_type::size_type)pos-lastPos));
+ }
+ }
+
+ lastPos = pos + 1;
+ }
+};
+
+
+#endif
diff --git a/src/ssw.c b/src/ssw.c
new file mode 100644
index 0000000..69646f1
--- /dev/null
+++ b/src/ssw.c
@@ -0,0 +1,834 @@
+/*
+ * ssw.c
+ *
+ * Created by Mengyao Zhao on 6/22/10.
+ * Copyright 2010 Boston College. All rights reserved.
+ * Version 0.1.4
+ * Last revision by Mengyao Zhao on 07/31/12.
+ *
+ */
+
+#include <emmintrin.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "ssw.h"
+
+#ifdef __GNUC__
+#define LIKELY(x) __builtin_expect((x),1)
+#define UNLIKELY(x) __builtin_expect((x),0)
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+/* Convert the coordinate in the scoring matrix into the coordinate in one line of the band. */
+#define set_u(u, w, i, j) { int x=(i)-(w); x=x>0?x:0; (u)=(j)-x+1; }
+
+/* Convert the coordinate in the direction matrix into the coordinate in one line of the band. */
+#define set_d(u, w, i, j, p) { int x=(i)-(w); x=x>0?x:0; x=(j)-x; (u)=x*3+p; }
+
+/*! @function
+ @abstract Round an integer to the next closest power-2 integer.
+ @param x integer to be rounded (in place)
+ @discussion x will be modified.
+ */
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+
+typedef struct {
+ uint16_t score;
+ int32_t ref; //0-based position
+ int32_t read; //alignment ending position on read, 0-based
+} alignment_end;
+
+typedef struct {
+ uint32_t* seq;
+ int32_t length;
+} cigar;
+
+struct _profile{
+ __m128i* profile_byte; // 0: none
+ __m128i* profile_word; // 0: none
+ const int8_t* read;
+ const int8_t* mat;
+ int32_t readLen;
+ int32_t n;
+ uint8_t bias;
+};
+
+/* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */
+__m128i* qP_byte (const int8_t* read_num,
+ const int8_t* mat,
+ const int32_t readLen,
+ const int32_t n, /* the edge length of the squre matrix mat */
+ uint8_t bias) {
+
+ int32_t segLen = (readLen + 15) / 16; /* Split the 128 bit register into 16 pieces.
+ Each piece is 8 bit. Split the read into 16 segments.
+ Calculat 16 segments in parallel.
+ */
+ __m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
+ int8_t* t = (int8_t*)vProfile;
+ int32_t nt, i, j, segNum;
+
+ /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch */
+ for (nt = 0; LIKELY(nt < n); nt ++) {
+ for (i = 0; i < segLen; i ++) {
+ j = i;
+ for (segNum = 0; LIKELY(segNum < 16) ; segNum ++) {
+ *t++ = j>= readLen ? bias : mat[nt * n + read_num[j]] + bias;
+ j += segLen;
+ }
+ }
+ }
+ return vProfile;
+}
+
+/* Striped Smith-Waterman
+ Record the highest score of each reference position.
+ Return the alignment score and ending position of the best alignment, 2nd best alignment, etc.
+ Gap begin and gap extension are different.
+ wight_match > 0, all other weights < 0.
+ The returned positions are 0-based.
+ */
+alignment_end* sw_sse2_byte (const int8_t* ref,
+ int8_t ref_dir, // 0: forward ref; 1: reverse ref
+ int32_t refLen,
+ int32_t readLen,
+ const uint8_t weight_gapO, /* will be used as - */
+ const uint8_t weight_gapE, /* will be used as - */
+ __m128i* vProfile,
+ uint8_t terminate, /* the best alignment score: used to terminate
+ the matrix calculation when locating the
+ alignment beginning point. If this score
+ is set to 0, it will not be used */
+ uint8_t bias, /* Shift 0 point to a positive value. */
+ int32_t maskLen) {
+
+#define max16(m, vm) (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 8)); \
+ (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 4)); \
+ (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 2)); \
+ (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 1)); \
+ (m) = _mm_extract_epi16((vm), 0)
+
+ uint8_t max = 0; /* the max alignment score */
+ int32_t end_read = readLen - 1;
+ int32_t end_ref = -1; /* 0_based best alignment ending point; Initialized as isn't aligned -1. */
+ int32_t segLen = (readLen + 15) / 16; /* number of segment */
+
+ /* array to record the largest score of each reference position */
+ uint8_t* maxColumn = (uint8_t*) calloc(refLen, 1);
+
+ /* array to record the alignment read ending position of the largest score of each reference position */
+ int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t));
+
+ /* Define 16 byte 0 vector. */
+ __m128i vZero = _mm_set1_epi32(0);
+
+ __m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i));
+ __m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i));
+ __m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i));
+ __m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i));
+
+ int32_t i, j;
+ /* 16 byte insertion begin vector */
+ __m128i vGapO = _mm_set1_epi8(weight_gapO);
+
+ /* 16 byte insertion extension vector */
+ __m128i vGapE = _mm_set1_epi8(weight_gapE);
+
+ /* 16 byte bias vector */
+ __m128i vBias = _mm_set1_epi8(bias);
+
+ __m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
+ __m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
+ __m128i vTemp;
+ int32_t edge, begin = 0, end = refLen, step = 1;
+// int32_t distance = readLen * 2 / 3;
+// int32_t distance = readLen / 2;
+// int32_t distance = readLen;
+
+ /* outer loop to process the reference sequence */
+ if (ref_dir == 1) {
+ begin = refLen - 1;
+ end = -1;
+ step = -1;
+ }
+ for (i = begin; LIKELY(i != end); i += step) {
+ int32_t cmp;
+ __m128i e = vZero, vF = vZero, vMaxColumn = vZero; /* Initialize F value to 0.
+ Any errors to vH values will be corrected in the Lazy_F loop.
+ */
+// max16(maxColumn[i], vMaxColumn);
+// fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]);
+
+ __m128i vH = pvHStore[segLen - 1];
+ vH = _mm_slli_si128 (vH, 1); /* Shift the 128-bit value in vH left by 1 byte. */
+ __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
+
+ /* Swap the 2 H buffers. */
+ __m128i* pv = pvHLoad;
+ pvHLoad = pvHStore;
+ pvHStore = pv;
+
+ /* inner loop to process the query sequence */
+ for (j = 0; LIKELY(j < segLen); ++j) {
+ vH = _mm_adds_epu8(vH, _mm_load_si128(vP + j));
+ vH = _mm_subs_epu8(vH, vBias); /* vH will be always > 0 */
+ // max16(maxColumn[i], vH);
+ // fprintf(stderr, "H[%d]: %d\n", i, maxColumn[i]);
+// int8_t* t;
+// int32_t ti;
+//for (t = (int8_t*)&vH, ti = 0; ti < 16; ++ti) fprintf(stderr, "%d\t", *t++);
+
+ /* Get max from vH, vE and vF. */
+ e = _mm_load_si128(pvE + j);
+ vH = _mm_max_epu8(vH, e);
+ vH = _mm_max_epu8(vH, vF);
+ vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
+
+ // max16(maxColumn[i], vMaxColumn);
+ // fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]);
+// for (t = (int8_t*)&vMaxColumn, ti = 0; ti < 16; ++ti) fprintf(stderr, "%d\t", *t++);
+
+ /* Save vH values. */
+ _mm_store_si128(pvHStore + j, vH);
+
+ /* Update vE value. */
+ vH = _mm_subs_epu8(vH, vGapO); /* saturation arithmetic, result >= 0 */
+ e = _mm_subs_epu8(e, vGapE);
+ e = _mm_max_epu8(e, vH);
+ _mm_store_si128(pvE + j, e);
+
+ /* Update vF value. */
+ vF = _mm_subs_epu8(vF, vGapE);
+ vF = _mm_max_epu8(vF, vH);
+
+ /* Load the next vH. */
+ vH = _mm_load_si128(pvHLoad + j);
+ }
+
+ /* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */
+ /* reset pointers to the start of the saved data */
+ j = 0;
+ vH = _mm_load_si128 (pvHStore + j);
+
+ /* the computed vF value is for the given column. since */
+ /* we are at the end, we need to shift the vF value over */
+ /* to the next column. */
+ vF = _mm_slli_si128 (vF, 1);
+ vTemp = _mm_subs_epu8 (vH, vGapO);
+ vTemp = _mm_subs_epu8 (vF, vTemp);
+ vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
+ cmp = _mm_movemask_epi8 (vTemp);
+
+ while (cmp != 0xffff)
+ {
+ vH = _mm_max_epu8 (vH, vF);
+ vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
+ _mm_store_si128 (pvHStore + j, vH);
+ vF = _mm_subs_epu8 (vF, vGapE);
+ j++;
+ if (j >= segLen)
+ {
+ j = 0;
+ vF = _mm_slli_si128 (vF, 1);
+ }
+ vH = _mm_load_si128 (pvHStore + j);
+
+ vTemp = _mm_subs_epu8 (vH, vGapO);
+ vTemp = _mm_subs_epu8 (vF, vTemp);
+ vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
+ cmp = _mm_movemask_epi8 (vTemp);
+ }
+
+ vMaxScore = _mm_max_epu8(vMaxScore, vMaxColumn);
+ vTemp = _mm_cmpeq_epi8(vMaxMark, vMaxScore);
+ cmp = _mm_movemask_epi8(vTemp);
+ if (cmp != 0xffff) {
+ uint8_t temp;
+ vMaxMark = vMaxScore;
+ max16(temp, vMaxScore);
+ vMaxScore = vMaxMark;
+
+ if (LIKELY(temp > max)) {
+ max = temp;
+ if (max + bias >= 255) break; //overflow
+ end_ref = i;
+
+ /* Store the column with the highest alignment score in order to trace the alignment ending position on read. */
+ for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j];
+ }
+ }
+
+ /* Record the max score of current column. */
+ max16(maxColumn[i], vMaxColumn);
+// fprintf(stderr, "maxColumn[%d]: %d\n", i, maxColumn[i]);
+ if (maxColumn[i] == terminate) break;
+ }
+
+ /* Trace the alignment ending position on read. */
+ uint8_t *t = (uint8_t*)pvHmax;
+ int32_t column_len = segLen * 16;
+ for (i = 0; LIKELY(i < column_len); ++i, ++t) {
+ int32_t temp;
+ if (*t == max) {
+ temp = i / 16 + i % 16 * segLen;
+ if (temp < end_read) end_read = temp;
+ }
+ }
+
+ free(pvHmax);
+ free(pvE);
+ free(pvHLoad);
+ free(pvHStore);
+
+ /* Find the most possible 2nd best alignment. */
+ alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end));
+ bests[0].score = max + bias >= 255 ? 255 : max;
+ bests[0].ref = end_ref;
+ bests[0].read = end_read;
+
+ bests[1].score = 0;
+ bests[1].ref = 0;
+ bests[1].read = 0;
+
+ edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0;
+ for (i = 0; i < edge; i ++) {
+// fprintf (stderr, "maxColumn[%d]: %d\n", i, maxColumn[i]);
+ if (maxColumn[i] > bests[1].score) {
+ bests[1].score = maxColumn[i];
+ bests[1].ref = i;
+ }
+ }
+ edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen);
+ for (i = edge + 1; i < refLen; i ++) {
+// fprintf (stderr, "refLen: %d\tmaxColumn[%d]: %d\n", refLen, i, maxColumn[i]);
+ if (maxColumn[i] > bests[1].score) {
+ bests[1].score = maxColumn[i];
+ bests[1].ref = i;
+ }
+ }
+
+ free(maxColumn);
+ free(end_read_column);
+ return bests;
+}
+
+__m128i* qP_word (const int8_t* read_num,
+ const int8_t* mat,
+ const int32_t readLen,
+ const int32_t n) {
+
+ int32_t segLen = (readLen + 7) / 8;
+ __m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
+ int16_t* t = (int16_t*)vProfile;
+ int32_t nt, i, j;
+ int32_t segNum;
+
+ /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch */
+ for (nt = 0; LIKELY(nt < n); nt ++) {
+ for (i = 0; i < segLen; i ++) {
+ j = i;
+ for (segNum = 0; LIKELY(segNum < 8) ; segNum ++) {
+ *t++ = j>= readLen ? 0 : mat[nt * n + read_num[j]];
+ j += segLen;
+ }
+ }
+ }
+ return vProfile;
+}
+
+alignment_end* sw_sse2_word (const int8_t* ref,
+ int8_t ref_dir, // 0: forward ref; 1: reverse ref
+ int32_t refLen,
+ int32_t readLen,
+ const uint8_t weight_gapO, /* will be used as - */
+ const uint8_t weight_gapE, /* will be used as - */
+ __m128i* vProfile,
+ uint16_t terminate,
+ int32_t maskLen) {
+
+#define max8(m, vm) (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 8)); \
+ (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 4)); \
+ (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 2)); \
+ (m) = _mm_extract_epi16((vm), 0)
+
+ uint16_t max = 0; /* the max alignment score */
+ int32_t end_read = readLen - 1;
+ int32_t end_ref = 0; /* 1_based best alignment ending point; Initialized as isn't aligned - 0. */
+ int32_t segLen = (readLen + 7) / 8; /* number of segment */
+
+ /* array to record the largest score of each reference position */
+ uint16_t* maxColumn = (uint16_t*) calloc(refLen, 2);
+
+ /* array to record the alignment read ending position of the largest score of each reference position */
+ int32_t* end_read_column = (int32_t*) calloc(refLen, sizeof(int32_t));
+
+ /* Define 16 byte 0 vector. */
+ __m128i vZero = _mm_set1_epi32(0);
+
+ __m128i* pvHStore = (__m128i*) calloc(segLen, sizeof(__m128i));
+ __m128i* pvHLoad = (__m128i*) calloc(segLen, sizeof(__m128i));
+ __m128i* pvE = (__m128i*) calloc(segLen, sizeof(__m128i));
+ __m128i* pvHmax = (__m128i*) calloc(segLen, sizeof(__m128i));
+
+ int32_t i, j, k;
+ /* 16 byte insertion begin vector */
+ __m128i vGapO = _mm_set1_epi16(weight_gapO);
+
+ /* 16 byte insertion extension vector */
+ __m128i vGapE = _mm_set1_epi16(weight_gapE);
+
+ /* 16 byte bias vector */
+ __m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
+ __m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
+ __m128i vTemp;
+ int32_t edge, begin = 0, end = refLen, step = 1;
+
+ /* outer loop to process the reference sequence */
+ if (ref_dir == 1) {
+ begin = refLen - 1;
+ end = -1;
+ step = -1;
+ }
+ for (i = begin; LIKELY(i != end); i += step) {
+ int32_t cmp;
+ __m128i e = vZero, vF = vZero; /* Initialize F value to 0.
+ Any errors to vH values will be corrected in the Lazy_F loop.
+ */
+ __m128i vH = pvHStore[segLen - 1];
+ vH = _mm_slli_si128 (vH, 2); /* Shift the 128-bit value in vH left by 2 byte. */
+
+ /* Swap the 2 H buffers. */
+ __m128i* pv = pvHLoad;
+
+ __m128i vMaxColumn = vZero; /* vMaxColumn is used to record the max values of column i. */
+
+ __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
+ pvHLoad = pvHStore;
+ pvHStore = pv;
+
+ /* inner loop to process the query sequence */
+ for (j = 0; LIKELY(j < segLen); j ++) {
+ vH = _mm_adds_epi16(vH, _mm_load_si128(vP + j));
+
+ /* Get max from vH, vE and vF. */
+ e = _mm_load_si128(pvE + j);
+ vH = _mm_max_epi16(vH, e);
+ vH = _mm_max_epi16(vH, vF);
+ vMaxColumn = _mm_max_epi16(vMaxColumn, vH);
+
+ /* Save vH values. */
+ _mm_store_si128(pvHStore + j, vH);
+
+ /* Update vE value. */
+ vH = _mm_subs_epu16(vH, vGapO); /* saturation arithmetic, result >= 0 */
+ e = _mm_subs_epu16(e, vGapE);
+ e = _mm_max_epi16(e, vH);
+ _mm_store_si128(pvE + j, e);
+
+ /* Update vF value. */
+ vF = _mm_subs_epu16(vF, vGapE);
+ vF = _mm_max_epi16(vF, vH);
+
+ /* Load the next vH. */
+ vH = _mm_load_si128(pvHLoad + j);
+ }
+
+ /* Lazy_F loop: has been revised to disallow adjecent insertion and then deletion, so don't update E(i, j), learn from SWPS3 */
+ for (k = 0; LIKELY(k < 8); ++k) {
+ vF = _mm_slli_si128 (vF, 2);
+ for (j = 0; LIKELY(j < segLen); ++j) {
+ vH = _mm_load_si128(pvHStore + j);
+ vH = _mm_max_epi16(vH, vF);
+ _mm_store_si128(pvHStore + j, vH);
+ vH = _mm_subs_epu16(vH, vGapO);
+ vF = _mm_subs_epu16(vF, vGapE);
+ if (UNLIKELY(! _mm_movemask_epi8(_mm_cmpgt_epi16(vF, vH)))) goto end;
+ }
+ }
+
+end:
+ vMaxScore = _mm_max_epi16(vMaxScore, vMaxColumn);
+ vTemp = _mm_cmpeq_epi16(vMaxMark, vMaxScore);
+ cmp = _mm_movemask_epi8(vTemp);
+ if (cmp != 0xffff) {
+ uint16_t temp;
+ vMaxMark = vMaxScore;
+ max8(temp, vMaxScore);
+ vMaxScore = vMaxMark;
+
+ if (LIKELY(temp > max)) {
+ max = temp;
+ end_ref = i;
+ for (j = 0; LIKELY(j < segLen); ++j) pvHmax[j] = pvHStore[j];
+ }
+ }
+
+ /* Record the max score of current column. */
+ max8(maxColumn[i], vMaxColumn);
+ if (maxColumn[i] == terminate) break;
+ }
+
+ /* Trace the alignment ending position on read. */
+ uint16_t *t = (uint16_t*)pvHmax;
+ int32_t column_len = segLen * 8;
+ for (i = 0; LIKELY(i < column_len); ++i, ++t) {
+ int32_t temp;
+ if (*t == max) {
+ temp = i / 8 + i % 8 * segLen;
+ if (temp < end_read) end_read = temp;
+ }
+ }
+
+ free(pvHmax);
+ free(pvE);
+ free(pvHLoad);
+ free(pvHStore);
+
+ /* Find the most possible 2nd best alignment. */
+ alignment_end* bests = (alignment_end*) calloc(2, sizeof(alignment_end));
+ bests[0].score = max;
+ bests[0].ref = end_ref;
+ bests[0].read = end_read;
+
+ bests[1].score = 0;
+ bests[1].ref = 0;
+ bests[1].read = 0;
+
+ edge = (end_ref - maskLen) > 0 ? (end_ref - maskLen) : 0;
+ for (i = 0; i < edge; i ++) {
+ if (maxColumn[i] > bests[1].score) {
+ bests[1].score = maxColumn[i];
+ bests[1].ref = i;
+ }
+ }
+ edge = (end_ref + maskLen) > refLen ? refLen : (end_ref + maskLen);
+ for (i = edge; i < refLen; i ++) {
+ if (maxColumn[i] > bests[1].score) {
+ bests[1].score = maxColumn[i];
+ bests[1].ref = i;
+ }
+ }
+
+ free(maxColumn);
+ free(end_read_column);
+ return bests;
+}
+
+cigar* banded_sw (const int8_t* ref,
+ const int8_t* read,
+ int32_t refLen,
+ int32_t readLen,
+ int32_t score,
+ const uint32_t weight_gapO, /* will be used as - */
+ const uint32_t weight_gapE, /* will be used as - */
+ int32_t band_width,
+ const int8_t* mat, /* pointer to the weight matrix */
+ int32_t n) {
+
+ uint32_t *c = (uint32_t*)malloc(16 * sizeof(uint32_t)), *c1;
+ int32_t i, j, e, f, temp1, temp2, s = 16, s1 = 8, s2 = 1024, l, max = 0;
+ int32_t width, width_d, *h_b, *e_b, *h_c;
+ int8_t *direction, *direction_line;
+ cigar* result = (cigar*)malloc(sizeof(cigar));
+ h_b = (int32_t*)malloc(s1 * sizeof(int32_t));
+ e_b = (int32_t*)malloc(s1 * sizeof(int32_t));
+ h_c = (int32_t*)malloc(s1 * sizeof(int32_t));
+ direction = (int8_t*)malloc(s2 * sizeof(int8_t));
+
+ do {
+ width = band_width * 2 + 3, width_d = band_width * 2 + 1;
+ while (width >= s1) {
+ ++s1;
+ kroundup32(s1);
+ h_b = (int32_t*)realloc(h_b, s1 * sizeof(int32_t));
+ e_b = (int32_t*)realloc(e_b, s1 * sizeof(int32_t));
+ h_c = (int32_t*)realloc(h_c, s1 * sizeof(int32_t));
+ }
+ while (width_d * readLen * 3 >= s2) {
+ ++s2;
+ kroundup32(s2);
+ if (s2 < 0) {
+ fprintf(stderr, "Alignment score and position are not consensus.\n");
+ exit(1);
+ }
+ direction = (int8_t*)realloc(direction, s2 * sizeof(int8_t));
+ }
+ direction_line = direction;
+ for (j = 1; LIKELY(j < width - 1); j ++) h_b[j] = 0;
+ for (i = 0; LIKELY(i < readLen); i ++) {
+ int32_t beg = 0, end = refLen - 1, u = 0, edge;
+ j = i - band_width; beg = beg > j ? beg : j; // band start
+ j = i + band_width; end = end < j ? end : j; // band end
+ edge = end + 1 < width - 1 ? end + 1 : width - 1;
+ f = h_b[0] = e_b[0] = h_b[edge] = e_b[edge] = h_c[0] = 0;
+ direction_line = direction + width_d * i * 3;
+
+ for (j = beg; LIKELY(j <= end); j ++) {
+ int32_t b, e1, f1, d, de, df, dh;
+ set_u(u, band_width, i, j); set_u(e, band_width, i - 1, j);
+ set_u(b, band_width, i, j - 1); set_u(d, band_width, i - 1, j - 1);
+ set_d(de, band_width, i, j, 0);
+ set_d(df, band_width, i, j, 1);
+ set_d(dh, band_width, i, j, 2);
+
+ temp1 = i == 0 ? -weight_gapO : h_b[e] - weight_gapO;
+ temp2 = i == 0 ? -weight_gapE : e_b[e] - weight_gapE;
+ e_b[u] = temp1 > temp2 ? temp1 : temp2;
+ direction_line[de] = temp1 > temp2 ? 3 : 2;
+
+ temp1 = h_c[b] - weight_gapO;
+ temp2 = f - weight_gapE;
+ f = temp1 > temp2 ? temp1 : temp2;
+ direction_line[df] = temp1 > temp2 ? 5 : 4;
+
+ e1 = e_b[u] > 0 ? e_b[u] : 0;
+ f1 = f > 0 ? f : 0;
+ temp1 = e1 > f1 ? e1 : f1;
+ temp2 = h_b[d] + mat[ref[j] * n + read[i]];
+ h_c[u] = temp1 > temp2 ? temp1 : temp2;
+
+ if (h_c[u] > max) max = h_c[u];
+
+ if (temp1 <= temp2) direction_line[dh] = 1;
+ else direction_line[dh] = e1 > f1 ? direction_line[de] : direction_line[df];
+ }
+ for (j = 1; j <= u; j ++) h_b[j] = h_c[j];
+ }
+ band_width *= 2;
+ } while (LIKELY(max < score));
+ band_width /= 2;
+
+ // trace back
+ i = readLen - 1;
+ j = refLen - 1;
+ e = 0; // Count the number of M, D or I.
+ l = 0; // record length of current cigar
+ f = max = 0; // M
+ temp2 = 2; // h
+ while (LIKELY(i > 0)) {
+ set_d(temp1, band_width, i, j, temp2);
+ switch (direction_line[temp1]) {
+ case 1:
+ --i;
+ --j;
+ temp2 = 2;
+ direction_line -= width_d * 3;
+ f = 0; // M
+ break;
+ case 2:
+ --i;
+ temp2 = 0; // e
+ direction_line -= width_d * 3;
+ f = 1; // I
+ break;
+ case 3:
+ --i;
+ temp2 = 2;
+ direction_line -= width_d * 3;
+ f = 1; // I
+ break;
+ case 4:
+ --j;
+ temp2 = 1;
+ f = 2; // D
+ break;
+ case 5:
+ --j;
+ temp2 = 2;
+ f = 2; // D
+ break;
+ default:
+ fprintf(stderr, "Trace back error: %d.\n", direction_line[temp1 - 1]);
+ return 0;
+ }
+ if (f == max) ++e;
+ else {
+ ++l;
+ while (l >= s) {
+ ++s;
+ kroundup32(s);
+ c = (uint32_t*)realloc(c, s * sizeof(uint32_t));
+ }
+ c[l - 1] = e<<4|max;
+ max = f;
+ e = 1;
+ }
+ }
+ if (f == 0) {
+ ++l;
+ while (l >= s) {
+ ++s;
+ kroundup32(s);
+ c = (uint32_t*)realloc(c, s * sizeof(uint32_t));
+ }
+ c[l - 1] = (e+1)<<4;
+ }else {
+ l += 2;
+ while (l >= s) {
+ ++s;
+ kroundup32(s);
+ c = (uint32_t*)realloc(c, s * sizeof(uint32_t));
+ }
+ c[l - 2] = e<<4|f;
+ c[l - 1] = 16; // 1M
+ }
+
+ // reverse cigar
+ c1 = (uint32_t*)malloc(l * sizeof(uint32_t));
+ s = 0;
+ e = l - 1;
+ while (LIKELY(s <= e)) {
+ c1[s] = c[e];
+ c1[e] = c[s];
+ ++ s;
+ -- e;
+ }
+ result->seq = c1;
+ result->length = l;
+
+ free(direction);
+ free(h_c);
+ free(e_b);
+ free(h_b);
+ free(c);
+ return result;
+}
+
+int8_t* seq_reverse(const int8_t* seq, int32_t end) /* end is 0-based alignment ending position */
+{
+ int8_t* reverse = (int8_t*)calloc(end + 1, sizeof(int8_t));
+ int32_t start = 0;
+ while (LIKELY(start <= end)) {
+ reverse[start] = seq[end];
+ reverse[end] = seq[start];
+ ++ start;
+ -- end;
+ }
+ return reverse;
+}
+
+s_profile* ssw_init (const int8_t* read, const int32_t readLen, const int8_t* mat, const int32_t n, const int8_t score_size) {
+ s_profile* p = (s_profile*)calloc(1, sizeof(struct _profile));
+ p->profile_byte = 0;
+ p->profile_word = 0;
+ p->bias = 0;
+
+ if (score_size == 0 || score_size == 2) {
+ /* Find the bias to use in the substitution matrix */
+ int32_t bias = 0, i;
+ for (i = 0; i < n*n; i++) if (mat[i] < bias) bias = mat[i];
+ bias = abs(bias);
+
+ p->bias = bias;
+ p->profile_byte = qP_byte (read, mat, readLen, n, bias);
+ }
+ if (score_size == 1 || score_size == 2) p->profile_word = qP_word (read, mat, readLen, n);
+ p->read = read;
+ p->mat = mat;
+ p->readLen = readLen;
+ p->n = n;
+ return p;
+}
+
+void init_destroy (s_profile* p) {
+ free(p->profile_byte);
+ free(p->profile_word);
+ free(p);
+}
+
+s_align* ssw_align (const s_profile* prof,
+ const int8_t* ref,
+ int32_t refLen,
+ const uint8_t weight_gapO,
+ const uint8_t weight_gapE,
+ const uint8_t flag, // (from high to low) bit 5: return the best alignment beginning position; 6: if (ref_end1 - ref_begin1 <= filterd) && (read_end1 - read_begin1 <= filterd), return cigar; 7: if max score >= filters, return cigar; 8: always return cigar; if 6 & 7 are both setted, only return cigar when both filter fulfilled
+ const uint16_t filters,
+ const int32_t filterd,
+ const int32_t maskLen) {
+
+ alignment_end* bests = 0, *bests_reverse = 0;
+ __m128i* vP = 0;
+ int32_t word = 0, band_width = 0, readLen = prof->readLen;
+ int8_t* read_reverse = 0;
+ cigar* path;
+ s_align* r = (s_align*)calloc(1, sizeof(s_align));
+ r->ref_begin1 = -1;
+ r->read_begin1 = -1;
+ r->cigar = 0;
+ r->cigarLen = 0;
+ if (maskLen < 15) {
+ fprintf(stderr, "When maskLen < 15, the function ssw_align doesn't return 2nd best alignment information.\n");
+ }
+
+ // Find the alignment scores and ending positions
+ if (prof->profile_byte) {
+ bests = sw_sse2_byte(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_byte, -1, prof->bias, maskLen);
+ if (prof->profile_word && bests[0].score == 255) {
+ free(bests);
+ bests = sw_sse2_word(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_word, -1, maskLen);
+ word = 1;
+ } else if (bests[0].score == 255) {
+ fprintf(stderr, "Please set 2 to the score_size parameter of the function ssw_init, otherwise the alignment results will be incorrect.\n");
+ return 0;
+ }
+ }else if (prof->profile_word) {
+ bests = sw_sse2_word(ref, 0, refLen, readLen, weight_gapO, weight_gapE, prof->profile_word, -1, maskLen);
+ word = 1;
+ }else {
+ fprintf(stderr, "Please call the function ssw_init before ssw_align.\n");
+ return 0;
+ }
+ r->score1 = bests[0].score;
+ r->ref_end1 = bests[0].ref;
+ r->read_end1 = bests[0].read;
+ if (maskLen >= 15) {
+ r->score2 = bests[1].score;
+ r->ref_end2 = bests[1].ref;
+ } else {
+ r->score2 = 0;
+ r->ref_end2 = -1;
+ }
+ free(bests);
+ if (flag == 0 || (flag == 2 && r->score1 < filters)) goto end;
+
+ // Find the beginning position of the best alignment.
+ read_reverse = seq_reverse(prof->read, r->read_end1);
+ if (word == 0) {
+ vP = qP_byte(read_reverse, prof->mat, r->read_end1 + 1, prof->n, prof->bias);
+ bests_reverse = sw_sse2_byte(ref, 1, r->ref_end1 + 1, r->read_end1 + 1, weight_gapO, weight_gapE, vP, r->score1, prof->bias, maskLen);
+ } else {
+ vP = qP_word(read_reverse, prof->mat, r->read_end1 + 1, prof->n);
+ bests_reverse = sw_sse2_word(ref, 1, r->ref_end1 + 1, r->read_end1 + 1, weight_gapO, weight_gapE, vP, r->score1, maskLen);
+ }
+ free(vP);
+ free(read_reverse);
+ r->ref_begin1 = bests_reverse[0].ref;
+ r->read_begin1 = r->read_end1 - bests_reverse[0].read;
+ free(bests_reverse);
+ if ((7&flag) == 0 || ((2&flag) != 0 && r->score1 < filters) || ((4&flag) != 0 && (r->ref_end1 - r->ref_begin1 > filterd || r->read_end1 - r->read_begin1 > filterd))) goto end;
+
+ // Generate cigar.
+ refLen = r->ref_end1 - r->ref_begin1 + 1;
+ readLen = r->read_end1 - r->read_begin1 + 1;
+ band_width = abs(refLen - readLen) + 1;
+ path = banded_sw(ref + r->ref_begin1, prof->read + r->read_begin1, refLen, readLen, r->score1, weight_gapO, weight_gapE, band_width, prof->mat, prof->n);
+ if (path == 0) r = 0;
+ else {
+ r->cigar = path->seq;
+ r->cigarLen = path->length;
+ free(path);
+ }
+
+end:
+ return r;
+}
+
+void align_destroy (s_align* a) {
+ free(a->cigar);
+ free(a);
+}
diff --git a/src/ssw.h b/src/ssw.h
new file mode 100644
index 0000000..3cb45c8
--- /dev/null
+++ b/src/ssw.h
@@ -0,0 +1,129 @@
+/*
+ * ssw.h
+ *
+ * Created by Mengyao Zhao on 6/22/10.
+ * Copyright 2010 Boston College. All rights reserved.
+ * Version 0.1.4
+ * Last revision by Mengyao Zhao on 07/31/12.
+ *
+ */
+
+#ifndef SSW_H
+#define SSW_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <emmintrin.h>
+
+/*! @typedef structure of the query profile */
+struct _profile;
+typedef struct _profile s_profile;
+
+/*! @typedef structure of the alignment result
+ @field score1 the best alignment score
+ @field score2 sub-optimal alignment score
+ @field ref_begin1 0-based best alignment beginning position on reference; ref_begin1 = -1 when the best alignment beginning
+ position is not available
+ @field ref_end1 0-based best alignment ending position on reference
+ @field read_begin1 0-based best alignment beginning position on read; read_begin1 = -1 when the best alignment beginning
+ position is not available
+ @field read_end1 0-based best alignment ending position on read
+ @field read_end2 0-based sub-optimal alignment ending position on read
+ @field cigar best alignment cigar; stored the same as that in BAM format, high 28 bits: length, low 4 bits: M/I/D (0/1/2);
+ cigar = 0 when the best alignment path is not available
+ @field cigarLen length of the cigar string; cigarLen = 0 when the best alignment path is not available
+*/
+typedef struct {
+ uint16_t score1;
+ uint16_t score2;
+ int32_t ref_begin1;
+ int32_t ref_end1;
+ int32_t read_begin1;
+ int32_t read_end1;
+ int32_t ref_end2;
+ uint32_t* cigar;
+ int32_t cigarLen;
+} s_align;
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*! @function Create the query profile using the query sequence.
+ @param read pointer to the query sequence; the query sequence needs to be numbers
+ @param readLen length of the query sequence
+ @param mat pointer to the substitution matrix; mat needs to be corresponding to the read sequence
+ @param n the square root of the number of elements in mat (mat has n*n elements)
+ @param score_size estimated Smith-Waterman score; if your estimated best alignment score is surely < 255 please set 0; if
+ your estimated best alignment score >= 255, please set 1; if you don't know, please set 2
+ @return pointer to the query profile structure
+ @note example for parameter read and mat:
+ If the query sequence is: ACGTATC, the sequence that read points to can be: 1234142
+ Then if the penalty for match is 2 and for mismatch is -2, the substitution matrix of parameter mat will be:
+ //A C G T
+ 2 -2 -2 -2 //A
+ -2 2 -2 -2 //C
+ -2 -2 2 -2 //G
+ -2 -2 -2 2 //T
+ mat is the pointer to the array {2, -2, -2, -2, -2, 2, -2, -2, -2, -2, 2, -2, -2, -2, -2, 2}
+*/
+s_profile* ssw_init (const int8_t* read, const int32_t readLen, const int8_t* mat, const int32_t n, const int8_t score_size);
+
+/*! @function Release the memory allocated by function ssw_init.
+ @param p pointer to the query profile structure
+*/
+void init_destroy (s_profile* p);
+
+// @function ssw alignment.
+/*! @function Do Striped Smith-Waterman alignment.
+ @param prof pointer to the query profile structure
+ @param ref pointer to the target sequence; the target sequence needs to be numbers and corresponding to the mat parameter of
+ function ssw_init
+ @param refLen length of the target sequence
+ @param weight_gapO the absolute value of gap open penalty
+ @param weight_gapE the absolute value of gap extension penalty
+ @param flag bitwise FLAG; (from high to low) bit 5: when setted as 1, function ssw_align will return the best alignment
+ beginning position; bit 6: when setted as 1, if (ref_end1 - ref_begin1 < filterd && read_end1 - read_begin1
+ < filterd), (whatever bit 5 is setted) the function will return the best alignment beginning position and
+ cigar; bit 7: when setted as 1, if the best alignment score >= filters, (whatever bit 5 is setted) the function
+ will return the best alignment beginning position and cigar; bit 8: when setted as 1, (whatever bit 5, 6 or 7 is
+ setted) the function will always return the best alignment beginning position and cigar
+ @param filters score filter: when bit 7 of flag is setted as 1 and bit 8 is setted as 0, filters will be used (Please check the
+ decription of the flag parameter for detailed usage.)
+ @param filterd distance filter: when bit 6 of flag is setted as 1 and bit 8 is setted as 0, filterd will be used (Please check
+ the decription of the flag parameter for detailed usage.)
+ @param maskLen The distance between the optimal and suboptimal alignment ending position >= maskLen. We suggest to use
+ readLen/2, if you don't have special concerns. Note: maskLen has to be >= 15, otherwise this function will NOT
+ return the suboptimal alignment information. Detailed description of maskLen: After locating the optimal
+ alignment ending position, the suboptimal alignment score can be heuristically found by checking the second
+ largest score in the array that contains the maximal score of each column of the SW matrix. In order to avoid
+ picking the scores that belong to the alignments sharing the partial best alignment, SSW C library masks the
+ reference loci nearby (mask length = maskLen) the best alignment ending position and locates the second largest
+ score from the unmasked elements.
+ @return pointer to the alignment result structure
+ @note Whatever the parameter flag is setted, this function will at least return the optimal and sub-optimal alignment score,
+ and the optimal alignment ending positions on target and query sequences. If both bit 6 and 7 of the flag are setted
+ while bit 8 is not, the function will return cigar only when both criteria are fulfilled. All returned positions are
+ 0-based coordinate.
+*/
+s_align* ssw_align (const s_profile* prof,
+ const int8_t* ref,
+ int32_t refLen,
+ const uint8_t weight_gapO,
+ const uint8_t weight_gapE,
+ const uint8_t flag,
+ const uint16_t filters,
+ const int32_t filterd,
+ const int32_t maskLen);
+
+/*! @function Release the memory allocated by function ssw_align.
+ @param a pointer to the alignment result structure
+*/
+void align_destroy (s_align* a);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // SSW_H
diff --git a/src/ssw_cpp.cpp b/src/ssw_cpp.cpp
new file mode 100644
index 0000000..ea260de
--- /dev/null
+++ b/src/ssw_cpp.cpp
@@ -0,0 +1,399 @@
+#include "ssw_cpp.h"
+
+#include <sstream>
+
+extern "C" {
+#include "ssw.h"
+}
+
+namespace {
+
+static int8_t kBaseTranslation[128] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ // A C G
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ // T
+ 4, 4, 4, 4, 3, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ // a c g
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ // t
+ 4, 4, 4, 4, 3, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+void BuildSwScoreMatrix(const uint8_t& match_score,
+ const uint8_t& mismatch_penalty,
+ int8_t* matrix) {
+
+ // The score matrix looks like
+ // // A, C, G, T, N
+ // score_matrix_ = { 2, -2, -2, -2, 0, // A
+ // -2, 2, -2, -2, 0, // C
+ // -2, -2, 2, -2, 0, // G
+ // -2, -2, -2, 2, 0, // T
+ // 0, 0, 0, 0, 0};// N
+
+ int id = 0;
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ matrix[id] = ((i == j) ? match_score : static_cast<int8_t>(-mismatch_penalty));
+ ++id;
+ }
+ matrix[id] = 0;
+ ++id;
+ }
+
+ for (int i = 0; i < 5; ++i)
+ matrix[id++] = 0;
+
+}
+
+void ConvertAlignment(const s_align& s_al,
+ const int& query_len,
+ StripedSmithWaterman::Alignment* al) {
+ al->sw_score = s_al.score1;
+ al->sw_score_next_best = s_al.score2;
+ al->ref_begin = s_al.ref_begin1;
+ al->ref_end = s_al.ref_end1;
+ al->query_begin = s_al.read_begin1;
+ al->query_end = s_al.read_end1;
+ al->ref_end_next_best = s_al.ref_end2;
+
+ al->cigar.clear();
+ al->cigar_string.clear();
+
+ if (s_al.cigarLen > 0) {
+ std::ostringstream cigar_string;
+ if (al->query_begin > 0) {
+ uint32_t cigar = (al->query_begin << 4) | 0x0004;
+ al->cigar.push_back(cigar);
+ cigar_string << al->query_begin << 'S';
+ }
+
+ for (int i = 0; i < s_al.cigarLen; ++i) {
+ al->cigar.push_back(s_al.cigar[i]);
+ cigar_string << (s_al.cigar[i] >> 4);
+ uint8_t op = s_al.cigar[i] & 0x000f;
+ switch(op) {
+ case 0: cigar_string << 'M'; break;
+ case 1: cigar_string << 'I'; break;
+ case 2: cigar_string << 'D'; break;
+ }
+ }
+
+ int end = query_len - al->query_end - 1;
+ if (end > 0) {
+ uint32_t cigar = (end << 4) | 0x0004;
+ al->cigar.push_back(cigar);
+ cigar_string << end << 'S';
+ }
+
+ al->cigar_string = cigar_string.str();
+ } // end if
+}
+
+int CalculateNumberMismatch(
+ const StripedSmithWaterman::Alignment& al,
+ const int8_t* matrix,
+ int8_t const *ref,
+ int8_t const *query) {
+
+ ref += al.ref_begin;
+ query += al.query_begin;
+ int mismatch_length = 0;
+ for (unsigned int i = 0; i < al.cigar.size(); ++i) {
+ int32_t op = al.cigar[i] & 0x0000000f;
+ int32_t length = (al.cigar[i] >> 4) & 0x0fffffff;
+ if (op == 0) { // M
+ for (int j = 0; j < length; ++j) {
+ if (matrix[*ref] != matrix[*query]) ++mismatch_length;
+ ++ref;
+ ++query;
+ }
+ } else if (op == 1) { // I
+ query += length;
+ mismatch_length += length;
+ } else if (op == 2) { // D
+ ref += length;
+ mismatch_length += length;
+ }
+ }
+
+ return mismatch_length;
+}
+
+void SetFlag(const StripedSmithWaterman::Filter& filter, uint8_t* flag) {
+ if (filter.report_begin_position) *flag |= 0x08;
+ if (filter.report_cigar) *flag |= 0x0f;
+}
+
+} // namespace
+
+
+
+namespace StripedSmithWaterman {
+
+Aligner::Aligner(void)
+ : score_matrix_(NULL)
+ , score_matrix_size_(5)
+ , translation_matrix_(NULL)
+ , default_matrix_(false)
+ , matrix_built_(false)
+ , match_score_(2)
+ , mismatch_penalty_(2)
+ , gap_opening_penalty_(3)
+ , gap_extending_penalty_(1)
+ , translated_reference_(NULL)
+ , reference_length_(0)
+{
+ BuildDefaultMatrix();
+}
+
+Aligner::Aligner(
+ const uint8_t& match_score,
+ const uint8_t& mismatch_penalty,
+ const uint8_t& gap_opening_penalty,
+ const uint8_t& gap_extending_penalty)
+
+ : score_matrix_(NULL)
+ , score_matrix_size_(5)
+ , translation_matrix_(NULL)
+ , default_matrix_(false)
+ , matrix_built_(false)
+ , match_score_(match_score)
+ , mismatch_penalty_(mismatch_penalty)
+ , gap_opening_penalty_(gap_opening_penalty)
+ , gap_extending_penalty_(gap_extending_penalty)
+ , translated_reference_(NULL)
+ , reference_length_(0)
+{
+ BuildDefaultMatrix();
+}
+
+Aligner::Aligner(const int8_t* score_matrix,
+ const int& score_matrix_size,
+ const int8_t* translation_matrix,
+ const int& translation_matrix_size)
+
+ : score_matrix_(NULL)
+ , score_matrix_size_(score_matrix_size)
+ , translation_matrix_(NULL)
+ , default_matrix_(true)
+ , matrix_built_(false)
+ , match_score_(2)
+ , mismatch_penalty_(2)
+ , gap_opening_penalty_(3)
+ , gap_extending_penalty_(1)
+ , translated_reference_(NULL)
+ , reference_length_(0)
+{
+ score_matrix_ = new int8_t[score_matrix_size_ * score_matrix_size_];
+ memcpy(score_matrix_, score_matrix, sizeof(int8_t) * score_matrix_size_ * score_matrix_size_);
+ translation_matrix_ = new int8_t[translation_matrix_size];
+ memcpy(translation_matrix_, translation_matrix, sizeof(int8_t) * translation_matrix_size);
+ matrix_built_ = true;
+}
+
+
+Aligner::~Aligner(void){
+ Clear();
+}
+
+int Aligner::SetReferenceSequence(const char* seq, const int& length) {
+
+ int len = 0;
+ if (matrix_built_) {
+ // calculate the valid length
+ int calculated_ref_length = static_cast<int>(strlen(seq));
+ int valid_length = (calculated_ref_length > length)
+ ? length : calculated_ref_length;
+ // delete the current buffer
+ CleanReferenceSequence();
+ // allocate a new buffer
+ translated_reference_ = new int8_t[valid_length];
+
+ len = TranslateBase(seq, valid_length, translated_reference_);
+ } else {
+ // nothing
+ }
+
+ reference_length_ = len;
+ return len;
+
+
+}
+
+int Aligner::TranslateBase(const char* bases, const int& length,
+ int8_t* translated) const {
+
+ char* ptr = (char*)bases;
+ int len = 0;
+ for (int i = 0; i < length; ++i) {
+ translated[i] = translation_matrix_[(int) *ptr];
+ ++ptr;
+ ++len;
+ }
+
+ return len;
+}
+
+
+bool Aligner::Align(const char* query, const Filter& filter,
+ Alignment* alignment) const
+{
+ if (!matrix_built_) return false;
+ if (reference_length_ == 0) return false;
+
+ int query_len = strlen(query);
+ if (query_len == 0) return false;
+ int8_t* translated_query = new int8_t[query_len];
+ TranslateBase(query, query_len, translated_query);
+
+ const int8_t score_size = 2;
+ s_profile* profile = ssw_init(translated_query, query_len, score_matrix_,
+ score_matrix_size_, score_size);
+
+ uint8_t flag = 0;
+ SetFlag(filter, &flag);
+ s_align* s_al = ssw_align(profile, translated_reference_, reference_length_,
+ static_cast<int>(gap_opening_penalty_),
+ static_cast<int>(gap_extending_penalty_),
+ flag, filter.score_filter, filter.distance_filter, query_len);
+
+ alignment->Clear();
+ ConvertAlignment(*s_al, query_len, alignment);
+ alignment->mismatches = CalculateNumberMismatch(*alignment, score_matrix_, translated_reference_, translated_query);
+
+
+ // Free memory
+ if (query_len > 1) delete [] translated_query;
+ else delete translated_query;
+ align_destroy(s_al);
+ init_destroy(profile);
+
+ return true;
+}
+
+
+bool Aligner::Align(const char* query, const char* ref, const int& ref_len,
+ const Filter& filter, Alignment* alignment) const
+{
+ if (!matrix_built_) return false;
+
+ int query_len = strlen(query);
+ if (query_len == 0) return false;
+ int8_t* translated_query = new int8_t[query_len];
+ TranslateBase(query, query_len, translated_query);
+
+ // calculate the valid length
+ int calculated_ref_length = static_cast<int>(strlen(ref));
+ int valid_ref_len = (calculated_ref_length > ref_len)
+ ? ref_len : calculated_ref_length;
+ int8_t* translated_ref = new int8_t[valid_ref_len];
+ TranslateBase(ref, valid_ref_len, translated_ref);
+
+
+ const int8_t score_size = 2;
+ s_profile* profile = ssw_init(translated_query, query_len, score_matrix_,
+ score_matrix_size_, score_size);
+
+ uint8_t flag = 0;
+ SetFlag(filter, &flag);
+ s_align* s_al = ssw_align(profile, translated_ref, valid_ref_len,
+ static_cast<int>(gap_opening_penalty_),
+ static_cast<int>(gap_extending_penalty_),
+ flag, filter.score_filter, filter.distance_filter, query_len);
+
+ alignment->Clear();
+ ConvertAlignment(*s_al, query_len, alignment);
+ alignment->mismatches = CalculateNumberMismatch(*alignment, score_matrix_, translated_ref, translated_query);
+
+ // Free memory
+ if (query_len > 1) delete [] translated_query;
+ else delete translated_query;
+ if (valid_ref_len > 1) delete [] translated_ref;
+ else delete translated_ref;
+ align_destroy(s_al);
+ init_destroy(profile);
+
+ return true;
+}
+
+void Aligner::Clear(void) {
+ if (score_matrix_) delete [] score_matrix_;
+ score_matrix_ = NULL;
+
+ if (!default_matrix_ && translation_matrix_)
+ delete [] translation_matrix_;
+ translation_matrix_ = NULL;
+
+ CleanReferenceSequence();
+
+ default_matrix_ = false;
+ matrix_built_ = false;
+}
+
+void Aligner::SetAllDefault(void) {
+ score_matrix_size_ = 5;
+ default_matrix_ = false;
+ matrix_built_ = false;
+ match_score_ = 2;
+ mismatch_penalty_ = 2;
+ gap_opening_penalty_ = 3;
+ gap_extending_penalty_ = 1;
+ reference_length_ = 0;
+}
+
+bool Aligner::ReBuild(void) {
+ if (matrix_built_) return false;
+
+ SetAllDefault();
+ BuildDefaultMatrix();
+
+ return true;
+}
+
+bool Aligner::ReBuild(
+ const uint8_t& match_score,
+ const uint8_t& mismatch_penalty,
+ const uint8_t& gap_opening_penalty,
+ const uint8_t& gap_extending_penalty) {
+ if (matrix_built_) return false;
+
+ SetAllDefault();
+
+ match_score_ = match_score;
+ mismatch_penalty_ = mismatch_penalty;
+ gap_opening_penalty_ = gap_opening_penalty;
+ gap_extending_penalty_ = gap_extending_penalty;
+
+ BuildDefaultMatrix();
+
+ return true;
+}
+
+bool Aligner::ReBuild(
+ const int8_t* score_matrix,
+ const int& score_matrix_size,
+ const int8_t* translation_matrix,
+ const int& translation_matrix_size) {
+
+ score_matrix_ = new int8_t[score_matrix_size_ * score_matrix_size_];
+ memcpy(score_matrix_, score_matrix, sizeof(int8_t) * score_matrix_size_ * score_matrix_size_);
+ translation_matrix_ = new int8_t[translation_matrix_size];
+ memcpy(translation_matrix_, translation_matrix, sizeof(int8_t) * translation_matrix_size);
+ matrix_built_ = true;
+
+ return true;
+}
+
+void Aligner::BuildDefaultMatrix(void) {
+ score_matrix_ = new int8_t[score_matrix_size_ * score_matrix_size_];
+ BuildSwScoreMatrix(match_score_, mismatch_penalty_, score_matrix_);
+ translation_matrix_ = kBaseTranslation;
+ matrix_built_ = true;
+ default_matrix_ = true;
+}
+} // namespace StripedSmithWaterman
diff --git a/src/ssw_cpp.h b/src/ssw_cpp.h
new file mode 100644
index 0000000..fb10f4f
--- /dev/null
+++ b/src/ssw_cpp.h
@@ -0,0 +1,216 @@
+#ifndef COMPLETE_STRIPED_SMITH_WATERMAN_CPP_H_
+#define COMPLETE_STRIPED_SMITH_WATERMAN_CPP_H_
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+namespace StripedSmithWaterman {
+
+struct Alignment {
+ uint16_t sw_score; // The best alignment score
+ uint16_t sw_score_next_best; // The next best alignment score
+ int32_t ref_begin; // Reference begin position of the best alignment
+ int32_t ref_end; // Reference end position of the best alignment
+ int32_t query_begin; // Query begin position of the best alignment
+ int32_t query_end; // Query end position of the best alignment
+ int32_t ref_end_next_best; // Reference end position of the next best alignment
+ int32_t mismatches; // Number of mismatches of the alignment
+ std::string cigar_string; // Cigar string of the best alignment
+ std::vector<uint32_t> cigar; // Cigar stored in the BAM format
+ // high 28 bits: length
+ // low 4 bits: M/I/D/S/X (0/1/2/4/8);
+ void Clear() {
+ sw_score = 0;
+ sw_score_next_best = 0;
+ ref_begin = 0;
+ ref_end = 0;
+ query_begin = 0;
+ query_end = 0;
+ ref_end_next_best = 0;
+ mismatches = 0;
+ cigar_string.clear();
+ cigar.clear();
+ };
+};
+
+struct Filter {
+ // NOTE: No matter the filter, those five fields will be given anyway.
+ // sw_score; sw_score_next_best; ref_end; query_end; ref_end_next_best.
+
+ bool report_begin_position; // Give ref_begin and query_begin.
+ // If it is not set, ref_begin and query_begin are -1.
+ bool report_cigar; // Give cigar_string and cigar.
+ // report_begin_position is automatically TRUE.
+
+ // When *report_cigar* is true and alignment passes these two filters,
+ // cigar_string and cigar will be given.
+ uint16_t score_filter; // score >= score_filter
+ uint16_t distance_filter; // ((ref_end - ref_begin) < distance_filter) &&
+ // ((query_end - read_begin) < distance_filter)
+
+ Filter()
+ : report_begin_position(true)
+ , report_cigar(true)
+ , score_filter(0)
+ , distance_filter(32767)
+ {};
+};
+
+class Aligner {
+ public:
+ // =========
+ // @function Construct an Aligner on default values.
+ // The function will build the {A.C,G,T,N} aligner.
+ // If you target for other character aligners, then please
+ // use the other constructor and pass the corresponding matrix in.
+ // =========
+ Aligner(void);
+
+ // =========
+ // @function Construct an Aligner by assigning scores.
+ // The function will build the {A.C,G,T,N} aligner.
+ // If you target for other character aligners, then please
+ // use the other constructor and pass the corresponding matrix in.
+ // =========
+ Aligner(const uint8_t& match_score,
+ const uint8_t& mismatch_penalty,
+ const uint8_t& gap_opening_penalty,
+ const uint8_t& gap_extending_penalty);
+
+ // =========
+ // @function Construct an Aligner by the specific matrixs.
+ // =========
+ Aligner(const int8_t* score_matrix,
+ const int& score_matrix_size,
+ const int8_t* translation_matrix,
+ const int& translation_matrix_size);
+
+ ~Aligner(void);
+
+ // =========
+ // @function Build the reference sequence and thus make
+ // Align(const char* query, s_align* alignment) function;
+ // otherwise the reference should be given when aligning.
+ // [NOTICE] If there exists a sequence, that one will be deleted
+ // and replaced.
+ // @param seq The reference bases;
+ // [NOTICE] It is not necessary null terminated.
+ // @param length The length of bases will be be built.
+ // @return The length of the built bases.
+ // =========
+ int SetReferenceSequence(const char* seq, const int& length);
+
+ void CleanReferenceSequence(void);
+
+ // =========
+ // @function Set penalties for opening and extending gaps
+ // [NOTICE] The defaults are 3 and 1 respectively.
+ // =========
+ void SetGapPenalty(const uint8_t& opening, const uint8_t& extending) {
+ gap_opening_penalty_ = opening;
+ gap_extending_penalty_ = extending;
+ };
+
+ void SetMismatchPenalty(const uint8_t& match, const uint8_t& mismatch) {
+ match_score_ = match;
+ mismatch_penalty_ = mismatch;
+ };
+
+ // =========
+ // @function Align the query againt the reference that is set by
+ // SetReferenceSequence.
+ // @param query The query sequence.
+ // @param filter The filter for the alignment.
+ // @param alignment The container contains the result.
+ // @return True: succeed; false: fail.
+ // =========
+ bool Align(const char* query, const Filter& filter, Alignment* alignment) const;
+
+ // =========
+ // @function Align the query againt the reference.
+ // [NOTICE] The reference won't replace the reference
+ // set by SetReferenceSequence.
+ // @param query The query sequence.
+ // @param ref The reference sequence.
+ // [NOTICE] It is not necessary null terminated.
+ // @param ref_len The length of the reference sequence.
+ // @param filter The filter for the alignment.
+ // @param alignment The container contains the result.
+ // @return True: succeed; false: fail.
+ // =========
+ bool Align(const char* query, const char* ref, const int& ref_len,
+ const Filter& filter, Alignment* alignment) const;
+
+ // @function Clear up all containers and thus the aligner is disabled.
+ // To rebuild the aligner please use Build functions.
+ void Clear(void);
+
+ // =========
+ // @function Rebuild the aligner's ability on default values.
+ // [NOTICE] If the aligner is not cleaned, rebuilding will fail.
+ // @return True: succeed; false: fail.
+ // =========
+ bool ReBuild(void);
+
+ // =========
+ // @function Rebuild the aligner's ability by the specific matrixs.
+ // [NOTICE] If the aligner is not cleaned, rebuilding will fail.
+ // @return True: succeed; false: fail.
+ // =========
+ bool ReBuild(
+ const uint8_t& match_score,
+ const uint8_t& mismatch_penalty,
+ const uint8_t& gap_opening_penalty,
+ const uint8_t& gap_extending_penalty);
+
+ // =========
+ // @function Construct an Aligner by the specific matrixs.
+ // [NOTICE] If the aligner is not cleaned, rebuilding will fail.
+ // @return True: succeed; false: fail.
+ // =========
+ bool ReBuild(
+ const int8_t* score_matrix,
+ const int& score_matrix_size,
+ const int8_t* translation_matrix,
+ const int& translation_matrix_size);
+
+ private:
+ int8_t* score_matrix_;
+ int score_matrix_size_;
+ int8_t* translation_matrix_;
+ bool default_matrix_;
+ bool matrix_built_;
+
+ uint8_t match_score_; // default: 2
+ uint8_t mismatch_penalty_; // default: 2
+ uint8_t gap_opening_penalty_; // default: 3
+ uint8_t gap_extending_penalty_; // default: 1
+
+ int8_t* translated_reference_;
+ int32_t reference_length_;
+
+ int TranslateBase(const char* bases, const int& length, int8_t* translated) const;
+ void SetAllDefault(void);
+ void BuildDefaultMatrix(void);
+
+ Aligner& operator= (const Aligner&);
+ Aligner (const Aligner&);
+}; // class Aligner
+
+
+// ================
+// inline functions
+// ================
+inline void Aligner::CleanReferenceSequence(void) {
+ if (reference_length_ == 0) return;
+
+ // delete the current buffer
+ if (reference_length_ > 1) delete [] translated_reference_;
+ else delete translated_reference_;
+
+ reference_length_ = 0;
+}
+} // namespace StripedSmithWaterman
+
+#endif // COMPLETE_STRIPED_SMITH_WATERMAN_CPP_H_
diff --git a/src/vcf2dag.cpp b/src/vcf2dag.cpp
new file mode 100644
index 0000000..1257cd0
--- /dev/null
+++ b/src/vcf2dag.cpp
@@ -0,0 +1,168 @@
+#include "Variant.h"
+#include "BedReader.h"
+#include "intervaltree/IntervalTree.h"
+#include <getopt.h>
+#include "fastahack/Fasta.h"
+#include <algorithm>
+#include <list>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+ << endl
+ << "options:" << endl
+ << " -r, --reference FILE FASTA reference file." << endl
+ << endl
+ << "Modify the VCF file so that homozygous regions are included as REF/. calls." << endl
+ << "For each ref and alt allele, assign an index. These steps are sufficient to" << endl
+ << "enable use of the VCF as a DAG (specifically a partially-ordered graph)." << endl;
+ exit(0);
+}
+
+int main(int argc, char** argv) {
+
+ string vcfFileName;
+ string fastaFileName;
+
+ bool adjustVcf = false;
+
+ if (argc == 1)
+ printSummary(argv);
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"reference", required_argument, 0, 'r'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hr:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+
+ case 'r':
+ fastaFileName = string(optarg);
+ break;
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ cerr << "could not open VCF file" << endl;
+ exit(1);
+ }
+
+ FastaReference reference;
+ if (fastaFileName.empty()) {
+ cerr << "a reference is required" << endl;
+ exit(1);
+ } else {
+ reference.open(fastaFileName);
+ }
+
+ string idname = "id";
+ long int uid = 0;
+
+ variantFile.addHeaderLine("##INFO=<ID="+idname+".alt,Number=A,Type=Integer,Description=\"Unique numerical identifier of alt allele.\">");
+ variantFile.addHeaderLine("##INFO=<ID="+idname+".ref,Number=1,Type=Integer,Description=\"Unique numerical identifier of ref allele.\">");
+ cout << variantFile.header << endl;
+
+ long int last_end = 1;
+ string sequenceName;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+
+ if (sequenceName.empty()) {
+ sequenceName = var.sequenceName;
+ } else if (sequenceName != var.sequenceName) {
+ // emit last record from previous chrom
+ // these should be refactored.....
+ Variant refvar(variantFile);
+ if (var.position - last_end > 0) {
+ refvar.ref = reference.getSubSequence(sequenceName, last_end - 1, var.position - last_end);
+ refvar.quality = 0;
+ refvar.position = last_end;
+ refvar.sequenceName = sequenceName;
+ refvar.info[idname+".ref"].push_back(convert(uid++));
+ cout << refvar << endl;
+ }
+ last_end = 1;
+ sequenceName = var.sequenceName;
+ }
+
+ // generate the last reference record if we have sequence between variants
+ if (var.position - last_end > 0) {
+ Variant refvar(variantFile);
+ refvar.quality = 0;
+ refvar.position = last_end;
+ refvar.sequenceName = sequenceName;
+ refvar.ref = reference.getSubSequence(sequenceName, last_end - 1, var.position - last_end);
+ refvar.info[idname+".ref"].push_back(convert(uid++));
+ cout << refvar << endl;
+ }
+
+ // now manipulate this record
+ vector<string>& refidx = var.info[idname+".ref"];
+ refidx.clear(); refidx.push_back(convert(uid++));
+
+ vector<string>& idxs = var.info[idname+".alt"];
+ idxs.clear();
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ idxs.push_back(convert(uid++));
+ }
+ cout << var << endl;
+
+ last_end = var.position + var.ref.size();
+
+ }
+
+ if (reference.sequenceLength(sequenceName) - last_end > 0) {
+ Variant refvar(variantFile);
+ refvar.quality = 0;
+ refvar.position = last_end;
+ refvar.sequenceName = sequenceName;
+ refvar.ref = reference.getSubSequence(sequenceName, last_end,
+ reference.sequenceLength(sequenceName) - last_end);
+ refvar.info[idname+".ref"].push_back(convert(uid++));
+ cout << refvar << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcf2fasta.cpp b/src/vcf2fasta.cpp
new file mode 100644
index 0000000..88de80a
--- /dev/null
+++ b/src/vcf2fasta.cpp
@@ -0,0 +1,264 @@
+#include "Variant.h"
+#include "convert.h"
+#include "join.h"
+#include "split.h"
+#include <set>
+#include <getopt.h>
+#include "fastahack/Fasta.h"
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+using namespace vcf;
+
+#define ALLELE_NULL -1
+
+
+class SampleFastaFile {
+
+public:
+
+ ofstream fastafile;
+ long int pos;
+ string linebuffer;
+ string filename;
+ string seqname;
+ int linewidth;
+
+ void write(string sequence) {
+ linebuffer += sequence;
+ while (linebuffer.length() > linewidth) {
+ fastafile << linebuffer.substr(0, linewidth) << endl;
+ linebuffer = linebuffer.substr(linewidth);
+ }
+ }
+
+ SampleFastaFile(void) { }
+
+ void open(string& m_filename, string& m_seqname, int m_linewidth = 80) {
+ filename = m_filename;
+ seqname = m_seqname;
+ pos = 0;
+ linewidth = m_linewidth;
+ if (fastafile.is_open()) fastafile.close();
+ fastafile.open(filename.c_str());
+ if (!fastafile.is_open()) {
+ cerr << "could not open " << filename << " for writing, exiting" << endl;
+ exit(1);
+ }
+ fastafile << ">" << seqname << endl;
+ }
+
+ ~SampleFastaFile(void) {
+ if (fastafile.is_open()) {
+ write(""); // flush
+ fastafile << linebuffer << endl;
+ fastafile.close();
+ }
+ }
+
+};
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [file]" << endl
+ << endl
+ << "options:" << endl
+ << " -f, --reference REF Use this reference when decomposing samples." << endl
+ << " -p, --prefix PREFIX Affix this output prefix to each file, none by default" << endl
+ << " -P, --default-ploidy N Set a default ploidy for samples which do not have information in the first record (2)." << endl
+ << endl
+ << "Outputs sample_seq:N.fa for each sample, reference sequence, and chromosomal copy N in [0,1... ploidy]." << endl;
+ //<< "Impossible regions of haplotypes are noted with an error message. The corresponding" << endl
+ //<< "regions of the output FASTA files will be marked as N." << endl
+ exit(0);
+}
+
+map<string, int>& getPloidies(Variant& var, map<string, int>& ploidies, int defaultPloidy=2) {
+ for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+ int p = ploidy(decomposeGenotype(var.getGenotype(*s)));
+ if (p == 0) ploidies[*s] = defaultPloidy;
+ else ploidies[*s] = p;
+ }
+ return ploidies;
+}
+
+void closeOutputs(map<string, map<int, SampleFastaFile*> >& outputs) {
+ for (map<string, map<int, SampleFastaFile*> >::iterator f = outputs.begin(); f != outputs.end(); ++f) {
+ for (map<int, SampleFastaFile*>::iterator s = f->second.begin(); s != f->second.end(); ++s) {
+ delete s->second;
+ }
+ }
+}
+
+void initOutputs(map<string, map<int, SampleFastaFile*> >& outputs, vector<string>& sampleNames, string& seqName, map<string, int>& ploidies, string& prefix) {
+ closeOutputs(outputs);
+ for (vector<string>::iterator s = sampleNames.begin(); s != sampleNames.end(); ++s) {
+ map<int, SampleFastaFile*>& outs = outputs[*s];
+ int p = ploidies[*s];
+ for (int i = 0; i < p; ++i) {
+ string name = prefix + *s + "_" + seqName + ":" + convert(i) + ".fasta";
+ if (!outs[i]) {
+ SampleFastaFile* fp = new SampleFastaFile;
+ outs[i] = fp;
+ }
+ SampleFastaFile& f = *outs[i];
+ f.open(name, seqName);
+ }
+ }
+}
+
+void vcf2fasta(VariantCallFile& variantFile, FastaReference& reference, string& outputPrefix, int defaultPloidy) {
+ string lastSeq;
+ long int lastPos=0, lastEnd=0;
+ map<string, map<int, SampleFastaFile*> > outputs;
+ Variant var(variantFile);
+ map<string, int> lastPloidies;
+ while (variantFile.getNextVariant(var)) {
+ if (!var.isPhased()) {
+ cerr << "variant " << var.sequenceName << ":" << var.position << " is not phased, cannot convert to fasta" << endl;
+ exit(1);
+ }
+ map<string, int> ploidies;
+ getPloidies(var, ploidies, defaultPloidy);
+ if (var.sequenceName != lastSeq || lastSeq.empty()) {
+ if (!lastSeq.empty()) {
+ string ref5prime = reference.getSubSequence(lastSeq, lastEnd, reference.sequenceLength(lastSeq)-lastEnd);
+ for (map<string, map<int, SampleFastaFile*> >::iterator s = outputs.begin(); s != outputs.end(); ++s) {
+ map<int, SampleFastaFile*>& f = s->second;
+ for (map<int, SampleFastaFile*>::iterator o = f.begin(); o != f.end(); ++o) {
+ o->second->write(ref5prime);
+ }
+ }
+ }
+ initOutputs(outputs, var.sampleNames, var.sequenceName, ploidies, outputPrefix);
+ lastSeq = var.sequenceName;
+ lastPos = 0;
+ } else if (!lastPloidies.empty() && lastPloidies != ploidies) {
+ cerr << "cannot handle mid-sequence change of ploidy" << endl;
+ // in principle it should be possible...
+ // it's a matter of representation, GFASTA anyone?
+ exit(1);
+ }
+ lastPloidies = ploidies;
+ if (var.position < lastEnd) {
+ cerr << var.position << " vs " << lastEnd << endl;
+ cerr << "overlapping or out-of-order variants at " << var.sequenceName << ":" << var.position << endl;
+ exit(1);
+ }
+ // get reference sequences implied by last->current variant
+ string ref5prime;
+ if (var.position - 1 - lastEnd > 0) {
+ ref5prime = reference.getSubSequence(var.sequenceName, lastEnd, var.position - 1 - lastEnd);
+ }
+ // write alt/ref seqs for current variant based on phased genotypes
+ for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+ string& sample = *s;
+ vector<int> gt = decomposePhasedGenotype(var.getGenotype(sample));
+ // assume no-call == ref?
+ if (gt.empty()) {
+ cerr << "empty genotype for sample " << *s << " at " << var.sequenceName << ":" << var.position << endl;
+ exit(1);
+ }
+ int i = 0;
+ for (vector<int>::iterator g = gt.begin(); g != gt.end(); ++g, ++i) {
+ outputs[sample].at(i)->write(ref5prime+var.alleles.at(*g));
+ }
+ }
+ lastPos = var.position - 1;
+ lastEnd = lastPos + var.ref.size();
+ }
+ // write last sequences
+ {
+ string ref5prime = reference.getSubSequence(lastSeq, lastEnd, reference.sequenceLength(lastSeq)-lastEnd);
+ for (map<string, map<int, SampleFastaFile*> >::iterator s = outputs.begin(); s != outputs.end(); ++s) {
+ map<int, SampleFastaFile*>& f = s->second;
+ for (map<int, SampleFastaFile*>::iterator o = f.begin(); o != f.end(); ++o) {
+ o->second->write(ref5prime);
+ }
+ }
+ }
+ closeOutputs(outputs);
+ // outputs are closed by ~SampleFastaFile
+}
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+ string fastaFileName;
+ int defaultPloidy;
+ string outputPrefix;
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"reference", required_argument, 0, 'f'},
+ {"prefix", required_argument, 0, 'p'},
+ {"default-ploidy", required_argument, 0, 'P'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hmf:p:P:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+
+ case 'f':
+ fastaFileName = optarg;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case 'p':
+ outputPrefix = optarg;
+ break;
+
+ case 'P':
+ defaultPloidy = atoi(optarg);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ FastaReference reference;
+ if (fastaFileName.empty()) {
+ cerr << "a reference is required for haplotype allele generation" << endl;
+ printSummary(argv);
+ exit(1);
+ }
+ reference.open(fastaFileName);
+
+ if (optind < argc) {
+ string filename = argv[optind];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ vcf2fasta(variantFile, reference, outputPrefix, defaultPloidy);
+
+ return 0;
+
+}
+
diff --git a/src/vcf2tsv.cpp b/src/vcf2tsv.cpp
new file mode 100644
index 0000000..9a1e2dd
--- /dev/null
+++ b/src/vcf2tsv.cpp
@@ -0,0 +1,241 @@
+#include "Variant.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [-n null_string] [-g]" << " [vcf file]" << endl
+ << "Converts stdin or given VCF file to tab-delimited format, using null string to replace empty values in the table." << endl
+ << "Specifying -g will output one line per sample with genotype information." << endl;
+ exit(1);
+}
+
+
+int main(int argc, char** argv) {
+
+ string nullval;
+ bool genotypes = false;
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"null-value", required_argument, 0, 'n'},
+ {"genotypes", no_argument, 0, 'g'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hn:g",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+
+ case 'n':
+ nullval = optarg;
+ break;
+
+ case 'g':
+ genotypes = true;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ VariantCallFile variantFile;
+ bool usingstdin = false;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ if (!variantFile.open(std::cin)) {
+ if (argc == 1) {
+ printSummary(argv);
+ } else {
+ cerr << "could not open stdin for reading as VCF" << endl;
+ exit(1);
+ }
+ }
+ usingstdin = true;
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+ // obtain all possible field names
+ vector<string> infofields;
+ vector<string> infoflags;
+
+ for (map<string, VariantFieldType>::iterator i = variantFile.infoTypes.begin(); i != variantFile.infoTypes.end(); ++i) {
+ if (i->second == FIELD_BOOL) {
+ infoflags.push_back(i->first);
+ } else {
+ infofields.push_back(i->first);
+ }
+ }
+
+ vector<string> formatfields;
+ if (genotypes) {
+ for (map<string, VariantFieldType>::iterator f = variantFile.formatTypes.begin(); f != variantFile.formatTypes.end(); ++f) {
+ formatfields.push_back(f->first);
+ }
+ }
+
+ // write header
+
+ // defaults
+ cout << "CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER";
+
+ // configurable info field
+ for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
+ cout << "\t" << *i;
+ }
+ for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
+ cout << "\t" << *i;
+ }
+
+ if (genotypes) {
+ cout << "\t" << "SAMPLE";
+ for (vector<string>::iterator f = formatfields.begin(); f != formatfields.end(); ++f) {
+ cout << "\t" << *f;
+ }
+ }
+ cout << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+
+ if (!genotypes) {
+
+ int altindex = 0;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++altindex) {
+
+ string& altallele = *a;
+
+ cout << var.sequenceName << "\t"
+ << var.position << "\t"
+ << var.id << "\t"
+ << var.ref << "\t"
+ << altallele << "\t"
+ << var.quality << "\t"
+ << var.filter;
+
+ for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
+ vector<string> value;
+ string& name = *i;
+ map<string, vector<string> >::iterator f = var.info.find(name);
+ if (f != var.info.end()) {
+ value = f->second;
+ if (value.size() == 1) {
+ cout << "\t" << value.front();
+ } else if (value.size() == var.alt.size()) {
+ cout << "\t" << value.at(altindex);
+ } else {
+ cout << "\t" << nullval; // null
+ }
+ } else {
+ cout << "\t" << nullval; // null
+ }
+ }
+
+ for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
+ string value;
+ string& name = *i;
+ map<string, bool>::iterator f = var.infoFlags.find(name);
+ cout << "\t";
+ if (f != var.infoFlags.end()) {
+ cout << 1;
+ } else {
+ cout << 0;
+ }
+ }
+
+ cout << endl;
+
+ }
+ } else {
+
+ stringstream o;
+
+ // per-genotype output
+ o << var.sequenceName << "\t"
+ << var.position << "\t"
+ << var.id << "\t"
+ << var.ref << "\t"
+ << join(var.alt, ",") << "\t"
+ << var.quality << "\t"
+ << var.filter;
+
+ for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
+ vector<string> value;
+ string& name = *i;
+ map<string, vector<string> >::iterator f = var.info.find(name);
+ if (f != var.info.end()) {
+ value = f->second;
+ if (value.size() == 1) {
+ o << "\t" << value.front();
+ } else if (value.size() == var.alt.size()) {
+ o << "\t" << join(value, ",");
+ } else {
+ o << "\t" << nullval; // null
+ }
+ } else {
+ o << "\t" << nullval; // null
+ }
+ }
+
+ for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
+ string value;
+ string& name = *i;
+ map<string, bool>::iterator f = var.infoFlags.find(name);
+ o << "\t";
+ if (f != var.infoFlags.end()) {
+ o << 1;
+ } else {
+ o << 0;
+ }
+ }
+
+ string siteinfo = o.str();
+
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+ cout << siteinfo;
+ const string& sampleName = s->first;
+ cout << "\t" << sampleName;
+ map<string, vector<string> >& sample = s->second;
+ for (vector<string>::iterator f = formatfields.begin(); f != formatfields.end(); ++f) {
+ if (sample.find(*f) != sample.end()) {
+ cout << "\t" << join(sample[*f], ",");
+ } else {
+ cout << "\t" << nullval;
+ }
+ }
+ cout << endl;
+ }
+ }
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfaddinfo.cpp b/src/vcfaddinfo.cpp
new file mode 100644
index 0000000..cff66d6
--- /dev/null
+++ b/src/vcfaddinfo.cpp
@@ -0,0 +1,111 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+// adds non-overlapping info fields from varB to varA
+void addInfo(Variant& varA, Variant& varB) {
+ for (map<string, vector<string> >::iterator i = varB.info.begin(); i != varB.info.end(); ++i) {
+ if (varA.info.find(i->first) == varA.info.end()) {
+ varA.info[i->first] = i->second;
+ }
+ }
+}
+
+int main(int argc, char** argv) {
+
+ if (argc != 3) {
+ cerr << "usage: " << argv[0] << " <vcf file> <vcf file>" << endl
+ << "Adds info fields from the second file which are not present in the first vcf file." << endl;
+ return 1;
+ }
+
+ string filenameA = argv[1];
+ string filenameB = argv[2];
+
+ if (filenameA == filenameB) {
+ cerr << "it won't help to add info data from the same file!" << endl;
+ return 1;
+ }
+
+ VariantCallFile variantFileA;
+ if (filenameA == "-") {
+ variantFileA.open(std::cin);
+ } else {
+ variantFileA.open(filenameA);
+ }
+
+ VariantCallFile variantFileB;
+ if (filenameB == "-") {
+ variantFileB.open(std::cin);
+ } else {
+ variantFileB.open(filenameB);
+ }
+
+ if (!variantFileA.is_open() || !variantFileB.is_open()) {
+ return 1;
+ }
+
+ Variant varA(variantFileA);
+ Variant varB(variantFileB);
+
+ // while the first file doesn't match the second positionally,
+ // step forward, annotating each genotype record with an empty genotype
+ // when the two match, iterate through the genotypes from the first file
+ // and get the genotypes reported in the second file
+
+ variantFileA.getNextVariant(varA);
+ variantFileB.getNextVariant(varB);
+
+ variantFileA.header = unionInfoHeaderLines(variantFileA.header, variantFileB.header);
+
+ cout << variantFileA.header << endl;
+
+ do {
+
+ while (!variantFileB.done()
+ && (varB.sequenceName < varA.sequenceName
+ || (varB.sequenceName == varA.sequenceName && varB.position < varA.position))
+ ) {
+ variantFileB.getNextVariant(varB);
+ }
+
+ while (!variantFileA.done()
+ && (varA.sequenceName < varB.sequenceName
+ || (varA.sequenceName == varB.sequenceName && varA.position < varB.position))
+ ) {
+ cout << varA << endl;
+ variantFileA.getNextVariant(varA);
+ }
+
+ while (!variantFileB.done()
+ && (varB.sequenceName < varA.sequenceName
+ || (varB.sequenceName == varA.sequenceName && varB.position < varA.position))
+ ) {
+ variantFileB.getNextVariant(varB);
+ }
+
+ while (!variantFileA.done() && varA.sequenceName == varB.sequenceName && varA.position == varB.position) {
+ addInfo(varA, varB);
+ cout << varA << endl;
+ variantFileA.getNextVariant(varA);
+ variantFileB.getNextVariant(varB);
+ }
+
+ } while (!variantFileA.done() && !variantFileB.done());
+
+ if (!variantFileA.done()) {
+ cout << varA << endl;
+ while (variantFileA.getNextVariant(varA)) {
+ cout << varA << endl;
+ }
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfafpath.cpp b/src/vcfafpath.cpp
new file mode 100644
index 0000000..e87ab80
--- /dev/null
+++ b/src/vcfafpath.cpp
@@ -0,0 +1,52 @@
+#include "Variant.h"
+#include <algorithm>
+#include <vector>
+#include <map>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc > 1) {
+ string filename = argv[1];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ //cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ //cout << var << endl;
+ double afref = 1;
+ map<double, vector<string> > allelesByAf;
+ vector<double> afd;
+ vector<string>& afstr = var.info["AF"];
+ for (vector<string>::iterator af = afstr.begin(); af != afstr.end(); ++af) {
+ double r; convert(*af, r);
+ afd.push_back(r);
+ }
+ vector<double>::iterator af = afd.begin();
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++af) {
+ afref -= *af;
+ allelesByAf[*af].push_back(*a);
+ }
+ cout << var.ref;
+ for (map<double, vector<string> >::reverse_iterator a = allelesByAf.rbegin(); a != allelesByAf.rend(); ++a) {
+ cout << " -> " << join(a->second, ", ");
+ }
+ cout << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfallelicprimitives.cpp b/src/vcfallelicprimitives.cpp
new file mode 100644
index 0000000..95b6333
--- /dev/null
+++ b/src/vcfallelicprimitives.cpp
@@ -0,0 +1,414 @@
+#include "Variant.h"
+#include "convert.h"
+#include "join.h"
+#include "split.h"
+#include <set>
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+#define ALLELE_NULL -1
+
+double convertStrDbl(const string& s) {
+ double r;
+ convert(s, r);
+ return r;
+}
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [file]" << endl
+ << endl
+ << "options:" << endl
+ << " -m, --use-mnps Retain MNPs as separate events (default: false)." << endl
+ << " -t, --tag-parsed FLAG Tag records which are split apart of a complex allele with this flag." << endl
+ << " -L, --max-length LEN Do not manipulate records in which either the ALT or" << endl
+ << " REF is longer than LEN (default: 200)." << endl
+ << " -k, --keep-info Maintain site and allele-level annotations when decomposing." << endl
+ << " Note that in many cases, such as multisample VCFs, these won't" << endl
+ << " be valid post-decomposition. For biallelic loci in single-sample" << endl
+ << " VCFs, they should be usable with caution." << endl
+ << " -g, --keep-geno Maintain genotype-level annotations when decomposing. Similar" << endl
+ << " caution should be used for this as for --keep-info." << endl
+ << endl
+ << "If multiple alleleic primitives (gaps or mismatches) are specified in" << endl
+ << "a single VCF record, split the record into multiple lines, but drop all" << endl
+ << "INFO fields. Does not handle genotypes (yet). MNPs are split into" << endl
+ << "multiple SNPs unless the -m flag is provided. Records generated by splits have th" << endl;
+ exit(0);
+}
+
+int main(int argc, char** argv) {
+
+ bool includePreviousBaseForIndels = true;
+ bool useMNPs = false;
+ string parseFlag;
+ int maxLength = 200;
+ bool keepInfo = false;
+ bool keepGeno = false;
+
+ VariantCallFile variantFile;
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"use-mnps", no_argument, 0, 'm'},
+ {"max-length", required_argument, 0, 'L'},
+ {"tag-parsed", required_argument, 0, 't'},
+ {"keep-info", no_argument, 0, 'k'},
+ {"keep-geno", no_argument, 0, 'g'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hmkgt:L:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+
+ case 'm':
+ useMNPs = true;
+ break;
+
+ case 'k':
+ keepInfo = true;
+ break;
+
+ case 'g':
+ keepGeno = true;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case 't':
+ parseFlag = optarg;
+ break;
+
+ case 'L':
+ maxLength = atoi(optarg);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (optind < argc) {
+ string filename = argv[optind];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">");
+ variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">");
+ if (!parseFlag.empty()) {
+ variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">");
+ }
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+
+
+ // we can't decompose *1* bp events, these are already in simplest-form whether SNPs or indels
+ // we also don't handle anything larger than maxLength bp
+ if (var.alt.size() == 1
+ && ( var.alt.front().size() == 1
+ || var.ref.size() == 1
+ || var.alt.front().size() > maxLength
+ || var.ref.size() > maxLength
+ )) {
+ // nothing to do
+ cout << var << endl;
+ continue;
+ }
+
+ // for each parsedalternate, get the position
+ // build a new vcf record for that position
+ // unless we are already at the position !
+ // take everything which is unique to that allele (records) and append it to the new record
+ // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes
+ // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it
+
+ map<string, vector<VariantAllele> > varAlleles = var.parsedAlternates(includePreviousBaseForIndels, useMNPs);
+ set<VariantAllele> alleles;
+
+ // collect unique alleles
+ for (map<string, vector<VariantAllele> >::iterator a = varAlleles.begin(); a != varAlleles.end(); ++a) {
+ for (vector<VariantAllele>::iterator va = a->second.begin(); va != a->second.end(); ++va) {
+ alleles.insert(*va);
+ }
+ }
+
+ int altcount = 0;
+ for (set<VariantAllele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
+ if (a->ref != a->alt) {
+ ++altcount;
+ }
+ }
+
+ if (altcount == 1 && var.alt.size() == 1 && var.alt.front().size() == 1) { // if biallelic SNP
+ cout << var << endl;
+ continue;
+ }
+
+ // collect variant allele indexed membership
+ map<string, vector<int> > variantAlleleIndexes; // from serialized VariantAllele to indexes
+ for (map<string, vector<VariantAllele> >::iterator a = varAlleles.begin(); a != varAlleles.end(); ++a) {
+ int index = var.altAlleleIndexes[a->first] + 1; // make non-relative
+ for (vector<VariantAllele>::iterator va = a->second.begin(); va != a->second.end(); ++va) {
+ variantAlleleIndexes[va->repr].push_back(index);
+ }
+ }
+
+ map<VariantAllele, double> alleleFrequencies;
+ map<VariantAllele, int> alleleCounts;
+ map<VariantAllele, map<string, string> > alleleInfos;
+ map<VariantAllele, map<string, map<string, string> > > alleleGenos;
+
+ bool hasAf = false;
+ if (var.info.find("AF") != var.info.end()) {
+ hasAf = true;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ vector<VariantAllele>& vars = varAlleles[*a];
+ for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) {
+ double freq;
+ try {
+ convert(var.info["AF"].at(var.altAlleleIndexes[*a]), freq);
+ alleleFrequencies[*va] += freq;
+ } catch (...) {
+ cerr << "vcfallelicprimitives WARNING: AF does not have count == alts @ "
+ << var.sequenceName << ":" << var.position << endl;
+ }
+ }
+ }
+ }
+
+ bool hasAc = false;
+ if (var.info.find("AC") != var.info.end()) {
+ hasAc = true;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ vector<VariantAllele>& vars = varAlleles[*a];
+ for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) {
+ int freq;
+ try {
+ convert(var.info["AC"].at(var.altAlleleIndexes[*a]), freq);
+ alleleCounts[*va] += freq;
+ } catch (...) {
+ cerr << "vcfallelicprimitives WARNING: AC does not have count == alts @ "
+ << var.sequenceName << ":" << var.position << endl;
+ }
+ }
+ }
+ }
+
+ if (keepInfo) {
+ for (map<string, vector<string> >::iterator infoit = var.info.begin();
+ infoit != var.info.end(); ++infoit) {
+ string key = infoit->first;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ vector<VariantAllele>& vars = varAlleles[*a];
+ for (vector<VariantAllele>::iterator va = vars.begin(); va != vars.end(); ++va) {
+ string val;
+ vector<string>& vals = var.info[key];
+ if (vals.size() == var.alt.size()) { // allele count for info
+ val = vals.at(var.altAlleleIndexes[*a]);
+ } else if (vals.size() == 1) { // site-wise count
+ val = vals.front();
+ } // don't handle other multiples... how would we do this without going crazy?
+ if (!val.empty()) {
+ alleleInfos[*va][key] = val;
+ }
+ }
+ }
+ }
+ }
+
+ /*
+ if (keepGeno) {
+ for (map<string, map<string, vector<string> > >::iterator sampleit = var.samples.begin();
+ sampleit != var.samples.end(); ++sampleit) {
+ string& sampleName = sampleit->first;
+ map<string, vector<string> >& sampleValues = var.samples[sampleName];
+
+ }
+ }
+ */
+
+ // from old allele index to a new series across the unpacked positions
+ map<int, map<long unsigned int, int> > unpackedAlleleIndexes;
+
+ map<long unsigned int, Variant> variants;
+ //vector<Variant> variants;
+ for (set<VariantAllele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
+ if (a->ref == a->alt) {
+ // ref allele
+ continue;
+ }
+ string type;
+ int len = 0;
+ if (a->ref.at(0) == a->alt.at(0)) { // well-behaved indels
+ if (a->ref.size() > a->alt.size()) {
+ type = "del";
+ len = a->ref.size() - a->alt.size();
+ } else if (a->ref.size() < a->alt.size()) {
+ len = a->alt.size() - a->ref.size();
+ type = "ins";
+ }
+ } else {
+ if (a->ref.size() == a->alt.size()) {
+ len = a->ref.size();
+ if (a->ref.size() == 1) {
+ type = "snp";
+ } else {
+ type = "mnp";
+ }
+ } else {
+ len = abs((int) a->ref.size() - (int) a->alt.size());
+ type = "complex";
+ }
+ }
+
+ if (variants.find(a->position) == variants.end()) {
+ Variant newvar(variantFile);
+ variants[a->position] = newvar;
+ }
+
+ Variant& v = variants[a->position]; // guaranteed to exist
+
+ if (!parseFlag.empty()) {
+ v.infoFlags[parseFlag] = true;
+ }
+ v.quality = var.quality;
+ v.filter = var.filter;
+ v.id = ".";
+ //v.format = var.format;
+ vector<string> gtonlyformat;
+ gtonlyformat.push_back("GT");
+ v.format = gtonlyformat;
+ v.info["TYPE"].push_back(type);
+ v.info["LEN"].push_back(convert(len));
+ if (hasAf) {
+ v.info["AF"].push_back(convert(alleleFrequencies[*a]));
+ }
+ if (hasAc) {
+ v.info["AC"].push_back(convert(alleleCounts[*a]));
+ }
+ if (keepInfo) {
+ for (map<string, vector<string> >::iterator infoit = var.info.begin();
+ infoit != var.info.end(); ++infoit) {
+ string key = infoit->first;
+ if (key != "AF" && key != "AC" && key != "TYPE" && key != "LEN") { // don't clobber previous
+ v.info[key].push_back(alleleInfos[*a][key]);
+ }
+ }
+ }
+
+ // now, keep all the other infos if we are asked to
+
+ v.sequenceName = var.sequenceName;
+ v.position = a->position; // ... by definition, this should be == if the variant was found
+ if (v.ref.size() < a->ref.size()) {
+ for (vector<string>::iterator va = v.alt.begin(); va != v.alt.end(); ++va) {
+ *va += a->ref.substr(v.ref.size());
+ }
+ v.ref = a->ref;
+ }
+ v.alt.push_back(a->alt);
+
+ int alleleIndex = v.alt.size();
+ vector<int>& originalIndexes = variantAlleleIndexes[a->repr];
+ for (vector<int>::iterator i = originalIndexes.begin(); i != originalIndexes.end(); ++i) {
+ unpackedAlleleIndexes[*i][v.position] = alleleIndex;
+ }
+ // add null allele
+ unpackedAlleleIndexes[ALLELE_NULL][v.position] = ALLELE_NULL;
+
+ }
+
+ // genotypes
+ for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+ string& sampleName = *s;
+ if (var.samples.find(sampleName) == var.samples.end()) {
+ continue;
+ }
+ map<string, vector<string> >& sample = var.samples[sampleName];
+ if (sample.find("GT") == sample.end()) {
+ continue;
+ }
+ string& genotype = sample["GT"].front();
+ vector<string> genotypeStrs = split(genotype, "|/");
+ vector<int> genotypeIndexes;
+ for (vector<string>::iterator s = genotypeStrs.begin(); s != genotypeStrs.end(); ++s) {
+ int i;
+ if (!convert(*s, i)) {
+ genotypeIndexes.push_back(ALLELE_NULL);
+ } else {
+ genotypeIndexes.push_back(i);
+ }
+ }
+ map<long unsigned int, vector<int> > positionIndexes;
+ for (vector<int>::iterator g = genotypeIndexes.begin(); g != genotypeIndexes.end(); ++g) {
+ int oldIndex = *g;
+ for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
+ const long unsigned int& p = v->first;
+ if (oldIndex == 0) { // reference
+ positionIndexes[p].push_back(0);
+ } else {
+ positionIndexes[p].push_back(unpackedAlleleIndexes[oldIndex][p]);
+ }
+ }
+ }
+ for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
+ Variant& variant = v->second;
+ vector<int>& gtints = positionIndexes[v->first];
+ vector<string> gtstrs;
+ for (vector<int>::iterator i = gtints.begin(); i != gtints.end(); ++i) {
+ if (*i != ALLELE_NULL) {
+ gtstrs.push_back(convert(*i));
+ } else {
+ gtstrs.push_back(".");
+ }
+ }
+ string genotype = join(gtstrs, "|");
+ // if we are keeping the geno info, pull it over here
+ if (keepGeno) {
+ variant.format = var.format;
+ variant.samples[sampleName] = var.samples[sampleName];
+ }
+ // note that this will replace the old geno, but otherwise it is the same
+ variant.samples[sampleName]["GT"].clear();
+ variant.samples[sampleName]["GT"].push_back(genotype);
+ }
+ }
+
+ //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
+ for (map<long unsigned int, Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
+ cout << v->second << endl;
+ }
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfaltcount.cpp b/src/vcfaltcount.cpp
new file mode 100644
index 0000000..756b5fc
--- /dev/null
+++ b/src/vcfaltcount.cpp
@@ -0,0 +1,50 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+ if (argc != 2) {
+ cerr << "usage: " << argv[0] << " <vcf file>" << endl
+ << "count the number of alternate alleles in all records in the vcf file" << endl;
+ return 1;
+ }
+
+ string filename = argv[1];
+
+ VariantCallFile variantFile;
+ variantFile.open(filename);
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ unsigned int alternateAlleleCount = 0;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ //cout << var << endl;
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+ //string& name = s->first;
+ map<string, vector<string> >& sample = s->second;
+ string& genotype = sample["GT"].front();
+ vector<string> gt = split(genotype, "|/");
+ int alt = 0;
+ for (vector<string>::iterator g = gt.begin(); g != gt.end(); ++g) {
+ if (*g != "0")
+ ++alt;
+ }
+ alternateAlleleCount += alt;
+ }
+ }
+
+ cout << alternateAlleleCount << endl;
+
+ return 0;
+
+}
+
diff --git a/src/vcfannotate.cpp b/src/vcfannotate.cpp
new file mode 100644
index 0000000..3499083
--- /dev/null
+++ b/src/vcfannotate.cpp
@@ -0,0 +1,126 @@
+#include "Variant.h"
+#include "BedReader.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+ << endl
+ << "options:" << endl
+ << " -b, --bed use annotations provided by this BED file" << endl
+ << " -k, --key use this INFO field key for the annotations" << endl
+ << " -d, --default use this INFO field key for records without annotations" << endl
+ << endl
+ << "Intersect the records in the VCF file with targets provided in a BED file." << endl
+ << "Intersections are done on the reference sequences in the VCF file." << endl
+ << "If no VCF filename is specified on the command line (last argument) the VCF" << endl
+ << "read from stdin." << endl;
+ exit(0);
+}
+
+int main(int argc, char** argv) {
+
+ string bedFileName;
+ string annotationInfoKey;
+ string defaultAnnotationValue;
+
+ if (argc == 1)
+ printSummary(argv);
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"bed", required_argument, 0, 'b'},
+ {"key", required_argument, 0, 'k'},
+ {"default", required_argument, 0, 'd'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hb:k:d:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'b':
+ bedFileName = string(optarg);
+ break;
+
+ case 'k':
+ annotationInfoKey = string(optarg);
+ break;
+
+ case 'd':
+ defaultAnnotationValue = string(optarg);
+ break;
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (bedFileName.empty()) {
+ cerr << "a BED file is required when intersecting" << endl;
+ exit(1);
+ }
+
+ BedReader bed(bedFileName);
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ cout << "could not open VCF file" << endl;
+ return 1;
+ }
+
+ string line = "##INFO=<ID=" + annotationInfoKey + ",Number=1,Type=String,Description=\"Annotation from "
+ + bedFileName + " delimited by ':'\">";
+ variantFile.addHeaderLine(line);
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ BedTarget record(var.sequenceName, var.position, var.position + var.ref.size() - 1, "");
+ vector<BedTarget*> overlaps = bed.targetsOverlapping(record);
+ vector<string> annotations;
+ if (!overlaps.empty()) {
+ for (vector<BedTarget*>::iterator t = overlaps.begin(); t != overlaps.end(); ++t) {
+ annotations.push_back((*t)->desc);
+ }
+ var.info[annotationInfoKey].push_back(join(annotations, ":"));
+ } else if (!defaultAnnotationValue.empty()) {
+ var.info[annotationInfoKey].push_back(defaultAnnotationValue);
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
diff --git a/src/vcfannotategenotypes.cpp b/src/vcfannotategenotypes.cpp
new file mode 100644
index 0000000..1bb65d3
--- /dev/null
+++ b/src/vcfannotategenotypes.cpp
@@ -0,0 +1,220 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <list>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+void annotateWithBlankGenotypes(Variant& var, string& annotationTag) {
+
+ var.addFormatField(annotationTag);
+
+ map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+ map<string, map<string, vector<string> > >::iterator sEnd = var.samples.end();
+
+ for (; s != sEnd; ++s) {
+ map<string, vector<string> >& sample = s->second;
+ sample[annotationTag].clear(); // means "no genotype" genotype
+ sample[annotationTag].push_back("./."); // means "no genotype" genotype
+ }
+}
+
+void annotateWithGenotypes(Variant& varA, Variant& varB, string& annotationTag) {
+
+ varA.addFormatField(annotationTag);
+
+ map<string, map<string, vector<string> > >::iterator s = varA.samples.begin();
+ map<string, map<string, vector<string> > >::iterator sEnd = varA.samples.end();
+
+ map<string, int> varAAlleleInts;
+ int i = 1;
+ for (vector<string>::iterator a = varA.alt.begin(); a != varA.alt.end(); ++a, ++i) {
+ varAAlleleInts[*a] = i;
+ }
+
+ map<int, int> varBconvertToVarA; // maps alleles in the second file to allele numbers for the first
+ varBconvertToVarA[0] = 0; // reference == reference!
+ i = 1;
+ for (vector<string>::iterator a = varB.alt.begin(); a != varB.alt.end(); ++a, ++i) {
+ map<string, int>::iterator ita = varAAlleleInts.find(*a);
+ if (ita != varAAlleleInts.end()) {
+ varBconvertToVarA[i] = ita->second;
+ }
+ }
+
+ for (; s != sEnd; ++s) {
+ map<string, vector<string> >& sample = s->second;
+ const string& name = s->first;
+ map<string, map<string, vector<string> > >::iterator o = varB.samples.find(name);
+ sample[annotationTag].clear();
+ if (o == varB.samples.end()) {
+ sample[annotationTag].push_back("./."); // means "no genotype"
+ } else {
+ map<string, vector<string> >& other = o->second;
+ string& otherGenotype = other["GT"].front();
+ // XXX this must compare the genotypes in the two files
+ map<int, int> gtB = decomposeGenotype(otherGenotype);
+ map<int, int> gtnew;
+ for (map<int, int>::iterator g = gtB.begin(); g != gtB.end(); ++g) {
+ map<int, int>::iterator f = varBconvertToVarA.find(g->first);
+ if (f != varBconvertToVarA.end()) {
+ gtnew[f->second] += g->second;
+ } else {
+ gtnew[-1] += g->second;
+ }
+ }
+ sample[annotationTag].push_back(genotypeToString(gtnew));
+ }
+ }
+}
+
+int main(int argc, char** argv) {
+
+ if (argc != 4) {
+ cerr << "usage: " << argv[0] << " <annotation-tag> <vcf file> <vcf file>" << endl
+ << "annotates genotypes in the first file with genotypes in the second" << endl
+ << "adding the genotype as another flag to each sample filed in the first file." << endl
+ << "annotation-tag is the name of the sample flag which is added to store the annotation." << endl
+ << "also adds a 'has_variant' flag for sites where the second file has a variant." << endl;
+ return 1;
+ }
+
+ string annotag = argv[1];
+ string filenameA = argv[2];
+ string filenameB = argv[3];
+
+ if (filenameA == filenameB) {
+ cerr << "it won't help to annotate samples with their own genotypes!" << endl;
+ return 1;
+ }
+
+ VariantCallFile variantFileA;
+ if (filenameA == "-") {
+ variantFileA.open(std::cin);
+ } else {
+ variantFileA.open(filenameA);
+ }
+
+ VariantCallFile variantFileB;
+ if (filenameB == "-") {
+ variantFileB.open(std::cin);
+ } else {
+ variantFileB.open(filenameB);
+ }
+
+ if (!variantFileA.is_open() || !variantFileB.is_open()) {
+ return 1;
+ }
+
+ Variant varA(variantFileA);
+ Variant varB(variantFileB);
+
+ // while the first file doesn't match the second positionally,
+ // step forward, annotating each genotype record with an empty genotype
+ // when the two match, iterate through the genotypes from the first file
+ // and get the genotypes reported in the second file
+
+ variantFileA.getNextVariant(varA);
+ variantFileB.getNextVariant(varB);
+
+ string line = "##INFO=<ID=" + annotag + ".has_variant,Number=0,Type=Flag,Description=\"True if "
+ + annotag + " has a called alternate among samples under comparison.\">";
+ variantFileA.addHeaderLine(line);
+ line = "##FORMAT=<ID=" + annotag + ",Number=1,Type=String,Description=\"Genotype from "
+ + annotag + ".\">";
+ variantFileA.addHeaderLine(line);
+
+ cout << variantFileA.header << endl;
+
+ do {
+
+ // this is broken. to do it right, it'll be necessary to get reference ids from the fasta reference used to make the alignments...
+ // if B is NOT done, and is less than A, read new B.
+ if (!variantFileB.done()
+ && (varB.sequenceName != varA.sequenceName
+ || (varB.sequenceName == varA.sequenceName && varB.position < varA.position)
+ || variantFileA.done())
+ ) {
+ variantFileB.getNextVariant(varB);
+ }
+
+ // if A is not done- and A is less than B, read A.
+ // should also read if variant B is done.
+ if (!variantFileA.done()
+ && (varA.sequenceName != varB.sequenceName
+ || (varA.sequenceName == varB.sequenceName && varA.position < varB.position)
+ || variantFileB.done())
+ ) {
+ cout << varA << endl;
+ variantFileA.getNextVariant(varA);
+ }
+
+ vector<Variant> varsA;
+ vector<Variant> varsB;
+
+ bool hasMultipleAlts = false;
+
+ long int thisPosition = 0;
+ string thisSequenceName;
+ if (varA.position == varB.position
+ && varA.sequenceName == varB.sequenceName) {
+ thisPosition = varA.position;
+ thisSequenceName = varA.sequenceName;
+ }
+ while (!variantFileA.done()
+ && !variantFileB.done()
+ && thisPosition == varA.position
+ && thisSequenceName == varA.sequenceName
+ && varA.sequenceName == varB.sequenceName
+ && varA.position == varB.position) {
+ // accumulate all the alts at the current position
+ varsA.push_back(varA);
+ varsB.push_back(varB);
+ if (varA.alt.size() > 1 || varB.alt.size() > 1)
+ hasMultipleAlts = true;
+ variantFileA.getNextVariant(varA);
+ variantFileB.getNextVariant(varB);
+ }
+
+ // multiple lines per position
+ if (!hasMultipleAlts && (varsA.size() > 1 || varsB.size() > 1)) {
+
+ map<pair<string, string>, Variant> varsAParsed;
+ map<pair<string, string>, Variant> varsBParsed;
+ for (vector<Variant>::iterator v = varsA.begin(); v != varsA.end(); ++v) {
+ varsAParsed[make_pair(v->ref, v->alt.front())] = *v;
+ }
+ for (vector<Variant>::iterator v = varsB.begin(); v != varsB.end(); ++v) {
+ varsBParsed[make_pair(v->ref, v->alt.front())] = *v;
+ }
+
+ for (map<pair<string, string>, Variant>::iterator vs = varsAParsed.begin(); vs != varsAParsed.end(); ++vs) {
+ Variant& varA = vs->second;
+ if (varsBParsed.find(make_pair(varA.ref, varA.alt.front())) != varsBParsed.end()) {
+ Variant& varB = varsBParsed[make_pair(varA.ref, varA.alt.front())]; // TODO cleanup
+ annotateWithGenotypes(varA, varB, annotag);
+ varA.infoFlags[annotag + ".has_variant"] = true;
+ } else {
+ annotateWithBlankGenotypes(varA, annotag);
+ }
+ cout << varA << endl;
+ }
+
+ } else if (!varsA.empty() && !varsB.empty()) { // one line per multi-allelic
+ Variant& varA = varsA.front();
+ Variant& varB = varsB.front();
+ annotateWithGenotypes(varA, varB, annotag);
+ // XXX TODO, and also allow for records with multiple alts
+ // XXX assume that if the other file has a corresponding record, some kind of variation was detected at the same site
+ varA.infoFlags[annotag + ".has_variant"] = true;
+ cout << varA << endl;
+ }
+
+ } while (!variantFileA.done() || !variantFileB.done());
+
+ return 0;
+
+}
+
diff --git a/src/vcfbreakmulti.cpp b/src/vcfbreakmulti.cpp
new file mode 100644
index 0000000..f462111
--- /dev/null
+++ b/src/vcfbreakmulti.cpp
@@ -0,0 +1,114 @@
+#include "Variant.h"
+#include "convert.h"
+#include <set>
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+
+double convertStrDbl(const string& s) {
+ double r;
+ convert(s, r);
+ return r;
+}
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [file]" << endl
+ << endl
+ << "If multiple alleles are specified in a single record, break the record into" << endl
+ << "multiple lines, preserving allele-specific INFO fields." << endl;
+ exit(0);
+}
+
+int main(int argc, char** argv) {
+
+ bool includePreviousBaseForIndels = true;
+ bool useMNPs = false;
+
+ VariantCallFile variantFile;
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "h",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (optind < argc) {
+ string filename = argv[optind];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+
+ int numalt = var.alt.size();
+
+ if (numalt == 1) {
+ cout << var << endl;
+ continue;
+ }
+
+ vector<Variant> variants;
+ for (int i = 0; i < numalt; ++i) {
+ variants.push_back(var);
+ }
+
+ for (int i = 0; i < numalt; ++i) {
+ Variant& v = variants.at(i);
+ vector<string> altsToRemove;
+ for (int j = 0; j < numalt; ++j) {
+ if (j != i) {
+ altsToRemove.push_back(var.alt.at(j));
+ }
+ }
+ for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) {
+ v.removeAlt(*a);
+ }
+ }
+
+ for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
+ cout << *v << endl;
+ }
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfcat.cpp b/src/vcfcat.cpp
new file mode 100644
index 0000000..cf40921
--- /dev/null
+++ b/src/vcfcat.cpp
@@ -0,0 +1,34 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ if (argc == 1) {
+ cout << "usage: " << argv[0] << " [file1] [file2] ... [fileN]" << endl
+ << "Concatenates VCF files." << endl;
+ return 0;
+ } else {
+ for (int i = 1; i < argc; ++i) {
+ VariantCallFile variantFile;
+ string filename = argv[i];
+ variantFile.open(filename);
+ if (!variantFile.is_open()) {
+ cerr << "could not open " << argv[i] << endl;
+ return 1;
+ }
+ if (i == 1) {
+ cout << variantFile.header << endl;
+ }
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ cout << var << endl;
+ }
+ }
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfcheck.cpp b/src/vcfcheck.cpp
new file mode 100644
index 0000000..d370ae1
--- /dev/null
+++ b/src/vcfcheck.cpp
@@ -0,0 +1,139 @@
+#include "Variant.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+ << endl
+ << "options:" << endl
+ << " -f, --fasta-reference FASTA reference file to use to obtain primer sequences" << endl
+ << " -x, --exclude-failures If a record fails, don't print it. Otherwise do." << endl
+ << " -k, --keep-failures Print if the record fails, otherwise not." << endl
+ << endl
+ << "Verifies that the VCF REF field matches the reference as described." << endl
+ << endl;
+ exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+ int c;
+ string fastaRef;
+ bool keepFailures = false;
+ bool excludeFailures = false;
+
+ if (argc == 1)
+ printSummary(argv);
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"fasta-reference", required_argument, 0, 'f'},
+ {"exclude-failures", no_argument, 0, 'x'},
+ {"keep-failures", no_argument, 0, 'k'},
+ //{"length", no_argument, &printLength, true},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hxkf:",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 'f':
+ fastaRef = optarg;
+ break;
+
+ case 'x':
+ excludeFailures = true;
+ break;
+
+ case 'k':
+ keepFailures = true;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+ break;
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (fastaRef.empty()) {
+ cerr << "a FASTA reference sequence must be specified" << endl;
+ exit(1);
+ }
+
+ FastaReference ref;
+ ref.open(fastaRef);
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ if (keepFailures || excludeFailures) {
+ cout << variantFile.header << endl;
+ }
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ int refstart = var.position - 1; // convert to 0-based
+ string matchedRef = ref.getSubSequence(var.sequenceName, refstart, var.ref.size());
+ if (var.ref != matchedRef) {
+ if (keepFailures) {
+ cout << var << endl;
+ } else if (!excludeFailures) {
+ cout << "mismatched reference " << var.ref << " should be " << matchedRef << " at "
+ << var.sequenceName << ":" << var.position << endl;
+ }
+ } else if (excludeFailures) {
+ cout << var << endl;
+ }
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfclassify.cpp b/src/vcfclassify.cpp
new file mode 100644
index 0000000..42624ee
--- /dev/null
+++ b/src/vcfclassify.cpp
@@ -0,0 +1,162 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+bool isTransition(string& ref, string& alt) {
+ if (((ref == "A" && alt == "G") || (ref == "G" && alt == "A")) ||
+ ((ref == "C" && alt == "T") || (ref == "T" && alt == "C"))) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool hasTransition(Variant& var) {
+ string& ref = var.ref;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string& alt = *a;
+ if (isTransition(ref, alt)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool hasTransversion(Variant& var) {
+ string& ref = var.ref;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string& alt = *a;
+ if (!isTransition(ref, alt)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool hasInsertion(Variant& var) {
+ string& ref = var.ref;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string& alt = *a;
+ if (ref.size() < alt.size()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool hasDeletion(Variant& var) {
+ string& ref = var.ref;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string& alt = *a;
+ if (ref.size() > alt.size()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool hasMNP(Variant& var) {
+ string& ref = var.ref;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string& alt = *a;
+ if (ref.size() > 1 && alt.size() == ref.size()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool hasSNP(Variant& var) {
+ string& ref = var.ref;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string& alt = *a;
+ if (ref.size() == 1 && alt.size() == 1) {
+ return true;
+ }
+ }
+ return false;
+}
+
+int main(int argc, char** argv) {
+
+ if (argc != 2) {
+ cerr << "usage: " << argv[0] << " <vcf file>" << endl
+ << "outputs a VCF stream each variant is tagged by allele class: snp, ts/tv, indel, mnp" << endl;
+ return 1;
+ }
+
+ string filename = argv[1];
+
+ VariantCallFile variantFile;
+ if (filename == "-") {
+ variantFile.open(std::cin);
+ } else {
+ variantFile.open(filename);
+ }
+
+ if (!variantFile.is_open()) {
+ cerr << "could not open " << filename << endl;
+ return 1;
+ }
+
+ Variant var(variantFile);
+
+ string line;
+ line = "##INFO=<ID=SNP,Number=0,Type=Flag,Description=\"SNP allele\">";
+ variantFile.addHeaderLine(line);
+ line = "##INFO=<ID=TS,Number=0,Type=Flag,Description=\"transition SNP\">";
+ variantFile.addHeaderLine(line);
+ line = "##INFO=<ID=TV,Number=0,Type=Flag,Description=\"transversion SNP\">";
+ variantFile.addHeaderLine(line);
+ line = "##INFO=<ID=INS,Number=0,Type=Flag,Description=\"insertion allele\">";
+ variantFile.addHeaderLine(line);
+ line = "##INFO=<ID=DEL,Number=0,Type=Flag,Description=\"deletion allele\">";
+ variantFile.addHeaderLine(line);
+ line = "##INFO=<ID=MNP,Number=0,Type=Flag,Description=\"MNP allele\">";
+ variantFile.addHeaderLine(line);
+ // TODO handle lengths at poly-allelic sites
+ //line = "##INFO=<ID=LEN,Number=1,Type=Integer,Description=\"allele length\">";
+ //variantFile.addHeaderLine(line);
+
+ // write the new header
+ cout << variantFile.header << endl;
+
+
+ while (variantFile.getNextVariant(var)) {
+
+ if (hasSNP(var)) {
+ var.infoFlags["SNP"] = true;
+ }
+
+ if (hasTransition(var)) {
+ var.infoFlags["TS"] = true;
+ }
+
+ if (hasTransversion(var)) {
+ var.infoFlags["TV"] = true;
+ }
+
+ if (hasInsertion(var)) {
+ var.infoFlags["INS"] = true;
+ }
+
+ if (hasDeletion(var)) {
+ var.infoFlags["DEL"] = true;
+ }
+
+ if (hasMNP(var)) {
+ var.infoFlags["MNP"] = true;
+ }
+
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfcleancomplex.cpp b/src/vcfcleancomplex.cpp
new file mode 100644
index 0000000..c0b3c71
--- /dev/null
+++ b/src/vcfcleancomplex.cpp
@@ -0,0 +1,71 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+ if (argc != 2) {
+ cerr << "usage: " << argv[0] << " <vcf file>" << endl
+ << "outputs a VCF stream in which 'long' non-complex"
+ << "alleles have their position corrected." << endl
+ << "assumes that VCF records can't overlap 5'->3'" << endl;
+ return 1;
+ }
+
+ string filename = argv[1];
+
+ VariantCallFile variantFile;
+ if (filename == "-") {
+ variantFile.open(std::cin);
+ } else {
+ variantFile.open(filename);
+ }
+
+ if (!variantFile.is_open()) {
+ cerr << "could not open " << filename << endl;
+ return 1;
+ }
+
+ Variant var(variantFile);
+
+ // write the new header
+ cout << variantFile.header << endl;
+
+ // print the records, filtering is done via the setting of varA's output sample names
+ while (variantFile.getNextVariant(var)) {
+ // if we just have one parsed alternate (non-complex case)
+ map<string, vector<VariantAllele> > parsedAlts = var.parsedAlternates(true, true); // use mnps, and previous for indels
+ // but the alt string is long
+ //cerr << var.alt.size() << " " << parsedAlts.size() << endl;
+ if (var.alt.size() == 1 && parsedAlts.size() > 1) {
+ string& alternate = var.alt.front();
+ vector<VariantAllele>& vs = parsedAlts[alternate];
+ vector<VariantAllele> valleles;
+ for (vector<VariantAllele>::iterator a = vs.begin(); a != vs.end(); ++a) {
+ if (a->ref != a->alt) {
+ valleles.push_back(*a);
+ }
+ }
+ if (valleles.size() == 1) {
+ // do we have extra sequence hanging around?
+ VariantAllele& varallele = valleles.front();
+ if (vs.front().ref == vs.front().alt) {
+ var.position = varallele.position;
+ var.ref = var.ref.substr(vs.front().ref.size(), varallele.ref.size());
+ var.alt.front() = varallele.alt;
+ }
+ }
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfcombine.cpp b/src/vcfcombine.cpp
new file mode 100644
index 0000000..09201c4
--- /dev/null
+++ b/src/vcfcombine.cpp
@@ -0,0 +1,207 @@
+#include "Variant.h"
+#include <getopt.h>
+#include <utility>
+
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [vcf file] [vcf file] ..." << endl
+ << endl
+ << "Combines VCF files positionally, combining samples when sites and alleles are identical." << endl
+ << "Any number of VCF files may be combined. The INFO field and other columns are taken from" << endl
+ << "one of the files which are combined when records in multiple files match. Alleles must" << endl
+ << "have identical ordering to be combined into one record. If they do not, multiple records" << endl
+ << "will be emitted." << endl
+ << endl
+ << "options:" << endl
+ << " -h --help This text." << endl
+ << " -r --region REGION A region specifier of the form chrN:x-y to bound the merge" << endl;
+ exit(1);
+}
+
+int main(int argc, char** argv) {
+
+ if (argc < 2) {
+ printSummary(argv);
+ }
+
+ string region;
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"region", required_argument, 0, 'r'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hr:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case 'r':
+ region = optarg;
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ vector<string> sampleNames;
+ string randomHeader;
+ VariantCallFile* vcf;
+
+ // structure to track ordered variants
+ //ChromNameCompare chromCompare;
+
+ typedef
+ map<vector<string>, // alts
+ map<VariantCallFile*, Variant*> >
+ Position;
+
+ typedef
+ map<long int, Position>
+ ChromVariants;
+
+ typedef
+ map<string, // chrom
+ ChromVariants,
+ ChromNameCompare>
+ VariantsByChromPosAltFile;
+
+ VariantsByChromPosAltFile variantsByChromPosAltFile;
+
+ VariantCallFile* firstVCF = NULL;
+ VCFHeader vcf_header;
+ for (int i = optind; i != argc; ++i) {
+ string inputFilename = argv[i];
+ vcf = new VariantCallFile;
+ vcf->open(inputFilename);
+ if (!region.empty()) {
+ if (!vcf->setRegion(region)) {
+ cerr << "could not set region on " << inputFilename << endl;
+ delete vcf;
+ continue;
+ }
+ }
+ if (vcf->is_open()) {
+ Variant* var = new Variant(*vcf);
+ if (vcf->getNextVariant(*var)) {
+ variantsByChromPosAltFile[var->sequenceName][var->position][var->alt][vcf] = var;
+ sampleNames.insert(sampleNames.end(), vcf->sampleNames.begin(), vcf->sampleNames.end());
+ // the first file is tracked for header generation
+ }
+ // populate the vcf_header with header_lines from this vcf file
+ vector<string> header_lines = split(vcf->vcf_header, "\n");
+ if (header_lines.size() > 0)
+ {
+ // populate the meta information lines
+ string column_headers_line;
+ for (vector<string>::const_iterator meta_iter = header_lines.begin(); meta_iter != header_lines.end(); ++meta_iter)
+ {
+ vcf_header.addMetaInformationLine(*meta_iter);
+ if ((*meta_iter).find("#CHROM") != string::npos) // store the header column position
+ {
+ column_headers_line = (*meta_iter);
+ }
+ }
+ if (column_headers_line.size() > 0) // if there are header columns then add them
+ {
+ vector<string> header_columns = split(column_headers_line, "\t");
+ for (vector<string>::const_iterator column_iter = header_columns.begin(); column_iter != header_columns.end(); ++column_iter)
+ {
+ vcf_header.addHeaderColumn(*column_iter);
+ }
+ }
+ }
+
+ if (firstVCF == NULL) firstVCF = vcf;
+ }
+ }
+
+ // get sorted, unique samples in all files
+ sort(sampleNames.begin(), sampleNames.end());
+ sampleNames.erase(unique(sampleNames.begin(), sampleNames.end()), sampleNames.end());
+
+ // now that we've accumulated the sample information we can generate the combined header
+ VariantCallFile outputCallFile;
+// string header = firstVCF->headerWithSampleNames(sampleNames);
+ string header = vcf_header.getHeaderString();
+
+ outputCallFile.openForOutput(header);
+
+ cout << outputCallFile.header << endl;
+
+ while (!variantsByChromPosAltFile.empty()) {
+ // get lowest variant(s)
+ // if they have identical alts and position, combine
+ // otherwise just output, but with the same sample names
+
+ ChromVariants& chrom = variantsByChromPosAltFile.begin()->second;
+ if (chrom.empty()) {
+ variantsByChromPosAltFile.erase(variantsByChromPosAltFile.begin());
+ continue;
+ }
+
+ Position& pos = chrom.begin()->second;
+ Position::iterator s = pos.begin();
+ for ( ; s != pos.end(); ++s) {
+ Variant variant(outputCallFile);
+ map<VariantCallFile*, Variant*>& vars = s->second;
+ map<VariantCallFile*, Variant*>::iterator v = vars.begin();
+ for ( ; v != vars.end(); ++v) {
+ VariantCallFile* vcf = v->first;
+ Variant* var = v->second;
+ //if (variant.info.empty()) {
+ if (v == vars.begin()) { // set these using the first matching variant
+ variant.sequenceName = var->sequenceName;
+ variant.position = var->position;
+ variant.id = var->id;
+ variant.ref = var->ref;
+ variant.alt = var->alt;
+ variant.filter = var->filter;
+ variant.quality = var->quality;
+ variant.info = var->info;
+ variant.format = var->format;
+ }
+ // add samples to output variant
+ for (Samples::iterator sample = var->samples.begin(); sample != var->samples.end(); ++sample) {
+ variant.samples[sample->first] = sample->second;
+ }
+ if (vcf->getNextVariant(*var)) {
+ variantsByChromPosAltFile[var->sequenceName][var->position][var->alt][vcf] = var;
+ }
+ }
+ // what was this chck for?
+ //if (!variant.info.empty())
+ cout << variant << endl;
+ }
+ // pop the last position
+ chrom.erase(chrom.begin());
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfcommonsamples.cpp b/src/vcfcommonsamples.cpp
new file mode 100644
index 0000000..ee594cb
--- /dev/null
+++ b/src/vcfcommonsamples.cpp
@@ -0,0 +1,85 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+template<class T>
+vector<T> intersection(vector<T>& a, vector<T>& b) {
+ map<T, bool> inA;
+ map<T, bool> inAB;
+ for (typename vector<T>::iterator i = a.begin(); i != a.end(); ++i) {
+ inA[*i] = true;
+ }
+ for (typename vector<T>::iterator i = b.begin(); i != b.end(); ++i) {
+ if (inA.find(*i) != inA.end()) {
+ inAB[*i] = true;
+ }
+ }
+ vector<T> aIb;
+ for (typename map<T, bool>::iterator i = inAB.begin(); i != inAB.end(); ++i) {
+ aIb.push_back(i->first);
+ }
+ return aIb;
+}
+
+int main(int argc, char** argv) {
+
+ if (argc != 3) {
+ cerr << "usage: " << argv[0] << " <vcf file> <vcf file>" << endl
+ << "outputs each record in the first file, removing samples not present in the second" << endl;
+ return 1;
+ }
+
+ string filenameA = argv[1];
+ string filenameB = argv[2];
+
+ if (filenameA == filenameB) {
+ cerr << "you're just spinning your wheels matching the samples in "
+ << filenameA << " to the samples in " << filenameB << endl;
+ return 1;
+ }
+
+ VariantCallFile variantFileA;
+ if (filenameA == "-") {
+ variantFileA.open(std::cin);
+ } else {
+ variantFileA.open(filenameA);
+ }
+
+ VariantCallFile variantFileB;
+ if (filenameB == "-") {
+ variantFileB.open(std::cin);
+ } else {
+ variantFileB.open(filenameB);
+ }
+
+ if (!variantFileA.is_open() || !variantFileB.is_open()) {
+ return 1;
+ }
+
+ Variant varA(variantFileA);
+ Variant varB(variantFileB);
+
+ vector<string> commonSamples = intersection(variantFileA.sampleNames, variantFileB.sampleNames);
+
+ // update sample list in header
+ variantFileA.updateSamples(commonSamples);
+
+ // and restrict the output sample names in the variant to those we are keeping
+ varA.setOutputSampleNames(commonSamples);
+
+ // write the new header
+ cout << variantFileA.header << endl;
+
+ // print the records, filtering is done via the setting of varA's output sample names
+ while (variantFileA.getNextVariant(varA)) {
+ cout << varA << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfcountalleles.cpp b/src/vcfcountalleles.cpp
new file mode 100644
index 0000000..9ce6aae
--- /dev/null
+++ b/src/vcfcountalleles.cpp
@@ -0,0 +1,33 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc > 1) {
+ string filename = argv[1];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ int uniqueAlleles = 0;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ uniqueAlleles += var.alleles.size();
+ }
+
+ cout << uniqueAlleles << endl;
+
+ return 0;
+
+}
+
diff --git a/src/vcfcreatemulti.cpp b/src/vcfcreatemulti.cpp
new file mode 100644
index 0000000..d4ac13c
--- /dev/null
+++ b/src/vcfcreatemulti.cpp
@@ -0,0 +1,197 @@
+#include "Variant.h"
+#include "convert.h"
+#include <set>
+#include <sstream>
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+
+double convertStrDbl(const string& s) {
+ double r;
+ convert(s, r);
+ return r;
+}
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [file]" << endl
+ << endl
+ << "If overlapping alleles are represented across multiple records, merge" << endl
+ << "them into a single record. Currently only for indels." << endl;
+ exit(0);
+}
+
+Variant createMultiallelic(vector<Variant>& vars) {
+
+ if (vars.size() == 1) {
+ return vars.front();
+ }
+
+ int maxpos = vars.front().position + vars.front().ref.size();
+ for (vector<Variant>::iterator v = vars.begin(); v != vars.end(); ++v) {
+ //cerr << *v << endl;
+ if (maxpos < v->position + v->ref.size()) {
+ maxpos = v->position + v->ref.size();
+ }
+ }
+
+ int numalt = vars.size();
+ //cerr << "gots overlapping vars " << vars.front().position << "-" << vars.back().position << endl;
+
+ // get REF
+ // use start position to extend all other alleles
+ int start = vars.front().position;
+ string ref = vars.front().ref;
+
+ for (vector<Variant>::iterator v = vars.begin() + 1; v != vars.end(); ++v) {
+ int sdiff = (v->position + v->ref.size()) - (start + ref.size());
+ int pdiff = (start + ref.size()) - v->position;
+ if (sdiff > 0) {
+ ref.append(v->ref.substr(pdiff, sdiff));
+ }
+ }
+
+ //cerr << "ref would be " << ref << " for vars from "
+ // << vars.front().position << " to " << vars.back().position << endl;
+
+ Variant var = vars.front();
+ var.alt.clear();
+ var.ref = ref;
+
+ for (vector<Variant>::iterator v = vars.begin(); v != vars.end(); ++v) {
+ // add alternates and splice them into the reference
+ int p5diff = v->position - var.position;
+ int p3diff = (var.position + var.ref.size()) - (v->position + v->ref.size());
+ string before;
+ string after;
+ if (p5diff > 0) {
+ before = var.ref.substr(0, p5diff);
+ }
+ if (p3diff > 0 && p3diff < var.ref.size()) {
+ after = var.ref.substr(var.ref.size() - p3diff);
+ }
+ if (p5diff || p3diff) {
+ for (vector<string>::iterator a = v->alt.begin(); a != v->alt.end(); ++a) {
+ var.alt.push_back(before);
+ string& alt = var.alt.back();
+ alt.append(*a);
+ alt.append(after);
+ }
+ } else {
+ for (vector<string>::iterator a = v->alt.begin(); a != v->alt.end(); ++a) {
+ var.alt.push_back(*a);
+ }
+ }
+ }
+
+ stringstream s;
+ s << vars.front().position << "-" << vars.back().position;
+ var.info["combined"].push_back(s.str());
+
+ return var;
+
+}
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "h",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (optind < argc) {
+ string filename = argv[optind];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ variantFile.addHeaderLine("##INFO=<ID=combined,Number=1,Type=String,Description=\"Range of overlapping variants which were combined into this one using vcfcreatemulti.\">");
+
+ cout << variantFile.header << endl;
+
+ bool first = true;
+ bool already = false;
+ Variant var(variantFile);
+ vector<Variant> vars;
+ string lastSeq;
+
+ while (variantFile.getNextVariant(var)) {
+
+ if (lastSeq.empty()) {
+ lastSeq = var.sequenceName;
+ }
+
+ if (vars.empty()) {
+ vars.push_back(var);
+ continue;
+ } else {
+ int maxpos = vars.front().position + vars.front().ref.size();
+ for (vector<Variant>::iterator v = vars.begin(); v != vars.end(); ++v) {
+ if (maxpos < v->position + v->ref.size()) {
+ maxpos = v->position + v->ref.size();
+ }
+ }
+ if (var.sequenceName != lastSeq) {
+ Variant result = createMultiallelic(vars);
+ cout << result << endl;
+ vars.clear();
+ lastSeq = var.sequenceName;
+ vars.push_back(var);
+ } else if (var.position < maxpos) {
+ vars.push_back(var);
+ } else {
+ Variant result = createMultiallelic(vars);
+ cout << result << endl;
+ vars.clear();
+ vars.push_back(var);
+ }
+ }
+
+ }
+
+ if (!vars.empty()) {
+ Variant result = createMultiallelic(vars);
+ cout << result << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfdistance.cpp b/src/vcfdistance.cpp
new file mode 100644
index 0000000..61e798e
--- /dev/null
+++ b/src/vcfdistance.cpp
@@ -0,0 +1,92 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ if (argc > 1) {
+ cerr << "usage: " << argv[0] << " <[vcf file]" << endl
+ << "adds a tag (BasesToClosestVariant) to each variant record which indicates" << endl
+ << "the distance to the nearest variant" << endl;
+ return 1;
+ }
+
+ VariantCallFile variantFile;
+ variantFile.open(std::cin);
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ Variant varA(variantFile);
+ Variant varB(variantFile);
+ Variant varC(variantFile);
+
+ vector<Variant*> vars;
+ vars.push_back(&varA);
+ vars.push_back(&varB);
+ vars.push_back(&varC);
+
+ for (vector<Variant*>::iterator v = vars.begin(); v != vars.end(); ++v) {
+ variantFile.getNextVariant(**v);
+ }
+
+ string tag = "BasesToClosestVariant";
+ string line = "##INFO=<ID=" + tag + ",Number=1,Type=Integer,Description=\"" \
+ + "Number of bases to the closest variant in the file.\">";
+ variantFile.addHeaderLine(line);
+
+ cout << variantFile.header << endl;
+
+ if (!vars.at(0)->sequenceName.empty()) {
+ if (!vars.at(1)->sequenceName.empty()) {
+ // at least two variants, so calculate the first distance
+ if (vars.at(0)->sequenceName == vars.at(1)->sequenceName) {
+ vars.at(0)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
+ }
+ cout << *vars.at(0) << endl;
+
+ if (!vars.at(2)->sequenceName.empty()) {
+ // at least three variants, so starting with the first three,
+ // calculate the middle variant's closest distance, and then
+ // slide the window forward one.
+ do {
+ if (vars.at(1)->sequenceName == vars.at(0)->sequenceName &&
+ vars.at(1)->sequenceName == vars.at(2)->sequenceName) {
+ vars.at(1)->info[tag].push_back(convert(min(vars.at(1)->position - vars.at(0)->position,
+ vars.at(2)->position - vars.at(1)->position)));
+ } else if (vars.at(1)->sequenceName == vars.at(0)->sequenceName) {
+ vars.at(1)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
+ } else if (vars.at(2)->sequenceName == vars.at(1)->sequenceName) {
+ vars.at(1)->info[tag].push_back(convert(vars.at(2)->position - vars.at(1)->position));
+ } else {
+ // don't add the tag
+ }
+ cout << *vars.at(1) << endl;
+ // rotate
+ Variant* v = vars.at(0);
+ vars.at(0) = vars.at(1);
+ vars.at(1) = vars.at(2);
+ vars.at(2) = v;
+ } while (variantFile.getNextVariant(*vars.back()));
+ }
+
+ // assign the last distance and output the last variant
+ if (vars.at(0)->sequenceName == vars.at(1)->sequenceName) {
+ vars.at(1)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position));
+ }
+ cout << *vars.at(1) << endl;
+ } else {
+ // output the lone variant line untouched
+ cout << *vars.at(0) << endl;
+ }
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfecho.cpp b/src/vcfecho.cpp
new file mode 100644
index 0000000..b850440
--- /dev/null
+++ b/src/vcfecho.cpp
@@ -0,0 +1,31 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc > 1) {
+ string filename = argv[1];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfentropy.cpp b/src/vcfentropy.cpp
new file mode 100644
index 0000000..4f92691
--- /dev/null
+++ b/src/vcfentropy.cpp
@@ -0,0 +1,159 @@
+#include "Variant.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+ << endl
+ << "options:" << endl
+ << " -f, --fasta-reference FASTA reference file to use to obtain flanking sequences" << endl
+ << " -w, --window-size Size of the window over which to calculate entropy" << endl
+ << endl
+ << "Anotates the output VCF file with, for each record, EntropyLeft, EntropyRight," << endl
+ << "EntropyCenter, which are the entropies of the sequence of the given window size to the" << endl
+ << "left, right, and center of the record. Also adds EntropyRef and EntropyAlt for each alt." << endl
+ << endl;
+ exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+ int c;
+ string fastaRef;
+ int windowSize = 0;
+
+ if (argc == 1)
+ printSummary(argv);
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"fasta-reference", required_argument, 0, 'f'},
+ {"window-size", required_argument, 0, 'w'},
+ //{"length", no_argument, &printLength, true},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hf:w:",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 'f':
+ fastaRef = optarg;
+ break;
+
+ case 'w':
+ windowSize = atoi(optarg);
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+ break;
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (windowSize == 0) {
+ cerr << "a window size must be specified" << endl;
+ exit(1);
+ }
+ if (fastaRef.empty()) {
+ cerr << "a FASTA reference sequence must be specified" << endl;
+ exit(1);
+ }
+
+ FastaReference ref;
+ ref.open(fastaRef);
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ variantFile.addHeaderLine("##INFO=<ID=EntropyLeft,Number=1,Type=Float,Description=\"Entropy of left-flanking sequence of "+ convert(windowSize) +"bp\">");
+ variantFile.addHeaderLine("##INFO=<ID=EntropyCenter,Number=1,Type=Float,Description=\"Entropy of centered sequence of "+ convert(windowSize) +"bp\">");
+ variantFile.addHeaderLine("##INFO=<ID=EntropyRight,Number=1,Type=Float,Description=\"Entropy of right-flanking sequence of "+ convert(windowSize) +"bp\">");
+ variantFile.addHeaderLine("##INFO=<ID=EntropyRef,Number=1,Type=Float,Description=\"Entropy of REF allele\">");
+ variantFile.addHeaderLine("##INFO=<ID=EntropyAlt,Number=A,Type=Float,Description=\"Entropy of each ALT allele\">");
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+
+ // get the ref start and end positions
+ int refstart = var.position - 1; // convert to 0-based
+ int refend = var.position + var.ref.size() - 1;
+ string leftseq = ref.getSubSequence(var.sequenceName, refstart - windowSize, windowSize);
+ string rightseq = ref.getSubSequence(var.sequenceName, refend, windowSize);
+ string centerseq = ref.getSubSequence(var.sequenceName, refstart - windowSize/2, windowSize);
+ double entropyLeft = shannon_H((char*) &leftseq[0], windowSize);
+ double entropyRight = shannon_H((char*) &rightseq[0], windowSize);
+ double entropyCenter = shannon_H((char*) ¢erseq[0], windowSize);
+ double entropyRef = shannon_H((char*) var.ref.c_str(), var.ref.size());
+
+ var.info["EntropyLeft"].clear();
+ var.info["EntropyRight"].clear();
+ var.info["EntropyCenter"].clear();
+ var.info["EntropyRef"].clear();
+ var.info["EntropyAlt"].clear();
+
+ var.info["EntropyLeft"].push_back(convert(entropyLeft));
+ var.info["EntropyRight"].push_back(convert(entropyRight));
+ var.info["EntropyCenter"].push_back(convert(entropyCenter));
+ var.info["EntropyRef"].push_back(convert(entropyRef));
+
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ double entropyAlt = shannon_H((char*) a->c_str(), a->size());
+ var.info["EntropyAlt"].push_back(convert(entropyAlt));
+ }
+
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfevenregions.cpp b/src/vcfevenregions.cpp
new file mode 100644
index 0000000..5888c98
--- /dev/null
+++ b/src/vcfevenregions.cpp
@@ -0,0 +1,202 @@
+#include "Variant.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <getopt.h>
+#include <cmath>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+ << endl
+ << "options:" << endl
+ << " -f, --fasta-reference REF FASTA reference file to use to obtain primer sequences." << endl
+ << " -n, --number-of-regions N The number of desired regions." << endl
+ << " -p, --number-of-positions N The number of positions per region." << endl
+ << " -o, --offset N Add an offset to region positioning, to avoid boundary" << endl
+ << " related artifacts in downstream processing." << endl
+ << " -l, --overlap N The number of sites to overlap between regions. Default 0." << endl
+ << " -s, --separator SEQ Specify string to use to separate region output. Default '-'" << endl
+ << endl
+ << "Generates a list of regions, e.g. chr20:10..30 using the variant" << endl
+ << "density information provided in the VCF file to ensure that the regions have" << endl
+ << "even numbers of variants. This can be use to reduce the variance in runtime" << endl
+ << "when dividing variant detection or genotyping by genomic coordinates." << endl;
+ exit(0);
+}
+
+
+struct Region {
+ long int start;
+ long int end;
+ int positions;
+ Region() : start(0), end(0), positions(0) { }
+ Region(long int s, long int e)
+ : start(s), end(e), positions(0) { }
+};
+
+
+int main(int argc, char** argv) {
+
+ int c;
+ string fastaRef;
+ bool keepFailures = false;
+ bool excludeFailures = false;
+ int number_of_regions = 1;
+ int number_of_positions = 0;
+ int offset = 0;
+ int overlap = 0;
+ string regionSplitSeq = "-";
+
+ if (argc == 1)
+ printSummary(argv);
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"fasta-reference", required_argument, 0, 'f'},
+ {"number-of-regions", required_argument, 0, 'n'},
+ {"number-of-positions", required_argument, 0, 'p'},
+ {"offset", required_argument, 0, 'o'},
+ {"overlap", required_argument, 0, 'l'},
+ {"separator", required_argument, 0, 's'},
+ //{"length", no_argument, &printLength, true},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hf:n:o:l:s:p:",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 'f':
+ fastaRef = optarg;
+ break;
+
+ case 'n':
+ number_of_regions = atoi(optarg);
+ break;
+
+ case 'p':
+ number_of_positions = atoi(optarg);
+ break;
+
+ case 'o':
+ offset = atoi(optarg);
+ break;
+
+ case 'l':
+ overlap = atoi(optarg);
+ break;
+
+ case 's':
+ regionSplitSeq = optarg;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+ break;
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (fastaRef.empty()) {
+ cerr << "a FASTA reference sequence must be specified" << endl;
+ exit(1);
+ }
+
+ FastaReference ref;
+ ref.open(fastaRef);
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ map<string, vector<Region> > positions_by_chrom;
+ int total_positions = 0;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ int refstart = var.position - 1; // convert to 0-based
+ positions_by_chrom[var.sequenceName].push_back(Region(refstart + offset, refstart + offset + var.ref.size()));
+ ++total_positions;
+ }
+
+ int positions_per_region;
+ if (number_of_positions) {
+ if (number_of_positions - overlap < 0) {
+ cerr << "overlap is greater than the number of positions per region!" << endl;
+ exit(1);
+ } else {
+ positions_per_region = number_of_positions - overlap;
+ }
+ } else {
+ positions_per_region = ceil((double) total_positions / (double) number_of_regions);
+ }
+ //cerr << positions_per_region << "=" << total_positions << "/" << number_of_regions << "+" << overlap << endl;
+
+ // todo, update routine to allow overlaps
+
+ for (map<string, vector<Region> >::iterator s = positions_by_chrom.begin();
+ s != positions_by_chrom.end(); ++s) {
+ //pair<long int, long int> current_region;
+ Region current_region;
+ for (vector<Region>::iterator p = s->second.begin(); p != s->second.end(); ++p) {
+ if (current_region.positions < positions_per_region + overlap) {
+ current_region.end = p->end;
+ current_region.positions++;
+ } else {
+ cout << s->first << ":" << current_region.start << regionSplitSeq << current_region.end << endl;
+ vector<Region>::iterator l = max(s->second.begin(), p-overlap-1);
+ current_region.start = l->end;
+ current_region.end = p->end;
+ current_region.positions = overlap + 1;
+ }
+ }
+ // get refseq size, use as end coordinate for last region in target
+ current_region.end = ref.sequenceLength(s->first);
+ cout << s->first << ":" << current_region.start << regionSplitSeq << current_region.end << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcffilter.cpp b/src/vcffilter.cpp
new file mode 100644
index 0000000..9150ff1
--- /dev/null
+++ b/src/vcffilter.cpp
@@ -0,0 +1,402 @@
+#include "Variant.h"
+#include "split.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+ << endl
+ << "options:" << endl
+ << " -f, --info-filter specifies a filter to apply to the info fields of records," << endl
+ << " removes alleles which do not pass the filter" << endl
+ << " -g, --genotype-filter specifies a filter to apply to the genotype fields of records" << endl
+ << " -k, --keep-info used in conjunction with '-g', keeps variant info, but removes genotype" << endl
+ << " -s, --filter-sites filter entire records, not just alleles" << endl
+ << " -t, --tag-pass tag vcf records as positively filtered with this tag, print all records" << endl
+ << " -F, --tag-fail tag vcf records as negatively filtered with this tag, print all records" << endl
+ << " -A, --append-filter append the existing filter tag, don't just replace it" << endl
+ << " -a, --allele-tag apply -t on a per-allele basis. adds or sets the corresponding INFO field tag" << endl
+ << " -v, --invert inverts the filter, e.g. grep -v" << endl
+ << " -o, --or use logical OR instead of AND to combine filters" << endl
+ << " -r, --region specify a region on which to target the filtering, requires a BGZF" << endl
+ << " compressed file which has been indexed with tabix. any number of" << endl
+ << " regions may be specified." << endl
+ << endl
+ << "Filter the specified vcf file using the set of filters." << endl
+ << "Filters are specified in the form \"<ID> <operator> <value>:" << endl
+ << " -f \"DP > 10\" # for info fields" << endl
+ << " -g \"GT = 1|1\" # for genotype fields" << endl
+ << " -f \"CpG\" # for 'flag' fields" << endl
+ << endl
+ << "Operators can be any of: =, !, <, >, |, &" << endl
+ << endl
+ << "Any number of filters may be specified. They are combined via logical AND" << endl
+ << "unless --or is specified on the command line. Obtain logical negation through" << endl
+ << "the use of parentheses, e.g. \"! ( DP = 10 )\"" << endl
+ << endl
+ << "For convenience, you can specify \"QUAL\" to refer to the quality of the site, even" << endl
+ << "though it does not appear in the INFO fields." << endl
+ << endl;
+ exit(0);
+}
+
+bool passesFilters(Variant& var, vector<VariantFilter>& filters, bool logicalOr, string alt = "") {
+ for (vector<VariantFilter>::iterator f = filters.begin(); f != filters.end(); ++f) {
+ string s = "";
+ if (logicalOr) {
+ if (alt.empty()) {
+ if (f->passes(var, s)) return true;
+ } else {
+ if (f->passes(var, s, alt)) return true;
+ }
+ } else {
+ if (alt.empty()) {
+ if (!f->passes(var, s)) return false;
+ } else {
+ if (!f->passes(var, s, alt)) return false;
+ }
+ }
+ }
+ if (logicalOr)
+ return false;
+ else
+ return true;
+}
+
+
+int main(int argc, char** argv) {
+
+ int c;
+ bool invert = false;
+ bool logicalOr = false;
+ bool filterSites = false;
+ bool keepInfo = false;
+ vector<string> infofilterStrs;
+ vector<VariantFilter> infofilters;
+ vector<string> genofilterStrs;
+ vector<VariantFilter> genofilters;
+ string tagPass = "";
+ string tagFail = "";
+ string filterSpec;
+ string alleleTag;
+ vector<string> regions;
+ bool replaceFilter = true;
+
+ if (argc == 1)
+ printSummary(argv);
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"filter-sites", no_argument, 0, 's'},
+ {"info-filter", required_argument, 0, 'f'},
+ {"genotype-filter", required_argument, 0, 'g'},
+ {"tag-pass", required_argument, 0, 't'},
+ {"tag-pass", required_argument, 0, 'F'},
+ {"append-filter", no_argument, 0, 'A'},
+ {"allele-tag", required_argument, 0, 'a'},
+ {"invert", no_argument, 0, 'v'},
+ {"or", no_argument, 0, 'o'},
+ {"region", required_argument, 0, 'r'},
+ {"keep-info", no_argument, 0, 'k'},
+ //{"length", no_argument, &printLength, true},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hvAsof:g:kt:F:r:a:",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 'f':
+ filterSpec += " " + string(optarg);
+ infofilterStrs.push_back(string(optarg));
+ break;
+
+ case 's':
+ filterSites = true;
+ break;
+
+ case 'a':
+ alleleTag = optarg;
+ break;
+
+ case 'g':
+ filterSpec += " genotypes filtered with: " + string(optarg);
+ genofilterStrs.push_back(string(optarg));
+ break;
+
+ case 't':
+ tagPass = optarg;
+ break;
+
+ case 'F':
+ tagFail = optarg;
+ break;
+
+ case 'A':
+ replaceFilter = false;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+ break;
+
+ case 'v':
+ invert = true;
+ break;
+
+ case 'o':
+ logicalOr = true;
+ break;
+
+ case 'r':
+ regions.push_back(optarg);
+ break;
+
+ case 'k':
+ keepInfo = true;
+ break;
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (genofilterStrs.size() == 0 && keepInfo) {
+ cout << "argument '-k' (--keep-info) requires a Genotype filter: ('-g')" << endl
+ << "i.e.: -g \"GT = 1|1\" -k" << endl;
+ exit(1);
+ }
+
+ filterSpec = filterSpec.substr(1); // strip leading " "
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ for (vector<string>::iterator f = infofilterStrs.begin(); f != infofilterStrs.end(); ++f) {
+ infofilters.push_back(VariantFilter(*f, VariantFilter::RECORD, variantFile.infoTypes));
+ }
+
+ for (vector<string>::iterator f = genofilterStrs.begin(); f != genofilterStrs.end(); ++f) {
+ genofilters.push_back(VariantFilter(*f, VariantFilter::SAMPLE, variantFile.formatTypes));
+ }
+
+ vector<string> headerlines = split(variantFile.header, "\n");
+ variantFile.header.clear();
+ for (vector<string>::iterator l = headerlines.begin(); l != headerlines.end(); ++l) {
+ if (!filterSpec.empty() && (l->find("INFO") != string::npos || l + 1 == headerlines.end())) {
+ variantFile.header += "##filter=\"" + filterSpec + "\"\n";
+ filterSpec.clear();
+ }
+ variantFile.header += *l + ((l + 1 == headerlines.end()) ? "" : "\n");
+ }
+
+ if (!tagPass.empty()) {
+ variantFile.addHeaderLine("##FILTER=<ID="+ tagPass +",Description=\"Record passes the filters: " + filterSpec + ".\">");
+ }
+
+ if (!tagFail.empty()) {
+ variantFile.addHeaderLine("##FILTER=<ID="+ tagFail +",Description=\"Record fails the filters: " + filterSpec + ".\">");
+ }
+
+ if (!alleleTag.empty()) {
+ if (tagFail.empty()) {
+ tagFail = "";
+ }
+ if (tagPass.empty()) {
+ tagPass = "PASS";
+ }
+ variantFile.addHeaderLine("##INFO=<ID="+ alleleTag +",Number=A,Type=String,Description=\"" + tagPass + " if this allele passes the filters, " + tagFail + " if not, filters are: " + filterSpec + ".\">");
+ }
+
+ cout << variantFile.header << endl;
+
+ /*
+ if (genofilters.empty()) {
+ variantFile.parseSamples = false;
+ }
+ */
+
+ if (filterSites) {
+ variantFile.parseSamples = false;
+ }
+
+ Variant var(variantFile);
+
+ vector<string>::iterator regionItr = regions.begin();
+
+ do {
+
+ if (!inputFilename.empty() && !regions.empty()) {
+ string regionStr = *regionItr++;
+ variantFile.setRegion(regionStr);
+ }
+
+ while (variantFile.getNextVariant(var)) {
+ if (!genofilters.empty()) {
+ for (vector<VariantFilter>::iterator f = genofilters.begin(); f != genofilters.end(); ++f) {
+ f->removeFilteredGenotypes(var, keepInfo);
+ }
+ }
+ if (!infofilters.empty()) {
+ if (filterSites) {
+ bool passes = passesFilters(var, infofilters, logicalOr);
+ if (invert) {
+ passes = !passes;
+ }
+ if (passes) {
+ if (!tagPass.empty()) {
+ if (alleleTag.empty()) {
+ if (replaceFilter) {
+ var.filter.clear();
+ var.addFilter(tagPass);
+ } else {
+ var.addFilter(tagPass);
+ }
+ } else {
+ var.info[alleleTag].clear();
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ var.info[alleleTag].push_back(tagPass);
+ }
+ }
+ } else {
+ if (!var.originalLine.empty()) {
+ cout << var.originalLine << endl;
+ } else {
+ cout << var << endl;
+ }
+ }
+ } else {
+ if (!tagFail.empty()) {
+ if (alleleTag.empty()) {
+ if (replaceFilter) {
+ var.filter.clear();
+ var.addFilter(tagFail);
+ } else {
+ var.addFilter(tagFail);
+ }
+ } else {
+ var.info[alleleTag].clear();
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ var.info[alleleTag].push_back(tagFail);
+ }
+ }
+ }
+ }
+ if (passes && !tagPass.empty()) {
+ cout << var << endl;
+ } else if (!tagFail.empty()) {
+ cout << var << endl;
+ }
+ } else { // filter out alleles which pass
+ // removes the failing alleles
+ vector<string> failingAlts;
+ vector<string> passingAlts;
+ vector<bool> passes;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ if (!passesFilters(var, infofilters, logicalOr, *a)) {
+ failingAlts.push_back(*a);
+ passes.push_back(false);
+ } else {
+ passingAlts.push_back(*a);
+ passes.push_back(true);
+ }
+ }
+ if (tagPass.empty()) { // if there is no specified tag, just remove the failing alts
+ if (failingAlts.size() < var.alt.size()) {
+ for (vector<string>::iterator a = failingAlts.begin(); a != failingAlts.end(); ++a) {
+ var.removeAlt(*a);
+ }
+ cout << var << endl;
+ }
+ } else { // otherwise, apply the tag
+ if (alleleTag.empty()) {
+ if (!passingAlts.empty()) {
+ if (replaceFilter) {
+ var.filter.clear();
+ var.addFilter(tagPass);
+ } else {
+ var.addFilter(tagPass);
+ }
+ } else {
+ if (replaceFilter) {
+ var.filter.clear();
+ if (!tagFail.empty()) {
+ var.addFilter(tagFail);
+ }
+ } else {
+ if (!tagFail.empty()) {
+ var.addFilter(tagFail);
+ }
+ }
+ }
+ } else {
+ var.info[alleleTag].clear();
+ for (vector<bool>::iterator p = passes.begin(); p != passes.end(); ++p) {
+ if (*p) {
+ var.info[alleleTag].push_back(tagPass);
+ } else {
+ var.info[alleleTag].push_back(tagFail);
+ }
+ }
+ }
+ // TODO
+ // here, if we don't use genotype filters, we shouldn't re-print the samples
+ // we haven't done anything to this part of the input.
+ cout << var << endl;
+ }
+ }
+ } else {
+ if (genofilters.empty()) {
+ cout << variantFile.line << endl;
+ } else {
+ cout << var << endl;
+ }
+ }
+ }
+
+ } while (regionItr != regions.end());
+
+ return 0;
+
+}
+
diff --git a/src/vcffixup.cpp b/src/vcffixup.cpp
new file mode 100644
index 0000000..be79d36
--- /dev/null
+++ b/src/vcffixup.cpp
@@ -0,0 +1,117 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+int countAlts(Variant& var, int alleleIndex) {
+ int alts = 0;
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ map<string, vector<string> >::iterator gt = sample.find("GT");
+ if (gt != sample.end()) {
+ map<int, int> genotype = decomposeGenotype(gt->second.front());
+ for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
+ if (g->first == alleleIndex) {
+ alts += g->second;
+ }
+ }
+ }
+ }
+ return alts;
+}
+
+int countAlleles(Variant& var) {
+ int alleles = 0;
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ map<string, vector<string> >::iterator gt = sample.find("GT");
+ if (gt != sample.end()) {
+ map<int, int> genotype = decomposeGenotype(gt->second.front());
+ for (map<int, int>::iterator g = genotype.begin(); g != genotype.end(); ++g) {
+ if (g->first != NULL_ALLELE) {
+ alleles += g->second;
+ }
+ }
+ }
+ }
+ return alleles;
+}
+
+int main(int argc, char** argv) {
+
+ if (argc == 1 || ((argc > 1) && strcmp(argv[1], "-h") == 0) || strcmp(argv[1], "--help") == 0) {
+ cerr << "usage: " << argv[0] << " <vcf file>" << endl
+ << "outputs a VCF stream where AC and NS have been generated for each record using sample genotypes" << endl;
+ return 1;
+ }
+
+ VariantCallFile variantFile;
+ if (argc == 1 || ((argc == 2) && strcmp(argv[1], "-") == 0)) {
+ variantFile.open(std::cin);
+ if (!variantFile.is_open()) {
+ cerr << "vcffixup: could not open stdin" << endl;
+ return 1;
+ }
+ } else {
+ string filename = argv[1];
+ variantFile.open(filename);
+ if (!variantFile.is_open()) {
+ cerr << "vcffixup: could not open " << filename << endl;
+ return 1;
+ }
+ }
+
+ Variant var(variantFile);
+
+ // remove header lines we're going to add
+ variantFile.removeInfoHeaderLine("AC");
+ variantFile.removeInfoHeaderLine("AF");
+ variantFile.removeInfoHeaderLine("NS");
+ variantFile.removeInfoHeaderLine("AN");
+
+ // and add them back, so as not to duplicate them if they are already there
+ variantFile.addHeaderLine("##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Total number of alternate alleles in called genotypes\">");
+ variantFile.addHeaderLine("##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1]\">");
+ variantFile.addHeaderLine("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">");
+ variantFile.addHeaderLine("##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
+
+ // write the new header
+ cout << variantFile.header << endl;
+
+ // print the records, filtering is done via the setting of varA's output sample names
+ while (variantFile.getNextVariant(var)) {
+ stringstream ns;
+ ns << var.samples.size();
+ var.info["NS"].clear();
+ var.info["NS"].push_back(ns.str());
+
+ var.info["AC"].clear();
+ var.info["AF"].clear();
+ var.info["AN"].clear();
+
+ int allelecount = countAlleles(var);
+ stringstream an;
+ an << allelecount;
+ var.info["AN"].push_back(an.str());
+
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string& allele = *a;
+ int altcount = countAlts(var, var.getAltAlleleIndex(allele) + 1);
+ stringstream ac;
+ ac << altcount;
+ var.info["AC"].push_back(ac.str());
+ stringstream af;
+ af << (double) altcount / (double) allelecount;
+ var.info["AF"].push_back(af.str());
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfflatten.cpp b/src/vcfflatten.cpp
new file mode 100644
index 0000000..f528360
--- /dev/null
+++ b/src/vcfflatten.cpp
@@ -0,0 +1,178 @@
+#include "Variant.h"
+#include "convert.h"
+
+using namespace std;
+using namespace vcf;
+
+
+double convertStrDbl(const string& s) {
+ double r;
+ convert(s, r);
+ return r;
+}
+
+int main(int argc, char** argv) {
+
+ int maxAlleles = 2;
+
+ VariantCallFile variantFile;
+
+ if (argc > 1) {
+ string filename = argv[1];
+ if (filename == "--help" || filename == "-h") {
+ cerr << "usage: vcfflatten [file]" << endl
+ << endl
+ << "Removes multi-allelic sites by picking the most common alternate. Requires" << endl
+ << "allele frequency specification 'AF' and use of 'G' and 'A' to specify the" << endl
+ << "fields which vary according to the Allele or Genotype. VCF file may be" << endl
+ << "specified on the command line or piped as stdin." << endl;
+ exit(1);
+ }
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ // count the number of alternates
+ // if we have more than N, strip the lowest-frequency ones
+ if (var.alleles.size() > maxAlleles) {
+
+ multimap<double, string> alleleFrequencies;
+
+ vector<string>& freqsstr = var.info["AF"];
+ vector<double> freqs;
+ freqs.resize(freqsstr.size());
+ transform(freqsstr.begin(), freqsstr.end(), freqs.begin(), convertStrDbl);
+
+ vector<double>::iterator f = freqs.begin();
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++f) {
+ alleleFrequencies.insert(pair<double, string>(*f, *a));
+ }
+
+ // pick the highest frequency alternate
+ string bestalt = alleleFrequencies.rbegin()->second;
+ // and get its index
+ int bestaltIndex = var.getAltAlleleIndex(bestalt);
+ int bestaltGenotypeIndex = bestaltIndex + 1; // per VCF spec
+
+ // keep the RR, RA, and AA alleles for this alternate
+ // generate the genotype index table for this variant
+ map<pair<int, int>, int> genotypeIndexes = var.getGenotypeIndexesDiploid();
+
+ // now get the genotype indexes we want to keep
+ vector<int> alleleIndexes;
+ alleleIndexes.push_back(0);
+ alleleIndexes.push_back(bestaltGenotypeIndex);
+
+ // add the reference allele index for generating genotype indexes
+ int ploidy = 2;
+ vector<vector<int> > genotypesToKeep = multichoose(ploidy, alleleIndexes);
+ map<int, bool> genotypeIndexesToKeep;
+ for (vector<vector<int> >::iterator k = genotypesToKeep.begin(); k != genotypesToKeep.end(); ++k) {
+ pair<int, int> genotype = make_pair(k->front(), k->back()); // vectors are guaranteed to be diploid per multichoose
+ genotypeIndexesToKeep[genotypeIndexes[genotype]] = true;
+ }
+ // we are diploid, so there should be exactly 3 genotypes
+ assert(genotypeIndexesToKeep.size() == 3);
+
+ // get the fields which have genotype order "G"
+ // for all the infocounts
+ // find the ones which are == GENOTYPE_NUMBER or ALLELE_NUMBER
+ // and fix em up
+ for (map<string, int>::iterator c = variantFile.infoCounts.begin(); c != variantFile.infoCounts.end(); ++c) {
+ int count = c->second;
+ if (count == GENOTYPE_NUMBER) {
+ string key = c->first;
+ map<string, vector<string> >::iterator v = var.info.find(key);
+ if (v != var.info.end()) {
+ vector<string>& vals = v->second;
+ vector<string> tokeep;
+ int i = 0;
+ for (vector<string>::iterator g = vals.begin(); g != vals.end(); ++g, ++i) {
+ if (genotypeIndexesToKeep.find(i) != genotypeIndexesToKeep.end()) {
+ tokeep.push_back(*g);
+ }
+ }
+ vals = tokeep;
+ }
+ } else if (count == ALLELE_NUMBER) {
+ string key = c->first;
+ map<string, vector<string> >::iterator v = var.info.find(key);
+ if (v != var.info.end()) {
+ vector<string>& vals = v->second;
+ vector<string> tokeep;
+ int i = 0;
+ for (vector<string>::iterator a = vals.begin(); a != vals.end(); ++a, ++i) {
+ if (i == bestaltIndex) {
+ tokeep.push_back(*a);
+ }
+ }
+ vals = tokeep;
+ }
+ }
+ }
+ //
+ // for all the formatcounts
+ // find the ones which are == GENOTYPE_NUMBER or ALLELE_NUMBER
+ // for each sample, remove the new irrelevant values
+
+ // for each sample
+ // remove info fields which now refer to nothing
+ for (map<string, int>::iterator c = variantFile.formatCounts.begin(); c != variantFile.formatCounts.end(); ++c) {
+ int count = c->second;
+ if (count == GENOTYPE_NUMBER) {
+ string key = c->first;
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ map<string, vector<string> >::iterator v = sample.find(key);
+ if (v != sample.end()) {
+ vector<string>& vals = v->second;
+ vector<string> tokeep;
+ int i = 0;
+ for (vector<string>::iterator g = vals.begin(); g != vals.end(); ++g, ++i) {
+ if (genotypeIndexesToKeep.find(i) != genotypeIndexesToKeep.end()) {
+ tokeep.push_back(*g);
+ }
+ }
+ vals = tokeep;
+ }
+ }
+ } else if (count == ALLELE_NUMBER) {
+ string key = c->first;
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ map<string, vector<string> >::iterator v = sample.find(key);
+ if (v != sample.end()) {
+ vector<string>& vals = v->second;
+ vector<string> tokeep;
+ int i = 0;
+ for (vector<string>::iterator a = vals.begin(); a != vals.end(); ++a, ++i) {
+ if (i == bestaltIndex) {
+ tokeep.push_back(*a);
+ }
+ }
+ vals = tokeep;
+ }
+ }
+ }
+ }
+
+ var.alt.clear();
+ var.alt.push_back(bestalt);
+
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfgeno2alleles.cpp b/src/vcfgeno2alleles.cpp
new file mode 100644
index 0000000..9fd66f2
--- /dev/null
+++ b/src/vcfgeno2alleles.cpp
@@ -0,0 +1,54 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+ if (argc > 1) {
+ cerr << "usage: " << argv[0] << " <[vcf file]" << endl
+ << "modifies the genotypes field to provide the literal alleles rather than indexes" << endl;
+ return 1;
+ }
+
+ VariantCallFile variantFile;
+
+ variantFile.open(std::cin);
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+ map<string, map<string, vector<string> > >::iterator sEnd = var.samples.end();
+
+ for (; s != sEnd; ++s) {
+ map<string, vector<string> >& sample = s->second;
+ vector<string>& gtstrs = sample["GT"];
+ string& genotype = gtstrs.front();
+ vector<string> gt = split(genotype, "|/");
+
+ // report the sample and it's genotype
+ stringstream o;
+ for (vector<string>::iterator g = gt.begin(); g != gt.end(); ++g) {
+ int index = atoi(g->c_str());
+ o << var.alleles[index];
+ if (g != (gt.end()-1)) o << "/";
+ }
+ gtstrs.clear();
+ gtstrs.push_back(o.str());
+ }
+ cout << var << endl;
+ }
+ return 0;
+
+}
+
diff --git a/src/vcfgeno2haplo.cpp b/src/vcfgeno2haplo.cpp
new file mode 100644
index 0000000..f821c2d
--- /dev/null
+++ b/src/vcfgeno2haplo.cpp
@@ -0,0 +1,391 @@
+#include "Variant.h"
+#include <getopt.h>
+#include "fastahack/Fasta.h"
+#include <algorithm>
+#include <list>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+ << endl
+ << "options:" << endl
+ << " -r, --reference FILE FASTA reference file, required with -i and -u" << endl
+ << " -w, --window-size N Merge variants at most this many bp apart (default 30)" << endl
+ << " -o, --only-variants Don't output the entire haplotype, just concatenate" << endl
+ << " REF/ALT strings (delimited by \":\")" << endl
+ << endl
+ << "Convert genotype-based phased alleles within --window-size into haplotype alleles." << endl
+ << "Will break haplotype construction when encountering non-phased genotypes on input." << endl
+ << endl;
+ exit(0);
+}
+
+bool isPhased(Variant& var) {
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ map<string, vector<string> >::iterator g = sample.find("GT");
+ if (g != sample.end()) {
+ string gt = g->second.front();
+ if (gt.size() > 1 && gt.find("|") == string::npos) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+int main(int argc, char** argv) {
+
+ string vcfFileName;
+ string fastaFileName;
+ int windowsize = 30;
+ bool onlyVariants = false;
+
+ if (argc == 1)
+ printSummary(argv);
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"window-size", required_argument, 0, 'w'},
+ {"reference", required_argument, 0, 'r'},
+ {"only-variants", no_argument, 0, 'o'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "how:r:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+
+ case 'o':
+ onlyVariants = true;
+ break;
+
+ case 'w':
+ windowsize = atoi(optarg);
+ break;
+
+ case 'r':
+ fastaFileName = string(optarg);
+ break;
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ cerr << "could not open VCF file" << endl;
+ exit(1);
+ }
+
+ FastaReference reference;
+ if (fastaFileName.empty()) {
+ cerr << "a reference is required for haplotype allele generation" << endl;
+ exit(1);
+ }
+ reference.open(fastaFileName);
+
+ // pattern
+ // when variants are within windowSize from each other, build up local haplotypes
+ // establish all the haplotypes which exist within the window using genotypes+allele#+position map
+ // generate a haplotype allele string for each unique haplotype
+ // for completeness retain phasing information in the genotypes
+ // write a new VCF record in which there are haplotype alleles and correctly described genotypes for each sample
+ // if the variants are outside of the windowSize, just write out the record
+
+ Variant var(variantFile);
+ Variant outputVar(variantFile);
+
+ cout << variantFile.header << endl;
+
+ // get the first distances
+ vector<Variant> cluster;
+
+ while (variantFile.getNextVariant(var) || !cluster.empty()) {
+
+ bool haplotypeCluster = false;
+
+ if (variantFile.done()) {
+ if (cluster.size() >= 1) {
+ haplotypeCluster = true;
+ } else {
+ cout << cluster.front() << endl;
+ cluster.clear();
+ }
+ } else if (isPhased(var)) {
+ if (cluster.empty()
+ || cluster.back().sequenceName == var.sequenceName
+ && var.position - cluster.back().position + cluster.back().ref.size() - 1 <= windowsize) {
+ cluster.push_back(var);
+ } else {
+ if (cluster.size() == 1) {
+ cout << cluster.front() << endl;
+ cluster.clear();
+ if (!variantFile.done()) {
+ cluster.push_back(var);
+ }
+ } else {
+ haplotypeCluster = true;
+ }
+ }
+ } else { // not phased
+ if (cluster.empty()) {
+ cout << var << endl;
+ } else if (cluster.size() == 1) {
+ cout << cluster.front() << endl;
+ cout << var << endl;
+ } else {
+ haplotypeCluster = true;
+ }
+ }
+
+ // we need to deal with the current cluster, as our next var is outside of bounds
+ // process the last cluster if it's more than 1 var
+ if (haplotypeCluster) {
+ /* cerr << "cluster: ";
+ for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
+ cerr << " " << v->position;
+ }
+ cerr << endl;
+ */
+
+ // generate haplotype alleles and genotypes!
+ // get the reference sequence across the haplotype in question
+ string referenceHaplotype = reference.getSubSequence(cluster.front().sequenceName,
+ cluster.front().position - 1,
+ cluster.back().position
+ + cluster.back().ref.size() - cluster.front().position);
+
+ // establish what haplotypes there are by parsing the (phased) genotypes across the samples over these records
+ map<string, vector<vector<int> > > sampleHaplotypes;
+ for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+ // build the haplotype using the genotype fields in the variant cluster
+ // only build haplotypes for samples with complete information
+ string& sampleName = *s;
+ vector<vector<int> >& haplotypes = sampleHaplotypes[sampleName];
+
+ bool completeCoverage = true;
+ // ensure complete genotype coverage over the haplotype cluster
+ for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
+ if (v->samples.find(sampleName) == v->samples.end()
+ || v->samples[sampleName].find("GT") == v->samples[sampleName].end()) {
+ completeCoverage = false;
+ break;
+ }
+ }
+ if (!completeCoverage) {
+ continue; // skip samples without complete coverage
+ }
+
+ // what's the ploidy?
+ {
+ string& gt = cluster.front().samples[sampleName]["GT"].front();
+ vector<string> gtspec = split(gt, "|");
+ for (vector<string>::iterator g = gtspec.begin(); g != gtspec.end(); ++g) {
+ vector<int> haplotype;
+ haplotypes.push_back(haplotype);
+ }
+ }
+
+ for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
+ string& gt = v->samples[sampleName]["GT"].front();
+ vector<string> gtspec = split(gt, "|");
+ vector<string>::iterator g = gtspec.begin();
+ for (vector<vector<int> >::iterator h = haplotypes.begin(); h != haplotypes.end(); ++h, ++g) {
+ int j;
+ convert(*g, j);
+ h->push_back(j);
+ }
+ }
+ }
+
+ set<vector<int> > uniqueHaplotypes;
+ for (map<string, vector<vector<int> > >::iterator hs = sampleHaplotypes.begin();
+ hs != sampleHaplotypes.end(); ++hs) {
+ vector<vector<int> >& haps = hs->second;
+ for (vector<vector<int> >::iterator h = haps.begin(); h != haps.end(); ++h) {
+ uniqueHaplotypes.insert(*h);
+ }
+ }
+
+ // write new haplotypes
+ map<vector<int>, string> haplotypeSeqs;
+ map<vector<int>, int> haplotypeIndexes;
+ map<int, string> alleles;
+
+ int impossibleHaplotypes = 0;
+
+ // always include the reference haplotype as 0
+ // when we come to it in the haplotypes, we'll ignore it
+ int alleleIndex = 1;
+ for (set<vector<int> >::iterator u = uniqueHaplotypes.begin(); u != uniqueHaplotypes.end(); ++u) {
+
+ /*
+ for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z) {
+ cerr << *z;
+ }
+ cerr << endl;
+ */
+
+ string haplotype;
+ if (!onlyVariants) {
+ haplotype = referenceHaplotype;
+ }
+ bool isreference = true;
+ bool impossibleHaplotype = false;
+ int referenceInsertOffset = 0;
+ int j = 0; // index into variant cluster
+ int lastpos = 0;
+ int lastrefend = 0;
+ for (vector<int>::const_iterator z = u->begin(); z != u->end(); ++z, ++j) {
+ int i = *z;
+ Variant& vartoInsert = cluster.at(j);
+ if (i == 0) {
+ if (onlyVariants) {
+ if (!haplotype.empty()) haplotype.append(":");
+ haplotype.append(vartoInsert.ref);
+ }
+ }
+ if (i != 0) {
+ isreference = false;
+ string& alternate = vartoInsert.alleles.at(i);
+ if (vartoInsert.position < lastrefend) {
+ cerr << "impossible haplotype, overlapping alleles at " << vartoInsert.sequenceName << ":" << vartoInsert.position << endl;
+ impossibleHaplotype = true;
+ break;
+ } else {
+ //cerr << vartoInsert.position << " " << cluster.front().position + referenceInsertOffset << endl;
+ //cerr << "replacing " << vartoInsert.ref << " at " << vartoInsert.position - cluster.front().position + referenceInsertOffset << " with " << alternate << endl;
+ if (onlyVariants) {
+ if (!haplotype.empty()) haplotype.append(":");
+ haplotype.append(alternate);
+ } else {
+ haplotype.replace(vartoInsert.position - cluster.front().position + referenceInsertOffset,
+ vartoInsert.ref.size(), alternate);
+ if (alternate.size() != vartoInsert.ref.size()) {
+ referenceInsertOffset += alternate.size() - vartoInsert.ref.size();
+ }
+ lastpos = vartoInsert.position;
+ lastrefend = vartoInsert.position + vartoInsert.ref.size();
+ }
+ }
+ }
+ }
+
+ if (impossibleHaplotype) {
+ ++impossibleHaplotypes;
+ haplotypeIndexes[*u] = -1; // indicates impossible haplotype
+ impossibleHaplotype = false;
+ } else if (isreference) {
+ alleles[0] = haplotype;
+ haplotypeIndexes[*u] = 0;
+ } else {
+ alleles[alleleIndex] = haplotype;
+ haplotypeIndexes[*u] = alleleIndex;
+ ++alleleIndex;
+ }
+ haplotypeSeqs[*u] = haplotype;
+ // if there's not a reference allele, add it
+ if (alleles.find(0) == alleles.end()) {
+ alleles[0] = referenceHaplotype;
+ // nb, there is no reference haplotype among
+ // the samples, so we don't have to add it to
+ // the haplotypeIndexes
+ }
+ }
+
+ if (onlyVariants) {
+ string newRef;
+ for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
+ if (!newRef.empty()) newRef.append(":");
+ newRef.append(v->ref);
+ }
+ outputVar.ref = newRef;
+ } else {
+ outputVar.ref = alleles[0];
+ }
+ outputVar.alt.clear();
+ for (int i = 1; i < alleleIndex; ++i) {
+ outputVar.alt.push_back(alleles[i]);
+ }
+
+ outputVar.sequenceName = cluster.front().sequenceName;
+ outputVar.position = cluster.front().position;
+ outputVar.filter = ".";
+ outputVar.id = ".";
+ outputVar.info = cluster.front().info;
+ outputVar.samples.clear();
+ outputVar.format = cluster.front().format;
+
+ // now the genotypes
+ for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+ string& sampleName = *s;
+ vector<string> gt;
+ vector<vector<int> > & hs = sampleHaplotypes[sampleName];
+ for (vector<vector<int> >::iterator h = hs.begin(); h != hs.end(); ++h) {
+ int hi = haplotypeIndexes[*h];
+ if (hi != -1) {
+ gt.push_back(convert(hi));
+ } else {
+ // nonexistent or impossible haplotype
+ gt.push_back(".");
+ }
+ }
+ if (gt.size() != 0) {
+ outputVar.samples[sampleName]["GT"].push_back(join(gt, "|"));
+ }
+ }
+ if (cluster.size() - impossibleHaplotypes < 2) {
+ for (vector<Variant>::iterator v = cluster.begin(); v != cluster.end(); ++v) {
+ cout << *v << endl;
+ }
+ } else {
+ if (!outputVar.alt.empty()) {
+ cout << outputVar << endl;
+ } else {
+ cerr << "no alternate alleles remain at " << outputVar.sequenceName << ":" << outputVar.position << " after haplotype validation" << endl;
+ }
+ }
+ cluster.clear();
+ if (!variantFile.done()) cluster.push_back(var);
+ }
+ }
+
+ exit(0); // why?
+ return 0;
+
+}
+
diff --git a/src/vcfgenosamplenames.cpp b/src/vcfgenosamplenames.cpp
new file mode 100644
index 0000000..32e065a
--- /dev/null
+++ b/src/vcfgenosamplenames.cpp
@@ -0,0 +1,39 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc > 1) {
+ string filename = argv[1];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ variantFile.addHeaderLine("##FORMAT=<ID=SN,Number=1,Type=String,Description=\"The name of the sample.\">");
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ var.format.push_back("SN");
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+ s != var.samples.end(); ++s) {
+ s->second["SN"].clear();
+ s->second["SN"].push_back(s->first);
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfgenosummarize.cpp b/src/vcfgenosummarize.cpp
new file mode 100644
index 0000000..526bca1
--- /dev/null
+++ b/src/vcfgenosummarize.cpp
@@ -0,0 +1,107 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+ if (argc > 1 && (argv[1] == "-h" || argv[1] == "--help")) {
+ cerr << "usage: " << argv[0] << " <[input file] >[output vcf]" << endl
+ << "Adds summary statistics to each record summarizing qualities reported in" << endl
+ << "called genotypes. Uses:" << endl
+ << "RO (reference observation count), QR (quality sum reference observations)" << endl
+ << "AO (alternate observation count), QA (quality sum alternate observations)" << endl;
+ return 1;
+ }
+
+ VariantCallFile variantFile;
+ if (argc == 1) {
+ variantFile.open(std::cin);
+ } else {
+ string filename = argv[argc-1];
+ variantFile.open(filename);
+ if (!variantFile.is_open()) {
+ cerr << "could not open " << filename << endl;
+ return 1;
+ }
+ }
+
+ Variant var(variantFile);
+
+ variantFile.removeInfoHeaderLine("AQR");
+ variantFile.addHeaderLine("##INFO=<ID=AQR,Number=1,Type=Float,Description=\"Mean reference observation quality calculated by RO and QR in called samples.\">");
+ variantFile.removeInfoHeaderLine("AQA");
+ variantFile.addHeaderLine("##INFO=<ID=AQA,Number=A,Type=Float,Description=\"Mean alternate observation quality calculated by AO and QA in called samples.\">");
+ variantFile.removeInfoHeaderLine("QR");
+ variantFile.addHeaderLine("##INFO=<ID=QR,Number=1,Type=Float,Description=\"Quality sum of reference observations calculated by QR in called samples.\">");
+ variantFile.removeInfoHeaderLine("QA");
+ variantFile.addHeaderLine("##INFO=<ID=QA,Number=A,Type=Float,Description=\"Quality sum of alternate observations calculated by QA in called samples.\">");
+ variantFile.removeInfoHeaderLine("RQA");
+ variantFile.addHeaderLine("##INFO=<ID=RQA,Number=A,Type=Float,Description=\"Ratio of mean alternate observation quality to mean reference observation quality (MQA/MQR).\">");
+
+ // write the new header
+ cout << variantFile.header << endl;
+
+ // print the records, filtering is done via the setting of varA's output sample names
+ while (variantFile.getNextVariant(var)) {
+ int refobs = 0;
+ int refqual = 0;
+ vector<int> altobs(var.alt.size(), 0);
+ vector<int> altqual(var.alt.size(), 0);
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+ s != var.samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ int x;
+ if (sample.find("RO") != sample.end()) {
+ convert(sample["RO"].front(), x);
+ refobs += x;
+ if (sample.find("QR") != sample.end()) {
+ convert(sample["QR"].front(), x);
+ refqual += x;
+ }
+ }
+ if (sample.find("AO") != sample.end()) {
+ vector<string>& aos = sample["AO"];
+ for (int i = 0; i != var.alt.size(); ++i) {
+ convert(aos[i], x);
+ altobs[i] += x;
+ }
+ if (sample.find("QA") != sample.end()) {
+ vector<string>& qas = sample["QA"];
+ for (int i = 0; i != var.alt.size(); ++i) {
+ convert(qas[i], x);
+ altqual[i] += x;
+ }
+ }
+ }
+ }
+ var.info["QR"].push_back(convert(refqual));
+ if (refobs == 0 || refqual == 0) {
+ var.info["AQR"].push_back(convert(0));
+ } else {
+ var.info["AQR"].push_back(convert((double)refqual/(double)refobs));
+ }
+
+ for (int i = 0; i != var.alt.size(); ++i) {
+ var.info["QA"].push_back(convert(altqual[i]));
+ var.info["AQA"].push_back(convert((double)altqual[i]/(double)altobs[i]));
+ if (refobs == 0 || refqual == 0) {
+ var.info["RQA"].push_back(convert(1));
+ } else {
+ var.info["RQA"].push_back(convert(((double)altqual[i]/(double)altobs[i]) /
+ ((double)refqual/(double)refobs)));
+ }
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfgenotypecompare.cpp b/src/vcfgenotypecompare.cpp
new file mode 100644
index 0000000..c81043f
--- /dev/null
+++ b/src/vcfgenotypecompare.cpp
@@ -0,0 +1,327 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+#include <sstream>
+
+using namespace std;
+using namespace vcf;
+
+// TODO fix this for multi-allelic!!!!
+string genotypeSpec(map<int, int>& genotype) {
+ string gtspec;
+ if (isNull(genotype)) {
+ gtspec = "NN";
+ } else if (isHom(genotype)) {
+ if (hasNonRef(genotype)) {
+ gtspec = "AA";
+ } else {
+ gtspec = "RR";
+ }
+ } else {
+ gtspec = "AR";
+ }
+ return gtspec;
+}
+
+int main(int argc, char** argv) {
+
+ if (argc != 3) {
+ cerr << "usage: " << argv[0] << " <other-genotype-tag> <vcf file>" << endl
+ << "adds statistics to the INFO field of the vcf file describing the" << endl
+ << "amount of discrepancy between the genotypes (GT) in the vcf file and the" << endl
+ << "genotypes reported in the <other-genotype-tag>. use this after" << endl
+ << "vcfannotategenotypes to get correspondence statistics for two vcfs." << endl;
+ return 1;
+ }
+
+ string otherGenoTag = argv[1];
+ string filename = argv[2];
+
+ VariantCallFile variantFile;
+ if (filename == "-") {
+ variantFile.open(std::cin);
+ } else {
+ variantFile.open(filename);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ vector<string> specs;
+
+ specs.push_back("AA_AA");
+ specs.push_back("AA_AR");
+ specs.push_back("AA_RR");
+ specs.push_back("AA_NN");
+
+ specs.push_back("AR_AA");
+ specs.push_back("AR_AR");
+ specs.push_back("AR_RR");
+ specs.push_back("AR_NN");
+
+ specs.push_back("RR_AA");
+ specs.push_back("RR_AR");
+ specs.push_back("RR_RR");
+ specs.push_back("RR_NN");
+
+ specs.push_back("NN_AA");
+ specs.push_back("NN_AR");
+ specs.push_back("NN_RR");
+ specs.push_back("NN_NN");
+
+
+ for (vector<string>::iterator spec = specs.begin(); spec != specs.end(); ++spec) {
+ string line = "##INFO=<ID=" + otherGenoTag + ".genotypes." + *spec
+ + ",Number=1,Type=Integer,Description=\"Number of genotypes with "
+ + *spec + " relationship with " + otherGenoTag + "\">";
+ variantFile.addHeaderLine(line);
+ }
+
+ string line;
+
+ line = "##INFO=<ID=" + otherGenoTag + ".genotypes.count,Number=1,Type=Integer,Description=\"Count of genotypes under comparison.\">";
+ variantFile.addHeaderLine(line);
+
+ line = "##INFO=<ID=" + otherGenoTag + ".genotypes.alternate_count,Number=1,Type=Integer,Description=\"Count of alternate genotypes in the first file.\">";
+ variantFile.addHeaderLine(line);
+
+ line = "##INFO=<ID=" + otherGenoTag
+ + ".site.alternate_positive_discrepancy,Number=1,Type=Integer,Description=\"Estimated positive discrepancy rate of "
+ + otherGenoTag + " genotypes, where positive discrepancies are all cases where an alternate allele is called GT "
+ + " but none is represented in " + otherGenoTag + " or " + otherGenoTag + " is null/no-call\">";
+ variantFile.addHeaderLine(line);
+
+ line = "##INFO=<ID=" + otherGenoTag
+ + ".site.alternate_negative_discrepancy,Number=1,Type=Integer,Description=\"Estimated negative discrepancy rate of "
+ + otherGenoTag + " genotypes, where negative discrepancies are all cases where no alternate allele is called in "
+ + " GT but an alternate is represented in " + otherGenoTag + ", including no-calls or partly null genotypes\">";
+ variantFile.addHeaderLine(line);
+
+ line = "##INFO=<ID=" + otherGenoTag
+ + ".site.alternate_null_discrepancy,Number=1,Type=Integer,Description=\"Estimated null discrepancy rate of "
+ + otherGenoTag + " genotypes, where null discrepancies are all cases where GT is specified and contains an alternate but "
+ + otherGenoTag + " is null. Cases where GT is null or partly null are excluded.\">";
+ variantFile.addHeaderLine(line);
+
+ line = "##INFO=<ID=" + otherGenoTag
+ + ".site.call_discrepancy,Number=1,Type=Integer,Description=\"Estimated call discrepancy rate of "
+ + otherGenoTag + " genotypes (het->hom, hom->het) between " + otherGenoTag + " and GT\">";
+ variantFile.addHeaderLine(line);
+
+ line = "##INFO=<ID=" + otherGenoTag
+ + ".site.call_concordance,Number=1,Type=Integer,Description=\"Estimated call concorndance rate of "
+ + otherGenoTag + " genotypes between " + otherGenoTag + " and GT\">";
+ variantFile.addHeaderLine(line);
+
+ line = "##INFO=<ID=" + otherGenoTag
+ + ".site.non_reference_discrepancy,Number=1,Type=Float,Description=\"Estimated non-reference discrepancy relative to "
+ + otherGenoTag + " genotypes,\">";
+ variantFile.addHeaderLine(line);
+
+ line = "##INFO=<ID=" + otherGenoTag
+ + ".site.non_reference_discrepancy.count,Number=1,Type=Int,Description=\"non-reference discrepancy normalizer relative to "
+ + otherGenoTag + " genotypes,\">";
+ variantFile.addHeaderLine(line);
+
+ line = "##INFO=<ID=" + otherGenoTag
+ + ".site.non_reference_discrepancy.normalizer,Number=1,Type=Int,Description=\"non-reference discrepancy count relative to "
+ + otherGenoTag + " genotypes,\">";
+ variantFile.addHeaderLine(line);
+
+ line = "##INFO=<ID=" + otherGenoTag
+ + ".site.non_reference_sensitivity,Number=1,Type=Float,Description=\"Estimated non-reference sensitivity relative to "
+ + otherGenoTag + " genotypes,\">";
+ variantFile.addHeaderLine(line);
+
+ line = "##INFO=<ID=" + otherGenoTag
+ + ".site.non_reference_sensitivity.count,Number=1,Type=Int,Description=\"non-reference sensitivity normalizer relative to "
+ + otherGenoTag + " genotypes,\">";
+ variantFile.addHeaderLine(line);
+
+ line = "##INFO=<ID=" + otherGenoTag
+ + ".site.non_reference_sensitivity.normalizer,Number=1,Type=Int,Description=\"non-reference sensitivity count relative to "
+ + otherGenoTag + " genotypes,\">";
+ variantFile.addHeaderLine(line);
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+
+ while (variantFile.getNextVariant(var)) {
+
+ //cout << "next: " << var << endl;
+ // for each sample, check GT against <other-genotype-tag>
+ // tally stats, and append to info
+ map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+ map<string, map<string, vector<string> > >::iterator sEnd = var.samples.end();
+
+ map<string, int> genotypeComparisonCounts;
+ int gtCount = var.samples.size();
+ int gtAltCount = 0; // number of alternate-containing genotypes in the first file
+ int pdCount = 0; // positive discrepancy count
+ int ndCount = 0; // negative discrepancy count
+ int nnCount = 0; // null discrepancy count
+ int cdCount = 0; // call discrepancy count
+ int ccCount = 0; // call concordance count
+ int nrdCount = 0; // non-reference discrepancy count
+ int nrdNormalizer = 0; // divisor for nrd rate
+ int nrsCount = 0; // non-reference sensitivity count
+ int nrsNormalizer = 0; // divisor for nrs rate
+
+ for (; s != sEnd; ++s) {
+ map<string, vector<string> >& sample = s->second;
+ const string& name = s->first;
+
+ // decompose genotypes into counts of strings
+ // to facilitate comparison
+
+ string gtA;
+ if (sample.find("GT") == sample.end()) {
+ gtA = "./.";
+ } else {
+ gtA = sample["GT"].front();
+ }
+
+ string gtB;
+ if (sample.find(otherGenoTag) == sample.end()) {
+ gtB = "./.";
+ } else {
+ gtB = sample[otherGenoTag].front();
+ }
+
+
+ map<int, int> genotypeA = decomposeGenotype(gtA);
+ map<int, int> genotypeB = decomposeGenotype(gtB);
+
+ string gtspecA = genotypeSpec(genotypeA);
+ string gtspecB = genotypeSpec(genotypeB);
+ //cout << gtA << " " << gtB << endl;
+ //cout << gtspecA << " " << gtspecB << endl;
+ ++genotypeComparisonCounts[gtspecA + "_" + gtspecB];
+
+ if (hasNonRef(genotypeA)) {
+ ++gtAltCount;
+ }
+
+ if (genotypeA != genotypeB) {
+ if (isNull(genotypeA)) {
+ // TODO handle this somehow, maybe via a different flag?
+ if (!isNull(genotypeB)) {
+ ++nnCount; // null discrepancy, the second set makes a call, this one does not
+ }
+ } else if (hasNonRef(genotypeA)) {
+ if (!isNull(genotypeB) && hasNonRef(genotypeB)) { // they cannot be the same, but they both represent an alternate
+ ++cdCount; // the calls are discrepant
+ } else { // the other call does not have an alternate
+ ++pdCount;
+ // it is also null
+ if (isNull(genotypeB)) {
+ ++nnCount;
+ }
+ }
+ } else { // the current genotype has no non-ref alternate
+ if (!isNull(genotypeB) && hasNonRef(genotypeB)) {
+ ++ndCount;
+ }
+ if (isNull(genotypeB)) {
+ ++nnCount;
+ }
+ }
+ } else {
+ if (!isNull(genotypeA)) {
+ ++ccCount;
+ }
+ }
+
+
+ if (!(isNull(genotypeA) || isNull(genotypeB))
+ && !(isHomRef(genotypeA) && isHomRef(genotypeB))) {
+ ++nrdNormalizer;
+ if (genotypeA != genotypeB) {
+ ++nrdCount;
+ }
+ }
+
+ if (!(isNull(genotypeB) || isHomRef(genotypeB))) {
+ ++nrsNormalizer;
+ if (!(isNull(genotypeA) || isHomRef(genotypeA))) {
+ ++nrsCount;
+ }
+ }
+
+ }
+
+ for (map<string, int>::iterator g = genotypeComparisonCounts.begin();
+ g != genotypeComparisonCounts.end(); ++g) {
+ stringstream c;
+ c << g->second;
+ vector<string>& t = var.info[otherGenoTag + ".genotypes." + g->first];
+ t.clear(); t.push_back(c.str());
+ }
+
+ stringstream gtc;
+ gtc << gtCount;
+ var.info[otherGenoTag + ".genotypes.count"].push_back(gtc.str());
+
+ stringstream gtac;
+ gtac << gtAltCount;
+ var.info[otherGenoTag + ".genotypes.alternate_count"].push_back(gtac.str());
+
+ stringstream pd;
+ pd << pdCount;
+ var.info[otherGenoTag + ".site.alternate_positive_discrepancy"].push_back(pd.str());
+
+ stringstream nd;
+ nd << ndCount;
+ var.info[otherGenoTag + ".site.alternate_negative_discrepancy"].push_back(nd.str());
+
+ stringstream nn;
+ nn << nnCount;
+ var.info[otherGenoTag + ".site.alternate_null_discrepancy"].push_back(nn.str());
+
+ stringstream cd;
+ cd << cdCount;
+ var.info[otherGenoTag + ".site.call_discrepancy"].push_back(cd.str());
+
+ stringstream cc;
+ cc << ccCount;
+ var.info[otherGenoTag + ".site.call_concordance"].push_back(cc.str());
+
+ stringstream nrdc;
+ nrdc << nrdCount;
+ var.info[otherGenoTag + ".site.non_reference_discrepancy.count"].push_back(nrdc.str());
+
+ stringstream nrdn;
+ nrdn << nrdNormalizer;
+ var.info[otherGenoTag + ".site.non_reference_discrepancy.normalizer"].push_back(nrdn.str());
+
+ if (nrdNormalizer > 0) {
+ stringstream nrd;
+ nrd << (double) nrdCount / (double) nrdNormalizer;
+ var.info[otherGenoTag + ".site.non_reference_discrepancy"].push_back(nrd.str());
+ }
+
+ stringstream nrsc;
+ nrsc << nrsCount;
+ var.info[otherGenoTag + ".site.non_reference_sensitivity.count"].push_back(nrsc.str());
+
+ stringstream nrsn;
+ nrsn << nrsNormalizer;
+ var.info[otherGenoTag + ".site.non_reference_sensitivity.normalizer"].push_back(nrsn.str());
+
+ if (nrsNormalizer > 0) {
+ stringstream nrs;
+ nrs << (double) nrsCount / (double) nrsNormalizer;
+ var.info[otherGenoTag + ".site.non_reference_sensitivity"].push_back(nrs.str());
+ }
+
+ cout << var << endl;
+
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfgenotypes.cpp b/src/vcfgenotypes.cpp
new file mode 100644
index 0000000..4fe7965
--- /dev/null
+++ b/src/vcfgenotypes.cpp
@@ -0,0 +1,66 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+ if (argc != 2) {
+ cerr << "usage: " << argv[0] << " <vcf file>" << endl
+ << "report the genotypes for each sample, for each variant in the vcf file" << endl;
+ return 1;
+ }
+
+ string filename = argv[1];
+
+ VariantCallFile variantFile;
+
+ if (filename == "-") {
+ variantFile.open(std::cin);
+ } else {
+ variantFile.open(filename);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+ map<string, map<string, vector<string> > >::iterator sEnd = var.samples.end();
+
+ cout << var.sequenceName << "\t"
+ << var.position << "\t"
+ << var.ref << "\t";
+ var.printAlt(cout); cout << "\t";
+ var.printAlleles(cout); cout << "\t";
+
+ for (; s != sEnd; ++s) {
+ map<string, vector<string> >& sample = s->second;
+ string& genotype = sample["GT"].front(); // XXX assumes we can only have one GT value
+ vector<string> gt = split(genotype, "|/");
+
+ // report the sample and it's genotype
+ cout << s->first << ":";
+ for (vector<string>::iterator g = gt.begin(); g != gt.end(); ++g) {
+ if (g->c_str() == ".") {
+ cout << ".";
+ } else {
+ int index = atoi(g->c_str());
+ cout << var.alleles[index];
+ }
+ if (g != (gt.end()-1)) cout << "/";
+ }
+ cout << "\t";
+ }
+ cout << endl;
+ }
+ return 0;
+
+}
+
diff --git a/src/vcfglbound.cpp b/src/vcfglbound.cpp
new file mode 100644
index 0000000..0b42f22
--- /dev/null
+++ b/src/vcfglbound.cpp
@@ -0,0 +1,178 @@
+#include "Variant.h"
+#include "split.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+ << endl
+ << "options:" << endl
+ << " -b, --bound N Bound GLs to this limit." << endl
+ << " -x, --exclude-broken If GLs are > 0, remove site." << endl
+ << endl
+ << "Adjust GLs so that the maximum GL is 0 by dividing all GLs for each sample by the max." << endl
+ << "Then cap (bound) at N (e.g. -10)." << endl;
+ exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+ bool excludeBroken = false;
+ double glBound = 0;
+ int c;
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"bound", required_argument, 0, 'b'},
+ {"exclude-broken", no_argument, 0, 'x'},
+ //{"length", no_argument, &printLength, true},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hxb:",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 'b':
+ glBound = atof(optarg);
+ break;
+
+ case 'x':
+ excludeBroken = true;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+ break;
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (glBound == 0) {
+ cerr << "a bound is required when running vcfglbound (try -10)" << endl;
+ exit(1);
+ }
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ if (find(var.format.begin(), var.format.end(), "GL") == var.format.end()) {
+ cout << var << endl;
+ continue;
+ }
+ if (find(var.format.begin(), var.format.end(), "GT") == var.format.end()) {
+ var.format.push_back("GT");
+ reverse(var.format.begin(), var.format.end());
+ }
+ bool isbroken = false;
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+ s != var.samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ map<string, vector<string> >::iterator l = sample.find("GL");
+ if (l != sample.end()) {
+
+ // find the gl max
+ vector<string>& glstrs = l->second;
+ vector<double> gls;
+ for (vector<string>::iterator gl = glstrs.begin(); gl != glstrs.end(); ++gl) {
+ double d;
+ convert(*gl, d);
+ gls.push_back(d);
+ }
+
+ isbroken = false; // reset every iteration
+ for (vector<double>::iterator g = gls.begin(); g != gls.end(); ++g) {
+ if (*g > 0) {
+ isbroken = true;
+ break;
+ }
+ }
+ if (isbroken) {
+ if (excludeBroken) {
+ cerr << var.sequenceName << ":" << var.position << ", sample " << s->first << " has GL > 0" << endl;
+ break;
+ } else {
+ cerr << "VCF record @ " << var.sequenceName << ":" << var.position << ", sample " << s->first << " has GL > 0, not processing, but outputting" << endl;
+ continue;
+ }
+ }
+
+ // normalize GLs to -10 min 0 max using division by max and bounding at -10
+ double minGL = 0;
+ for (vector<double>::iterator g = gls.begin(); g != gls.end(); ++g) {
+ if (*g < minGL) minGL = *g;
+ }
+ double maxGL = minGL;
+ for (vector<double>::iterator g = gls.begin(); g != gls.end(); ++g) {
+ if (*g > maxGL) maxGL = *g;
+ }
+ // modify gls
+ for (vector<double>::iterator g = gls.begin(); g != gls.end(); ++g) {
+ *g = max(glBound, *g - maxGL);
+ }
+
+ // and pack back into GL field
+ glstrs.clear();
+ for (vector<double>::iterator g = gls.begin(); g != gls.end(); ++g) {
+ glstrs.push_back(convert(*g));
+ }
+ }
+ }
+ if (excludeBroken && isbroken) {
+ cerr << "excluding VCF record @ " << var.sequenceName << ":" << var.position << " due to GLs > 0" << endl;
+ } else {
+ cout << var << endl;
+ }
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfglxgt.cpp b/src/vcfglxgt.cpp
new file mode 100644
index 0000000..5909bdc
--- /dev/null
+++ b/src/vcfglxgt.cpp
@@ -0,0 +1,171 @@
+#include "Variant.h"
+#include "split.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+ << endl
+ << "options:" << endl
+ << " -n, --fix-null-genotypes only apply to null and partly-null genotypes" << endl
+ << endl
+ << "Set genotypes using the maximum genotype likelihood for each sample." << endl
+ << endl;
+ exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+ bool fixNull = false;
+ int c;
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"fix-null-genotypes", no_argument, 0, 'n'},
+ //{"length", no_argument, &printLength, true},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hn",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 'n':
+ fixNull = true;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+ break;
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ cout << variantFile.header << endl;
+
+ map<pair<int, int>, list<list<int> > > glOrderCache;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ if (find(var.format.begin(), var.format.end(), "GL") == var.format.end()) {
+ cout << var << endl;
+ continue;
+ }
+ if (find(var.format.begin(), var.format.end(), "GT") == var.format.end()) {
+ var.format.push_back("GT");
+ reverse(var.format.begin(), var.format.end());
+ }
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+ s != var.samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ map<string, vector<string> >::iterator g = sample.find("GT");
+ map<string, vector<string> >::iterator l = sample.find("GL");
+ if (l != sample.end()) {
+ if (g == sample.end()) {
+ sample["GT"].push_back("./.");
+ g = sample.find("GT");
+ }
+
+ string& gt = g->second.front();
+ // if we are fixing null but the genotype is fully specified, continue
+ if (fixNull && gt.find(".") == string::npos) continue;
+ string splitter = "/";
+ if (gt.find("|") != string::npos) splitter = "|";
+ int samplePloidy = split(gt, splitter).size();
+ int numAlleles = var.alt.size() + 1; // including reference
+
+ // get the gt GL ordering
+ pair<int, int> pa = make_pair(samplePloidy, numAlleles);
+ map<pair<int, int>, list<list<int> > >::iterator order = glOrderCache.find(pa);
+ if (order == glOrderCache.end()) {
+ glOrderCache[pa] = glorder(samplePloidy, numAlleles);
+ }
+ list<list<int> >& glOrdering = glOrderCache[pa];
+
+ // find the gl max
+ vector<string>& gls = l->second;
+ vector<string>::iterator p = gls.begin();
+ double maxGl;
+ convert(*p, maxGl); ++p;
+ int i = 1, maxindex = 0;
+ for (; p != gls.end(); ++p, ++i) {
+ double cgl;
+ convert(*p, cgl);
+ if (cgl > maxGl) {
+ maxGl = cgl;
+ maxindex = i; // prefers == gls in order of listing
+ }
+ }
+
+ // determine which genotype it represents
+ // modify, if the GT is part-null
+ vector<string>& gtv = g->second;
+ list<list<int> >::iterator b = glOrdering.begin();
+ advance(b, maxindex);
+ /*
+ cout << "changing sample " << s->first << " gt from " << gt << " to " << join(*b, "/")
+ << " gls are ";
+ int q = 0;
+ for (list<list<int> >::iterator i = glOrdering.begin(); i != glOrdering.end(); ++i, ++q) {
+ cout << join(*i, "/") << ":" << sample["GL"].at(q) << ", ";
+ }
+ cout << endl;
+ */
+
+ gtv.clear();
+ gtv.push_back(join(*b, "/"));
+ }
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfhetcount.cpp b/src/vcfhetcount.cpp
new file mode 100644
index 0000000..3dd4561
--- /dev/null
+++ b/src/vcfhetcount.cpp
@@ -0,0 +1,72 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+ if (argc == 2 && (argv[1] == "-h" || argv[1] == "--help")) {
+ cerr << "usage: " << argv[0] << " <vcf file>" << endl
+ << "count the number of alternate alleles in heterozygous genotypes in all records in the vcf file" << endl
+ << "outputs a count for each individual in the file" << endl;
+ return 1;
+ }
+
+
+ string inputFilename;
+ VariantCallFile variantFile;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ unsigned int hetAlleleCount = 0;
+ map<string, unsigned int> hetCounts;
+ for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+ hetCounts[*s] = 0;
+ }
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ //cout << var << endl;
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+ string name = s->first;
+ map<string, vector<string> >& sample = s->second;
+ string& genotype = sample["GT"].front();
+ vector<string> gt = split(genotype, "|/");
+ int alt = 0;
+ for (vector<string>::iterator g = gt.begin(); g != gt.end(); ++g) {
+ if (*g != "0")
+ ++alt;
+ }
+ if (alt != gt.size()) {
+ hetCounts[name] += alt;
+ //hetAlleleCount += alt;
+ }
+ }
+ }
+
+ //cout << hetAlleleCount << endl;
+ for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+ cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << *s;
+ }
+ cout << endl;
+ for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+ cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << hetCounts[*s];
+ }
+ cout << endl;
+
+ return 0;
+
+}
+
diff --git a/src/vcfhethomratio.cpp b/src/vcfhethomratio.cpp
new file mode 100644
index 0000000..10e39f8
--- /dev/null
+++ b/src/vcfhethomratio.cpp
@@ -0,0 +1,66 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+ if (argc != 2) {
+ cerr << "usage: " << argv[0] << " <vcf file>" << endl
+ << "outputs the het/hom ratio for each individual in the file" << endl;
+ return 1;
+ }
+
+ string filename = argv[1];
+
+ VariantCallFile variantFile;
+ if (filename == "-") {
+ variantFile.open(std::cin);
+ } else {
+ variantFile.open(filename);
+ }
+ if (!variantFile.is_open()) {
+ cerr << "could not open " << filename << endl;
+ return 1;
+ }
+
+ map<string, unsigned int> hetCounts;
+ map<string, unsigned int> homCounts;
+ for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+ hetCounts[*s] = 0;
+ homCounts[*s] = 0;
+ }
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ //cout << var << endl;
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+ string name = s->first;
+ map<string, vector<string> >& sample = s->second;
+ string& gt = sample["GT"].front();
+ map<int, int> genotype = decomposeGenotype(gt);
+ if (isHet(genotype)) {
+ ++hetCounts[name];
+ } else if (isHomNonRef(genotype)) {
+ ++homCounts[name];
+ }
+ }
+ }
+
+ for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+ cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << *s;
+ }
+ cout << endl;
+ for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+ cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << (double) hetCounts[*s] / (double) homCounts[*s];
+ }
+ cout << endl;
+
+ return 0;
+
+}
+
diff --git a/src/vcfindex.cpp b/src/vcfindex.cpp
new file mode 100644
index 0000000..24a9401
--- /dev/null
+++ b/src/vcfindex.cpp
@@ -0,0 +1,42 @@
+#include "Variant.h"
+#include "convert.h"
+#include <vector>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc > 1) {
+ string filename = argv[1];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ string idname = "id";
+ long int uid = 0;
+
+ variantFile.addHeaderLine("##INFO=<ID="+idname+",Number=A,Type=Integer,Description=\"Unique numerical identifier of allele in file.\">");
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ vector<string>& idxs = var.info[idname];
+ idxs.clear();
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ idxs.push_back(convert(uid++));
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfinfo2qual.cpp b/src/vcfinfo2qual.cpp
new file mode 100644
index 0000000..2c4b961
--- /dev/null
+++ b/src/vcfinfo2qual.cpp
@@ -0,0 +1,50 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc == 1) {
+ cerr << "usage: " << argv[0] << " [key] [vcf_file]" << endl
+ << "Sets QUAL from info field tag keyed by [key]." << endl
+ << "The VCF file may be omitted and read from stdin." << endl
+ << "The average of the field is used if it contains multiple values." << endl;
+ return 1;
+ }
+
+ string key = argv[1];
+
+ if (argc > 2) {
+ string filename = argv[2];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ vector<string>& ivs = var.info[key];
+ double vs = 0;
+ for (vector<string>::iterator i = ivs.begin();
+ i != ivs.end(); ++i) {
+ double v;
+ convert(*i, v);
+ vs += v;
+ }
+ var.quality = vs / (double) ivs.size();
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfinfosummarize.cpp b/src/vcfinfosummarize.cpp
new file mode 100644
index 0000000..3e0f0f9
--- /dev/null
+++ b/src/vcfinfosummarize.cpp
@@ -0,0 +1,212 @@
+#include "Variant.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <getopt.h>
+#include <algorithm>
+#include <numeric>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+ << endl
+ << "options:" << endl
+ << " -f, --field Summarize this field in the INFO column" << endl
+ << " -i, --info Store the computed statistic in this info field" << endl
+ << " -a, --average Take the mean for field (default)" << endl
+ << " -m, --median Use the median" << endl
+ << " -n, --min Use the min" << endl
+ << " -x, --max Use the max" << endl
+ << endl
+ << "Take annotations given in the per-sample fields and add the mean, median, min, or max" << endl
+ << "to the site-level INFO." << endl
+ << endl;
+ exit(0);
+}
+
+double median(vector<double> &v)
+{
+ size_t n = v.size() / 2;
+ nth_element(v.begin(), v.begin()+n, v.end());
+ return v[n];
+}
+
+double mean(vector<double> &v)
+{
+ double sum = accumulate(v.begin(), v.end(), 0.0);
+ return sum / v.size();
+}
+
+enum StatType { MEAN, MEDIAN, MIN, MAX };
+
+int main(int argc, char** argv) {
+
+ int c;
+ string sitewideField;
+ string infoField;
+ StatType statType = MEAN;
+
+ if (argc == 1)
+ printSummary(argv);
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ {"help", no_argument, 0, 'h'},
+ {"field", required_argument, 0, 'f'},
+ {"info", required_argument, 0, 'i'},
+ {"average", no_argument, 0, 'a'},
+ {"median", no_argument, 0, 'm'},
+ {"min", no_argument, 0, 'n'},
+ {"max", no_argument, 0, 'x'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hamnxf:i:",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 'f':
+ sitewideField = optarg;
+ break;
+
+ case 'i':
+ infoField = optarg;
+ break;
+
+ case 'a':
+ statType = MEAN;
+ break;
+
+ case 'm':
+ statType = MEDIAN;
+ break;
+
+ case 'n':
+ statType = MIN;
+ break;
+
+ case 'x':
+ statType = MAX;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (infoField.empty() || sitewideField.empty()) {
+ cerr << "Error: both a sample field and an info field are required." << endl;
+ return 1;
+ }
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ string statTypeStr;
+
+ switch (statType) {
+ case MEAN:
+ statTypeStr = "mean";
+ break;
+ case MEDIAN:
+ statTypeStr = "median";
+ break;
+ case MIN:
+ statTypeStr = "min";
+ break;
+ case MAX:
+ statTypeStr = "max";
+ break;
+ default:
+ cerr << "Error: failure to convert stat type to string" << endl;
+ return 1;
+ break;
+ }
+
+ variantFile.addHeaderLine("##INFO=<ID="+infoField+",Number=1,Type=Float,Description=\"Summary statistic generated by"+statTypeStr+" of site-wide values of "+sitewideField+" \">");
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ vector<double> vals;
+ map<string, vector<string> >::iterator i = var.info.find(sitewideField);
+ if (i != var.info.end()) {
+ for (vector<string>::iterator s = i->second.begin(); s != i->second.end(); ++s) {
+ double d;
+ convert(*s, d);
+ vals.push_back(d);
+ }
+ }
+
+ double result;
+ switch (statType) {
+ case MEAN:
+ result = mean(vals);
+ break;
+ case MEDIAN:
+ result = median(vals);
+ break;
+ case MIN:
+ result = *min_element(vals.begin(), vals.end());
+ break;
+ case MAX:
+ result = *max_element(vals.begin(), vals.end());
+ break;
+ default:
+ cerr << "Error: unrecognized StatType" << endl;
+ return 1;
+ break;
+ }
+
+ var.info[infoField].clear();
+ var.info[infoField].push_back(convert(result));
+
+ cout << var << endl;
+
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfintersect.cpp b/src/vcfintersect.cpp
new file mode 100644
index 0000000..27f272e
--- /dev/null
+++ b/src/vcfintersect.cpp
@@ -0,0 +1,577 @@
+#include "Variant.h"
+#include "BedReader.h"
+#include "intervaltree/IntervalTree.h"
+#include <getopt.h>
+#include "fastahack/Fasta.h"
+#include <algorithm>
+#include <list>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+ << endl
+ << "options:" << endl
+ << " -b, --bed FILE use intervals provided by this BED file" << endl
+ << " -R, --region REGION use 1-based tabix-style region (e.g. chrZ:10-20), multiples allowed" << endl
+ << " -S, --start-only don't use the reference length information in the record to determine" << endl
+ << " overlap status, just use the start posiion" << endl
+ << " -v, --invert invert the selection, printing only records which would" << endl
+ << " not have been printed out" << endl
+ << " -i, --intersect-vcf FILE use this VCF for set intersection generation" << endl
+ << " -u, --union-vcf FILE use this VCF for set union generation" << endl
+ << " -w, --window-size N compare records up to this many bp away (default 30)" << endl
+ << " -r, --reference FILE FASTA reference file, required with -i and -u" << endl
+ << " -l, --loci output whole loci when one alternate allele matches" << endl
+ << " -m, --ref-match intersect on the basis of record REF string" << endl
+ << " -t, --tag TAG attach TAG to each record's info field if it would intersect" << endl
+ << " -V, --tag-value VAL use this value to indicate that the allele is passing" << endl
+ << " '.' will be used otherwise. default: 'PASS'" << endl
+ << " -M, --merge-from FROM-TAG" << endl
+ << " -T, --merge-to TO-TAG merge from FROM-TAG used in the -i file, setting TO-TAG" << endl
+ << " in the current file." << endl
+ << endl
+ << "For bed-vcf intersection, alleles which fall into the targets are retained." << endl
+ << endl
+ << "For vcf-vcf intersection and union, unify on equivalent alleles within window-size bp" << endl
+ << "as determined by haplotype comparison alleles." << endl;
+ //<< "Intersect the records in the VCF file with targets provided in a BED file." << endl
+ //<< "Intersections are done on the reference sequences in the VCF file." << endl
+ //<< "If no VCF filename is specified on the command line (last argument) the VCF" << endl
+ //<< "read from stdin." << endl;
+ exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+ string bedFileName;
+ string vcfFileName;
+ string fastaFileName;
+ bool intersecting = false;
+ bool unioning = false;
+ bool invert = false;
+ bool contained = true;
+ bool overlapping = false;
+ bool startPositionOnly = false;
+ int windowsize = 30;
+ bool loci = false;
+ bool refmatch = false;
+ string tag;
+ string tagValue = "PASS";
+ string mergeFromTag;
+ string mergeToTag;
+ vector<BedTarget> regions;
+
+ if (argc == 1)
+ printSummary(argv);
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"bed", required_argument, 0, 'b'},
+ {"region", required_argument, 0, 'R'},
+ {"invert", no_argument, 0, 'v'},
+ {"intersect-vcf", required_argument, 0, 'i'},
+ {"union-vcf", required_argument, 0, 'u'},
+ {"contained", no_argument, 0, 'c'},
+ {"overlapping", no_argument, 0, 'o'},
+ {"window-size", required_argument, 0, 'w'},
+ {"reference", required_argument, 0, 'r'},
+ {"loci", no_argument, 0, 'l'},
+ {"ref-match", no_argument, 0, 'm'},
+ {"tag", required_argument, 0, 't'},
+ {"tag-value", required_argument, 0, 'V'},
+ {"merge-from", required_argument, 0, 'M'},
+ {"merge-to", required_argument, 0, 'T'},
+ {"start-only", no_argument, 0, 'S'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hvcSlmob:i:u:w:r:t:V:M:T:R:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+
+ case 'w':
+ windowsize = atoi(optarg);
+ break;
+
+ case 'b':
+ bedFileName = string(optarg);
+ break;
+
+ case 'i':
+ intersecting = true;
+ vcfFileName = string(optarg);
+ break;
+
+ case 'u':
+ unioning = true;
+ vcfFileName = string(optarg);
+ break;
+
+ case 'r':
+ fastaFileName = string(optarg);
+ break;
+
+ case 'v':
+ invert = true;
+ break;
+
+ case 'c':
+ contained = true;
+ break;
+
+ case 'o':
+ overlapping = true;
+ break;
+
+ case 'l':
+ loci = true;
+ break;
+
+ case 'm':
+ refmatch = true;
+ break;
+
+ case 't':
+ tag = optarg;
+ break;
+
+ case 'R':
+ regions.push_back(BedTarget(optarg));
+ regions.back().left -= 1;
+ break;
+
+ case 'S':
+ startPositionOnly = true;
+ break;
+
+ case 'V':
+ tagValue = optarg;
+ break;
+
+ case 'M':
+ mergeFromTag = optarg;
+ break;
+
+ case 'T':
+ mergeToTag = optarg;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+
+ VariantCallFile variantFile;
+ bool usingstdin = false;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ usingstdin = true;
+ }
+
+ if (!variantFile.is_open()) {
+ cerr << "could not open VCF file" << endl;
+ exit(1);
+ }
+
+
+ bool usingBED = false;
+ if (!bedFileName.empty()) {
+ usingBED = true;
+ }
+
+ if (usingBED || !regions.empty()) {
+ variantFile.parseSamples = false;
+ }
+
+ // it runs much faster to do this first. then downstream processes don't block!
+
+ BedReader bed;
+ if (usingBED) {
+ bed.open(bedFileName);
+ }
+ if (!regions.empty()) {
+ // add to the bed
+ bed.addTargets(regions);
+ usingBED = true;
+ }
+
+ VariantCallFile otherVariantFile;
+ if (!vcfFileName.empty()) {
+ if (vcfFileName == "-") {
+ if (usingstdin) {
+ cerr << "cannot open both VCF file streams from stdin" << endl;
+ exit(1);
+ } else {
+ otherVariantFile.open(std::cin);
+ }
+ } else {
+ otherVariantFile.open(vcfFileName);
+ }
+ if (!otherVariantFile.is_open()) {
+ cerr << "could not open VCF file " << vcfFileName << endl;
+ exit(1);
+ }
+ }
+
+
+ if (!tag.empty()) {
+ variantFile.addHeaderLine("##INFO=<ID="+ tag +",Number=A,Type=String,Description=\"" + tagValue + " if this allele intersects with one in " + vcfFileName + ", '.' if not.\">");
+ }
+
+ if (!mergeToTag.empty()) {
+ if (mergeFromTag.empty()) {
+ cerr << "must specify a tag to merge from" << endl;
+ exit(1);
+ }
+ // get mergeFromTag type
+ map<string, VariantFieldType>::iterator f = otherVariantFile.infoTypes.find(mergeFromTag);
+ if (f == otherVariantFile.infoTypes.end()) {
+ cerr << "vcfintersect: ERROR could not find " << mergeFromTag << " in header" << endl;
+ exit(1);
+ }
+ VariantFieldType mergeFromType = f->second;
+ stringstream s;
+ s << mergeFromType;
+
+ variantFile.addHeaderLine("##INFO=<ID="+ mergeToTag +",Number=A,Type=" + s.str() + ",Description=\"The value of " + mergeFromTag + " in " + vcfFileName + " '.' if the tag does not exist for the given allele in the other file, or if there is no corresponding allele.\">");
+ }
+
+ cout << variantFile.header << endl;
+
+
+ FastaReference reference;
+ if (unioning || intersecting) {
+ if (fastaFileName.empty()) {
+ cerr << "a reference is required for haplotype-based intersection and unioniong" << endl;
+ exit(1);
+ }
+ reference.open(fastaFileName);
+ }
+
+ if (!unioning && !intersecting) {
+ variantFile.parseSamples = false; // faster, as when we are
+ // only bed-intersecting we
+ // can do position-only
+ // output and don't have to
+ // manipulate specific
+ // alleles
+ }
+
+ // read the VCF file for union or intersection into an interval tree
+ // indexed using some proximity window
+
+ map<string, IntervalTree<Variant*> > variantIntervals;
+ map<string, list<Variant> > otherVariants;
+ map<string, vector<Interval<Variant*> > > otherVariantIntervals;
+
+ if (unioning || intersecting) {
+
+ Variant ovar(otherVariantFile);
+ while (otherVariantFile.getNextVariant(ovar)) {
+ long int left = ovar.position;
+ long int right = left + ovar.ref.size(); // this should be 1-past the end
+ otherVariants[ovar.sequenceName].push_back(ovar);
+ Variant* v = &otherVariants[ovar.sequenceName].back();
+ otherVariantIntervals[ovar.sequenceName].push_back(Interval<Variant*>(left, right, v));
+ }
+
+ for (map<string, vector<Interval<Variant*> > >::iterator j = otherVariantIntervals.begin(); j != otherVariantIntervals.end(); ++j) {
+ variantIntervals[j->first] = IntervalTree<Variant*>(j->second);
+ }
+
+ }
+
+ set<Variant*> outputVariants;
+
+ long int lastOutputPosition = 0;
+ string lastSequenceName;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+
+ if (lastSequenceName.empty()) {
+ lastSequenceName = var.sequenceName;
+ } else if (lastSequenceName != var.sequenceName) {
+ if (unioning) {
+ vector<Interval<Variant*> > previousRecords;
+ long int lastSeqLength = reference.sequenceLength(lastSequenceName);
+ variantIntervals[lastSequenceName].findContained(lastOutputPosition, lastSeqLength, previousRecords);
+ for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) {
+ Variant* v = r->value;
+ if (outputVariants.find(v) == outputVariants.end()) {
+ outputVariants.insert(v);
+ cout << *v << endl; // Q: does this output everything in correct order?.... A: No.
+ }
+ }
+ lastSequenceName = var.sequenceName;
+ lastOutputPosition = 0;
+ }
+ }
+
+ if (usingBED) {
+ vector<BedTarget*> overlaps;
+ if (startPositionOnly) {
+ // only intersect if start position (not end) is in target
+ BedTarget record(var.sequenceName, var.position, var.position, "");
+ overlaps = bed.targetsOverlapping(record);
+ } else {
+ // default behavior
+ BedTarget record(var.sequenceName, var.position, var.position + var.ref.size() - 1, "");
+ overlaps = bed.targetsOverlapping(record);
+ }
+
+ if (!invert && !overlaps.empty()) {
+ cout << variantFile.line << endl;
+ } else if (invert && overlaps.empty()) {
+ cout << variantFile.line << endl;
+ }
+
+ } else if (unioning || intersecting) {
+
+ // TODO check overlaps with union/intersection
+ // hmm... for unioning, you might need to step through the original VCF records
+ // but the idea is to exclude the haplotype-based duplicates
+
+ vector<Interval<Variant*> > results;
+
+ variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize, results);
+
+ vector<Variant*> overlapping;
+
+ for (vector<Interval<Variant*> >::iterator r = results.begin(); r != results.end(); ++r) {
+ overlapping.push_back(r->value);
+ }
+
+
+ if (unioning) {
+
+ // unioning strategy
+
+ // write out all the records from the last file
+ // between the last one printed out and the first
+ // one we're about to print out
+
+ vector<Interval<Variant*> > previousRecords;
+
+ variantIntervals[var.sequenceName].findOverlapping(lastOutputPosition, var.position - windowsize, previousRecords);
+
+ map<long int, vector<Variant*> > variants;
+
+ for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) {
+ Variant* v = r->value;
+ if (outputVariants.find(v) == outputVariants.end()) {
+ outputVariants.insert(v);
+ variants[v->position].push_back(v);
+ }
+ }
+
+ for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) {
+ for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) {
+ cout << **o << endl;
+ lastOutputPosition = max(lastOutputPosition, (*o)->position);
+ }
+ }
+
+ // TODO find the duplicates for the other file
+ }
+
+
+ if (overlapping.empty()) {
+
+ if (unioning || (intersecting && invert)) {
+ cout << var << endl;
+ lastOutputPosition = max(lastOutputPosition, var.position);
+ } else if (intersecting && (!tag.empty() || !mergeToTag.empty())) {
+ for (int i = 0; i < var.alt.size(); ++i) {
+ if (!tag.empty()) {
+ var.info[tag].push_back(".");
+ }
+ if (!mergeToTag.empty()) {
+ var.info[mergeToTag].push_back(".");
+ }
+ }
+ cout << var << endl;
+ lastOutputPosition = max(lastOutputPosition, var.position);
+ }
+
+ } else {
+
+ // get the min and max of the overlaps
+
+ int haplotypeStart = var.position;
+ int haplotypeEnd = var.position + var.ref.size();
+
+ for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
+ haplotypeStart = min((*v)->position, (long int) haplotypeStart);
+ haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd);
+ }
+
+ // for everything overlapping and the current variant, construct the local haplotype within the bounds
+ // if there is an exact match, the allele in the current VCF does intersect
+
+ string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart);
+ map<string, vector<pair<Variant*, int> > > haplotypes; // map to variant and alt index
+
+ for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
+ Variant& variant = **v;
+ int altindex = 0;
+ for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a, ++altindex) {
+ string haplotype = referenceHaplotype;
+ // get the relative start and end coordinates for the variant alternate allele
+ int relativeStart = variant.position - haplotypeStart;
+ haplotype.replace(relativeStart, variant.ref.size(), *a);
+ haplotypes[haplotype].push_back(make_pair(*v, altindex));
+ }
+ }
+
+ Variant originalVar = var;
+
+ // determine the non-intersecting alts
+ vector<string> altsToRemove;
+ vector<int> altIndexesToRemove;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string haplotype = referenceHaplotype;
+ int relativeStart = var.position - haplotypeStart;
+ haplotype.replace(relativeStart, var.ref.size(), *a);
+ map<string, vector<pair<Variant*, int> > >::iterator h = haplotypes.find(haplotype);
+ if ((intersecting && !invert && h == haplotypes.end())
+ || (intersecting && invert && h != haplotypes.end())
+ || (unioning && h != haplotypes.end())) {
+ if (tag.empty() && mergeToTag.empty()) {
+ altsToRemove.push_back(*a);
+ } else {
+ if (!tag.empty()) {
+ var.info[tag].push_back(".");
+ }
+ if (!mergeToTag.empty()) {
+ var.info[mergeToTag].push_back(".");
+ }
+ }
+ } else {
+ if (!tag.empty()) {
+ var.info[tag].push_back(tagValue);
+ }
+ // NB: just take the first value for the mergeFromTag
+ if (!mergeToTag.empty()) {
+ Variant* v = h->second.front().first;
+ int index = h->second.front().second;
+ if (v->info.find(mergeFromTag) != v->info.end()) {
+ // now you have to find the exact allele...
+ string& otherValue = v->info[mergeFromTag].at(index);
+ var.info[mergeToTag].push_back(otherValue);
+ } else if (mergeFromTag == "QUAL") {
+ var.info[mergeToTag].push_back(convert(v->quality));
+ } else {
+ var.info[mergeToTag].push_back(".");
+ }
+ }
+ }
+ }
+
+ // remove the non-overlapping (intersecting) or overlapping (unioning) alts
+ if (intersecting && loci && altsToRemove.size() != var.alt.size()) {
+ // we have a match in loci mode, so we should output the whole loci, not just the matching sequence
+ } else {
+ for (vector<string>::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) {
+ var.removeAlt(*a);
+ }
+ }
+
+ if (unioning) {
+
+ // somehow sort the records and combine them?
+ map<long int, vector<Variant*> > variants;
+ for (vector<Variant*>::iterator o = overlapping.begin(); o != overlapping.end(); ++o) {
+ if ((*o)->position <= var.position && // check ensures proper ordering of variants on output
+ outputVariants.find(*o) == outputVariants.end()) {
+ outputVariants.insert(*o);
+ variants[(*o)->position].push_back(*o);
+ }
+ }
+ // add in the current variant, if it has alts left
+ if (!var.alt.empty()) {
+ vector<Variant*>& vars = variants[var.position];
+ int numalts = 0;
+ for (vector<Variant*>::iterator v = vars.begin(); v != vars.end(); ++v) {
+ numalts += (*v)->alt.size();
+ }
+ if (numalts + var.alt.size() == originalVar.alt.size()) {
+ variants[var.position].clear();
+ variants[var.position].push_back(&originalVar);
+ } else {
+ variants[var.position].push_back(&var);
+ }
+ }
+
+ for (map<long int, vector<Variant*> >::iterator v = variants.begin(); v != variants.end(); ++v) {
+ for (vector<Variant*>::iterator o = v->second.begin(); o != v->second.end(); ++o) {
+ cout << **o << endl;
+ lastOutputPosition = max(lastOutputPosition, (*o)->position);
+ }
+ }
+ } else {
+ // if any alts remain, output the variant record
+ if (!var.alt.empty()) {
+ cout << var << endl;
+ lastOutputPosition = max(lastOutputPosition, var.position);
+ }
+ }
+
+ }
+
+ }
+
+ }
+
+
+ // if unioning, and any variants remain, output them
+ if (unioning) {
+ for (map<string, list<Variant> >::iterator chrom = otherVariants.find(lastSequenceName);
+ chrom != otherVariants.end();
+ ++chrom) {
+ for (list<Variant>::iterator v = chrom->second.begin(); v != chrom->second.end(); ++v) {
+ Variant* variant = &*v;
+ if (outputVariants.find(variant) == outputVariants.end()) {
+ outputVariants.insert(variant);
+ cout << *variant << endl;
+ // TODO guarantee sorting
+ }
+ }
+ }
+ }
+
+ exit(0); // why?
+ return 0;
+
+}
+
diff --git a/src/vcfkeepgeno.cpp b/src/vcfkeepgeno.cpp
new file mode 100644
index 0000000..5616b22
--- /dev/null
+++ b/src/vcfkeepgeno.cpp
@@ -0,0 +1,62 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+
+int main(int argc, char** argv) {
+
+ if (argc < 3) {
+ cerr << "usage: " << argv[0] << " <vcf file> [FIELD1] [FIELD2] ..." << endl
+ << "outputs each record in the vcf file, removing FORMAT fields not listed"
+ << "on the command line from sample specifications in the output"
+ << endl;
+ return 1;
+ }
+
+ string filename = argv[1];
+
+ vector<string> newFormat;
+ set<string> fieldsToKeep;
+ for (int i = 2; i < argc; ++i) {
+ fieldsToKeep.insert(argv[i]);
+ newFormat.push_back(argv[i]);
+ }
+
+ VariantCallFile variantFile;
+ if (filename == "-") {
+ variantFile.open(std::cin);
+ } else {
+ variantFile.open(filename);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ Variant var(variantFile);
+
+ vector<string> formatIds = variantFile.formatIds();
+ for (vector<string>::iterator i = formatIds.begin(); i != formatIds.end(); ++i) {
+ if (!fieldsToKeep.count(*i)) {
+ variantFile.removeGenoHeaderLine(*i);
+ }
+ }
+
+ // write the header
+ cout << variantFile.header << endl;
+
+ // print the records, filtering is done via the setting of varA's output sample names
+ while (variantFile.getNextVariant(var)) {
+ var.format = newFormat;
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfkeepinfo.cpp b/src/vcfkeepinfo.cpp
new file mode 100644
index 0000000..916ca89
--- /dev/null
+++ b/src/vcfkeepinfo.cpp
@@ -0,0 +1,68 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ if (argc < 3) {
+ cerr << "usage: " << argv[0] << " <vcf file> [FIELD1] [FIELD2] ..." << endl
+ << "outputs each record in the vcf file, removing INFO fields not listed on the command line" << endl;
+ return 1;
+ }
+
+ string filename = argv[1];
+
+ set<string> fieldsToKeep;
+ for (int i = 2; i < argc; ++i) {
+ fieldsToKeep.insert(argv[i]);
+ }
+
+ VariantCallFile variantFile;
+ if (filename == "-") {
+ variantFile.open(std::cin);
+ } else {
+ variantFile.open(filename);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ Variant var(variantFile);
+
+ vector<string> fieldsToErase;
+ vector<string> infoIds = variantFile.infoIds();
+ for (vector<string>::iterator i = infoIds.begin(); i != infoIds.end(); ++i) {
+ if (!fieldsToKeep.count(*i)) {
+ fieldsToErase.push_back(*i);
+ variantFile.removeInfoHeaderLine(*i);
+ }
+ }
+
+ // write the header
+ cout << variantFile.header << endl;
+
+ // print the records, filtering is done via the setting of varA's output sample names
+ while (variantFile.getNextVariant(var)) {
+ for (map<string, vector<string> >::iterator i = var.info.begin(); i != var.info.end(); ++i) {
+ if (!fieldsToKeep.count(i->first)) {
+ var.info.erase(i->first);
+ }
+ }
+ for (map<string, bool>::iterator i = var.infoFlags.begin(); i != var.infoFlags.end(); ++i) {
+ if (!fieldsToKeep.count(i->first)) {
+ var.infoFlags.erase(i->first);
+ }
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfkeepsamples.cpp b/src/vcfkeepsamples.cpp
new file mode 100644
index 0000000..935c8a1
--- /dev/null
+++ b/src/vcfkeepsamples.cpp
@@ -0,0 +1,54 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ if (argc < 3) {
+ cerr << "usage: " << argv[0] << " <vcf file> [SAMPLE1] [SAMPLE2] ..." << endl
+ << "outputs each record in the vcf file, removing samples not listed on the command line" << endl;
+ return 1;
+ }
+
+ string filename = argv[1];
+
+ vector<string> samplesToKeep;
+ for (int i = 2; i < argc; ++i) {
+ samplesToKeep.push_back(argv[i]);
+ }
+
+ VariantCallFile variantFile;
+ if (filename == "-") {
+ variantFile.open(std::cin);
+ } else {
+ variantFile.open(filename);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ Variant var(variantFile);
+
+ // update sample list in header
+ variantFile.updateSamples(samplesToKeep);
+
+ // and restrict the output sample names in the variant to those we are keeping
+ var.setOutputSampleNames(samplesToKeep);
+
+ // write the new header
+ cout << variantFile.header << endl;
+
+ // print the records, filtering is done via the setting of varA's output sample names
+ while (variantFile.getNextVariant(var)) {
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfleftalign.cpp b/src/vcfleftalign.cpp
new file mode 100644
index 0000000..f4b992e
--- /dev/null
+++ b/src/vcfleftalign.cpp
@@ -0,0 +1,781 @@
+#include "Variant.h"
+#include "convert.h"
+#include "join.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <set>
+#include <vector>
+#include <getopt.h>
+#include <cmath>
+
+using namespace std;
+using namespace vcf;
+
+
+// Attempts to left-realign all the indels represented by the alignment cigar.
+//
+// This is done by shifting all indels as far left as they can go without
+// mismatch, then merging neighboring indels of the same class. leftAlign
+// updates the alignment cigar with changes, and returns true if realignment
+// changed the alignment cigar.
+//
+// To left-align, we move multi-base indels left by their own length as long as
+// the preceding bases match the inserted or deleted sequence. After this
+// step, we handle multi-base homopolymer indels by shifting them one base to
+// the left until they mismatch the reference.
+//
+// To merge neighboring indels, we iterate through the set of left-stabilized
+// indels. For each indel we add a new cigar element to the new cigar. If a
+// deletion follows a deletion, or an insertion occurs at the same place as
+// another insertion, we merge the events by extending the previous cigar
+// element.
+//
+// In practice, we must call this function until the alignment is stabilized.
+
+#define VCFLEFTALIGN_DEBUG(msg) \
+ if (false) { cerr << msg; }
+
+class VCFIndelAllele {
+ friend ostream& operator<<(ostream&, const VCFIndelAllele&);
+ friend bool operator==(const VCFIndelAllele&, const VCFIndelAllele&);
+ friend bool operator!=(const VCFIndelAllele&, const VCFIndelAllele&);
+ friend bool operator<(const VCFIndelAllele&, const VCFIndelAllele&);
+public:
+ bool insertion;
+ int length;
+ int position;
+ int readPosition;
+ string sequence;
+
+ bool homopolymer(void);
+
+ VCFIndelAllele(bool i, int l, int p, int rp, string s)
+ : insertion(i), length(l), position(p), readPosition(rp), sequence(s)
+ { }
+};
+
+bool FBhomopolymer(string sequence);
+ostream& operator<<(ostream& out, const VCFIndelAllele& indel);
+bool operator==(const VCFIndelAllele& a, const VCFIndelAllele& b);
+bool operator!=(const VCFIndelAllele& a, const VCFIndelAllele& b);
+bool operator<(const VCFIndelAllele& a, const VCFIndelAllele& b);
+
+bool VCFIndelAllele::homopolymer(void) {
+ string::iterator s = sequence.begin();
+ char c = *s++;
+ while (s != sequence.end()) {
+ if (c != *s++) return false;
+ }
+ return true;
+}
+
+bool FBhomopolymer(string sequence) {
+ string::iterator s = sequence.begin();
+ char c = *s++;
+ while (s != sequence.end()) {
+ if (c != *s++) return false;
+ }
+ return true;
+}
+
+ostream& operator<<(ostream& out, const VCFIndelAllele& indel) {
+ string t = indel.insertion ? "i" : "d";
+ out << t << ":" << indel.position << ":" << indel.readPosition << ":" << indel.sequence;
+ return out;
+}
+
+bool operator==(const VCFIndelAllele& a, const VCFIndelAllele& b) {
+ return (a.insertion == b.insertion
+ && a.length == b.length
+ && a.position == b.position
+ && a.sequence == b.sequence);
+}
+
+bool operator!=(const VCFIndelAllele& a, const VCFIndelAllele& b) {
+ return !(a==b);
+}
+
+bool operator<(const VCFIndelAllele& a, const VCFIndelAllele& b) {
+ ostringstream as, bs;
+ as << a;
+ bs << b;
+ return as.str() < bs.str();
+}
+
+
+class AltAlignment {
+public:
+ unsigned int pos;
+ string seq;
+ vector<pair<int, string> > cigar;
+ AltAlignment(unsigned int& p,
+ string& s,
+ string& c) {
+ pos = p;
+ seq = s;
+ cigar = splitCigar(c);
+ }
+};
+
+double entropy(const string& st) {
+ vector<char> stvec(st.begin(), st.end());
+ set<char> alphabet(stvec.begin(), stvec.end());
+ vector<double> freqs;
+ for (set<char>::iterator c = alphabet.begin(); c != alphabet.end(); ++c) {
+ int ctr = 0;
+ for (vector<char>::iterator s = stvec.begin(); s != stvec.end(); ++s) {
+ if (*s == *c) {
+ ++ctr;
+ }
+ }
+ freqs.push_back((double)ctr / (double)stvec.size());
+ }
+ double ent = 0;
+ double ln2 = log(2);
+ for (vector<double>::iterator f = freqs.begin(); f != freqs.end(); ++f) {
+ ent += *f * log(*f)/ln2;
+ }
+ ent = -ent;
+ return ent;
+}
+
+void getAlignment(Variant& var, FastaReference& reference, string& ref, vector<AltAlignment>& alignments, int window) {
+
+ // default alignment params
+ float matchScore = 10.0f;
+ float mismatchScore = -9.0f;
+ float gapOpenPenalty = 25.0f;
+ float gapExtendPenalty = 3.33f;
+
+ // establish reference sequence
+ string pad = string(window/2, 'Z');
+ string leftFlank = reference.getSubSequence(var.sequenceName, var.zeroBasedPosition() - window/2, window/2);
+ string rightFlank = reference.getSubSequence(var.sequenceName, var.zeroBasedPosition() + var.ref.size(), window/2);
+ ref = pad + leftFlank + var.ref + rightFlank + pad;
+
+ // and iterate through the alternates, generating alignments
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string alt = pad + leftFlank + *a + rightFlank + pad;
+ CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
+ unsigned int referencePos;
+ string cigar;
+ sw.Align(referencePos, cigar, ref, alt);
+ alignments.push_back(AltAlignment(referencePos, alt, cigar));
+ }
+}
+
+
+bool stablyLeftAlign(string& alternateSequence, string referenceSequence, int maxiterations = 50, bool debug = false);
+int countMismatches(string& alternateSequence, string referenceSequence);
+
+bool leftAlign(string& alternateSequence, Cigar& cigar, string& referenceSequence, bool debug = false) {
+
+ int arsOffset = 0; // pointer to insertion point in aligned reference sequence
+ string alignedReferenceSequence = referenceSequence;
+ int aabOffset = 0;
+ string alignmentAlignedBases = alternateSequence;
+
+ // store information about the indels
+ vector<VCFIndelAllele> indels;
+
+ int rp = 0; // read position, 0-based relative to read
+ int sp = 0; // sequence position
+
+ string softBegin;
+ string softEnd;
+
+ stringstream cigar_before, cigar_after;
+ for (vector<pair<int, string> >::const_iterator c = cigar.begin();
+ c != cigar.end(); ++c) {
+ unsigned int l = c->first;
+ char t = c->second.at(0);
+
+ cigar_before << l << t;
+ if (t == 'M') { // match or mismatch
+ sp += l;
+ rp += l;
+ } else if (t == 'D') { // deletion
+ indels.push_back(VCFIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l)));
+ alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
+ aabOffset += l;
+ sp += l; // update reference sequence position
+ } else if (t == 'I') { // insertion
+ indels.push_back(VCFIndelAllele(true, l, sp, rp, alternateSequence.substr(rp, l)));
+ alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-'));
+ arsOffset += l;
+ rp += l;
+ } else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference
+ // remove these bases from the refseq and read seq, but don't modify the alignment sequence
+ if (rp == 0) {
+ alignedReferenceSequence = string(l, '*') + alignedReferenceSequence;
+ softBegin = alignmentAlignedBases.substr(0, l);
+ } else {
+ alignedReferenceSequence = alignedReferenceSequence + string(l, '*');
+ softEnd = alignmentAlignedBases.substr(alignmentAlignedBases.size() - l, l);
+ }
+ rp += l;
+ } else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
+ } else if (t == 'N') { // skipped region in the reference not present in read, aka splice
+ sp += l;
+ }
+ }
+
+
+ int alignedLength = sp;
+
+ VCFLEFTALIGN_DEBUG("| " << cigar_before.str() << endl
+ << "| " << alignedReferenceSequence << endl
+ << "| " << alignmentAlignedBases << endl);
+
+ // if no indels, return the alignment
+ if (indels.empty()) { return false; }
+
+ // for each indel, from left to right
+ // while the indel sequence repeated to the left and we're not matched up with the left-previous indel
+ // move the indel left
+
+ vector<VCFIndelAllele>::iterator previous = indels.begin();
+ for (vector<VCFIndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) {
+
+ // left shift by repeats
+ //
+ // from 1 base to the length of the indel, attempt to shift left
+ // if the move would cause no change in alignment optimality (no
+ // introduction of mismatches, and by definition no change in gap
+ // length), move to the new position.
+ // in practice this moves the indel left when we reach the size of
+ // the repeat unit.
+ //
+ int steppos, readsteppos;
+ VCFIndelAllele& indel = *id;
+ int i = 1;
+ while (i <= indel.length) {
+
+ int steppos = indel.position - i;
+ int readsteppos = indel.readPosition - i;
+
+#ifdef VERBOSE_DEBUG
+ if (debug) {
+ if (steppos >= 0 && readsteppos >= 0) {
+ cerr << referenceSequence.substr(steppos, indel.length) << endl;
+ cerr << alternateSequence.substr(readsteppos, indel.length) << endl;
+ cerr << indel.sequence << endl;
+ }
+ }
+#endif
+ while (steppos >= 0 && readsteppos >= 0
+ && indel.sequence == referenceSequence.substr(steppos, indel.length)
+ && indel.sequence == alternateSequence.substr(readsteppos, indel.length)
+ && (id == indels.begin()
+ || (previous->insertion && steppos >= previous->position)
+ || (!previous->insertion && steppos >= previous->position + previous->length))) {
+ VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl);
+ indel.position -= i;
+ indel.readPosition -= i;
+ steppos = indel.position - i;
+ readsteppos = indel.readPosition - i;
+ }
+ do {
+ ++i;
+ } while (i <= indel.length && indel.length % i != 0);
+ }
+
+ // left shift indels with exchangeable flanking sequence
+ //
+ // for example:
+ //
+ // GTTACGTT GTTACGTT
+ // GT-----T ----> G-----TT
+ //
+ // GTGTGACGTGT GTGTGACGTGT
+ // GTGTG-----T ----> GTG-----TGT
+ //
+ // GTGTG-----T GTG-----TGT
+ // GTGTGACGTGT ----> GTGTGACGTGT
+ //
+ //
+ steppos = indel.position - 1;
+ readsteppos = indel.readPosition - 1;
+ while (steppos >= 0 && readsteppos >= 0
+ && alternateSequence.at(readsteppos) == referenceSequence.at(steppos)
+ && alternateSequence.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
+ && (id == indels.begin()
+ || (previous->insertion && indel.position - 1 >= previous->position)
+ || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
+ VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl);
+ indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1);
+ indel.position -= 1;
+ indel.readPosition -= 1;
+ steppos = indel.position - 1;
+ readsteppos = indel.readPosition - 1;
+ }
+ // tracks previous indel, so we don't run into it with the next shift
+ previous = id;
+ }
+
+ // bring together floating indels
+ // from left to right
+ // check if we could merge with the next indel
+ // if so, adjust so that we will merge in the next step
+ if (indels.size() > 1) {
+ previous = indels.begin();
+ for (vector<VCFIndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) {
+ VCFIndelAllele& indel = *id;
+ // parsimony: could we shift right and merge with the previous indel?
+ // if so, do it
+ int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
+ int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
+ if (previous->insertion == indel.insertion
+ && ((previous->insertion
+ && (previous->position < indel.position
+ && previous->readPosition + previous->readPosition < indel.readPosition))
+ ||
+ (!previous->insertion
+ && (previous->position + previous->length < indel.position)
+ && (previous->readPosition < indel.readPosition)
+ ))) {
+ if (previous->homopolymer()) {
+ string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
+ string readseq = alternateSequence.substr(prev_end_read, indel.position - prev_end_ref);
+ VCFLEFTALIGN_DEBUG("seq: " << seq << endl << "readseq: " << readseq << endl);
+ if (previous->sequence.at(0) == seq.at(0)
+ && FBhomopolymer(seq)
+ && FBhomopolymer(readseq)) {
+ VCFLEFTALIGN_DEBUG("moving " << *previous << " right to "
+ << (indel.insertion ? indel.position : indel.position - previous->length) << endl);
+ previous->position = indel.insertion ? indel.position : indel.position - previous->length;
+ }
+ }
+ else {
+ int pos = previous->position;
+ while (pos < (int) referenceSequence.length() &&
+ ((previous->insertion && pos + previous->length <= indel.position)
+ ||
+ (!previous->insertion && pos + previous->length < indel.position))
+ && previous->sequence
+ == referenceSequence.substr(pos + previous->length, previous->length)) {
+ pos += previous->length;
+ }
+ if (pos < previous->position &&
+ ((previous->insertion && pos + previous->length == indel.position)
+ ||
+ (!previous->insertion && pos == indel.position - previous->length))
+ ) {
+ VCFLEFTALIGN_DEBUG("right-merging tandem repeat: moving " << *previous << " right to " << pos << endl);
+ previous->position = pos;
+ }
+ }
+ }
+ previous = id;
+ }
+ }
+
+ // for each indel
+ // if ( we're matched up to the previous insertion (or deletion)
+ // and it's also an insertion or deletion )
+ // merge the indels
+ //
+ // and simultaneously reconstruct the cigar
+
+ Cigar newCigar;
+
+ if (!softBegin.empty()) {
+ newCigar.push_back(make_pair(softBegin.size(), "S"));
+ }
+
+ vector<VCFIndelAllele>::iterator id = indels.begin();
+ VCFIndelAllele last = *id++;
+ if (last.position > 0) {
+ newCigar.push_back(make_pair(last.position, "M"));
+ newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D")));
+ } else {
+ newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D")));
+ }
+ int lastend = last.insertion ? last.position : (last.position + last.length);
+ VCFLEFTALIGN_DEBUG(last << ",");
+
+ for (; id != indels.end(); ++id) {
+ VCFIndelAllele& indel = *id;
+ VCFLEFTALIGN_DEBUG(indel << ",");
+ if (indel.position < lastend) {
+ cerr << "impossibility?: indel realigned left of another indel" << endl
+ << referenceSequence << endl << alternateSequence << endl;
+ exit(1);
+ } else if (indel.position == lastend && indel.insertion == last.insertion) {
+ pair<int, string>& op = newCigar.back();
+ op.first += indel.length;
+ } else if (indel.position >= lastend) { // also catches differential indels, but with the same position
+ newCigar.push_back(make_pair(indel.position - lastend, "M"));
+ newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D")));
+ }
+ last = *id;
+ lastend = last.insertion ? last.position : (last.position + last.length);
+ }
+
+ if (lastend < alignedLength) {
+ newCigar.push_back(make_pair(alignedLength - lastend, "M"));
+ }
+
+ if (!softEnd.empty()) {
+ newCigar.push_back(make_pair(softEnd.size(), "S"));
+ }
+
+ VCFLEFTALIGN_DEBUG(endl);
+
+ cigar = newCigar;
+
+ for (vector<pair<int, string> >::const_iterator c = cigar.begin();
+ c != cigar.end(); ++c) {
+ unsigned int l = c->first;
+ char t = c->second.at(0);
+ cigar_after << l << t;
+ }
+
+ //cerr << cigar_before.str() << " changes to " << cigar_after.str() << endl;
+ VCFLEFTALIGN_DEBUG(cigar_after.str() << endl);
+
+ // check if we're realigned
+ if (cigar_after.str() == cigar_before.str()) {
+ return false;
+ } else {
+ return true;
+ }
+
+}
+
+// Iteratively left-aligns the indels in the alignment until we have a stable
+// realignment. Returns true on realignment success or non-realignment.
+// Returns false if we exceed the maximum number of realignment iterations.
+//
+bool stablyLeftAlign(string& alternateSequence, string referenceSequence, Cigar& cigar, int maxiterations, bool debug) {
+
+ if (!leftAlign(alternateSequence, cigar, referenceSequence, debug)) {
+
+ return true;
+
+ } else {
+
+ bool result = true;
+ while ((result = leftAlign(alternateSequence, cigar, referenceSequence, debug)) && --maxiterations > 0) {
+ }
+
+ if (maxiterations <= 0) {
+ return false;
+ } else {
+ return true;
+ }
+
+ }
+
+}
+
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [file]" << endl
+ << endl
+ << "options:" << endl
+ << " -r, --reference FILE Use this reference as a basis for realignment." << endl
+ << " -w, --window N Use a window of this many bp when left aligning (150)." << endl
+ << endl
+ << "Left-aligns variants in the specified input file or stdin. Window size is determined" << endl
+ << "dynamically according to the entropy of the regions flanking the indel. These must have" << endl
+ << "entropy > 1 bit/bp, or be shorter than ~5kb." << endl;
+ exit(0);
+}
+
+int main(int argc, char** argv) {
+
+ int window = 150;
+ VariantCallFile variantFile;
+ string fastaFileName;
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"reference", required_argument, 0, 'r'},
+ {"window", required_argument, 0, 'w'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hw:r:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+
+ case 'r':
+ fastaFileName = optarg;
+ break;
+
+ case 'w':
+ window = atoi(optarg);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (optind < argc) {
+ string filename = argv[optind];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ cerr << "could not open VCF file" << endl;
+ exit(1);
+ }
+
+ FastaReference fastaReference;
+ if (fastaFileName.empty()) {
+ cerr << "a reference is required" << endl;
+ exit(1);
+ } else {
+ fastaReference.open(fastaFileName);
+ }
+
+ /*
+ variantFile.addHeaderLine("##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">");
+ variantFile.addHeaderLine("##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">");
+ if (!parseFlag.empty()) {
+ variantFile.addHeaderLine("##INFO=<ID="+parseFlag+",Number=0,Type=Flag,Description=\"The allele was parsed using vcfallelicprimitives.\">");
+ }
+ */
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+
+ // if there is no indel, there is nothing to realign
+ bool hasIndel = false;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ if (a->size() != var.ref.size()) {
+ hasIndel = true;
+ break;
+ }
+ }
+ if (!hasIndel) {
+ cout << var << endl;
+ continue;
+ }
+
+ vector<AltAlignment> alignments;
+ string ref;
+
+ // determine window size to prevent mismapping with SW algorithm
+ int currentWindow = window;
+ int scale = 2;
+ if (var.ref.size()*scale > currentWindow) currentWindow = var.ref.size()*scale;
+ for (vector<string>::iterator a = var.alleles.begin(); a != var.alleles.end(); ++a) {
+ if (a->size()*scale > currentWindow) {
+ currentWindow = a->size()*scale;
+ }
+ }
+
+ // while the entropy of either flank is < some target entropy (~1 is fine), increase the flank sizes
+ while (currentWindow < 2000) { // limit to one step > than this
+ string refTarget = fastaReference.getSubSequence(var.sequenceName, var.position - 1 - currentWindow/2, currentWindow);
+ if (entropy(refTarget.substr(0, refTarget.size()/2)) < 1 ||
+ entropy(refTarget.substr(refTarget.size()/2)) < 1) {
+ currentWindow *= scale;
+ } else {
+ break;
+ }
+ }
+
+ // do the alignments
+ getAlignment(var, fastaReference, ref, alignments, currentWindow);
+
+ // stably left align the alignments
+ for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end(); ++a) {
+ Cigar cigarBefore = a->cigar;
+ //cerr << a->seq << endl;
+ //cerr << "before : " << a->pos << " " << joinCigar(a->cigar) << endl;
+ long int prev = a->pos;
+ stablyLeftAlign(a->seq, ref, a->cigar, 20, false);
+ //cerr << "after : " << a->pos << " " << joinCigar(a->cigar) << endl;
+ if (a->pos != prev) cerr << "modified alignment @ " << var << endl;
+ }
+ //cout << var << endl;
+
+ // transform the mappings
+ // chop off leading matching bases
+ // find the range of bp in the alleles
+ // make the new ref allele
+ // make the new alt alleles
+ // emit the var
+
+ long int newPosition = var.position+currentWindow/2;
+ long int newEndPosition = var.position-currentWindow/2;
+ // check for no-indel case
+ int newLength = var.ref.size();
+ bool giveUp = false;
+ for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end() && !giveUp; ++a) {
+ // get the first mismatching position
+ Cigar::iterator c = a->cigar.begin();
+
+ int rp = 0;
+ int sp = 0;
+ bool hitMismatch = false;
+
+ int matchingBpAtStart = 0;
+ int matchingBpAtEnd = 0;
+ // will be set to true if the first reference position match is broken by a SNP, not an indel
+ bool leadingSNP = false;
+
+ while (c != a->cigar.end()) {
+ char op = c->second[0];
+ if (c == a->cigar.begin()) {
+ if (op != 'M') {
+ cerr << "alignment does not start on matched sequence" << endl;
+ cerr << var << endl;
+ exit(1);
+ }
+ int i = 0;
+ for ( ; i < c->first; ++i) {
+ if (ref[i] != a->seq[i]) {
+ leadingSNP = true;
+ break;
+ }
+ }
+ matchingBpAtStart = i;
+ }
+ if (!leadingSNP && c == (a->cigar.begin()+1)) {
+ // if the first thing we run into is an indel, step back, per VCF spec
+ if (op == 'D' || op == 'I') {
+ --matchingBpAtStart;
+ }
+ }
+ if (c == (a->cigar.end()-1)) {
+ if (op != 'M') {
+ // soft clip at end
+ // it'll be hard to interpret this
+ // the alignments sometimes generate this
+ // best thing to do is to move on
+ //cerr << "alignment does not end on matched sequence" << endl;
+ //cout << var << endl;
+ //exit(1);
+ giveUp = true;
+ break;
+ }
+ int i = 0;
+ for ( ; i < c->first; ++i) {
+ if (ref[ref.size()-1-i] != a->seq[a->seq.size()-1-i]) {
+ break;
+ }
+ }
+ matchingBpAtEnd = i;
+ }
+ ++c;
+ }
+
+ int altMismatchLength = a->seq.size() - matchingBpAtEnd - matchingBpAtStart;
+ int refMismatchLength = (var.ref.size() + currentWindow) - matchingBpAtEnd - matchingBpAtStart;
+ //cerr << "alt mismatch length " << altMismatchLength << endl
+ // << "ref mismatch length " << refMismatchLength << endl;
+ long int newStart = var.position - currentWindow/2 + matchingBpAtStart;
+ long int newEnd = newStart + refMismatchLength;
+ //cerr << "ref should run from " << newStart << " to " << newStart + refMismatchLength << endl;
+ newPosition = min(newStart, newPosition);
+ newEndPosition = max(newEnd, newEndPosition);
+ //cerr << newPosition << " " << newEndPosition << endl;
+ //if (newRefSize < refMismatchLength) newRefSize = refMismatchLength;
+ }
+
+ // the alignment failed for some reason, continue
+ if (giveUp) {
+ cout << var << endl;
+ continue;
+ }
+
+ //cerr << "new ref start " << newPosition << " and end " << newEndPosition << " was " << var.position << "," << var.position + var.ref.size() << endl;
+ int newRefSize = newEndPosition - newPosition;
+ string newRef = fastaReference.getSubSequence(var.sequenceName, newPosition-1, newRefSize);
+ // get the number of bp to strip from the alts
+ int stripFromStart = currentWindow/2 - (var.position - newPosition);
+ int stripFromEnd = (currentWindow + newRefSize) - (stripFromStart + newRefSize) + (var.ref.size() - newRefSize);
+
+ //cerr << "strip from start " << stripFromStart << endl;
+ //cerr << "strip from end " << stripFromEnd << endl;
+
+ vector<string> newAlt;
+ vector<string>::iterator l = var.alt.begin();
+ bool failedAlt = false;
+ for (vector<AltAlignment>::iterator a = alignments.begin(); a != alignments.end();
+ ++a, ++l) {
+ int diff = newRef.size() - l->size();
+ string alt = a->seq.substr(stripFromStart, a->seq.size() - (stripFromEnd + stripFromStart));
+ newAlt.push_back(alt);
+ if (alt.empty()) failedAlt = true;
+ }
+
+ // check the before/after haplotypes
+ bool brokenRealignment = false;
+ if (!newRef.empty() && !failedAlt) {
+ int slop = 50; // 50 extra bp!
+ int haplotypeStart = min(var.position, newPosition) - slop;
+ int haplotypeEnd = max(var.position + var.ref.size(), newPosition + newRef.size()) + slop;
+ string referenceHaplotype = fastaReference.getSubSequence(var.sequenceName, haplotypeStart - 1,
+ haplotypeEnd - haplotypeStart);
+ vector<string>::iterator o = var.alt.begin();
+ vector<string>::iterator n = newAlt.begin();
+ for ( ; o != var.alt.end() ; ++o, ++n) {
+ // map the haplotypes
+ string oldHaplotype = referenceHaplotype;
+ string newHaplotype = referenceHaplotype;
+ oldHaplotype.replace(var.position - haplotypeStart, var.ref.size(), *o);
+ newHaplotype.replace(newPosition - haplotypeStart, newRef.size(), *n);
+ if (oldHaplotype != newHaplotype) {
+ cerr << "broken left alignment!" << endl
+ << "old " << oldHaplotype << endl
+ << "new " << newHaplotype << endl;
+ cerr << "was: " << var << endl;
+ brokenRealignment = true;
+ }
+ }
+ }
+
+ // *if* everything is OK, update the variant
+ if (!brokenRealignment && !newRef.empty() && !failedAlt) {
+ var.ref = newRef;
+ var.alt = newAlt;
+ var.position = newPosition;
+ }
+
+ cout << var << endl;
+
+ // for each parsedalternate, get the position
+ // build a new vcf record for that position
+ // unless we are already at the position !
+ // take everything which is unique to that allele (records) and append it to the new record
+ // then handle genotypes; determine the mapping between alleleic primitives and convert to phased haplotypes
+ // this means taking all the parsedAlternates and, for each one, generating a pattern of allele indecies corresponding to it
+
+
+
+ //for (vector<Variant>::iterator v = variants.begin(); v != variants.end(); ++v) {
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcflength.cpp b/src/vcflength.cpp
new file mode 100644
index 0000000..ebcc1a2
--- /dev/null
+++ b/src/vcflength.cpp
@@ -0,0 +1,49 @@
+#include "Variant.h"
+#include "convert.h"
+#include <vector>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc > 1) {
+ string filename = argv[1];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ variantFile.addHeaderLine("##INFO=<ID=length,Number=A,Type=Integer,Description=\"length(ALT) - length(REF) for each ALT\">");
+ variantFile.addHeaderLine("##INFO=<ID=length.ref,Number=1,Type=Integer,Description=\"length(REF)\">");
+ variantFile.addHeaderLine("##INFO=<ID=length.alt,Number=A,Type=Integer,Description=\"length(ALT) for each ALT\">");
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ vector<string>& lengths = var.info["length"];
+ lengths.clear();
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ lengths.push_back(convert((int) a->size() - (int) var.ref.size()));
+ }
+ vector<string>& lengthsRef = var.info["length.ref"];
+ lengthsRef.clear();
+ lengthsRef.push_back(convert(var.ref.size()));
+ vector<string>& lengthsAlt = var.info["length.alt"];
+ lengthsAlt.clear();
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ lengthsAlt.push_back(convert((int) a->size()));
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfnumalt.cpp b/src/vcfnumalt.cpp
new file mode 100644
index 0000000..a7c66cb
--- /dev/null
+++ b/src/vcfnumalt.cpp
@@ -0,0 +1,55 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ if (argc != 2) {
+ cerr << "usage: " << argv[0] << " <vcf file>" << endl
+ << "outputs a VCF stream where NUMALT has been generated for each record using sample genotypes" << endl;
+ return 1;
+ }
+
+ string filename = argv[1];
+
+ VariantCallFile variantFile;
+ if (filename == "-") {
+ variantFile.open(std::cin);
+ } else {
+ variantFile.open(filename);
+ }
+
+ if (!variantFile.is_open()) {
+ cerr << "could not open " << filename << endl;
+ return 1;
+ }
+
+ Variant var(variantFile);
+
+ // remove header lines we're going to add
+ variantFile.removeInfoHeaderLine("NUMALT");
+
+ // and add them back, so as not to duplicate them if they are already there
+ variantFile.addHeaderLine("##INFO=<ID=NUMALT,Number=1,Type=Integer,Description=\"Total number of segregating alternate alleles at the loci\">");
+
+ // write the new header
+ cout << variantFile.header << endl;
+
+ // print the records, filtering is done via the setting of varA's output sample names
+ while (variantFile.getNextVariant(var)) {
+ stringstream na;
+ na << var.alt.size();
+ var.info["NUMALT"].clear();
+ var.info["NUMALT"].push_back(na.str());
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfoverlay.cpp b/src/vcfoverlay.cpp
new file mode 100644
index 0000000..1bb415c
--- /dev/null
+++ b/src/vcfoverlay.cpp
@@ -0,0 +1,109 @@
+#include "Variant.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [<vcf file> ...]" << endl
+ << endl
+ << "options:" << endl
+ << " -h, --help this dialog" << endl
+ << endl
+ << "Overlays records in the input vcf files in the order in which they appear." << endl;
+ exit(0);
+}
+
+int main(int argc, char** argv) {
+
+ if (argc == 1)
+ printSummary(argv);
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ {"help", no_argument, 0, 'h'},
+ {0, 0, 0, 0}
+ };
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "h",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ // idea here is to shadow-merge
+ // records from the VCF files, which are provided in order of desired merge
+
+ map<int, pair<VariantCallFile*, Variant > > variantFiles;
+ map<string, map<long int, map<string, map<int, string> > > > linesByPrecedence;
+ int i = optind;
+
+ if (!(optind < argc - 1)) {
+ cerr << "more than one input file must be specified" << endl;
+ exit(1);
+ }
+
+ while (i < argc) {
+ int index = i++;
+ VariantCallFile*& variantFile = variantFiles[index].first;
+ Variant& var = variantFiles[index].second;
+ string inputFilename = argv[optind++];
+ variantFile = new VariantCallFile;
+ try {
+ if (!variantFile->open(inputFilename)) {
+ cerr << "vcfoverlay could not open VCF file " << inputFilename << endl;
+ --index;
+ } else {
+ var.setVariantCallFile(variantFile);
+ while (variantFile->getNextVariant(var)) {
+ linesByPrecedence[var.sequenceName][var.position][var.vrepr()][index] = variantFile->line;
+ }
+ }
+ } catch (...) {
+ cerr << "vcfoverlay encountered errors when opening " << inputFilename << endl;
+ }
+ }
+
+ cout << variantFiles.begin()->second.first->header << endl;
+
+ while (!linesByPrecedence.empty()) {
+ // get the lowest entry in the buffer of observed lines
+ // print the first line
+ // get the next variant from that file, put it back into the map
+ const string& lowestChrom = linesByPrecedence.begin()->first;
+ const long int lowestPosition = linesByPrecedence.begin()->second.begin()->first;
+ map<string, map<int, string> >& pos = linesByPrecedence.begin()->second.begin()->second;
+ for (map<string, map<int, string> >::iterator m = pos.begin(); m != pos.end(); ++m) {
+ cout << m->second.begin()->second << endl;
+ }
+ linesByPrecedence[lowestChrom].erase(lowestPosition);
+
+ if (linesByPrecedence[lowestChrom].empty()) {
+ linesByPrecedence.erase(lowestChrom);
+ }
+ }
+
+ // flush the rest of the variant records if there are any
+
+ return 0;
+}
+
diff --git a/src/vcfparsealts.cpp b/src/vcfparsealts.cpp
new file mode 100644
index 0000000..5b4e508
--- /dev/null
+++ b/src/vcfparsealts.cpp
@@ -0,0 +1,42 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc > 1) {
+ string filename = argv[1];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ map<string, vector<VariantAllele> > variants = var.parsedAlternates();
+ cout << var << endl;
+ for (map<string, vector<VariantAllele> >::iterator va = variants.begin(); va != variants.end(); ++va) {
+ cout << " ( " << va->first << " :: ";
+ vector<VariantAllele>& vars = va->second;
+ vector<VariantAllele>::iterator g = vars.begin();
+ for (; g != vars.end(); ++g) {
+ cout << *g << "; ";
+ }
+ cout << " ) ";
+ }
+ cout << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfprimers.cpp b/src/vcfprimers.cpp
new file mode 100644
index 0000000..2a5c46a
--- /dev/null
+++ b/src/vcfprimers.cpp
@@ -0,0 +1,140 @@
+#include "Variant.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+ << endl
+ << "options:" << endl
+ << " -f, --fasta-reference FASTA reference file to use to obtain primer sequences" << endl
+ << " -l, --primer-length The length of the primer sequences on each side of the variant" << endl
+ << endl
+ << "For each VCF record, extract the flanking sequences, and write them to stdout as FASTA" << endl
+ << "records suitable for alignment. This tool is intended for use in designing validation" << endl
+ << "experiments. Primers extracted which would flank all of the alleles at multi-allelic" << endl
+ << "sites. The name of the FASTA \"reads\" indicates the VCF record which they apply to." << endl
+ << "The form is >CHROM_POS_LEFT for the 3' primer and >CHROM_POS_RIGHT for the 5' primer," << endl
+ << "for example:" << endl
+ << endl
+ << ">20_233255_LEFT" << endl
+ << "CCATTGTATATATAGACCATAATTTCTTTATCCAATCATCTGTTGATGGA" << endl
+ << ">20_233255_RIGHT" << endl
+ << "ACTCAGTTGATTCCATACCTTTGCCATCATGAATCATGTTGTAATAAACA" << endl
+ << endl;
+ exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+ int c;
+ string fastaRef;
+ int primerLength = 0;
+
+ if (argc == 1)
+ printSummary(argv);
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"fasta-reference", required_argument, 0, 'f'},
+ {"primer-length", required_argument, 0, 'l'},
+ //{"length", no_argument, &printLength, true},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hf:l:",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 'f':
+ fastaRef = optarg;
+ break;
+
+ case 'l':
+ primerLength = atoi(optarg);
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+ break;
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (primerLength == 0) {
+ cerr << "a primer length must be specified" << endl;
+ exit(1);
+ }
+ if (fastaRef.empty()) {
+ cerr << "a FASTA reference sequence must be specified" << endl;
+ exit(1);
+ }
+
+ FastaReference ref;
+ ref.open(fastaRef);
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ // get the ref start and end positions
+ int refstart = var.position - 1; // convert to 0-based
+ int refend = var.position + var.ref.size() - 1;
+ string leftprimer = ref.getSubSequence(var.sequenceName, refstart - primerLength, primerLength);
+ string rightprimer = ref.getSubSequence(var.sequenceName, refend, primerLength);
+ //cout << var << endl;
+ cout << ">" << var.sequenceName << "_" << var.position << "_LEFT" << endl
+ << leftprimer << endl
+ << ">" << var.sequenceName << "_" << var.position << "_RIGHT" << endl
+ << rightprimer << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfqual2info.cpp b/src/vcfqual2info.cpp
new file mode 100644
index 0000000..71c3335
--- /dev/null
+++ b/src/vcfqual2info.cpp
@@ -0,0 +1,44 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc == 1) {
+ cerr << "usage: " << argv[0] << " [key] [vcf_file]" << endl
+ << "Puts QUAL into an info field tag keyed by [key]." << endl
+ << "The VCF file may be omitted and read from stdin." << endl;
+ return 1;
+ }
+
+ string key = argv[1];
+
+ if (argc > 2) {
+ string filename = argv[2];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ variantFile.addHeaderLine("##INFO=<ID="+key+",Number=1,Type=Float,Description=\"QUAL value of site field.\">");
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ var.info[key].clear();
+ var.info[key].push_back(convert(var.quality));
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfrandom.cpp b/src/vcfrandom.cpp
new file mode 100644
index 0000000..debab84
--- /dev/null
+++ b/src/vcfrandom.cpp
@@ -0,0 +1,70 @@
+#include <sstream>
+#include <stdlib.h>
+#include <time.h>
+#include "Variant.h"
+#include <cmath>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ stringstream headerss;
+ headerss << "##fileformat=VCFv4.0" << endl
+ << "##source=vcfrandom" << endl
+ << "##reference=/d2/data/references/build_37/human_reference_v37.fa" << endl
+ << "##phasing=none" << endl
+ << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">" << endl
+ << "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total read depth at the locus\">" << endl
+ << "##INFO=<ID=AC,Number=1,Type=Integer,Description=\"Total number of alternate alleles in called genotypes\">" << endl
+ << "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">" << endl
+ << "##INFO=<ID=AF,Number=1,Type=Float,Description=\"Estimated allele frequency in the range (0,1]\">" << endl
+ << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl
+ << "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype\">" << endl
+ << "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">" << endl
+ << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tbill";
+
+ string header = headerss.str();
+ variantFile.openForOutput(header);
+
+ cout << variantFile.header << endl;
+
+ srand(time(NULL));
+
+ vector<string> atgc;
+ atgc.push_back("A");
+ atgc.push_back("T");
+ atgc.push_back("G");
+ atgc.push_back("C");
+
+ for (int i = 1; i < 10; ++i) {
+ Variant var(variantFile);
+ var.sequenceName = "one";
+ var.id = ".";
+ var.filter = ".";
+ var.ref = atgc.at(rand() % 4);
+ var.quality = 100;
+ stringstream s;
+ s << rand() % 100;
+ var.info["DP"].push_back(s.str());
+ var.format.push_back("GT");
+ var.format.push_back("DP");
+ var.position = i;
+ for (vector<string>::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) {
+ string& name = *s;
+ var.alt.clear();
+ var.alt.push_back(atgc.at(rand() % 4));
+ var.alt.push_back(atgc.at(rand() % 4));
+ var.samples[name]["GT"].push_back("0/1");
+ stringstream dp;
+ dp << floor(rand() % 100);
+ var.samples[name]["DP"].push_back(dp.str());
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
diff --git a/src/vcfrandomsample.cpp b/src/vcfrandomsample.cpp
new file mode 100644
index 0000000..3cc565b
--- /dev/null
+++ b/src/vcfrandomsample.cpp
@@ -0,0 +1,174 @@
+#include "Variant.h"
+#include "BedReader.h"
+#include <getopt.h>
+#include "mt19937ar.h"
+#include <sstream>
+#include <iostream>
+#include "convert.h"
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+ << endl
+ << "options:" << endl
+ << " -r, --rate RATE base sampling probability per locus" << endl
+ << " -s, --scale-by KEY scale sampling likelihood by this Float info field" << endl
+ << " -p, --random-seed N use this random seed (by default read from /dev/random)" << endl
+ << " -q, --pseudorandom-seed use a pseudorandom seed (by default read from /dev/random)" << endl
+ << endl
+ << "Randomly sample sites from an input VCF file, which may be provided as stdin." << endl
+ << "Scale the sampling probability by the field specified in KEY. This may be" << endl
+ << "used to provide uniform sampling across allele frequencies, for instance." << endl;
+ exit(0);
+}
+
+int main(int argc, char** argv) {
+
+ double rate = 1.0;
+ int seed = 0;
+ bool useprng = false;
+ string scaleByKey;
+
+ if (argc == 1)
+ printSummary(argv);
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ {"help", no_argument, 0, 'h'},
+ {"rate", required_argument, 0, 'r'},
+ {"scale-by", required_argument, 0, 's'},
+ {"random-seed", required_argument, 0, 'p'},
+ {"pseudorandom-seed", required_argument, 0, 'q'},
+ {0, 0, 0, 0}
+ };
+
+ int option_index = 0;
+ c = getopt_long (argc, argv, "hqr:s:p:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'r':
+ rate = atof(optarg);
+ break;
+
+ case 's':
+ scaleByKey = optarg;
+ break;
+
+ case 'p':
+ seed = atoi(optarg);
+ break;
+
+ case 'q':
+ useprng = true;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ cout << "could not open VCF file" << endl;
+ return 1;
+ }
+
+ // seed prng with random bits from /dev/random
+ if (!seed) {
+ fstream random;
+ if (useprng) {
+ random.open("/dev/urandom", fstream::in);
+ } else {
+ random.open("/dev/random", fstream::in);
+ }
+ random.get((char*) &seed, sizeof(int));
+ random.close();
+ }
+
+ init_genrand(seed);
+
+ vector<string> args;
+ for (int i = 0; i < argc; ++i) {
+ args.push_back(argv[i]);
+ }
+
+ stringstream liness;
+ liness << "##sampling=\"random sampling using "
+ << join(args, " ")
+ << " using random seed "
+ << seed << "\"";
+ variantFile.addHeaderLine(liness.str());
+
+ cout << variantFile.header << endl;
+
+ // check that we can use the scaling key
+ if (!scaleByKey.empty()) {
+ if (variantFile.infoTypes.find(scaleByKey) == variantFile.infoTypes.end()) {
+ cerr << "could not find info key " << scaleByKey << endl;
+ exit(1);
+ } else {
+ if (variantFile.infoTypes[scaleByKey] != FIELD_FLOAT) {
+ cerr << "cannot use " << scaleByKey << " as a scaling factor, as it is not of type Float" << endl;
+ exit(1);
+ }
+ }
+ }
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ double randN = genrand_real1();
+ if (!scaleByKey.empty()) {
+ if (var.info.find(scaleByKey) != var.info.end()) {
+ double val;
+
+ // hack, sum the values of interest if we have multiple values
+ // really, this is only suitable for AF stuff
+ vector<string>& vals = var.info[scaleByKey];
+ for (vector<string>::iterator b = vals.begin(); b != vals.end(); ++b) {
+ double f;
+ convert(*b, f);
+ val += f;
+ }
+ val /= vals.size();
+
+ if (val > 1) {
+ cerr << "cannot scale by " << scaleByKey << "=" << val << " as it is > 1" << endl;
+ exit(1);
+ }
+ randN *= val;
+ }
+ }
+ if (randN < rate) {
+ cout << var << endl;
+ }
+ }
+
+ return 0;
+
+}
diff --git a/src/vcfremap.cpp b/src/vcfremap.cpp
new file mode 100644
index 0000000..82c9997
--- /dev/null
+++ b/src/vcfremap.cpp
@@ -0,0 +1,350 @@
+#include "Variant.h"
+#include "BedReader.h"
+#include "intervaltree/IntervalTree.h"
+#include <getopt.h>
+#include "fastahack/Fasta.h"
+#include <algorithm>
+#include <list>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+ << endl
+ << "options:" << endl
+ << " -w, --ref-window-size N align using this many bases flanking each side of the reference allele" << endl
+ << " -s, --alt-window-size N align using this many flanking bases from the reference around each alternate allele" << endl
+ << " -r, --reference FILE FASTA reference file, required with -i and -u" << endl
+ << " -m, --match-score N match score for SW algorithm" << endl
+ << " -x, --mismatch-score N mismatch score for SW algorithm" << endl
+ << " -o, --gap-open-penalty N gap open penalty for SW algorithm" << endl
+ << " -e, --gap-extend-penalty N gap extension penalty for SW algorithm" << endl
+ << " -z, --entropy-gap-open use entropy scaling for the gap open penalty" << endl
+ << " -R, --repeat-gap-extend N penalize non-repeat-unit gaps in repeat sequence" << endl
+ << " -a, --adjust-vcf TAG supply a new cigar as TAG in the output VCF" << endl
+ << endl
+ << "For each alternate allele, attempt to realign against the reference with lowered gap open penalty." << endl
+ << "If realignment is possible, adjust the cigar and reference/alternate alleles." << endl;
+ exit(0);
+}
+
+int main(int argc, char** argv) {
+
+ string vcfFileName;
+ string fastaFileName;
+ int windowsize = 100;
+ bool includePreviousBaseForIndels = false;
+ bool useMNPs = true;
+ int altwindowsize = 50;
+
+ // constants for SmithWaterman algorithm
+ float matchScore = 10.0f;
+ float mismatchScore = -9.0f;
+ float gapOpenPenalty = 15.0f;
+ float gapExtendPenalty = 6.66f;
+
+ bool useEntropy = false;
+ bool useRepeatGapExtendPenalty = false;
+ float repeatGapExtendPenalty = 1;
+
+ bool adjustVcf = false;
+ string adjustedTag = "remappedCIGAR";
+
+ if (argc == 1)
+ printSummary(argv);
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"ref-window-size", required_argument, 0, 'w'},
+ {"reference", required_argument, 0, 'r'},
+ {"match-score", required_argument, 0, 'm'},
+ {"mismatch-score", required_argument, 0, 'x'},
+ {"gap-open-penalty", required_argument, 0, 'o'},
+ {"gap-extend-penalty", required_argument, 0, 'e'},
+ {"alt-window-size", required_argument, 0, 's'},
+ {"entropy-gap-open", no_argument, 0, 'z'},
+ {"repeat-gap-extend", no_argument, 0, 'R'},
+ {"adjust-vcf", required_argument, 0, 'a'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hza:w:r:m:x:o:e:s:R:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+
+ case 'w':
+ windowsize = atoi(optarg);
+ break;
+
+ case 'a':
+ adjustVcf = true;
+ adjustedTag = optarg;
+ break;
+
+ case 'r':
+ fastaFileName = string(optarg);
+ break;
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case 'm':
+ matchScore = atof(optarg);
+ break;
+
+ case 'x':
+ mismatchScore = atof(optarg);
+ break;
+
+ case 'o':
+ gapOpenPenalty = atof(optarg);
+ break;
+
+ case 'e':
+ gapExtendPenalty = atof(optarg);
+ break;
+
+ case 's':
+ altwindowsize = atoi(optarg);
+ break;
+
+ case 'z':
+ useEntropy = true;
+ break;
+
+ case 'R':
+ useRepeatGapExtendPenalty = true;
+ repeatGapExtendPenalty = atof(optarg);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ cerr << "could not open VCF file" << endl;
+ exit(1);
+ }
+
+ FastaReference freference;
+ if (fastaFileName.empty()) {
+ cerr << "a reference is required" << endl;
+ exit(1);
+ } else {
+ freference.open(fastaFileName);
+ }
+
+ if (adjustVcf) {
+ vector<string> commandline;
+ for (int i = 0; i < argc; ++i)
+ commandline.push_back(argv[i]);
+ variantFile.addHeaderLine("##INFO=<ID=" + adjustedTag + ",Number=A,Type=String,Description=\"CIGAR when remapped using"+ join(commandline, " ") +"\">");
+ }
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ //if (!adjustVcf) {
+ cout << endl;
+ cout << var << endl;
+ //}
+ map<string, vector<VariantAllele> > variantAlleles;
+ vector<vector<pair<int, char> > > cigars;
+ vector<int> positionDiffs;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ //if (!adjustVcf) cout << endl;
+ cout << endl;
+
+ // try to remap locally
+
+ string reference = freference.getSubSequence(var.sequenceName, var.position - 1 - windowsize, windowsize * 2 + var.ref.size());
+
+ // passed to sw align
+ unsigned int referencePos;
+ string cigar;
+
+ string& alternate = *a;
+
+ vector<VariantAllele>& variants = variantAlleles[alternate];
+
+ string alternateQuery = reference.substr(windowsize - altwindowsize, altwindowsize) + alternate + reference.substr(reference.size() - windowsize, altwindowsize);
+
+ //cout << "REF:\t" << reference << endl;
+ //cout << "ALT:\t" << string(windowsize - altwindowsize, ' ') << alternateQuery << endl;
+
+ CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
+ if (useEntropy) sw.EnableEntropyGapPenalty(1);
+ if (useRepeatGapExtendPenalty) sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
+ sw.Align(referencePos, cigar, reference, alternateQuery);
+
+ int altpos = 0;
+ int refpos = 0;
+ int len;
+ string slen;
+ vector<pair<int, char> > cigarData;
+
+ string ref = reference.substr(referencePos);
+ positionDiffs.push_back(referencePos); // TODO this... is borked
+
+ stringstream refss;
+ stringstream altss;
+
+ if (!adjustVcf) cout << cigar << endl;
+ cout << cigar << endl;
+ for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) {
+ switch (*c) {
+ case 'I':
+ len = atoi(slen.c_str());
+ slen.clear();
+ if (altpos < altwindowsize) {
+ cigarData.push_back(make_pair(len, 'M'));
+ } else {
+ cigarData.push_back(make_pair(len, *c));
+ }
+ altss << alternateQuery.substr(altpos, len);
+ refss << string(len, '-');
+ altpos += len;
+ break;
+ case 'D':
+ len = atoi(slen.c_str());
+ slen.clear();
+ if (altpos < altwindowsize) {
+ } else {
+ cigarData.push_back(make_pair(len, *c));
+ }
+ refss << ref.substr(refpos, len);
+ altss << string(len, '-');
+ refpos += len;
+ break;
+ case 'M':
+ len = atoi(slen.c_str());
+ slen.clear();
+ {
+ for (int i = 0; i < len; ++i) {
+ if (ref.at(refpos + i) == alternateQuery.at(altpos + i)) {
+ if (!cigarData.empty() && cigarData.back().second == 'M') {
+ cigarData.back().first++;
+ } else {
+ cigarData.push_back(make_pair(1, 'M'));
+ }
+ } else {
+ if (!cigarData.empty() && cigarData.back().second == 'X') {
+ cigarData.back().first++;
+ } else {
+ cigarData.push_back(make_pair(1, 'X'));
+ }
+ }
+ }
+ }
+ refss << ref.substr(refpos, len);
+ altss << alternateQuery.substr(altpos, len);
+ refpos += len;
+ altpos += len;
+ break;
+ case 'S':
+ len = atoi(slen.c_str());
+ slen.clear();
+ cigarData.push_back(make_pair(len, *c));
+ refss << ref.substr(refpos, len);
+ //altss << alternateQuery.substr(altpos, len); // TODO deal with soft clipping, weird behavior
+ refpos += len;
+ altpos += len;
+ break;
+ default:
+ len = 0;
+ slen += *c;
+ break;
+ }
+ }
+
+ if (!adjustVcf) {
+ cout << "ref:\t" << refss.str() << endl;
+ cout << "alt:\t" << altss.str() << endl;
+ } else {
+ cout << "ref:\t" << refss.str() << endl;
+ cout << "alt:\t" << altss.str() << endl;
+ cigars.push_back(cigarData);
+ }
+
+ }
+
+ if (adjustVcf) {
+ int substart = cigars.front().front().first;
+ int subend = cigars.front().back().first;
+
+ // find the min and max match
+ for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) {
+ if (c->front().second == 'M' && c->front().first <= substart) {
+ substart = c->front().first;
+ if (c->size() > 1 && c->at(1).second != 'X') {
+ --substart;
+ }
+ }
+ if (c->back().second == 'M' && c->back().first <= subend) {
+ subend = c->back().first;
+ }
+ }
+
+ // adjust the cigars and get the new reference length
+ int reflen = 0;
+ for (vector<vector<pair<int, char> > >::iterator c = cigars.begin(); c != cigars.end(); ++c) {
+ c->front().first -= substart;
+ c->back().first -= subend;
+ int crf = cigarRefLen(*c);
+ if (crf > reflen)
+ reflen = crf;
+ var.info[adjustedTag].push_back(joinCigar(*c));
+ }
+
+ // find the lowest positional difference
+ int pdiff = 0;
+ for (vector<int>::iterator d = positionDiffs.begin(); d != positionDiffs.end(); ++d) {
+ if (*d + altwindowsize < pdiff)
+ pdiff = *d + altwindowsize;
+ }
+
+ // adjust the reference string
+ var.position += pdiff;
+
+ // adjust the variant position
+ var.ref = freference.getSubSequence(var.sequenceName, var.position - 1, reflen);
+
+ cout << var << endl;
+ }
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfremoveaberrantgenotypes.cpp b/src/vcfremoveaberrantgenotypes.cpp
new file mode 100644
index 0000000..75ebc32
--- /dev/null
+++ b/src/vcfremoveaberrantgenotypes.cpp
@@ -0,0 +1,75 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+void stripAberrant(Variant& var) {
+ map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+ while (s != var.samples.end()) {
+ map<string, vector<string> >& sample = s->second;
+ map<int, int> genotype = decomposeGenotype(sample["GT"].front());
+ int refobs = 0;
+ convert(sample["RO"].front(), refobs);
+ if (isHomNonRef(genotype) && refobs > 0) {
+ var.samples.erase(s);
+ } else if (isHomRef(genotype)) {
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ int alleleIndex = var.altAlleleIndexes[*a];
+ int altobs = 0;
+ convert(sample["AO"].at(alleleIndex), altobs);
+ if (altobs > 0) {
+ var.samples.erase(s);
+ break;
+ }
+ }
+ }
+ ++s;
+ }
+}
+
+int main(int argc, char** argv) {
+
+ if (argc != 2) {
+ cerr << "usage: " << argv[0] << " <vcf file>" << endl
+ << "strips samples which are homozygous but have observations implying heterozygosity" << endl;
+ return 1;
+ }
+
+ string filename = argv[1];
+
+ VariantCallFile variantFile;
+ if (filename == "-") {
+ variantFile.open(std::cin);
+ } else {
+ variantFile.open(filename);
+ }
+
+ if (!variantFile.is_open()) {
+ cerr << "could not open " << filename << endl;
+ return 1;
+ }
+
+ Variant var(variantFile);
+
+ // TODO check if AC is present
+ // ensure that AC is listed as an info field
+ string line = "##filter=\"removed homozygous genotypes which have observations implying heterozygosity\">";
+ variantFile.addHeaderLine(line);
+
+ // write the new header
+ cout << variantFile.header << endl;
+
+ // print the records, filtering is done via the setting of varA's output sample names
+ while (variantFile.getNextVariant(var)) {
+ stripAberrant(var);
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfremovesamples.cpp b/src/vcfremovesamples.cpp
new file mode 100644
index 0000000..b4b31df
--- /dev/null
+++ b/src/vcfremovesamples.cpp
@@ -0,0 +1,76 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+
+using namespace std;
+using namespace vcf;
+
+// remove elements in a from b
+template<class T>
+vector<T> removeElems(vector<T>& a, vector<T>& b) {
+ map<T, bool> inA;
+ map<T, bool> inAB;
+ for (typename vector<T>::iterator i = a.begin(); i != a.end(); ++i) {
+ inA[*i] = true;
+ }
+ for (typename vector<T>::iterator i = b.begin(); i != b.end(); ++i) {
+ if (inA.find(*i) == inA.end()) {
+ inAB[*i] = true;
+ }
+ }
+ vector<T> aNb;
+ for (typename map<T, bool>::iterator i = inAB.begin(); i != inAB.end(); ++i) {
+ aNb.push_back(i->first);
+ }
+ return aNb;
+}
+
+int main(int argc, char** argv) {
+
+ if (argc < 3) {
+ cerr << "usage: " << argv[0] << " <vcf file> [SAMPLE1] [SAMPLE2] ..." << endl
+ << "outputs each record in the vcf file, removing samples listed on the command line" << endl;
+ return 1;
+ }
+
+ string filename = argv[1];
+
+ vector<string> samplesToRemove;
+ for (int i = 2; i < argc; ++i) {
+ samplesToRemove.push_back(argv[i]);
+ }
+
+ VariantCallFile variantFile;
+ if (filename == "-") {
+ variantFile.open(std::cin);
+ } else {
+ variantFile.open(filename);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ Variant var(variantFile);
+
+ vector<string> samplesToKeep = removeElems(samplesToRemove, variantFile.sampleNames);
+
+ // update sample list in header
+ variantFile.updateSamples(samplesToKeep);
+
+ // and restrict the output sample names in the variant to those we are keeping
+ var.setOutputSampleNames(samplesToKeep);
+
+ // write the new header
+ cout << variantFile.header << endl;
+
+ // print the records, filtering is done via the setting of varA's output sample names
+ while (variantFile.getNextVariant(var)) {
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfroc.cpp b/src/vcfroc.cpp
new file mode 100644
index 0000000..e77562e
--- /dev/null
+++ b/src/vcfroc.cpp
@@ -0,0 +1,469 @@
+#include "Variant.h"
+#include "BedReader.h"
+#include "intervaltree/IntervalTree.h"
+#include <getopt.h>
+#include "fastahack/Fasta.h"
+#include <algorithm>
+#include <list>
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [<vcf file>]" << endl
+ << endl
+ << "options:" << endl
+ << " -t, --truth-vcf FILE use this VCF as ground truth for ROC generation" << endl
+ << " -w, --window-size N compare records up to this many bp away (default 30)" << endl
+ << " -c, --complex directly compare complex alleles, don't parse into primitives" << endl
+ << " -r, --reference FILE FASTA reference file" << endl
+ << endl
+ << "Generates a pseudo-ROC curve using sensitivity and specificity estimated against" << endl
+ << "a putative truth set. Thresholding is provided by successive QUAL cutoffs." << endl;
+ exit(0);
+}
+
+void buildVariantIntervalTree(VariantCallFile& variantFile,
+ map<string, IntervalTree<Variant*> >& variantIntervals,
+ list<Variant>& variants) {
+
+ map<string, vector<Interval<Variant*> > > rawVariantIntervals;
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ long int left = var.position;
+ long int right = left + var.ref.size(); // this should be 1-past the end
+ variants.push_back(var);
+ Variant* v = &variants.back();
+ rawVariantIntervals[var.sequenceName].push_back(Interval<Variant*>(left, right, v));
+ }
+
+ for (map<string, vector<Interval<Variant*> > >::iterator j = rawVariantIntervals.begin(); j != rawVariantIntervals.end(); ++j) {
+ variantIntervals[j->first] = IntervalTree<Variant*>(j->second);
+ }
+}
+
+
+void intersectVariant(Variant& var,
+ map<string, IntervalTree<Variant*> >& variantIntervals,
+ vector<string*>& commonAlleles,
+ vector<string*>& uniqueAlleles,
+ FastaReference& reference,
+ int windowsize = 50) {
+
+ vector<Interval<Variant*> > results;
+
+ variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize, results);
+
+ vector<Variant*> overlapping;
+
+ for (vector<Interval<Variant*> >::iterator r = results.begin(); r != results.end(); ++r) {
+ overlapping.push_back(r->value);
+ }
+
+
+ if (overlapping.empty()) {
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ uniqueAlleles.push_back(&*a);
+ }
+ } else {
+
+ // get the min and max of the overlaps
+
+ int haplotypeStart = var.position;
+ int haplotypeEnd = var.position + var.ref.size();
+
+ for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
+ haplotypeStart = min((*v)->position, (long int) haplotypeStart);
+ haplotypeEnd = max((*v)->position + (*v)->ref.size(), (long unsigned int) haplotypeEnd);
+ }
+
+ // for everything overlapping and the current variant, construct the local haplotype within the bounds
+ // if there is an exact match, the allele in the current VCF does intersect
+
+ string referenceHaplotype = reference.getSubSequence(var.sequenceName, haplotypeStart - 1, haplotypeEnd - haplotypeStart);
+ map<string, vector<pair<Variant*, int> > > haplotypes; // map to variant and alt index
+
+ for (vector<Variant*>::iterator v = overlapping.begin(); v != overlapping.end(); ++v) {
+ Variant& variant = **v;
+ int altindex = 0;
+ for (vector<string>::iterator a = variant.alt.begin(); a != variant.alt.end(); ++a, ++altindex) {
+ string haplotype = referenceHaplotype;
+ // get the relative start and end coordinates for the variant alternate allele
+ int relativeStart = variant.position - haplotypeStart;
+ haplotype.replace(relativeStart, variant.ref.size(), *a);
+ haplotypes[haplotype].push_back(make_pair(*v, altindex));
+ }
+ }
+
+
+ // determine the non-intersecting alts
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string haplotype = referenceHaplotype;
+ int relativeStart = var.position - haplotypeStart;
+ haplotype.replace(relativeStart, var.ref.size(), *a);
+ map<string, vector<pair<Variant*, int> > >::iterator h = haplotypes.find(haplotype);
+ if (h == haplotypes.end()) {
+ uniqueAlleles.push_back(&*a);
+ } else {
+ commonAlleles.push_back(&*a);
+ }
+ }
+
+ }
+}
+
+
+int main(int argc, char** argv) {
+
+ string truthVcfFileName;
+ string fastaFileName;
+ bool complex = false;
+ int windowsize = 30;
+
+ if (argc == 1)
+ printSummary(argv);
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"window-size", required_argument, 0, 'w'},
+ {"reference", required_argument, 0, 'r'},
+ {"complex", required_argument, 0, 'c'},
+ {"truth-vcf", required_argument, 0, 't'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hcw:r:t:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+
+ case 'w':
+ windowsize = atoi(optarg);
+ break;
+
+ case 'r':
+ fastaFileName = string(optarg);
+ break;
+
+ case 't':
+ truthVcfFileName = optarg;
+ break;
+
+ case 'c':
+ complex = true;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ break;
+
+ case '?':
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ VariantCallFile variantFile;
+ bool usingstdin = false;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ usingstdin = true;
+ }
+
+ if (!variantFile.is_open()) {
+ cerr << "could not open VCF file" << endl;
+ exit(1);
+ }
+
+ VariantCallFile truthVariantFile;
+ if (!truthVcfFileName.empty()) {
+ if (truthVcfFileName == "-") {
+ if (usingstdin) {
+ cerr << "cannot open both VCF file streams from stdin" << endl;
+ exit(1);
+ } else {
+ truthVariantFile.open(std::cin);
+ }
+ } else {
+ truthVariantFile.open(truthVcfFileName);
+ }
+ if (!truthVariantFile.is_open()) {
+ cerr << "could not open VCF file " << truthVcfFileName << endl;
+ exit(1);
+ }
+ }
+
+ FastaReference reference;
+ if (fastaFileName.empty()) {
+ cerr << "a reference is required for the haplotype-based intersection used by vcfroc" << endl;
+ exit(1);
+ }
+ reference.open(fastaFileName);
+
+ // read the VCF file for union or intersection into an interval tree
+ // indexed using some proximity window
+
+ map<string, IntervalTree<Variant*> > truthVariantIntervals;
+ list<Variant> truthVariants;
+ buildVariantIntervalTree(truthVariantFile, truthVariantIntervals, truthVariants);
+
+ map<string, IntervalTree<Variant*> > testVariantIntervals;
+ list<Variant> testVariants;
+ buildVariantIntervalTree(variantFile, testVariantIntervals, testVariants);
+
+ map<long double, vector<VariantAllele*> > falseNegativeAllelesAtCutoff; // false negative after this cutoff
+ map<long double, vector<VariantAllele*> > falsePositiveAllelesAtCutoff; // false positive until this cutoff
+ list<VariantAllele*> allFalsePositiveAlleles;
+ map<long double, vector<VariantAllele*> > allelesAtCutoff;
+ //map<long double, vector<VariantAllele*> > totalAllelesAtCutoff;
+ map<Variant*, map<string, vector<VariantAllele> > > parsedAlleles;
+ map<long double, vector<Variant*> > callsByCutoff;
+
+ // replicate this method, where Q is for each unique Q in the set
+ //vcfintersect -r $reference -v -i $results.$Q.vcf $answers_primitives | vcfstats >false_negatives.$Q.stats
+ //vcfintersect -r $reference -v -i $answers_primitives $results.$Q.vcf | vcfstats >false_positives.$Q.stats
+
+ for (list<Variant>::iterator v = testVariants.begin(); v != testVariants.end(); ++v) {
+ // TODO allow different cutoff sources
+ callsByCutoff[v->quality].push_back(&*v);
+ }
+
+ // add false negatives at any cutoff
+ for (list<Variant>::iterator v = truthVariants.begin(); v != truthVariants.end(); ++v) {
+ Variant& variant = *v;
+ vector<string*> commonAlleles;
+ vector<string*> uniqueAlleles;
+ intersectVariant(variant, testVariantIntervals,
+ commonAlleles, uniqueAlleles, reference);
+ if (complex) {
+ parsedAlleles[&*v] = variant.flatAlternates();
+ } else {
+ parsedAlleles[&*v] = variant.parsedAlternates();
+ }
+ // unique alleles are false negatives regardless of cutoff
+ for (vector<string*>::iterator a = uniqueAlleles.begin(); a != uniqueAlleles.end(); ++a) {
+ vector<VariantAllele>& alleles = parsedAlleles[&*v][**a];
+ for (vector<VariantAllele>::iterator va = alleles.begin(); va != alleles.end(); ++va) {
+ if (va->ref != va->alt) { // use only non-reference alleles
+ // false negatives at threshold 0 XXX --- may not apply if threshold is generalized
+ falseNegativeAllelesAtCutoff[-1].push_back(&*va);
+ }
+ }
+ }
+ }
+
+ for (map<long double, vector<Variant*> >::iterator q = callsByCutoff.begin(); q != callsByCutoff.end(); ++q) {
+ long double threshold = q->first;
+ vector<Variant*>& variants = q->second;
+ for (vector<Variant*>::iterator v = variants.begin(); v != variants.end(); ++v) {
+ Variant& variant = **v;
+ vector<string*> commonAlleles;
+ vector<string*> uniqueAlleles;
+ intersectVariant(variant, truthVariantIntervals,
+ commonAlleles, uniqueAlleles, reference);
+ if (complex) {
+ parsedAlleles[*v] = variant.flatAlternates();
+ } else {
+ parsedAlleles[*v] = variant.parsedAlternates();
+ }
+
+ map<string, vector<VariantAllele> >& parsedAlts = parsedAlleles[*v];
+ // push VariantAllele*'s into the FN and FP alleles at cutoff vectors
+ for (vector<string*>::iterator a = commonAlleles.begin(); a != commonAlleles.end(); ++a) {
+ vector<VariantAllele>& alleles = parsedAlleles[*v][**a];
+ for (vector<VariantAllele>::iterator va = alleles.begin(); va != alleles.end(); ++va) {
+ if (va->ref != va->alt) { // use only non-reference alleles
+ allelesAtCutoff[threshold].push_back(&*va);
+ falseNegativeAllelesAtCutoff[threshold].push_back(&*va);
+ }
+ }
+ }
+ for (vector<string*>::iterator a = uniqueAlleles.begin(); a != uniqueAlleles.end(); ++a) {
+ vector<VariantAllele>& alleles = parsedAlts[**a];
+ for (vector<VariantAllele>::iterator va = alleles.begin(); va != alleles.end(); ++va) {
+ if (va->ref != va->alt) { // use only non-reference alleles
+ allelesAtCutoff[threshold].push_back(&*va);
+ allFalsePositiveAlleles.push_back(&*va);
+ falsePositiveAllelesAtCutoff[threshold].push_back(&*va);
+ }
+ }
+ }
+ }
+ }
+
+
+ // output results
+ int totalSNPs = 0;
+ int falsePositiveSNPs = 0;
+ int falseNegativeSNPs = 0;
+ int totalIndels = 0;
+ int falsePositiveIndels = 0;
+ int falseNegativeIndels = 0;
+ int totalComplex = 0;
+ int falsePositiveComplex = 0;
+ int falseNegativeComplex = 0;
+
+ // write header
+
+ cout << "threshold" << "\t"
+ << "num_snps" << "\t"
+ << "false_positive_snps" << "\t"
+ << "false_negative_snps" << "\t"
+ << "num_indels" << "\t"
+ << "false_positive_indels" << "\t"
+ << "false_negative_indels" << "\t"
+ << "num_complex" << "\t"
+ << "false_positive_complex" << "\t"
+ << "false_negative_complex" << endl;
+
+ // count total alleles in set
+ for (map<long double, vector<VariantAllele*> >::iterator a = allelesAtCutoff.begin(); a != allelesAtCutoff.end(); ++a) {
+ vector<VariantAllele*>& alleles = a->second;
+ for (vector<VariantAllele*>::iterator va = alleles.begin(); va != alleles.end(); ++va) {
+ VariantAllele& allele = **va;
+ if (allele.ref.size() == 1 && allele.ref.size() == allele.alt.size()) {
+ ++totalSNPs;
+ } else if (allele.ref.size() != allele.alt.size()) {
+ if (allele.ref.size() == 1 || allele.alt.size() == 1) {
+ ++totalIndels;
+ } else {
+ ++totalComplex;
+ }
+ } else {
+ ++totalComplex;
+ }
+ }
+ }
+
+ // tally total false positives
+ for (list<VariantAllele*>::iterator va = allFalsePositiveAlleles.begin(); va != allFalsePositiveAlleles.end(); ++va) {
+ VariantAllele& allele = **va;
+ if (allele.ref.size() == 1 && allele.ref.size() == allele.alt.size()) {
+ ++falsePositiveSNPs;
+ } else if (allele.ref.size() != allele.alt.size()) {
+ if (allele.ref.size() == 1 || allele.alt.size() == 1) {
+ ++falsePositiveIndels;
+ } else {
+ ++falsePositiveComplex;
+ }
+ } else {
+ ++falsePositiveComplex;
+ }
+ }
+
+ // get categorical false negatives
+ vector<VariantAllele*>& categoricalFalseNegatives = falseNegativeAllelesAtCutoff[-1];
+ for (vector<VariantAllele*>::iterator va = categoricalFalseNegatives.begin(); va != categoricalFalseNegatives.end(); ++va) {
+ VariantAllele& allele = **va;
+ if (allele.ref.size() == 1 && allele.ref.size() == allele.alt.size()) {
+ assert(allele.ref.size() == 1);
+ ++falseNegativeSNPs;
+ } else if (allele.ref.size() != allele.alt.size()) {
+ if (allele.ref.size() == 1 || allele.alt.size() == 1) {
+ ++falseNegativeIndels;
+ } else {
+ ++falseNegativeComplex;
+ }
+ } else {
+ ++falseNegativeComplex;
+ }
+ }
+ cout << -1 << "\t"
+ << totalSNPs << "\t"
+ << falsePositiveSNPs << "\t"
+ << falseNegativeSNPs << "\t"
+ << totalIndels << "\t"
+ << falsePositiveIndels << "\t"
+ << falseNegativeIndels << "\t"
+ << totalComplex << "\t"
+ << falsePositiveComplex << "\t"
+ << falseNegativeComplex << endl;
+
+ for (map<long double, vector<VariantAllele*> >::iterator a = allelesAtCutoff.begin(); a != allelesAtCutoff.end(); ++a) {
+ vector<VariantAllele*>& alleles = a->second;
+ long double threshold = a->first;
+ for (vector<VariantAllele*>::iterator va = alleles.begin(); va != alleles.end(); ++va) {
+ VariantAllele& allele = **va;
+ if (allele.ref.size() == 1 && allele.ref.size() == allele.alt.size()) {
+ assert(allele.ref.size() == 1);
+ --totalSNPs;
+ } else if (allele.ref.size() != allele.alt.size()) {
+ if (allele.ref.size() == 1 || allele.alt.size() == 1) {
+ --totalIndels;
+ } else {
+ --totalComplex;
+ }
+ } else {
+ --totalComplex;
+ }
+ }
+ vector<VariantAllele*>& falseNegatives = falseNegativeAllelesAtCutoff[threshold];
+ for (vector<VariantAllele*>::iterator va = falseNegatives.begin(); va != falseNegatives.end(); ++va) {
+ VariantAllele& allele = **va;
+ if (allele.ref.size() == 1 && allele.ref.size() == allele.alt.size()) {
+ assert(allele.ref.size() == 1);
+ ++falseNegativeSNPs;
+ } else if (allele.ref.size() != allele.alt.size()) {
+ if (allele.ref.size() == 1 || allele.alt.size() == 1) {
+ ++falseNegativeIndels;
+ } else {
+ ++falseNegativeComplex;
+ }
+ } else {
+ ++falseNegativeComplex;
+ }
+ }
+ vector<VariantAllele*>& falsePositives = falsePositiveAllelesAtCutoff[threshold];
+ for (vector<VariantAllele*>::iterator va = falsePositives.begin(); va != falsePositives.end(); ++va) {
+ VariantAllele& allele = **va;
+ if (allele.ref.size() == 1 && allele.ref.size() == allele.alt.size()) {
+ assert(allele.ref.size() == 1);
+ --falsePositiveSNPs;
+ } else if (allele.ref.size() != allele.alt.size()) {
+ if (allele.ref.size() == 1 || allele.alt.size() == 1) {
+ --falsePositiveIndels;
+ } else {
+ --falsePositiveComplex;
+ }
+ } else {
+ --falsePositiveComplex;
+ }
+ }
+ cout << threshold << "\t"
+ << totalSNPs << "\t"
+ << falsePositiveSNPs << "\t"
+ << falseNegativeSNPs << "\t"
+ << totalIndels << "\t"
+ << falsePositiveIndels << "\t"
+ << falseNegativeIndels << "\t"
+ << totalComplex << "\t"
+ << falsePositiveComplex << "\t"
+ << falseNegativeComplex << endl;
+
+ }
+
+ exit(0); // why?
+ return 0;
+
+}
+
diff --git a/src/vcfsample2info.cpp b/src/vcfsample2info.cpp
new file mode 100644
index 0000000..2beab59
--- /dev/null
+++ b/src/vcfsample2info.cpp
@@ -0,0 +1,218 @@
+#include "Variant.h"
+#include "split.h"
+#include "fastahack/Fasta.h"
+#include <getopt.h>
+#include <algorithm>
+#include <numeric>
+
+using namespace std;
+using namespace vcf;
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+ << endl
+ << "options:" << endl
+ << " -f, --field Add information about this field in samples to INFO column" << endl
+ << " -i, --info Store the computed statistic in this info field" << endl
+ << " -a, --average Take the mean of samples for field (default)" << endl
+ << " -m, --median Use the median" << endl
+ << " -n, --min Use the min" << endl
+ << " -x, --max Use the max" << endl
+ << endl
+ << "Take annotations given in the per-sample fields and add the mean, median, min, or max" << endl
+ << "to the site-level INFO." << endl
+ << endl;
+ exit(0);
+}
+
+double median(vector<double> &v)
+{
+ size_t n = v.size() / 2;
+ nth_element(v.begin(), v.begin()+n, v.end());
+ return v[n];
+}
+
+double mean(vector<double> &v)
+{
+ double sum = accumulate(v.begin(), v.end(), 0.0);
+ return sum / v.size();
+}
+
+enum StatType { MEAN, MEDIAN, MIN, MAX };
+
+int main(int argc, char** argv) {
+
+ int c;
+ string sampleField;
+ string infoField;
+ StatType statType = MEAN;
+
+ if (argc == 1)
+ printSummary(argv);
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ {"help", no_argument, 0, 'h'},
+ {"field", required_argument, 0, 'f'},
+ {"info", required_argument, 0, 'i'},
+ {"average", no_argument, 0, 'a'},
+ {"median", no_argument, 0, 'm'},
+ {"min", no_argument, 0, 'n'},
+ {"max", no_argument, 0, 'x'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hamnxf:i:",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 'f':
+ sampleField = optarg;
+ break;
+
+ case 'i':
+ infoField = optarg;
+ break;
+
+ case 'a':
+ statType = MEAN;
+ break;
+
+ case 'm':
+ statType = MEDIAN;
+ break;
+
+ case 'n':
+ statType = MIN;
+ break;
+
+ case 'x':
+ statType = MAX;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if (infoField.empty() || sampleField.empty()) {
+ cerr << "Error: both a sample field and an info field are required." << endl;
+ return 1;
+ }
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ string statTypeStr;
+
+ switch (statType) {
+ case MEAN:
+ statTypeStr = "mean";
+ break;
+ case MEDIAN:
+ statTypeStr = "median";
+ break;
+ case MIN:
+ statTypeStr = "min";
+ break;
+ case MAX:
+ statTypeStr = "max";
+ break;
+ default:
+ cerr << "Error: failure to convert stat type to string" << endl;
+ return 1;
+ break;
+ }
+
+ variantFile.addHeaderLine("##INFO=<ID="+infoField+",Number=1,Type=Float,Description=\"Summary statistic generated by"+statTypeStr+" of per-sample values of "+sampleField+" \">");
+
+ cout << variantFile.header << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ vector<double> vals;
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin();
+ s != var.samples.end(); ++s) {
+ map<string, vector<string> >& sample = s->second;
+ if (sample.find(sampleField) != sample.end()) {
+ double val;
+ string& s = sample[sampleField].front();
+ if (sample[sampleField].size() > 1) {
+ cerr << "Error: cannot handle sample fields with multiple values" << endl;
+ return 1;
+ }
+ convert(s, val);
+ vals.push_back(val);
+ }
+ }
+
+ double result;
+ switch (statType) {
+ case MEAN:
+ result = mean(vals);
+ break;
+ case MEDIAN:
+ result = median(vals);
+ break;
+ case MIN:
+ result = *min_element(vals.begin(), vals.end());
+ break;
+ case MAX:
+ result = *max_element(vals.begin(), vals.end());
+ break;
+ default:
+ cerr << "Error: unrecognized StatType" << endl;
+ return 1;
+ break;
+ }
+
+ var.info[infoField].clear();
+ var.info[infoField].push_back(convert(result));
+
+ cout << var << endl;
+
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfsamplediff.cpp b/src/vcfsamplediff.cpp
new file mode 100644
index 0000000..09ca242
--- /dev/null
+++ b/src/vcfsamplediff.cpp
@@ -0,0 +1,200 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+bool samplesDiffer(vector<string>& samples, Variant& var) {
+
+ string genotype;
+
+ for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s) {
+ string& sampleName = *s;
+ map<string, map<string, vector<string> > >::iterator f = var.samples.find(sampleName);
+ if (f != var.samples.end()) {
+ map<string, vector<string> >& sample = f->second;
+ map<string, vector<string> >::iterator gt = sample.find("GT");
+ if (gt != sample.end()) {
+ string& thisGenotype = gt->second.front();
+ if (genotype.empty()) {
+ genotype = thisGenotype;
+ } else {
+ if (genotype != thisGenotype) {
+ return true;
+ }
+ }
+ }
+ }
+ }
+
+ return false;
+
+}
+
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] <tag> <sample> <sample> [ <sample> ... ] <vcf file>" << endl
+ << "Tags each record where the listed sample genotypes differ with <tag>." << endl
+ << "The first sample is assumed to be germline, the second somatic." << endl
+ << "Each record is tagged with <tag>={germline,somatic,loh} to specify the type of" << endl
+ << "variant given the genotype difference between the two samples." << endl
+ << endl
+ << "options:" << endl
+ << " -s --strict Require that no observations in the germline support the somatic alternate." << endl
+ << endl;
+}
+
+
+int main(int argc, char** argv) {
+
+ bool strict = false;
+ int c;
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"strict", no_argument, 0, 's'},
+ //{"length", no_argument, &printLength, true},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hs",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 's':
+ strict = true;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+ break;
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ if(argc - optind < 4) {
+ printSummary(argv);
+ exit(0);
+ }
+
+ string tag = argv[optind];
+
+ vector<string> samples;
+ for (int i = optind+1; i < argc - 1; ++i) {
+ samples.push_back(argv[i]);
+ }
+
+ string filename = argv[argc-1];
+
+ VariantCallFile variantFile;
+ if (filename == "-") {
+ variantFile.open(std::cin);
+ } else {
+ variantFile.open(filename);
+ }
+
+ if (!variantFile.is_open()) {
+ cerr << "could not open " << filename << endl;
+ return 1;
+ }
+
+ assert(samples.size() == 2);
+
+ Variant var(variantFile);
+
+ // TODO check if AC is present
+ // ensure that AC is listed as an info field
+ string line = "##INFO=<ID=" + tag + ",Number=1,Type=String,Description=\"Samples";
+ for (vector<string>::iterator s = samples.begin(); s != samples.end(); ++s) {
+ line += " " + *s;
+ }
+ line += " have different genotypes\">";
+ variantFile.addHeaderLine(line);
+
+ variantFile.addHeaderLine("##INFO=<ID=SSC,Number=1,Type=Float,Description=\"Somatic variant score (phred-scaled probability that the somatic variant call is correct).\">");
+
+ // write the new header
+ cout << variantFile.header << endl;
+
+ // print the records, filtering is done via the setting of varA's output sample names
+ while (variantFile.getNextVariant(var)) {
+ if (var.samples.find(samples.front()) != var.samples.end()
+ && var.samples.find(samples.back()) != var.samples.end()) {
+ map<string, vector<string> >& germline = var.samples[samples.front()];
+ map<string, vector<string> >& somatic = var.samples[samples.back()];
+ map<int, int> gtGermline = decomposeGenotype(germline["GT"].front());
+ map<int, int> gtSomatic = decomposeGenotype(somatic["GT"].front());
+ int germlineAltCount = 0;
+ if (germline.find("AO") != germline.end()) {
+ convert(germline["AO"].front(), germlineAltCount);
+ }
+ var.info[tag].clear(); // remove previous
+ if (gtGermline == gtSomatic) {
+ var.info[tag].push_back("germline");
+ } else {
+ //if (isHet(gtGermline) && isHom(gtSomatic)) {
+ // var.info[tag].push_back("loh");
+ if (isHet(gtGermline) && isHomNonRef(gtSomatic) ||
+ isHomRef(gtGermline) && (isHet(gtSomatic) || isHomNonRef(gtSomatic))) {
+ if (!strict || strict && germlineAltCount == 0) {
+ var.info[tag].push_back("somatic");
+ }
+ } else if (isHom(gtGermline) && isHet(gtSomatic)) {
+ if (var.alt.size() == 1) {
+ var.info[tag].push_back("reversion");
+ } else {
+ var.info[tag].push_back("somatic");
+ }
+ }
+ }
+ if (germline.find("GQ") != germline.end() && somatic.find("GQ") != somatic.end()) {
+ double germlineGQ;
+ convert(germline["GQ"].front(), germlineGQ);
+ double somaticGQ;
+ convert(somatic["GQ"].front(), somaticGQ);
+ double somaticScore = min(var.quality, min(germlineGQ, somaticGQ));
+ var.info["SSC"].clear();
+ var.info["SSC"].push_back(convert(somaticScore));
+ }
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfsamplenames.cpp b/src/vcfsamplenames.cpp
new file mode 100644
index 0000000..23f68f7
--- /dev/null
+++ b/src/vcfsamplenames.cpp
@@ -0,0 +1,29 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc > 1) {
+ string filename = argv[1];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ for (vector<string>::iterator sample = variantFile.sampleNames.begin();
+ sample != variantFile.sampleNames.end(); ++sample) {
+ cout << *sample << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfsamplestats.cpp b/src/vcfsamplestats.cpp
new file mode 100644
index 0000000..ceceb98
--- /dev/null
+++ b/src/vcfsamplestats.cpp
@@ -0,0 +1,193 @@
+#include "Variant.h"
+#include "split.h"
+#include <string>
+#include <iostream>
+#include <getopt.h>
+
+using namespace vcf;
+
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+ << "options:" << endl
+ << endl
+ << " -h, --help this dialog" << endl
+ << endl
+ << "By default, output a table of this form:" << endl
+ << "sample" << " "
+ << "sitecount" << " "
+ << "refcount" << " "
+ << "altcount" << " "
+ << "homcount" << " "
+ << "hetcount" << " "
+ << "avg_gq" << " "
+ << "avg_dp" << endl
+ << endl
+ << "for each sample in the VCF file." << endl
+ << "Reads from stdin if no file is specified on the command line." << endl
+ << endl;
+ exit(0);
+}
+
+
+int main(int argc, char** argv) {
+
+ int c;
+ //bool outputTotalStats = false;
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ //{"totals", no_argument, 0, 't'},
+ //{"length", no_argument, &printLength, true},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "h",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ //case 't':
+ // outputTotalStats = true;
+ //break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+ break;
+
+ case '?':
+ /* getopt_long already printed an error message. */
+ printSummary(argv);
+ exit(1);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ VariantCallFile variantFile;
+ if (optind == argc - 1) {
+ string inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+
+ map<string, int> sitecount;
+ map<string, int> refcount;
+ map<string, int> altcount;
+ map<string, int> homcount;
+ map<string, int> hetcount;
+ map<string, int> gqsum;
+ map<string, int> dpsum;
+
+ for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+ string& sample = *s;
+ sitecount[sample] = 0;
+ refcount[sample] = 0;
+ altcount[sample] = 0;
+ homcount[sample] = 0;
+ hetcount[sample] = 0;
+ gqsum[sample] = 0;
+ }
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+
+ for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
+
+ string name = s->first;
+ map<string, vector<string> >& sample = s->second;
+
+ sitecount[name] += 1;
+
+ int gq;
+ if (convert(sample["GQ"].front(), gq)) {
+ gqsum[name] += gq;
+ }
+
+ int dp;
+ if (convert(sample["DP"].front(), dp))
+ dpsum[name] += dp;
+
+ string& genotype = sample["GT"].front();
+ vector<string> gt = split(genotype, "|/");
+
+ int alt = 0;
+ int ref = 0;
+
+ for (vector<string>::iterator g = gt.begin(); g != gt.end(); ++g) {
+ if (*g != "0") {
+ ++alt;
+ } else {
+ ++ref;
+ }
+ }
+
+ if (alt != gt.size()) {
+ hetcount[name] += alt;
+ }
+
+ if (alt == gt.size() || ref == gt.size()) {
+ homcount[name] += 1;
+ }
+
+ refcount[name] += ref;
+ altcount[name] += alt;
+
+ }
+ }
+
+ cout << "sample" << "\t"
+ << "sitecount" << "\t"
+ << "refcount" << "\t"
+ << "altcount" << "\t"
+ << "homcount" << "\t"
+ << "hetcount" << "\t"
+ << "avg_gq" << "\t"
+ << "avg_dp" << endl;
+ for (vector<string>::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) {
+ string& sample = *s;
+ cout << sample << "\t"
+
+ << sitecount[sample] << "\t"
+ << refcount[sample] << "\t"
+ << altcount[sample] << "\t"
+ << homcount[sample] << "\t"
+ << hetcount[sample] << "\t"
+ << (float) gqsum[sample] / (float) sitecount[sample] << "\t"
+ << (float) dpsum[sample] / (float) sitecount[sample]
+ << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfsitesummarize.cpp b/src/vcfsitesummarize.cpp
new file mode 100644
index 0000000..067d0d4
--- /dev/null
+++ b/src/vcfsitesummarize.cpp
@@ -0,0 +1,94 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc > 1) {
+ string filename = argv[1];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ // obtain all possible field names
+ vector<string> infofields;
+ vector<string> infoflags;
+
+ for (map<string, VariantFieldType>::iterator i = variantFile.infoTypes.begin(); i != variantFile.infoTypes.end(); ++i) {
+ if (variantFile.infoCounts[i->first] != ALLELE_NUMBER) {
+ if (i->second == FIELD_BOOL) {
+ infoflags.push_back(i->first);
+ } else {
+ infofields.push_back(i->first);
+ }
+ }
+ }
+
+ // write header
+
+ // defaults
+ cout << "CHROM\tPOS\tID\tREF\tQUAL\tFILTER";
+
+ // configurable info field
+ for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
+ cout << "\t" << *i;
+ }
+ for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
+ cout << "\t" << *i;
+ }
+ cout << endl;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+
+ cout << var.sequenceName << "\t"
+ << var.position << "\t"
+ << var.id << "\t"
+ << var.ref << "\t"
+ << var.quality << "\t"
+ << var.filter;
+
+ for (vector<string>::iterator i = infofields.begin(); i != infofields.end(); ++i) {
+ vector<string> value;
+ string& name = *i;
+ map<string, vector<string> >::iterator f = var.info.find(name);
+ if (f != var.info.end()) {
+ value = f->second;
+ if (value.size() == 1) {
+ cout << "\t" << value.front();
+ } else {
+ cout << "\t"; // null
+ }
+ } else {
+ cout << "\t"; // null
+ }
+ }
+
+ for (vector<string>::iterator i = infoflags.begin(); i != infoflags.end(); ++i) {
+ string value;
+ string& name = *i;
+ map<string, bool>::iterator f = var.infoFlags.find(name);
+ cout << "\t";
+ if (f != var.infoFlags.end()) {
+ cout << 1;
+ } else {
+ cout << 0;
+ }
+ }
+
+ cout << endl;
+
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfsom.cpp b/src/vcfsom.cpp
new file mode 100644
index 0000000..1e53ec1
--- /dev/null
+++ b/src/vcfsom.cpp
@@ -0,0 +1,626 @@
+#include "Variant.h"
+#include "split.h"
+#include "convert.h"
+#include <string>
+#include <iostream>
+#include <set>
+#include <sys/time.h>
+#include "fsom/fsom.h"
+#include <getopt.h>
+#include <cmath>
+
+using namespace std;
+using namespace vcf;
+
+double mean(const vector<double>& data) {
+ double total = 0;
+ for (vector<double>::const_iterator i = data.begin(); i != data.end(); ++i) {
+ total += *i;
+ }
+ return total/data.size();
+}
+
+double median(vector <double>& data) {
+ double median;
+ size_t size = data.size();
+ // ascending order
+ sort(data.begin(), data.end());
+ // get middle value
+ if (size % 2 == 0) {
+ median = (data[size/2-1] + data[size/2]) / 2;
+ } else {
+ median = data[size/2];
+ }
+ return median;
+}
+
+double variance(const vector <double>& data, const double mean) {
+ double total = 0;
+ for (vector <double>::const_iterator i = data.begin(); i != data.end(); ++i) {
+ total += (*i - mean)*(*i - mean);
+ }
+ return total / (data.size());
+}
+
+double standard_deviation(const vector <double>& data, const double mean) {
+ return sqrt(variance(data, mean));
+}
+
+struct Stats {
+ double mean;
+ double stdev;
+ Stats(void) : mean(0), stdev(1) { }
+};
+
+bool load_som_metadata(string& som_metadata_file, int& x, int& y, vector<string>& fields, map<string, Stats>& stats) {
+ ifstream in(som_metadata_file.c_str());
+ if (!in.is_open()) {
+ return false;
+ }
+ string linebuf;
+ getline(in, linebuf);
+ vector<string> xy = split(linebuf, "\t ");
+ convert(xy.front(), x);
+ convert(xy.back(), y);
+ while (getline(in, linebuf)) {
+ // format is: field_name, mean, stdev
+ vector<string> m = split(linebuf, "\t ");
+ fields.push_back(m[0]);
+ Stats& s = stats[m[0]];
+ convert(m[1], s.mean);
+ convert(m[2], s.stdev);
+ }
+ in.close();
+ return true;
+}
+
+bool save_som_metadata(string& som_metadata_file, int x, int y, vector<string>& fields, map<string, Stats>& stats) {
+ ofstream out(som_metadata_file.c_str());
+ if (!out.is_open()) {
+ return false;
+ }
+ out << x << "\t" << y << endl;
+ for (vector<string>::iterator f = fields.begin(); f != fields.end(); ++f) {
+ Stats& s = stats[*f];
+ out << *f << "\t" << s.mean << "\t" << s.stdev << endl;
+ }
+ out.close();
+ return true;
+}
+
+void normalize_inputs(vector<double>& record, vector<string>& fields, map<string, Stats>& stats) {
+ vector<double>::iterator r = record.begin();
+ for (vector<string>::iterator f = fields.begin(); f != fields.end(); ++f, ++r) {
+ Stats& s = stats[*f];
+ *r = (*r - s.mean) / s.stdev;
+ }
+}
+
+void read_fields(Variant& var, int ai, vector<string>& fields, vector<double>& record) {
+ double td;
+ vector<string>::iterator j = fields.begin();
+ for (; j != fields.end(); ++j) {
+ if (*j == "QUAL") { // special handling...
+ td = var.quality;
+ } else {
+ if (var.info.find(*j) == var.info.end()) {
+ td = 0;
+ } else {
+ if (var.vcf->infoCounts[*j] == 1) { // for non Allele-variant fields
+ convert(var.info[*j][0], td);
+ } else {
+ convert(var.info[*j][ai], td);
+ }
+ }
+ }
+ record.push_back(td);
+ }
+}
+
+struct SomPaint {
+ int true_count;
+ int false_count;
+ double prob_true;
+ SomPaint(void) : true_count(0), false_count(0), prob_true(0) { }
+};
+
+static unsigned long prev_uticks = 0;
+
+static unsigned long get_uticks(){
+ struct timeval ts;
+ gettimeofday(&ts,0);
+ return ((ts.tv_sec * 1000000) + ts.tv_usec);
+}
+
+static void start_timer(){
+ prev_uticks = get_uticks();
+}
+
+static void print_timing( const char *msg ){
+#define MS_DELTA (1000.0)
+#define SS_DELTA (MS_DELTA * 1000.0)
+#define MM_DELTA (SS_DELTA * 60.0)
+#define HH_DELTA (MM_DELTA * 60.0)
+
+ double ticks = get_uticks() - prev_uticks;
+
+ if( ticks < MS_DELTA ){
+ fprintf(stderr, "%s\t : %lf us\n", msg, ticks );
+ }
+ else if( ticks < SS_DELTA ){
+ fprintf(stderr, "%s\t : %lf ms\n", msg, ticks / MS_DELTA );
+ }
+ else if( ticks < MM_DELTA ){
+ fprintf(stderr, "%s\t : %lf s\n", msg, ticks / SS_DELTA );
+ }
+ else if( ticks < HH_DELTA ){
+ fprintf(stderr, "%s\t : %lf m\n", msg, ticks / MM_DELTA );
+ }
+ else{
+ fprintf(stderr, "%s\t : %lf h\n", msg, ticks / HH_DELTA );
+ }
+
+ start_timer();
+}
+
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [vcf file]" << endl
+ << endl
+ << "training: " << endl
+ << " " << argv[0] << " -s output.som -x 20 -y 20 -f \"AF DP ABP\" training.vcf" << endl
+ << endl
+ << "application: " << endl
+ << " " << argv[0] << " -a output.som test.vcf >results.vcf" << endl
+ << endl
+ << argv[0] << "trains and/or applies a self-organizing map to the input VCF data" << endl
+ << "on stdin, adding two columns for the x and y coordinates of the winning" << endl
+ << "neuron in the network and an optional euclidean distance from a given" << endl
+ << "node (--center)." << endl
+ << endl
+ << "If a map is provided via --apply, it will be applied to input without" << endl
+ << "training. A .meta file describing network parameters and input parameter" << endl
+ << "distributions is used to automatically setup the network." << endl
+ << endl
+ << "options:" << endl
+ << endl
+ << " -h, --help this dialog" << endl
+ << endl
+ << "training:" << endl
+ << endl
+ << " -f, --fields \"FIELD ...\" INFO fields to provide to the SOM" << endl
+ << " -a, --apply FILE apply the saved map to input data to FILE" << endl
+ << " -s, --save FILE train on input data and save the map to FILE" << endl
+ << " -p, --print-training-results" << endl
+ << " print results of SOM on training input" << endl
+ << " (you can also just use --apply on the same input)" << endl
+ << " -x, --width X width in columns of the output array" << endl
+ << " -y, --height Y height in columns of the output array" << endl
+ << " -i, --iterations N number of training iterations or epochs" << endl
+ << " -d, --debug print timing information" << endl
+ << endl
+ << "recalibration:" << endl
+ << endl
+ << " -c, --center X,Y annotate with euclidean distance from center" << endl
+ << " -T, --paint-true VCF use VCF file to annotate true variants (multiple)" << endl
+ << " -F, --paint-false VCF use VCF file to annotate false variants (multiple)" << endl
+ << " -R, --paint-tag TAG provide estimated FDR% in TAG in variant INFO" << endl
+ << " -N, --false-negative replace FDR% (false detection) with FNR% (false negative)" << endl;
+
+}
+
+
+int main(int argc, char** argv) {
+
+ int width = 100;
+ int height = 100;
+ int num_dimensions = 2;
+ int iterations = 1000;
+ string som_file;
+ string som_metadata_file;
+ bool apply = false;
+ bool train = false;
+ bool apply_to_training_data = false; // print results against training data
+ bool debug = false;
+ vector<string> fields;
+ vector<string> centerv;
+ int centerx;
+ int centery;
+ string trueVCF;
+ string falseVCF;
+ bool normalize = true;
+
+ int c;
+
+ if (argc == 1) {
+ printSummary(argv);
+ exit(1);
+ }
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"iterations", required_argument, 0, 'i'},
+ {"width", required_argument, 0, 'x'},
+ {"height", required_argument, 0, 'y'},
+ {"apply", required_argument, 0, 'a'},
+ {"save", required_argument, 0, 's'},
+ {"fields", required_argument, 0, 'f'},
+ {"print-training-results", no_argument, 0, 'p'},
+ {"center", required_argument, 0, 'c'},
+ {"paint-true", required_argument, 0, 'T'},
+ {"paint-false", required_argument, 0, 'F'},
+ {"debug", no_argument, 0, 'd'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hpdi:x:y:a:s:f:c:T:F:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ string field;
+
+ switch (c)
+ {
+
+ case 'x':
+ if (!convert(optarg, width)) {
+ cerr << "could not parse --width, -x" << endl;
+ exit(1);
+ }
+ break;
+
+ case 'y':
+ if (!convert(optarg, height)) {
+ cerr << "could not parse --height, -y" << endl;
+ exit(1);
+ }
+ break;
+
+ case 'i':
+ if (!convert(optarg, iterations)) {
+ cerr << "could not parse --iterations, -i" << endl;
+ exit(1);
+ }
+ break;
+
+ case 'p':
+ apply_to_training_data = true;
+ break;
+
+ case 'T':
+ trueVCF = optarg;
+ break;
+
+ case 'F':
+ falseVCF = optarg;
+ break;
+
+ case 'd':
+ debug = true;
+ break;
+
+ case 'a':
+ som_file = optarg;
+ apply = true;
+ break;
+
+ case 's':
+ som_file = optarg;
+ train = true;
+ break;
+
+ case 'f':
+ fields = split(string(optarg), ' ');
+ break;
+
+ case 'c':
+ centerv = split(string(optarg), ',');
+ convert(centerv.at(0), centerx);
+ convert(centerv.at(1), centery);
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ size_t i, j;
+ som_network_t *net = NULL;
+ vector<string> inputs;
+ vector<vector<double> > data;
+ map<string, Stats> stats;
+
+ string line;
+ stringstream ss;
+
+ VariantCallFile variantFile;
+ bool usingstdin = false;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ usingstdin = true;
+ }
+
+ if (!variantFile.is_open()) {
+ cerr << "could not open VCF file" << endl;
+ return 1;
+ }
+
+ som_metadata_file = som_file + ".meta";
+
+ Variant var(variantFile);
+
+ variantFile.addHeaderLine("##INFO=<ID=SOMX,Number=A,Type=Integer,Description=\"X position of best neuron for variant in self-ordering map defined in " + som_file + "\">");
+ variantFile.addHeaderLine("##INFO=<ID=SOMY,Number=A,Type=Integer,Description=\"Y position of best neuron for variant in self-ordering map defined in " + som_file + "\">");
+ if (!centerv.empty()) {
+ variantFile.addHeaderLine("##INFO=<ID=SOMD,Number=A,Type=Float,Description=\"Euclidean distance from "
+ + convert(centerx) + "," + convert(centery) + " as defined by " + som_file + "\">");
+ }
+ if (!trueVCF.empty() && !falseVCF.empty()) {
+ variantFile.addHeaderLine("##INFO=<ID=SOMP,Number=A,Type=Float,Description=\"Estimated probability the variant is true using som "
+ + som_file + ", true variants from " + trueVCF + ", and false variants from " + falseVCF + "\">");
+ }
+
+ if (debug) start_timer();
+
+ vector<Variant> variants;
+ if (train) {
+ map<string, pair<double, double> > normalizationLimits;
+ while (variantFile.getNextVariant(var)) {
+ variants.push_back(var);
+ int ai = 0;
+ vector<string>::iterator a = var.alt.begin();
+ for ( ; a != var.alt.end(); ++a, ++ai) {
+ vector<double> record;
+ double td;
+ vector<string>::iterator j = fields.begin();
+ for (; j != fields.end(); ++j) {
+ if (*j == "QUAL") { // special handling...
+ td = var.quality;
+ } else {
+ if (var.info.find(*j) == var.info.end()) {
+ td = 0;
+ } else {
+ if (variantFile.infoCounts[*j] == 1) { // for non Allele-variant fields
+ convert(var.info[*j][0], td);
+ } else {
+ convert(var.info[*j][ai], td);
+ }
+ }
+ }
+ if (normalize) {
+ pair<double, double>& limits = normalizationLimits[*j];
+ if (td < limits.first) limits.first = td;
+ if (td > limits.second) limits.second = td;
+ }
+ record.push_back(td);
+ }
+ data.push_back(record);
+ }
+ }
+ // normalize inputs
+ if (normalize) {
+ // get normalization vector
+ // goal is normalization at 0, sd=1
+ int i = 0;
+ for (vector<string>::iterator f = fields.begin(); f != fields.end(); ++f, ++i) {
+ vector<double> fv;
+ for (vector<vector<double> >::iterator d = data.begin(); d != data.end(); ++d) {
+ fv.push_back(d->at(i));
+ }
+ Stats& s = stats[*f];
+ // get normalization constants
+ s.mean = mean(fv);
+ s.stdev = standard_deviation(fv, s.mean);
+ // normalize
+ for (vector<vector<double> >::iterator d = data.begin(); d != data.end(); ++d) {
+ double v = d->at(i);
+ d->at(i) = (v - s.mean) / s.stdev;
+ }
+ }
+ }
+ }
+
+ vector<double*> dataptrs (data.size());
+ for (unsigned i=0, e=dataptrs.size(); i<e; ++i) {
+ dataptrs[i] = &(data[i][0]); // assuming !thing[i].empty()
+ }
+
+ if (debug) print_timing( "Input Processing" );
+
+ if (apply) {
+ if (! (net = som_deserialize(som_file.c_str()))) {
+ cerr << "could not load SOM from " << som_file << endl;
+ return 1;
+ }
+ if (!fields.empty()) {
+ cerr << "fields specified, but a SOM is to be applied, and metadata should be stored at " << som_metadata_file << endl;
+ return 1;
+ }
+ if (!load_som_metadata(som_metadata_file, width, height, fields, stats)) {
+ cerr << "could not load SOM metadata from " << som_metadata_file << endl;
+ return 1;
+ }
+ } else {
+
+ net = som_network_new(data[0].size(), height, width);
+
+ if ( !net ) {
+ printf( "ERROR: som_network_new failed.\n" );
+ return 1;
+ }
+ }
+
+ if (debug) print_timing( "Network Creation" );
+
+ if (train) {
+ if (debug) cerr << "Training using " << data.size() << " input vectors" << endl;
+ som_init_weights ( net, &dataptrs[0], data.size() );
+ som_train ( net, &dataptrs[0], data.size(), iterations );
+ }
+
+ if (debug) print_timing( "Network Training" );
+
+ // open and calibrate using the true and false datasets
+
+ if (train && apply_to_training_data) {
+ // currently disabled
+ /*
+ cout << variantFile.header << endl;
+ vector<Variant>::iterator v = variants.begin(); int di = 0;
+ for ( ; v != variants.end() && di < data.size(); ++v) {
+ var.info["SOMX"].clear();
+ var.info["SOMY"].clear();
+ var.info["SOMP"].clear();
+ var.info["SOMD"].clear();
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++di) {
+ som_set_inputs ( net, dataptrs[di] );
+ size_t x=0, y=0;
+ som_get_best_neuron_coordinates ( net, &x, &y );
+ v->info["SOMX"].push_back(convert(x));
+ v->info["SOMY"].push_back(convert(y));
+ if (!centerv.empty()) {
+ float distance = sqrt(pow(abs((float)centerx - (float)x), 2)
+ + pow(abs((float)centery - (float)y), 2));
+ var.info["SOMD"].clear();
+ var.info["SOMD"].push_back(convert(distance));
+ }
+ }
+ cout << *v << endl;
+ }
+ */
+ } else if (apply) {
+
+ // if we have true and false sets, use them to "paint" the map
+ vector<vector<SomPaint> > paintedSOM;
+ paintedSOM.resize(width);
+ for (vector<vector<SomPaint> >::iterator t = paintedSOM.begin();
+ t != paintedSOM.end(); ++t) {
+ t->resize(height);
+ }
+
+ // handle trues
+ if (!trueVCF.empty()) {
+ VariantCallFile trueVariantFile;
+ trueVariantFile.open(trueVCF);
+ Variant v(trueVariantFile);
+ while (trueVariantFile.getNextVariant(v)) {
+ int ai = 0;
+ vector<string>::iterator a = v.alt.begin();
+ for ( ; a != v.alt.end(); ++a, ++ai) {
+ vector<double> record;
+ read_fields(v, ai, fields, record);
+ if (normalize) {
+ normalize_inputs(record, fields, stats);
+ }
+ som_set_inputs ( net, &record[0] );
+ size_t x=0, y=0;
+ som_get_best_neuron_coordinates ( net, &x, &y );
+ paintedSOM[x][y].true_count += 1;
+ }
+ }
+ }
+
+ // get falses
+ if (!falseVCF.empty()) {
+ VariantCallFile falseVariantFile;
+ falseVariantFile.open(falseVCF);
+ Variant v(falseVariantFile);
+ while (falseVariantFile.getNextVariant(v)) {
+ int ai = 0;
+ vector<string>::iterator a = v.alt.begin();
+ for ( ; a != v.alt.end(); ++a, ++ai) {
+ vector<double> record;
+ read_fields(v, ai, fields, record);
+ if (normalize) {
+ normalize_inputs(record, fields, stats);
+ }
+ som_set_inputs ( net, &record[0] );
+ size_t x=0, y=0;
+ som_get_best_neuron_coordinates ( net, &x, &y );
+ paintedSOM[x][y].false_count += 1;
+ }
+ }
+ }
+
+ // estimate probability of each node using true and false set
+ for (vector<vector<SomPaint> >::iterator t = paintedSOM.begin();
+ t != paintedSOM.end(); ++t) {
+ for (vector<SomPaint>::iterator p = t->begin(); p != t->end(); ++p) {
+ //cout << "count at node " << t - paintedSOM.begin() << "," << p - t->begin()
+ // << " is " << p->true_count << " true, " << p->false_count << " false" << endl;
+ if (p->true_count + p->false_count > 0) {
+ p->prob_true = (double) p->true_count / (double) (p->true_count + p->false_count);
+ } else {
+ // for nodes without training data, could we estimate from surrounding nodes?
+ // yes, TODO, but for now we can be conservative and say "0"
+ p->prob_true = 0;
+ }
+ }
+ }
+
+ cout << variantFile.header << endl;
+ while (variantFile.getNextVariant(var)) {
+ var.info["SOMX"].clear();
+ var.info["SOMY"].clear();
+ var.info["SOMP"].clear();
+ var.info["SOMD"].clear();
+ int ai = 0;
+ vector<string>::iterator a = var.alt.begin();
+ for ( ; a != var.alt.end(); ++a, ++ai) {
+ vector<double> record;
+ read_fields(var, ai, fields, record);
+ if (normalize) {
+ normalize_inputs(record, fields, stats);
+ }
+ som_set_inputs ( net, &record[0] );
+ size_t x=0, y=0;
+ som_get_best_neuron_coordinates ( net, &x, &y );
+ if (!trueVCF.empty() && !falseVCF.empty()) {
+ SomPaint& paint = paintedSOM[x][y];
+ var.info["SOMP"].push_back(convert(paint.prob_true));
+ }
+ var.info["SOMX"].push_back(convert(x));
+ var.info["SOMY"].push_back(convert(y));
+ if (!centerv.empty()) {
+ float distance = sqrt(pow(abs((float)centerx - (float)x), 2)
+ + pow(abs((float)centery - (float)y), 2));
+ var.info["SOMD"].push_back(convert(distance));
+ }
+ }
+ cout << var << endl;
+ }
+ }
+
+ if (debug) print_timing( "Input Recognition" );
+
+ if (train) {
+ if (!save_som_metadata(som_metadata_file, width, height, fields, stats)) {
+ cerr << "could not save metadata to " << som_metadata_file << endl;
+ }
+ som_serialize(net, som_file.c_str());
+ }
+
+ som_network_destroy ( net );
+
+ if (debug) print_timing( "Network Destruction" );
+
+ return 0;
+
+}
diff --git a/src/vcfstats.cpp b/src/vcfstats.cpp
new file mode 100644
index 0000000..da8137b
--- /dev/null
+++ b/src/vcfstats.cpp
@@ -0,0 +1,570 @@
+#include "Variant.h"
+#include "split.h"
+#include "convert.h"
+#include <getopt.h>
+
+using namespace std;
+using namespace vcf;
+
+bool isTransition(const string& ref, const string& alt) {
+ if (((ref == "A" && alt == "G") || (ref == "G" && alt == "A")) ||
+ ((ref == "C" && alt == "T") || (ref == "T" && alt == "C"))) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool isDeamination(const string& ref, const string& alt) {
+ if ((ref == "G" && alt == "A") ||
+ (ref == "C" && alt == "T")) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool isAmination(const string& ref, const string& alt) {
+ if ((ref == "A" && alt == "G") ||
+ (ref == "T" && alt == "C")) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+class AlleleStats {
+public:
+ int transitions;
+ int transversions;
+ int deaminations;
+ int aminations;
+ int mismatches;
+ int insertedbases;
+ int insertions;
+ int deletedbases;
+ int deletions;
+ //AlleleStats(int ts, int tv, int da, int am, int mm)
+ AlleleStats(void)
+ : transitions(0)
+ , transversions(0)
+ , deaminations(0)
+ , aminations(0)
+ , mismatches(0)
+ , insertions(0)
+ , insertedbases(0)
+ , deletions(0)
+ , deletedbases(0)
+ { }
+};
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] <vcf file>" << endl
+ << endl
+ << " -r, --region specify a region on which to target the stats, requires a BGZF" << endl
+ << " compressed file which has been indexed with tabix. any number of" << endl
+ << " regions may be specified." << endl
+ << " -a, --add-info add the statistics intermediate information to the VCF file," << endl
+ << " writing out VCF records instead of summary statistics" << endl
+ << " -t, --add-type only add the type= field to the info (faster than -a)" << endl
+ << " -l, --no-length-frequency don't out the indel and mnp length-frequency spectra" << endl
+ << " -m, --match-score N match score for SW algorithm" << endl
+ << " -x, --mismatch-score N mismatch score for SW algorithm" << endl
+ << " -o, --gap-open-penalty N gap open penalty for SW algorithm" << endl
+ << " -e, --gap-extend-penalty N gap extension penalty for SW algorithm" << endl
+ << endl
+ << "Prints statistics about variants in the input VCF file." << endl;
+}
+
+
+int main(int argc, char** argv) {
+
+ vector<string> regions;
+ bool addTags = false;
+ bool addType = false;
+ bool lengthFrequency = true;
+
+ // constants for SmithWaterman algorithm
+ float matchScore = 10.0f;
+ float mismatchScore = -9.0f;
+ float gapOpenPenalty = 15.0f;
+ float gapExtendPenalty = 6.66f;
+
+ bool useReferenceAlignment = false;
+
+ int c;
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"region", required_argument, 0, 'r'},
+ {"add-info", no_argument, 0, 'a'},
+ {"add-type", no_argument, 0, 't'},
+ {"no-length-frequency", no_argument, 0, 'l'},
+ {"match-score", required_argument, 0, 'm'},
+ {"mismatch-score", required_argument, 0, 'x'},
+ {"gap-open-penalty", required_argument, 0, 'o'},
+ {"gap-extend-penalty", required_argument, 0, 'e'},
+ //{"length", no_argument, &printLength, true},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "hlatr:m:x:o:e:",
+ long_options, &option_index);
+
+ /* Detect the end of the options. */
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 0:
+ /* If this option set a flag, do nothing else now. */
+ if (long_options[option_index].flag != 0)
+ break;
+ printf ("option %s", long_options[option_index].name);
+ if (optarg)
+ printf (" with arg %s", optarg);
+ printf ("\n");
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+ break;
+
+ case 'r':
+ regions.push_back(optarg);
+ break;
+
+ case 'l':
+ lengthFrequency = false;
+ break;
+
+ case 'a':
+ addTags = true;
+ break;
+
+ case 't':
+ addType = true;
+ break;
+
+ case 'm':
+ matchScore = atof(optarg);
+ break;
+
+ case 'x':
+ mismatchScore = atof(optarg);
+ break;
+
+ case 'o':
+ gapOpenPenalty = atof(optarg);
+ break;
+
+ case 'e':
+ gapExtendPenalty = atof(optarg);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+
+ VariantCallFile variantFile;
+ string inputFilename;
+ if (optind == argc - 1) {
+ inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ if (addType && !addTags) {
+ variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">");
+ variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">");
+ cout << variantFile.header << endl;
+ }
+
+ if (addTags) {
+ variantFile.addHeaderLine("##INFO=<ID=transitions,Number=A,Type=Integer,Description=\"Total number of transitions in the alternate allele\">");
+ variantFile.addHeaderLine("##INFO=<ID=transversions,Number=A,Type=Integer,Description=\"Total number of transversions in the alternate allele\">");
+ variantFile.addHeaderLine("##INFO=<ID=deaminations,Number=A,Type=Integer,Description=\"Total number of deaminations in the alternate allele\">");
+ variantFile.addHeaderLine("##INFO=<ID=aminations,Number=A,Type=Integer,Description=\"Total number of aminations in the alternate allele\">");
+ variantFile.addHeaderLine("##INFO=<ID=mismatches,Number=A,Type=Integer,Description=\"Total number of mismatches in the alternate allele\">");
+ variantFile.addHeaderLine("##INFO=<ID=insertions,Number=A,Type=Integer,Description=\"Total number of inserted bases in the alternate allele\">");
+ variantFile.addHeaderLine("##INFO=<ID=deletions,Number=A,Type=Integer,Description=\"Total number of deleted bases in the alternate allele\">");
+ variantFile.addHeaderLine("##INFO=<ID=cigar,Number=A,Type=String,Description=\"The CIGAR-style representation of the alternate allele as aligned to the reference\">");
+ variantFile.addHeaderLine("##INFO=<ID=type,Number=A,Type=String,Description=\"The type of the allele, either snp, ins, del, complex, or ref.\">");
+ variantFile.addHeaderLine("##INFO=<ID=reflen,Number=1,Type=Integer,Description=\"The length of the reference allele\">");
+ variantFile.addHeaderLine("##INFO=<ID=altlen,Number=A,Type=Integer,Description=\"The length of the alternate allele\">");
+ cout << variantFile.header << endl;
+ }
+
+ Variant var(variantFile);
+
+ vector<string>::iterator regionItr = regions.begin();
+
+ int variantAlleles = 0;
+ int uniqueVariantAlleles = 0;
+ int variantSites = 0;
+ int snps = 0;
+ int transitions = 0;
+ int transversions = 0;
+ int deaminations = 0;
+ int aminations = 0;
+ int totalinsertions = 0;
+ int totaldeletions = 0;
+ int insertedbases = 0;
+ int deletedbases = 0;
+ int totalmnps = 0;
+ int totalcomplex = 0;
+ int mismatchbases = 0;
+ int mnpbases = 0;
+ int biallelics = 0;
+ int multiallelics = 0;
+ map<int, int> insertions;
+ map<int, int> deletions;
+ map<int, int> mnps;
+ map<int, int> complexsubs;
+
+ bool includePreviousBaseForIndels = false;
+ bool useMNPs = true;
+ bool useEntropy = false;
+
+ AlleleStats biallelicSNPs;
+
+ // todo, add biallelic snp dialog to output and ts/tv for snps and mnps
+
+ do {
+
+ if (!inputFilename.empty() && !regions.empty()) {
+ string regionStr = *regionItr++;
+ variantFile.setRegion(regionStr);
+ }
+
+ while (variantFile.getNextVariant(var)) {
+ ++variantSites;
+ if (var.alt.size() > 1) {
+ ++multiallelics;
+ } else {
+ ++biallelics;
+ }
+ map<string, vector<VariantAllele> > alternates = var.parsedAlternates(includePreviousBaseForIndels,
+ useMNPs,
+ useEntropy,
+ matchScore,
+ mismatchScore,
+ gapOpenPenalty,
+ gapExtendPenalty);
+ map<VariantAllele, vector<string> > uniqueVariants;
+
+ vector<string> cigars;
+
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string& alternate = *a;
+ if (addTags)
+ var.info["altlen"].push_back(convert(alternate.size()));
+ vector<VariantAllele>& vav = alternates[alternate];
+ if (vav.size() > 1) {
+ // check that there are actually multiple non-reference alleles
+ int nonRefAlleles = 0;
+ for (vector<VariantAllele>::iterator z = vav.begin(); z != vav.end(); ++z) {
+ if (z->ref != z->alt)
+ ++nonRefAlleles;
+ }
+ if (nonRefAlleles > 1)
+ ++totalcomplex;
+ }
+ for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
+ uniqueVariants[*v].push_back(alternate);
+ }
+
+ if (addTags || addType) {
+ string cigar;
+ pair<int, string> element;
+ for (vector<VariantAllele>::iterator v = vav.begin(); v != vav.end(); ++v) {
+ VariantAllele& va = *v;
+ if (va.ref != va.alt) {
+ if (element.second == "M") {
+ cigar += convert(element.first) + element.second;
+ element.second = ""; element.first = 0;
+ }
+ if (va.ref.size() == va.alt.size()) {
+ cigar += convert(va.ref.size()) + "X";
+ } else if (va.ref.size() > va.alt.size()) {
+ cigar += convert(va.ref.size() - va.alt.size()) + "D";
+ } else {
+ cigar += convert(va.alt.size() - va.ref.size()) + "I";
+ }
+ } else {
+ if (element.second == "M") {
+ element.first += va.ref.size();
+ } else {
+ element = make_pair(va.ref.size(), "M");
+ }
+ }
+ }
+ if (element.second == "M") {
+ cigar += convert(element.first) + element.second;
+ }
+ element.second = ""; element.first = 0;
+ cigars.push_back(cigar);
+ }
+ }
+
+ if (addTags) {
+ var.info["cigar"] = cigars;
+ var.info["reflen"].push_back(convert(var.ref.size()));
+ } else if (addType) {
+ var.info["cigar"] = cigars;
+ }
+
+ variantAlleles += var.alt.size();
+ map<string, AlleleStats> alleleStats;
+
+ for (map<VariantAllele, vector<string> >::iterator v = uniqueVariants.begin(); v != uniqueVariants.end(); ++v) {
+ const VariantAllele& va = v->first;
+ vector<string>& alternates = v->second;
+
+ if (!(addTags || addType)) { // don't add any tag information if we're not going to output it
+ alternates.clear();
+ }
+
+ if (va.ref != va.alt) {
+ ++uniqueVariantAlleles;
+ if (va.ref.size() == va.alt.size()) {
+ if (va.ref.size() == 1) {
+ ++snps;
+ ++mismatchbases;
+ for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+ ++alleleStats[*a].mismatches;
+ }
+ if (isTransition(va.ref, va.alt)) {
+ ++transitions;
+ for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+ ++alleleStats[*a].transitions;
+ }
+ } else {
+ ++transversions;
+ for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+ ++alleleStats[*a].transversions;
+ }
+ }
+ if (isAmination(va.ref, va.alt)) {
+ ++aminations;
+ for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+ ++alleleStats[*a].aminations;
+ }
+ }
+ if (isDeamination(va.ref, va.alt)) {
+ ++deaminations;
+ for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+ ++alleleStats[*a].deaminations;
+ }
+ }
+ } else {
+ ++totalmnps;
+ ++mnps[va.alt.size()]; // not entirely correct
+ for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+ alleleStats[*a].mismatches += va.alt.size();
+ }
+ string::const_iterator r = va.ref.begin();
+ for (string::const_iterator a = va.alt.begin(); a != va.alt.end(); ++a, ++r) {
+ string rstr = string(1, *r);
+ string astr = string(1, *a);
+ if (rstr == astr) {
+ continue;
+ }
+ if (isTransition(rstr, astr)) {
+ ++transitions;
+ for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+ ++alleleStats[*a].transitions;
+ }
+ } else {
+ ++transversions;
+ for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+ ++alleleStats[*a].transversions;
+ }
+ }
+ if (isAmination(rstr, astr)) {
+ ++aminations;
+ for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+ ++alleleStats[*a].aminations;
+ }
+ }
+ if (isDeamination(rstr, astr)) {
+ ++deaminations;
+ for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+ ++alleleStats[*a].deaminations;
+ }
+ }
+ ++mismatchbases;
+ ++mnpbases;
+ }
+ }
+ } else if (va.ref.size() > va.alt.size()) {
+ int diff = va.ref.size() - va.alt.size();
+ deletedbases += diff;
+ ++totaldeletions;
+ ++deletions[diff];
+ for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+ alleleStats[*a].deletedbases += diff;
+ alleleStats[*a].deletions += 1;
+ }
+ } else {
+ int diff = va.alt.size() - va.ref.size();
+ insertedbases += diff;
+ ++totalinsertions;
+ ++insertions[diff];
+ for (vector<string>::iterator a = alternates.begin(); a != alternates.end(); ++a) {
+ alleleStats[*a].insertedbases += diff;
+ alleleStats[*a].insertions += 1;
+ }
+ }
+ }
+ }
+ if (addTags || addType) {
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ string vartype;
+ if (alleleStats[*a].insertions + alleleStats[*a].deletions == 0) {
+ if (alleleStats[*a].mismatches == 1) {
+ vartype = "snp";
+ } else if (alleleStats[*a].mismatches > 1) {
+ vartype = "complex";
+ } else {
+ vartype = "ref";
+ }
+ } else if (alleleStats[*a].insertions + alleleStats[*a].deletions == 1) {
+ if (alleleStats[*a].insertions == 1) {
+ vartype = "ins";
+ } else {
+ vartype = "del";
+ }
+ } else {
+ vartype = "complex";
+ }
+ if (addTags) {
+ var.info["mismatches"].push_back(convert(alleleStats[*a].mismatches));
+ var.info["insertions"].push_back(convert(alleleStats[*a].insertions));
+ var.info["deletions"].push_back(convert(alleleStats[*a].deletions));
+ var.info["transitions"].push_back(convert(alleleStats[*a].transitions));
+ var.info["transversions"].push_back(convert(alleleStats[*a].transversions));
+ var.info["deaminations"].push_back(convert(alleleStats[*a].deaminations));
+ var.info["aminations"].push_back(convert(alleleStats[*a].aminations));
+ }
+ var.info["type"].push_back(vartype);
+ }
+ cout << var << endl;
+ }
+ // biallelic SNP case
+ if (var.alt.size() == 1 && var.ref.size() == 1 && var.alt.front().size() == 1) {
+ if (isTransition(var.ref, var.alt.front())) {
+ biallelicSNPs.transitions++;
+ } else {
+ biallelicSNPs.transversions++;
+ }
+ biallelicSNPs.mismatches++;
+ }
+ }
+
+ } while (regionItr != regions.end());
+
+
+ // find the maximum indel size
+ int maxindel = 0;
+ for (map<int, int>::iterator i = insertions.begin(); i != insertions.end(); ++i) {
+ if (i->first > maxindel) {
+ maxindel = i->first;
+ }
+ }
+ for (map<int, int>::iterator i = deletions.begin(); i != deletions.end(); ++i) {
+ if (i->first > maxindel) {
+ maxindel = i->first;
+ }
+ }
+
+ // and maximum mnp
+ int maxmnp = 0;
+ for (map<int, int>::iterator i = mnps.begin(); i != mnps.end(); ++i) {
+ if (i->first > maxmnp) {
+ maxmnp = i->first;
+ }
+ }
+
+ // now print the results
+
+ if (!addTags && !addType) {
+ cout << "total variant sites:\t" << variantSites << endl
+ << "of which " << biallelics << " (" << (double) biallelics / variantSites << ") are biallelic and "
+ << multiallelics << " (" << (double) multiallelics / variantSites << ") are multiallelic" << endl
+ << "total variant alleles:\t" << variantAlleles << endl
+ << "unique variant alleles:\t" << uniqueVariantAlleles << endl
+ << endl
+ << "snps:\t" << snps << endl
+ << "mnps:\t" << totalmnps << endl
+ << "indels:\t" << totalinsertions + totaldeletions << endl
+ << "complex:\t" << totalcomplex << endl
+ << endl
+ << "mismatches:\t" << mismatchbases << endl
+ << endl
+ << "ts/tv ratio:\t" << (double) transitions / (double) transversions << endl
+ << "deamination ratio:\t" << (double) deaminations / aminations << endl
+ << "biallelic snps:\t" << biallelicSNPs.mismatches << " @ "
+ << (double) biallelicSNPs.transitions / (double) biallelicSNPs.transversions << endl;
+
+ if (lengthFrequency) {
+ cout << endl
+ << "ins/del length frequency distribution" << endl
+ << "length\tins\tdel\tins/del" << endl;
+ for (int i = 1; i <= maxindel; ++i) {
+ int ins = insertions[i];
+ int del = deletions[i];
+ cout << i << "\t"
+ << (ins > 0 ? convert(ins) : "" ) << "\t"
+ << (del > 0 ? convert(del) : "") << "\t"
+ << (ins > 0 && del > 0 ? convert((double) ins / (double) del) : "")
+ << endl;
+ }
+ }
+
+ cout << endl
+ << "insertion alleles / deletion alleles:\t"
+ << (double) totalinsertions / (double) totaldeletions << endl
+ << "inserted bases / deleted bases:\t"
+ << (double) insertedbases / (double) deletedbases << endl
+ << endl;
+
+ if (lengthFrequency) {
+ cout << "mnp length frequency distribution" << endl
+ << "length\tcount" << endl;
+ for (int i = 2; i <= maxmnp; ++i) {
+ int mnp = mnps[i];
+ cout << i << "\t"
+ << (mnp > 0 ? convert(mnp) : "")
+ << endl;
+ }
+ }
+
+ cout << "total bases in mnps:\t" << mnpbases << endl;
+
+ /*
+ cout << "complex event frequency distribution" << endl
+ << "length\tcount" << endl;
+ for (map<int, int>::iterator i = complexsubs.begin(); i != complexsubs.end(); ++i) {
+ cout << i->first << "\t" << i->second << endl;
+ }
+ */
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfstreamsort.cpp b/src/vcfstreamsort.cpp
new file mode 100644
index 0000000..834de64
--- /dev/null
+++ b/src/vcfstreamsort.cpp
@@ -0,0 +1,143 @@
+#include "Variant.h"
+#include <algorithm>
+#include <getopt.h>
+#include "convert.h"
+
+using namespace std;
+using namespace vcf;
+
+bool listContains(list<string>& l, string& v) {
+ for (list<string>::iterator i = l.begin(); i != l.end(); ++i) {
+ if (*i == v) return true;
+ }
+ return false;
+}
+
+void printSummary(char** argv) {
+ cerr << "usage: " << argv[0] << " [options] [vcf file]" << endl
+ << endl
+ << "Sorts the input (either stdin or file) using a streaming sort algorithm."
+ << endl
+ << "options:" << endl
+ << endl
+ << " -h, --help this dialog" << endl
+ << " -w, --window N number of sites to sort (default 10000)" << endl
+ << " -a, --all load all sites and then sort in memory" << endl;
+}
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+ int sortSitesWindow = 10000;
+ bool sortAll = false;
+
+ int c;
+
+ while (true) {
+ static struct option long_options[] =
+ {
+ /* These options set a flag. */
+ //{"verbose", no_argument, &verbose_flag, 1},
+ {"help", no_argument, 0, 'h'},
+ {"window", required_argument, 0, 'w'},
+ {"all", required_argument, 0, 'a'},
+ {0, 0, 0, 0}
+ };
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ c = getopt_long (argc, argv, "haw:",
+ long_options, &option_index);
+
+ if (c == -1)
+ break;
+
+ string field;
+
+ switch (c)
+ {
+
+ case 'w':
+ if (!convert(optarg, sortSitesWindow)) {
+ cerr << "could not parse --window, -w" << endl;
+ exit(1);
+ }
+ break;
+
+ case 'a':
+ sortAll = true;
+ break;
+
+ case 'h':
+ printSummary(argv);
+ exit(0);
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (optind == argc - 1) {
+ string inputFilename = argv[optind];
+ variantFile.open(inputFilename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ cout << variantFile.header << endl;
+
+ map<string, map<long int, map<string, vector<Variant> > > > records;
+ long int back = 0;
+ int numrecords = 0;
+ list<string> sequenceNames;
+
+ variantFile.parseSamples = false;
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ //cerr << "at position " << var.sequenceName << ":" << var.position << endl;
+ if (!listContains(sequenceNames, var.sequenceName)) {
+ //cerr << "adding new sequence name " << var.sequenceName << endl;
+ sequenceNames.push_back(var.sequenceName);
+ }
+ records[var.sequenceName][var.position][var.vrepr()].push_back(var);
+ if (records[var.sequenceName][var.position].size() == 1) ++numrecords;
+ if (!sortAll && numrecords > sortSitesWindow) {
+ //cerr << "outputting a position" << endl;
+ if (records[sequenceNames.front()].empty()) {
+ //cerr << "end of reference sequence " << sequenceNames.front() << endl;
+ sequenceNames.pop_front();
+ }
+ map<long int, map<string, vector<Variant> > >& frecords = records[sequenceNames.front()];
+ map<string, vector<Variant> >& vars = frecords.begin()->second;
+ for (map<string, vector<Variant> >::iterator v = vars.begin(); v != vars.end(); ++v) {
+ for (vector<Variant>::iterator s = v->second.begin(); s != v->second.end(); ++s) {
+ cout << s->originalLine << endl;
+ }
+ }
+ frecords.erase(frecords.begin());
+ --numrecords;
+ }
+ }
+ //cerr << "done processing input, cleaning up" << endl;
+ for (list<string>::iterator s = sequenceNames.begin(); s != sequenceNames.end(); ++s) {
+ map<long int, map<string, vector<Variant> > >& q = records[*s];
+ for (map<long int, map<string, vector<Variant> > >::iterator r = q.begin(); r != q.end(); ++r) {
+ for (map<string, vector<Variant> >::iterator v = r->second.begin(); v != r->second.end(); ++v) {
+ for (vector<Variant>::iterator s = v->second.begin(); s != v->second.end(); ++s) {
+ cout << s->originalLine << endl;
+ }
+ }
+ --numrecords;
+ }
+ }
+ //cerr << numrecords << " remain" << endl;
+
+ return 0;
+
+}
+
diff --git a/src/vcfuniq.cpp b/src/vcfuniq.cpp
new file mode 100644
index 0000000..30ad21b
--- /dev/null
+++ b/src/vcfuniq.cpp
@@ -0,0 +1,49 @@
+#include "Variant.h"
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc > 1) {
+ string filename = argv[1];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ cout << variantFile.header << endl;
+
+ string lastsn;
+ long int lastpos;
+ string lastref;
+ vector<string> lastalt;
+
+ variantFile.parseSamples = false;
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ if (!lastsn.empty()
+ && (lastsn == var.sequenceName
+ && lastpos == var.position
+ && lastref == var.ref
+ && lastalt == var.alt)) {
+ continue;
+ } else {
+ lastsn = var.sequenceName;
+ lastpos = var.position;
+ lastref = var.ref;
+ lastalt = var.alt;
+ cout << var.originalLine << endl;
+ }
+ }
+
+ return 0;
+
+}
+
diff --git a/src/vcfuniqalleles.cpp b/src/vcfuniqalleles.cpp
new file mode 100644
index 0000000..3c1c7e2
--- /dev/null
+++ b/src/vcfuniqalleles.cpp
@@ -0,0 +1,54 @@
+#include "Variant.h"
+#include <set>
+
+using namespace std;
+using namespace vcf;
+
+int main(int argc, char** argv) {
+
+ VariantCallFile variantFile;
+
+ if (argc > 1) {
+ string filename = argv[1];
+ variantFile.open(filename);
+ } else {
+ variantFile.open(std::cin);
+ }
+
+ if (!variantFile.is_open()) {
+ return 1;
+ }
+
+ cout << variantFile.header << endl;
+
+ string lastsn;
+ long int lastpos;
+ string lastref;
+ vector<string> lastalt;
+
+ Variant var(variantFile);
+ while (variantFile.getNextVariant(var)) {
+ set<string> alleles;
+ vector<string> alleles_to_remove;
+ for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
+ if (*a != var.ref) {
+ if (alleles.find(*a) == alleles.end()) {
+ alleles.insert(*a);
+ } else {
+ alleles_to_remove.push_back(*a);
+ }
+ } else {
+ alleles_to_remove.push_back(*a); // same as ref
+ }
+ }
+ for (vector<string>::iterator a = alleles_to_remove.begin(); a != alleles_to_remove.end(); ++a) {
+ cerr << "removing " << *a << " from " << var.sequenceName << ":" << var.position << endl;
+ var.removeAlt(*a);
+ }
+ cout << var << endl;
+ }
+
+ return 0;
+
+}
+
diff --git a/tests/lib/Local/vcflib/Test.pm b/tests/lib/Local/vcflib/Test.pm
new file mode 100644
index 0000000..761c22a
--- /dev/null
+++ b/tests/lib/Local/vcflib/Test.pm
@@ -0,0 +1,32 @@
+use strict;
+use warnings;
+
+package Local::vcflib::Test;
+use base 'Exporter';
+
+use File::Basename qw< dirname >;
+use IPC::Run3 qw< run3 >;
+use Test::More;
+
+our @EXPORT = qw( run run_ok );
+our $BIN = dirname(__FILE__) . "/../../../../bin";
+
+sub run {
+ my ($run, $stdin) = @_;
+ my ($command, @opts) = @$run;
+ run3(["$BIN/$command", @opts], \$stdin, \(my $stdout), \(my $stderr));
+ return ($stdout, $stderr, $?);
+}
+
+sub run_ok {
+ local $Test::Builder::Level = $Test::Builder::Level + 1;
+ my ($stdout, $stderr, $exit) = run(@_);
+ ok $exit >> 8 == 0, "exit code"
+ or diag "error running command: " . join(" ", @{$_[0]}) . "\n"
+ ."with input:\n$_[1]\n--\n"
+ ."exit code = " . ($exit >> 8) . " (system() return value = $exit)\n"
+ ."stderr = \n$stderr";
+ return ($stdout, $stderr);
+}
+
+1;
diff --git a/tests/vcfdistance.t b/tests/vcfdistance.t
new file mode 100644
index 0000000..233ddfa
--- /dev/null
+++ b/tests/vcfdistance.t
@@ -0,0 +1,98 @@
+use strict;
+use warnings;
+use Test::More;
+use Local::vcflib::Test;
+
+my @vcf = split /\n/, <<'';
+##fileformat=VCFv4.0
+#CHROM POS ID REF ALT QUAL FILTER INFO
+refseq 502 . G A 38553 PASS
+refseq 552 . G A 24044 PASS
+refseq 660 . G A 38553 PASS
+refseq 678 . G A 24044 PASS
+refseq 684 . G A 24044 PASS
+
+sub variants {
+ join "\n", @vcf[0 .. $_[0] + 1]
+}
+
+my ($output, $header) = ('', <<'');
+##fileformat=VCFv4.0
+##INFO=<ID=BasesToClosestVariant,Number=1,Type=Integer,Description="Number of bases to the closest variant in the file.">
+#CHROM POS ID REF ALT QUAL FILTER INFO
+
+#
+# Various numbers of variant lines
+#
+($output) = run_ok(["vcfdistance"], variants(5));
+is $output, $header . <<'', "distances for 5 variant lines";
+refseq 502 . G A 38553 PASS BasesToClosestVariant=50;
+refseq 552 . G A 24044 PASS BasesToClosestVariant=50;
+refseq 660 . G A 38553 PASS BasesToClosestVariant=18;
+refseq 678 . G A 24044 PASS BasesToClosestVariant=6;
+refseq 684 . G A 24044 PASS BasesToClosestVariant=6;
+
+($output) = run_ok(["vcfdistance"], variants(4));
+is $output, $header . <<'', "distances for 4 variant lines";
+refseq 502 . G A 38553 PASS BasesToClosestVariant=50;
+refseq 552 . G A 24044 PASS BasesToClosestVariant=50;
+refseq 660 . G A 38553 PASS BasesToClosestVariant=18;
+refseq 678 . G A 24044 PASS BasesToClosestVariant=18;
+
+($output) = run_ok(["vcfdistance"], variants(3));
+is $output, $header . <<'', "distances for 3 variant lines";
+refseq 502 . G A 38553 PASS BasesToClosestVariant=50;
+refseq 552 . G A 24044 PASS BasesToClosestVariant=50;
+refseq 660 . G A 38553 PASS BasesToClosestVariant=108;
+
+($output) = run_ok(["vcfdistance"], variants(2));
+is $output, $header . <<'', "distances for 2 variant lines";
+refseq 502 . G A 38553 PASS BasesToClosestVariant=50;
+refseq 552 . G A 24044 PASS BasesToClosestVariant=50;
+
+($output) = run_ok(["vcfdistance"], variants(1));
+is $output, $header . <<'', "distances for 1 variant line";
+refseq 502 . G A 38553 PASS
+
+($output) = run_ok(["vcfdistance"], variants(0));
+is $output, $header, "distances for 0 variant lines";
+
+#
+# Various combinations of reference sequences (obviously non-comparable)
+#
+ at vcf = split /\n/, <<'';
+##fileformat=VCFv4.0
+#CHROM POS ID REF ALT QUAL FILTER INFO
+ref1 502 . G A 38553 PASS
+ref2 552 . G A 24044 PASS
+ref2 660 . G A 38553 PASS
+ref2 678 . G A 24044 PASS
+ref3 684 . G A 24044 PASS
+
+($output) = run_ok(["vcfdistance"], variants(5));
+is $output, $header . <<'', "distances for 5 variant lines; three references";
+ref1 502 . G A 38553 PASS
+ref2 552 . G A 24044 PASS BasesToClosestVariant=108;
+ref2 660 . G A 38553 PASS BasesToClosestVariant=18;
+ref2 678 . G A 24044 PASS BasesToClosestVariant=18;
+ref3 684 . G A 24044 PASS
+
+($output) = run_ok(["vcfdistance"], variants(4));
+is $output, $header . <<'', "distances for 4 variant lines, two references";
+ref1 502 . G A 38553 PASS
+ref2 552 . G A 24044 PASS BasesToClosestVariant=108;
+ref2 660 . G A 38553 PASS BasesToClosestVariant=18;
+ref2 678 . G A 24044 PASS BasesToClosestVariant=18;
+
+($output) = run_ok(["vcfdistance"], variants(3));
+is $output, $header . <<'', "distances for 3 variant lines, two references";
+ref1 502 . G A 38553 PASS
+ref2 552 . G A 24044 PASS BasesToClosestVariant=108;
+ref2 660 . G A 38553 PASS BasesToClosestVariant=108;
+
+($output) = run_ok(["vcfdistance"], variants(2));
+is $output, $header . <<'', "distances for 2 variant lines, two references";
+ref1 502 . G A 38553 PASS
+ref2 552 . G A 24044 PASS
+
+done_testing;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/libvcflib.git
More information about the debian-med-commit
mailing list