[med-svn] [bcftools] 01/08: Imported Upstream version 1.3.1
Afif Elghraoui
afif at moszumanska.debian.org
Tue Apr 26 05:29:44 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository bcftools.
commit c7f7f8b28968400ce3d0418f9141cd3384cfe2b6
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Mon Apr 25 21:57:06 2016 -0700
Imported Upstream version 1.3.1
---
Makefile | 8 +-
bcftools.h | 3 +-
consensus.c | 2 +-
doc/bcftools.1 | 96 ++++-
doc/bcftools.html | 101 +++++-
doc/bcftools.txt | 87 ++++-
khash_str2str.h | 19 +-
main.c | 4 +-
peakfit.c | 6 +
ploidy.c | 30 +-
plot-vcfstats | 6 +-
plugins/GTisec.c | 470 +++++++++++++++++++++++++
plugins/GTisec.mk | 2 +
plugins/color-chrs.c | 2 +-
plugins/fill-tags.c | 46 ++-
plugins/impute-info.c | 10 +-
plugins/mendelian.c | 2 +-
plugins/tag2tag.c | 4 +-
plugins/vcf2sex.c | 11 +-
polysomy.c | 3 +-
reheader.c | 62 ++--
test/annotate10.hdr | 3 +
test/annotate10.out | 18 +
test/annotate10.vcf | 17 +
test/annots10.tab | 4 +
test/concat.1.a.vcf | 2 -
test/concat.1.b.vcf | 2 -
test/fill-tags.2.out | 49 +++
test/norm.fa | 2 +
test/norm.fa.fai | 1 +
test/norm.merge.out | 2 +-
test/norm.merge.strict.out | 2 +-
test/norm.merge.vcf | 4 +-
test/norm.telomere.out | 5 +
test/norm.telomere.vcf | 4 +
test/{norm.merge.strict.out => reheader.3.out} | 43 +--
test/{norm.merge.strict.out => reheader.4.out} | 43 +--
test/reheader.empty.hdr | 7 +
test/reheader.empty.out | 8 +
test/reheader.samples3 | 2 +
test/reheader.samples4 | 1 +
test/test.pl | 96 ++---
test/view.GTisec.H.out | 20 ++
test/view.GTisec.Hm.out | 25 ++
test/view.GTisec.Hmv.out | 25 ++
test/view.GTisec.Hv.out | 20 ++
test/view.GTisec.m.out | 20 ++
test/view.GTisec.mv.out | 20 ++
test/view.GTisec.out | 15 +
test/view.GTisec.v.out | 15 +
test/view.filter.annovar.vcf | 1 +
vcfannotate.c | 212 +++++++++--
vcfcall.c | 43 ++-
vcfconcat.c | 139 +++++++-
vcfconvert.c | 18 +-
vcffilter.c | 8 +-
vcfindex.c | 3 +-
vcfisec.c | 12 +-
vcfmerge.c | 10 +-
vcfnorm.c | 20 +-
vcfplugin.c | 41 +--
vcfroh.c | 43 ++-
vcfview.c | 8 +-
63 files changed, 1706 insertions(+), 301 deletions(-)
diff --git a/Makefile b/Makefile
index 4357d7c..7dec9dd 100644
--- a/Makefile
+++ b/Makefile
@@ -82,7 +82,7 @@ MISC_PROGRAMS = plot-vcfstats vcfutils.pl plugins/color-chrs.pl
all:$(PROG) plugins
# See htslib/Makefile
-PACKAGE_VERSION = 1.3
+PACKAGE_VERSION = 1.3.1
ifneq "$(wildcard .git)" ""
PACKAGE_VERSION := $(shell git describe --always --dirty)
DOC_VERSION := $(shell git describe --always)+
@@ -129,7 +129,7 @@ endif
plugins: $(PLUGINS)
-bcftools_h = bcftools.h $(htslib_vcf_h)
+bcftools_h = bcftools.h $(htslib_hts_defs_h) $(htslib_vcf_h)
call_h = call.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) vcmp.h
convert_h = convert.h $(htslib_vcf_h)
tsv2vcf_h = tsv2vcf.h $(htslib_vcf_h)
@@ -143,7 +143,7 @@ main.o: main.c $(htslib_hts_h) version.h $(bcftools_h)
vcfannotate.o: vcfannotate.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(HTSDIR)/htslib/kseq.h $(bcftools_h) vcmp.h $(filter_h)
vcfplugin.o: vcfplugin.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(HTSDIR)/htslib/kseq.h $(bcftools_h) vcmp.h $(filter_h)
vcfcall.o: vcfcall.c $(htslib_vcf_h) $(HTSDIR)/htslib/kfunc.h $(htslib_synced_bcf_reader_h) $(HTSDIR)/htslib/khash_str2int.h $(bcftools_h) $(call_h) $(prob1_h) $(ploidy_h)
-vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(HTSDIR)/htslib/kseq.h $(bcftools_h)
+vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(HTSDIR)/htslib/kseq.h $(htslib_bgzf_h) $(htslib_tbx_h) $(bcftools_h)
vcfconvert.o: vcfconvert.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h)
vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h
vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h)
@@ -157,7 +157,7 @@ vcfcnv.o: vcfcnv.c $(cnv_h)
vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h)
vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h)
vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h)
-reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(HTSDIR)/htslib/kseq.h $(bcftools_h)
+reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(HTSDIR)/htslib/kseq.h $(bcftools_h)
tabix.o: tabix.c $(htslib_bgzf_h) $(htslib_tbx_h)
ccall.o: ccall.c $(HTSDIR)/htslib/kfunc.h $(call_h) kmin.h $(prob1_h)
convert.o: convert.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(convert_h)
diff --git a/bcftools.h b/bcftools.h
index 6f22272..d4e856d 100644
--- a/bcftools.h
+++ b/bcftools.h
@@ -26,6 +26,7 @@ THE SOFTWARE. */
#define BCFTOOLS_H
#include <stdarg.h>
+#include <htslib/hts_defs.h>
#include <htslib/vcf.h>
#include <math.h>
@@ -37,7 +38,7 @@ THE SOFTWARE. */
#define FT_STDIN (1<<3)
char *bcftools_version(void);
-void error(const char *format, ...);
+void error(const char *format, ...) HTS_NORETURN;
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
const char *hts_bcf_wmode(int file_type);
diff --git a/consensus.c b/consensus.c
index 7a615fe..051f353 100644
--- a/consensus.c
+++ b/consensus.c
@@ -623,7 +623,7 @@ int main_consensus(int argc, char *argv[])
{"chain",1,0,'c'},
{0,0,0,0}
};
- char c;
+ int c;
while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0)
{
switch (c)
diff --git a/doc/bcftools.1 b/doc/bcftools.1
index d2783d4..a85c9a1 100644
--- a/doc/bcftools.1
+++ b/doc/bcftools.1
@@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
-.\" Date: 2015-12-15 14:02 GMT
+.\" Date: 2016-04-18 14:18 BST
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
-.TH "BCFTOOLS" "1" "2015\-12\-15 14:02 GMT" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "2016\-04\-18 14:18 BST" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
@@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
.SS "VERSION"
.sp
-This manual page was last updated \fB2015\-12\-15 14:02 GMT\fR and refers to bcftools git version \fB1\&.2\-191\-g6737c5c+\fR\&.
+This manual page was last updated \fB2016\-04\-18 14:18 BST\fR and refers to bcftools git version \fB1\&.3\-36\-g47e811c+\fR\&.
.SS "BCF1"
.sp
The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
@@ -393,6 +393,11 @@ Skip sites where FILTER column does not contain any of the strings listed in
\fI\&.,PASS\fR\&.
.RE
.PP
+\fB\-\-no\-version\fR
+.RS 4
+Do not append version and command line information to the output VCF header\&.
+.RE
+.PP
\fB\-o, \-\-output\fR \fIFILE\fR
.RS 4
When output consists of a single stream, write it to
@@ -402,7 +407,7 @@ rather than to standard output, where it is written by default\&.
.PP
\fB\-O, \-\-output\-type\fR \fIb\fR|\fIu\fR|\fIz\fR|\fIv\fR
.RS 4
-Output compressed BCF (\fIb\fR), uncompressed BCF (\fIu\fR), compressed VCF (\fIz\fR), uncompressed VCF (\fIv\fR)\&. Use the \-Ou option when piping between bcftools subcommands to speed up performance by removing unecessary compression/decompression and VCF←→BCF conversion\&.
+Output compressed BCF (\fIb\fR), uncompressed BCF (\fIu\fR), compressed VCF (\fIz\fR), uncompressed VCF (\fIv\fR)\&. Use the \-Ou option when piping between bcftools subcommands to speed up performance by removing unnecessary compression/decompression and VCF←→BCF conversion\&.
.RE
.PP
\fB\-r, \-\-regions\fR \fIchr\fR|\fIchr:pos\fR|\fIchr:from\-to\fR|\fIchr:from\-\fR[,\&...]
@@ -416,7 +421,7 @@ cannot be used in combination with
.PP
\fB\-R, \-\-regions\-file\fR \fIFILE\fR
.RS 4
-Regions can be specified either on command line or in a VCF, BED, or tab\-delimited file (the default)\&. The columns of the tab\-delimited file are: CHROM, POS, and, optionally, POS_TO, where positions are 1\-based and inclusive\&. Uncompressed files are stored in memory, while bgzip\-compressed and tabix\-indexed region files are streamed\&. Note that sequence names must match exactly, "chr20" is not the same as "20"\&. Also note that chromosome ordering in
+Regions can be specified either on command line or in a VCF, BED, or tab\-delimited file (the default)\&. The columns of the tab\-delimited file are: CHROM, POS, and, optionally, POS_TO, where positions are 1\-based and inclusive\&. The columns of the tab\-delimited BED file are also CHROM, POS and POS_TO (trailing columns are ignored), but coordinates are 0\-based, half\-open\&. To indicate that a file be treated as BED rather than the 1\-based tab\-delimited file, the file must have th [...]
\fIFILE\fR
will be respected, the VCF will be processed in the order in which chromosomes first appear in
\fIFILE\fR\&. However, within chromosomes, the VCF will always be processed in ascending genomic coordinate order no matter what order they appear in
@@ -546,7 +551,7 @@ or
.RE
.SS "bcftools annotate \fI[OPTIONS]\fR \fIFILE\fR"
.sp
-This command allows to add or remove annotations\&.
+Add or remove annotations\&.
.PP
\fB\-a, \-\-annotations\fR \fIfile\fR
.RS 4
@@ -637,6 +642,12 @@ annotate sites which are present ("+") or absent ("\-") in the
file with a new INFO/TAG flag
.RE
.PP
+\fB\-\-no\-version\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
\fB\-o, \-\-output\fR \fIFILE\fR
.RS 4
see
@@ -861,6 +872,12 @@ This command replaces the former \fBbcftools view\fR caller\&. Some of the origi
\fBFile format options:\fR
.RS 4
.PP
+\fB\-\-no\-version\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
\fB\-o, \-\-output\fR \fIFILE\fR
.RS 4
see
@@ -1041,7 +1058,7 @@ calling model (conflicts with
likelihood of novel mutation for constrained
\fB\-C\fR
\fItrio\fR
-calling\&. The trio genotype calling maximizes likelihood of a particular combination of genotypes for father, mother and the child P(F=i,M=j,C=k) = P(unconstrained) * Pn + P(constrained) * (1\-Pn)\&. By providing three values, the mutation rate Pn is set explictly for SNPs, deletions and insertions, respectively\&. If two values are given, the first is interpreted as the mutation rate of SNPs and the second is used to calculate the mutation rate of indels according to their length as Pn [...]
+calling\&. The trio genotype calling maximizes likelihood of a particular combination of genotypes for father, mother and the child P(F=i,M=j,C=k) = P(unconstrained) * Pn + P(constrained) * (1\-Pn)\&. By providing three values, the mutation rate Pn is set explicitly for SNPs, deletions and insertions, respectively\&. If two values are given, the first is interpreted as the mutation rate of SNPs and the second is used to calculate the mutation rate of indels according to their length as P [...]
.RE
.PP
\fB\-p, \-\-pval\-threshold\fR \fIfloat\fR
@@ -1076,7 +1093,7 @@ haploid output for males and skips females (requires PED file with
.RE
.SS "bcftools concat \fI[OPTIONS]\fR \fIFILE1\fR \fIFILE2\fR [\&...]"
.sp
-Concatenate or combine VCF/BCF files\&. All source files must have the same sample columns appearing in the same order\&. Can be used, for example, to concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel VCF into one\&. The input files must be sorted by chr and position\&. The files must be given in the correct order to produce sorted VCF on output unless the \fB\-a, \-\-allow\-overlaps\fR option is specified\&.
+Concatenate or combine VCF/BCF files\&. All source files must have the same sample columns appearing in the same order\&. Can be used, for example, to concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel VCF into one\&. The input files must be sorted by chr and position\&. The files must be given in the correct order to produce sorted VCF on output unless the \fB\-a, \-\-allow\-overlaps\fR option is specified\&. With the \-\-naive option, the files are concatenated [...]
.PP
\fB\-a, \-\-allow\-overlaps\fR
.RS 4
@@ -1110,6 +1127,17 @@ Read the list of files from a file\&.
Ligate phased VCFs by matching phase at overlapping haplotypes
.RE
.PP
+\fB\-\-no\-version\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
+\fB\-n, \-\-naive\fR
+.RS 4
+Concatenate BCF files without recompression\&. This is very fast but requires that all files have the same headers\&. This is because all tags and chromosome names in the BCF body rely on the implicit order of the contig and tag definitions in the header\&. Currently no sanity checks are in place and only works for compressed BCF files\&. Dangerous, use with caution\&.
+.RE
+.PP
\fB\-o, \-\-output\fR \fIFILE\fR
.RS 4
see
@@ -1273,6 +1301,12 @@ see
\fBVCF output options:\fR
.RS 4
.PP
+\fB\-\-no\-version\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
\fB\-o, \-\-output\fR \fIFILE\fR
.RS 4
see
@@ -1593,6 +1627,12 @@ is true\&. For valid expressions see
define behaviour at sites with existing FILTER annotations\&. The default mode replaces existing filters of failed sites with a new FILTER string while leaving sites which pass untouched when non\-empty and setting to "PASS" when the FILTER string is absent\&. The "+" mode appends new FILTER strings of failed sites instead of replacing them\&. The "x" mode resets filters of sites which pass to "PASS"\&. Modes "+" and "x" can both be set\&.
.RE
.PP
+\fB\-\-no\-version\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
\fB\-o, \-\-output\fR \fIFILE\fR
.RS 4
see
@@ -1641,6 +1681,12 @@ see
see
\fBCommon Options\fR
.RE
+.PP
+\fB\-\-threads\fR \fIINT\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
.SS "bcftools gtcheck [\fIOPTIONS\fR] [\-g \fIgenotypes\&.vcf\&.gz\fR] \fIquery\&.vcf\&.gz\fR"
.sp
Checks sample identity or, without \fB\-g\fR, multi\-sample cross\-check is performed\&.
@@ -2066,6 +2112,12 @@ The option controls what types of multiallelic records can be created:
.RE
.\}
.PP
+\fB\-\-no\-version\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
\fB\-o, \-\-output\fR \fIFILE\fR
.RS 4
see
@@ -2142,6 +2194,12 @@ split multiallelic sites into biallelic records (\fI\-\fR) or join biallelic sit
\fIany\fR\&.
.RE
.PP
+\fB\-\-no\-version\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
\fB\-N, \-\-do\-not\-normalize\fR
.RS 4
the
@@ -2264,6 +2322,12 @@ see
\fBVCF output options:\fR
.RS 4
.PP
+\fB\-\-no\-version\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
\fB\-o, \-\-output\fR \fIFILE\fR
.RS 4
see
@@ -2512,7 +2576,7 @@ const char *about(void);
// Longer description used by \*(Aqbcftools +name \-h\*(Aq
const char *usage(void);
-// Called once at startup, allows to initialize local variables\&.
+// Called once at startup, allows initialization of local variables\&.
// Return 1 to suppress normal VCF/BCF header output, \-1 on critical
// errors, 0 otherwise\&.
int init(int argc, char **argv, bcf_hdr_t *in_hdr, bcf_hdr_t *out_hdr);
@@ -2558,7 +2622,7 @@ see
.PP
\fB\-s, \-\-sample\fR \fIstring\fR
.RS 4
-samply name
+sample name
.RE
.PP
\fB\-t, \-\-targets\fR \fILIST\fR
@@ -2781,7 +2845,7 @@ see
.PP
\fB\-s, \-\-samples\fR \fIFILE\fR
.RS 4
-new sample names, one name per line, in the same order as they appear in the VCF file\&. Alternatively, only samples which need to be renamed can be listed as "old_name new_name\en" pairs separated by whitespaces, each on separate line\&.
+new sample names, one name per line, in the same order as they appear in the VCF file\&. Alternatively, only samples which need to be renamed can be listed as "old_name new_name\en" pairs separated by whitespaces, each on a separate line\&. If a sample name contains spaces, the spaces can be escaped using the backslash character, for example "Not\e a\e good\e sample\e name"\&.
.RE
.SS "bcftools roh [\fIOPTIONS\fR] \fIfile\&.vcf\&.gz\fR"
.sp
@@ -2841,7 +2905,7 @@ in case allele frequency is not known, use the
.RS 4
use the specified INFO tag
\fITAG\fR
-as an allele frequency estimate instead of the defaul AC and AN tags\&. Sites which do not have
+as an allele frequency estimate instead of the default AC and AN tags\&. Sites which do not have
\fITAG\fR
will be skipped\&.
.RE
@@ -3098,6 +3162,12 @@ suppress the header in VCF output
compression level\&. 0 stands for uncompressed, 1 for best speed and 9 for best compression\&.
.RE
.PP
+\fB\-\-no\-version\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
\fB\-O, \-\-output\-type\fR \fIb\fR|\fIu\fR|\fIz\fR|\fIv\fR
.RS 4
see
@@ -3180,6 +3250,8 @@ see
.ps +1
\fBFilter options:\fR
.RS 4
+.sp
+Note that filter options below dealing with counting the number of alleles will, for speed, first check for the values of AC and AN in the INFO column to avoid parsing all the genotype (FORMAT/GT) fields in the VCF\&. This means that a filter like \fI\-\-min\-af 0\&.1\fR will be based \(oqAC/AN\(cq where AC and AN come from either INFO/AC and INFO/AN if available or FORMAT/GT if not\&. It will not filter on another field like INFO/AF\&. The \fI\-\-include\fR and \fI\-\-exclude\fR filter [...]
.PP
\fB\-c, \-\-min\-ac\fR \fIINT\fR[\fI:nref\fR|\fI:alt1\fR|\fI:minor\fR|\fI:major\fR|:\*(Aqnonmajor\*(Aq]
.RS 4
diff --git a/doc/bcftools.html b/doc/bcftools.html
index 012d670..d089aec 100644
--- a/doc/bcftools.html
+++ b/doc/bcftools.html
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp1931920"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipul [...]
+<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25197440"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipu [...]
Call Format (VCF) and its binary counterpart BCF. All commands work
transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.</p><p>Most commands accept VCF, bgzipped VCF and BCF with filetype detected
automatically even when streaming from a pipe. Indexed VCF and BCF
@@ -8,7 +8,7 @@ will work in all situations. Un-indexed VCF and BCF and streams will
work in most, but not all situations. In general, whenever multiple VCFs are
read simultaneously, they must be indexed and therefore also compressed.</p><p>BCFtools is designed to work on a stream. It regards an input file "-" as the
standard input (stdin) and outputs to the standard output (stdout). Several
-commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2015-12-15 14:02 GMT</strong></span> and refers to bcftools git version <span class="strong"><strong>1.2-191-g6737c5c+</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools <= 0.1.19 is <span class="strong"> [...]
+commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2016-04-18 14:18 BST</strong></span> and refers to bcftools git version <span class="strong"><strong>1.3-36-g47e811c+</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools <= 0.1.19 is <span class="strong">< [...]
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions <= 0.1.19 to convert to VCF, which can then be read by
@@ -117,6 +117,10 @@ specific commands to see if they apply.</p><div class="variablelist"><dl><dt><sp
in <span class="emphasis"><em>LIST</em></span>. For example, to include only sites which have no filters set,
use <span class="strong"><strong>-f</strong></span> <span class="emphasis"><em>.,PASS</em></span>.
</dd><dt><span class="term">
+<span class="strong"><strong>--no-version</strong></span>
+</span></dt><dd>
+ Do not append version and command line information to the output VCF header.
+</dd><dt><span class="term">
<span class="strong"><strong>-o, --output</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
When output consists of a single stream, write it to <span class="emphasis"><em>FILE</em></span> rather than
@@ -126,7 +130,7 @@ specific commands to see if they apply.</p><div class="variablelist"><dl><dt><sp
</span></dt><dd>
Output compressed BCF (<span class="emphasis"><em>b</em></span>), uncompressed BCF (<span class="emphasis"><em>u</em></span>), compressed VCF (<span class="emphasis"><em>z</em></span>), uncompressed VCF (<span class="emphasis"><em>v</em></span>).
Use the -Ou option when piping between bcftools subcommands to speed up
- performance by removing unecessary compression/decompression and
+ performance by removing unnecessary compression/decompression and
VCF←→BCF conversion.
</dd><dt><span class="term">
<span class="strong"><strong>-r, --regions</strong></span> <span class="emphasis"><em>chr</em></span>|<span class="emphasis"><em>chr:pos</em></span>|<span class="emphasis"><em>chr:from-to</em></span>|<span class="emphasis"><em>chr:from-</em></span>[,…]
@@ -138,11 +142,15 @@ specific commands to see if they apply.</p><div class="variablelist"><dl><dt><sp
</span></dt><dd>
Regions can be specified either on command line or in a VCF, BED, or
tab-delimited file (the default). The columns of the tab-delimited file
- are: CHROM, POS, and, optionally, POS_TO, where positions are 1-based
- and inclusive. Uncompressed files are stored in memory, while
- bgzip-compressed and tabix-indexed region files are streamed. Note that
- sequence names must match exactly, "chr20" is not the same as "20".
- Also note that chromosome ordering in <span class="emphasis"><em>FILE</em></span> will be respected,
+ are: CHROM, POS, and, optionally, POS_TO, where positions are 1-based
+ and inclusive. The columns of the tab-delimited BED file are also
+ CHROM, POS and POS_TO (trailing columns are ignored), but coordinates
+ are 0-based, half-open. To indicate that a file be treated as BED rather
+ than the 1-based tab-delimited file, the file must have the ".bed" or
+ ".bed.gz" suffix (case-insensitive). Uncompressed files are stored in
+ memory, while bgzip-compressed and tabix-indexed region files are streamed.
+ Note that sequence names must match exactly, "chr20" is not the same as
+ "20". Also note that chromosome ordering in <span class="emphasis"><em>FILE</em></span> will be respected,
the VCF will be processed in the order in which chromosomes first appear
in <span class="emphasis"><em>FILE</em></span>. However, within chromosomes, the VCF will always be
processed in ascending genomic coordinate order no matter what order they
@@ -217,7 +225,7 @@ specific commands to see if they apply.</p><div class="variablelist"><dl><dt><sp
</span></dt><dd>
Number of output compression threads to use in addition to main thread.
Only used when <span class="emphasis"><em>--output-type</em></span> is <span class="emphasis"><em>b</em></span> or <span class="emphasis"><em>z</em></span>. Default: 0.
-</dd></dl></div></div><div class="refsect2" title="bcftools annotate [OPTIONS] FILE"><a id="annotate"></a><h3>bcftools annotate <span class="emphasis"><em>[OPTIONS]</em></span> <span class="emphasis"><em>FILE</em></span></h3><p>This command allows to add or remove annotations.</p><div class="variablelist"><dl><dt><span class="term">
+</dd></dl></div></div><div class="refsect2" title="bcftools annotate [OPTIONS] FILE"><a id="annotate"></a><h3>bcftools annotate <span class="emphasis"><em>[OPTIONS]</em></span> <span class="emphasis"><em>FILE</em></span></h3><p>Add or remove annotations.</p><div class="variablelist"><dl><dt><span class="term">
<span class="strong"><strong>-a, --annotations</strong></span> <span class="emphasis"><em>file</em></span>
</span></dt><dd>
Bgzip-compressed and tabix-indexed file with annotations. The file
@@ -288,6 +296,10 @@ specific commands to see if they apply.</p><div class="variablelist"><dl><dt><sp
</span></dt><dd>
annotate sites which are present ("+") or absent ("-") in the <span class="strong"><strong>-a</strong></span> file with a new INFO/TAG flag
</dd><dt><span class="term">
+<span class="strong"><strong>--no-version</strong></span>
+</span></dt><dd>
+ see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
<span class="strong"><strong>-o, --output</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
@@ -450,6 +462,10 @@ loss), 0 (complete loss), 3 (single-copy gain).</p><div class="refsect3" title="
functionality has been temporarily lost in the process of transition under
<a class="ulink" href="http://github.com/samtools/htslib" target="_top">htslib</a>, but will be added back on popular
demand. The original calling model can be invoked with the <span class="strong"><strong>-c</strong></span> option.</p><div class="refsect3" title="File format options:"><a id="_file_format_options"></a><h4>File format options:</h4><div class="variablelist"><dl><dt><span class="term">
+<span class="strong"><strong>--no-version</strong></span>
+</span></dt><dd>
+ see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
<span class="strong"><strong>-o, --output</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
@@ -566,7 +582,7 @@ demand. The original calling model can be invoked with the <span class="strong">
genotype calling maximizes likelihood of a particular combination of
genotypes for father, mother and the child
P(F=i,M=j,C=k) = P(unconstrained) * Pn + P(constrained) * (1-Pn).
- By providing three values, the mutation rate Pn is set explictly for SNPs,
+ By providing three values, the mutation rate Pn is set explicitly for SNPs,
deletions and insertions, respectively. If two values are given, the first
is interpreted as the mutation rate of SNPs and the second is used to
calculate the mutation rate of indels according to their length as
@@ -598,7 +614,9 @@ columns appearing in the same order. Can be used, for example, to
concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel
VCF into one. The input files must be sorted by chr and position. The files
must be given in the correct order to produce sorted VCF on output unless
-the <span class="strong"><strong>-a, --allow-overlaps</strong></span> option is specified.</p><div class="variablelist"><dl><dt><span class="term">
+the <span class="strong"><strong>-a, --allow-overlaps</strong></span> option is specified. With the --naive option, the files
+are concatenated without being recompressed, which is very fast but dangerous
+if the BCF headers differ.</p><div class="variablelist"><dl><dt><span class="term">
<span class="strong"><strong>-a, --allow-overlaps</strong></span>
</span></dt><dd>
First coordinate of the next file can precede last record of the current file.
@@ -624,6 +642,18 @@ the <span class="strong"><strong>-a, --allow-overlaps</strong></span> option is
</span></dt><dd>
Ligate phased VCFs by matching phase at overlapping haplotypes
</dd><dt><span class="term">
+<span class="strong"><strong>--no-version</strong></span>
+</span></dt><dd>
+ see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
+<span class="strong"><strong>-n, --naive</strong></span>
+</span></dt><dd>
+ Concatenate BCF files without recompression. This is very fast but requires
+ that all files have the same headers. This is because all tags and
+ chromosome names in the BCF body rely on the implicit order of the contig
+ and tag definitions in the header. Currently no sanity checks
+ are in place and only works for compressed BCF files. Dangerous, use with caution.
+</dd><dt><span class="term">
<span class="strong"><strong>-o, --output</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
@@ -714,6 +744,10 @@ the <span class="strong"><strong>-a, --allow-overlaps</strong></span> option is
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
</dd></dl></div></div><div class="refsect3" title="VCF output options:"><a id="_vcf_output_options"></a><h4>VCF output options:</h4><div class="variablelist"><dl><dt><span class="term">
+<span class="strong"><strong>--no-version</strong></span>
+</span></dt><dd>
+ see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
<span class="strong"><strong>-o, --output</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
@@ -907,6 +941,10 @@ And similarly here, the second is filtered:
strings of failed sites instead of replacing them. The "x" mode resets
filters of sites which pass to "PASS". Modes "+" and "x" can both be set.
</dd><dt><span class="term">
+<span class="strong"><strong>--no-version</strong></span>
+</span></dt><dd>
+ see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
<span class="strong"><strong>-o, --output</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
@@ -939,6 +977,10 @@ And similarly here, the second is filtered:
<span class="strong"><strong>-T, --targets-file</strong></span> <span class="emphasis"><em>file</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
+<span class="strong"><strong>--threads</strong></span> <span class="emphasis"><em>INT</em></span>
+</span></dt><dd>
+ see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
</dd></dl></div></div><div class="refsect2" title="bcftools gtcheck [OPTIONS] [-g genotypes.vcf.gz] query.vcf.gz"><a id="gtcheck"></a><h3>bcftools gtcheck [<span class="emphasis"><em>OPTIONS</em></span>] [<span class="strong"><strong>-g</strong></span> <span class="emphasis"><em>genotypes.vcf.gz</em></span>] <span class="emphasis"><em>query.vcf.gz</em></span></h3><p>Checks sample identity or, without <span class="strong"><strong>-g</strong></span>, multi-sample cross-check is performed.< [...]
<span class="strong"><strong>-a, --all-sites</strong></span>
</span></dt><dd>
@@ -1165,6 +1207,10 @@ For "vertical" merge take a look at <span class="strong"><strong><a class="link"
-m both .. both SNP and indel records can be multiallelic
-m all .. SNP records can be merged with indel records
-m id .. merge by ID</pre><div class="variablelist"><dl><dt><span class="term">
+<span class="strong"><strong>--no-version</strong></span>
+</span></dt><dd>
+ see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
<span class="strong"><strong>-o, --output</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
@@ -1223,6 +1269,10 @@ the <span class="strong"><strong><a class="link" href="#fasta_ref">--fasta-ref</
<span class="emphasis"><em>both</em></span>; if SNPs and indels should be merged into a single record, specify
<span class="emphasis"><em>any</em></span>.
</dd><dt><span class="term">
+<span class="strong"><strong>--no-version</strong></span>
+</span></dt><dd>
+ see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
<span class="strong"><strong>-N, --do-not-normalize</strong></span><a id="do_not_normalize"></a>
</span></dt><dd>
the <span class="emphasis"><em>-c s</em></span> option can be used to fix or set the REF allele from the
@@ -1292,6 +1342,10 @@ the <span class="strong"><strong><a class="link" href="#fasta_ref">--fasta-ref</
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
</dd></dl></div></div><div class="refsect3" title="VCF output options:"><a id="_vcf_output_options_2"></a><h4>VCF output options:</h4><div class="variablelist"><dl><dt><span class="term">
+<span class="strong"><strong>--no-version</strong></span>
+</span></dt><dd>
+ see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
<span class="strong"><strong>-o, --output</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
@@ -1400,7 +1454,7 @@ const char *about(void);
// Longer description used by 'bcftools +name -h'
const char *usage(void);
-// Called once at startup, allows to initialize local variables.
+// Called once at startup, allows initialization of local variables.
// Return 1 to suppress normal VCF/BCF header output, -1 on critical
// errors, 0 otherwise.
int init(int argc, char **argv, bcf_hdr_t *in_hdr, bcf_hdr_t *out_hdr);
@@ -1427,7 +1481,7 @@ file for help.</p><div class="refsect3" title="General options:"><a id="_general
</dd><dt><span class="term">
<span class="strong"><strong>-s, --sample</strong></span> <span class="emphasis"><em>string</em></span>
</span></dt><dd>
- samply name
+ sample name
</dd><dt><span class="term">
<span class="strong"><strong>-t, --targets</strong></span> <span class="emphasis"><em>LIST</em></span>
</span></dt><dd>
@@ -1561,8 +1615,10 @@ bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%SAMPLE=%GT]\n' file.vcf.gz</pre><
</span></dt><dd>
new sample names, one name per line, in the same order as they appear
in the VCF file. Alternatively, only samples which need to be renamed
- can be listed as "old_name new_name\n" pairs separated by
- whitespaces, each on separate line.
+ can be listed as "old_name new_name\n" pairs separated by whitespaces,
+ each on a separate line. If a sample name contains spaces, the
+ spaces can be escaped using the backslash character, for example
+ "Not\ a\ good\ sample\ name".
</dd></dl></div></div><div class="refsect2" title="bcftools roh [OPTIONS] file.vcf.gz"><a id="roh"></a><h3>bcftools roh [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>file.vcf.gz</em></span></h3><p>A program for detecting runs of homo/autozygosity. Only bi-allelic sites
are considered.</p><div class="refsect3" title="The HMM model:"><a id="_the_hmm_model"></a><h4>The HMM model:</h4><pre class="screen">Notation:
D = Data, AZ = autozygosity, HW = Hardy-Weinberg (non-autozygosity),
@@ -1590,7 +1646,7 @@ Transition probabilities:
<span class="strong"><strong>--AF-tag</strong></span> <span class="emphasis"><em>TAG</em></span>
</span></dt><dd>
use the specified INFO tag <span class="emphasis"><em>TAG</em></span> as an allele frequency estimate
- instead of the defaul AC and AN tags. Sites which do not have <span class="emphasis"><em>TAG</em></span>
+ instead of the default AC and AN tags. Sites which do not have <span class="emphasis"><em>TAG</em></span>
will be skipped.
</dd><dt><span class="term">
<span class="strong"><strong>--AF-file</strong></span> <span class="emphasis"><em>FILE</em></span>
@@ -1759,6 +1815,10 @@ Convert between VCF and BCF. Former <span class="strong"><strong>bcftools subset
compression level. 0 stands for uncompressed, 1 for best speed and 9 for
best compression.
</dd><dt><span class="term">
+<span class="strong"><strong>--no-version</strong></span>
+</span></dt><dd>
+ see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
<span class="strong"><strong>-O, --output-type</strong></span> <span class="emphasis"><em>b</em></span>|<span class="emphasis"><em>u</em></span>|<span class="emphasis"><em>z</em></span>|<span class="emphasis"><em>v</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
@@ -1803,7 +1863,14 @@ Convert between VCF and BCF. Former <span class="strong"><strong>bcftools subset
<span class="strong"><strong>-S, --samples-file</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
-</dd></dl></div></div><div class="refsect3" title="Filter options:"><a id="_filter_options"></a><h4>Filter options:</h4><div class="variablelist"><dl><dt><span class="term">
+</dd></dl></div></div><div class="refsect3" title="Filter options:"><a id="_filter_options"></a><h4>Filter options:</h4><p>Note that filter options below dealing with counting the number of alleles
+will, for speed, first check for the values of AC and AN in the INFO column to
+avoid parsing all the genotype (FORMAT/GT) fields in the VCF. This means
+that a filter like <span class="emphasis"><em>--min-af 0.1</em></span> will be based ‘AC/AN’ where AC and AN come
+from either INFO/AC and INFO/AN if available or FORMAT/GT if not. It will not
+filter on another field like INFO/AF. The <span class="emphasis"><em>--include</em></span> and <span class="emphasis"><em>--exclude</em></span> filter
+expressions should instead be used to explicitly filter based on fields in
+the INFO column, e.g. <span class="emphasis"><em>--exclude AF<0.1</em></span>.</p><div class="variablelist"><dl><dt><span class="term">
<span class="strong"><strong>-c, --min-ac</strong></span> <span class="emphasis"><em>INT</em></span>[<span class="emphasis"><em>:nref</em></span>|<span class="emphasis"><em>:alt1</em></span>|<span class="emphasis"><em>:minor</em></span>|<span class="emphasis"><em>:major</em></span>|:'nonmajor']
</span></dt><dd>
minimum allele count (INFO/AC) of sites to be printed.
diff --git a/doc/bcftools.txt b/doc/bcftools.txt
index e26692d..c91bb9c 100644
--- a/doc/bcftools.txt
+++ b/doc/bcftools.txt
@@ -159,6 +159,9 @@ specific commands to see if they apply.
in 'LIST'. For example, to include only sites which have no filters set,
use *-f* '.,PASS'.
+*--no-version*::
+ Do not append version and command line information to the output VCF header.
+
*-o, --output* 'FILE'::
When output consists of a single stream, write it to 'FILE' rather than
to standard output, where it is written by default.
@@ -166,7 +169,7 @@ specific commands to see if they apply.
*-O, --output-type* 'b'|'u'|'z'|'v'::
Output compressed BCF ('b'), uncompressed BCF ('u'), compressed VCF ('z'), uncompressed VCF ('v').
Use the -Ou option when piping between bcftools subcommands to speed up
- performance by removing unecessary compression/decompression and
+ performance by removing unnecessary compression/decompression and
VCF<-->BCF conversion.
*-r, --regions* 'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]::
@@ -176,11 +179,15 @@ specific commands to see if they apply.
*-R, --regions-file* 'FILE'::
Regions can be specified either on command line or in a VCF, BED, or
tab-delimited file (the default). The columns of the tab-delimited file
- are: CHROM, POS, and, optionally, POS_TO, where positions are 1-based
- and inclusive. Uncompressed files are stored in memory, while
- bgzip-compressed and tabix-indexed region files are streamed. Note that
- sequence names must match exactly, "chr20" is not the same as "20".
- Also note that chromosome ordering in 'FILE' will be respected,
+ are: CHROM, POS, and, optionally, POS_TO, where positions are 1-based
+ and inclusive. The columns of the tab-delimited BED file are also
+ CHROM, POS and POS_TO (trailing columns are ignored), but coordinates
+ are 0-based, half-open. To indicate that a file be treated as BED rather
+ than the 1-based tab-delimited file, the file must have the ".bed" or
+ ".bed.gz" suffix (case-insensitive). Uncompressed files are stored in
+ memory, while bgzip-compressed and tabix-indexed region files are streamed.
+ Note that sequence names must match exactly, "chr20" is not the same as
+ "20". Also note that chromosome ordering in 'FILE' will be respected,
the VCF will be processed in the order in which chromosomes first appear
in 'FILE'. However, within chromosomes, the VCF will always be
processed in ascending genomic coordinate order no matter what order they
@@ -263,7 +270,7 @@ specific commands to see if they apply.
[[annotate]]
=== bcftools annotate '[OPTIONS]' 'FILE'
-This command allows to add or remove annotations.
+Add or remove annotations.
*-a, --annotations* 'file'::
Bgzip-compressed and tabix-indexed file with annotations. The file
@@ -337,6 +344,9 @@ This command allows to add or remove annotations.
*-m, --mark-sites* [+-]'TAG'::
annotate sites which are present ("+") or absent ("-") in the *-a* file with a new INFO/TAG flag
+*--no-version*::
+ see *<<common_options,Common Options>>*
+
*-o, --output* 'FILE'::
see *<<common_options,Common Options>>*
@@ -498,6 +508,9 @@ demand. The original calling model can be invoked with the *-c* option.
==== File format options:
+*--no-version*::
+ see *<<common_options,Common Options>>*
+
*-o, --output* 'FILE'::
see *<<common_options,Common Options>>*
@@ -598,7 +611,7 @@ demand. The original calling model can be invoked with the *-c* option.
genotype calling maximizes likelihood of a particular combination of
genotypes for father, mother and the child
P(F=i,M=j,C=k) = P(unconstrained) * Pn + P(constrained) * (1-Pn).
- By providing three values, the mutation rate Pn is set explictly for SNPs,
+ By providing three values, the mutation rate Pn is set explicitly for SNPs,
deletions and insertions, respectively. If two values are given, the first
is interpreted as the mutation rate of SNPs and the second is used to
calculate the mutation rate of indels according to their length as
@@ -630,7 +643,10 @@ columns appearing in the same order. Can be used, for example, to
concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel
VCF into one. The input files must be sorted by chr and position. The files
must be given in the correct order to produce sorted VCF on output unless
-the *-a, --allow-overlaps* option is specified.
+the *-a, --allow-overlaps* option is specified. With the --naive option, the files
+are concatenated without being recompressed, which is very fast but dangerous
+if the BCF headers differ.
+
*-a, --allow-overlaps*::
First coordinate of the next file can precede last record of the current file.
@@ -651,6 +667,16 @@ the *-a, --allow-overlaps* option is specified.
*-l, --ligate*::
Ligate phased VCFs by matching phase at overlapping haplotypes
+*--no-version*::
+ see *<<common_options,Common Options>>*
+
+*-n, --naive*::
+ Concatenate BCF files without recompression. This is very fast but requires
+ that all files have the same headers. This is because all tags and
+ chromosome names in the BCF body rely on the implicit order of the contig
+ and tag definitions in the header. Currently no sanity checks
+ are in place and only works for compressed BCF files. Dangerous, use with caution.
+
*-o, --output* 'FILE'::
see *<<common_options,Common Options>>*
@@ -739,6 +765,9 @@ Create consensus sequence by applying VCF variants to a reference fasta file.
==== VCF output options:
+*--no-version*::
+ see *<<common_options,Common Options>>*
+
*-o, --output* 'FILE'::
see *<<common_options,Common Options>>*
@@ -936,6 +965,9 @@ And similarly here, the second is filtered:
strings of failed sites instead of replacing them. The "x" mode resets
filters of sites which pass to "PASS". Modes "+" and "x" can both be set.
+*--no-version*::
+ see *<<common_options,Common Options>>*
+
*-o, --output* 'FILE'::
see *<<common_options,Common Options>>*
@@ -961,7 +993,8 @@ And similarly here, the second is filtered:
*-T, --targets-file* 'file'::
see *<<common_options,Common Options>>*
-
+*--threads* 'INT'::
+ see *<<common_options,Common Options>>*
@@ -1209,6 +1242,9 @@ For "vertical" merge take a look at *<<norm,bcftools norm>>* instead.
-m id .. merge by ID
----
+*--no-version*::
+ see *<<common_options,Common Options>>*
+
*-o, --output* 'FILE'::
see *<<common_options,Common Options>>*
@@ -1262,6 +1298,9 @@ the *<<fasta_ref,--fasta-ref>>* option is supplied.
'both'; if SNPs and indels should be merged into a single record, specify
'any'.
+*--no-version*::
+ see *<<common_options,Common Options>>*
+
*-N, --do-not-normalize*[[do_not_normalize]]::
the '-c s' option can be used to fix or set the REF allele from the
reference '-f'. The '-N' option will not turn on indel normalisation
@@ -1323,6 +1362,9 @@ the *<<fasta_ref,--fasta-ref>>* option is supplied.
==== VCF output options:
+*--no-version*::
+ see *<<common_options,Common Options>>*
+
*-o, --output* 'FILE'::
see *<<common_options,Common Options>>*
@@ -1434,7 +1476,7 @@ const char *about(void);
// Longer description used by 'bcftools +name -h'
const char *usage(void);
-// Called once at startup, allows to initialize local variables.
+// Called once at startup, allows initialization of local variables.
// Return 1 to suppress normal VCF/BCF header output, -1 on critical
// errors, 0 otherwise.
int init(int argc, char **argv, bcf_hdr_t *in_hdr, bcf_hdr_t *out_hdr);
@@ -1467,7 +1509,7 @@ file for help.
see *<<common_options,Common Options>>*
*-s, --sample* 'string'::
- samply name
+ sample name
*-t, --targets* 'LIST'::
see *<<common_options,Common Options>>*
@@ -1594,9 +1636,10 @@ Modify header of VCF/BCF files, change sample names.
*-s, --samples* 'FILE'::
new sample names, one name per line, in the same order as they appear
in the VCF file. Alternatively, only samples which need to be renamed
- can be listed as "old_name new_name\n" pairs separated by
- whitespaces, each on separate line.
-
+ can be listed as "old_name new_name\n" pairs separated by whitespaces,
+ each on a separate line. If a sample name contains spaces, the
+ spaces can be escaped using the backslash character, for example
+ "Not\ a\ good\ sample\ name".
[[roh]]
@@ -1635,7 +1678,7 @@ Transition probabilities:
*--AF-tag* 'TAG'::
use the specified INFO tag 'TAG' as an allele frequency estimate
- instead of the defaul AC and AN tags. Sites which do not have 'TAG'
+ instead of the default AC and AN tags. Sites which do not have 'TAG'
will be skipped.
*--AF-file* 'FILE'::
@@ -1790,6 +1833,9 @@ Convert between VCF and BCF. Former *bcftools subset*.
compression level. 0 stands for uncompressed, 1 for best speed and 9 for
best compression.
+*--no-version*::
+ see *<<common_options,Common Options>>*
+
*-O, --output-type* 'b'|'u'|'z'|'v'::
see *<<common_options,Common Options>>*
@@ -1830,6 +1876,15 @@ Convert between VCF and BCF. Former *bcftools subset*.
==== Filter options:
+Note that filter options below dealing with counting the number of alleles
+will, for speed, first check for the values of AC and AN in the INFO column to
+avoid parsing all the genotype (FORMAT/GT) fields in the VCF. This means
+that a filter like '--min-af 0.1' will be based `AC/AN' where AC and AN come
+from either INFO/AC and INFO/AN if available or FORMAT/GT if not. It will not
+filter on another field like INFO/AF. The '--include' and '--exclude' filter
+expressions should instead be used to explicitly filter based on fields in
+the INFO column, e.g. '--exclude AF<0.1'.
+
*-c, --min-ac* 'INT'[':nref'|':alt1'|':minor'|':major'|:'nonmajor']::
minimum allele count (INFO/AC) of sites to be printed.
Specifying the type of allele is optional and can be set to
diff --git a/khash_str2str.h b/khash_str2str.h
index ecf4e0b..4a5bd12 100644
--- a/khash_str2str.h
+++ b/khash_str2str.h
@@ -1,6 +1,6 @@
/* khash_str2str.h -- C-string to C-string hash table.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014,2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -61,6 +61,23 @@ static inline void khash_str2str_destroy_free(void *_hash)
}
/*
+ * Destroys the hash structure, the keys and the values
+ */
+static inline void khash_str2str_destroy_free_all(void *_hash)
+{
+ khash_t(str2str) *hash = (khash_t(str2str)*)_hash;
+ khint_t k;
+ if (hash == 0) return;
+ for (k = 0; k < kh_end(hash); ++k)
+ if (kh_exist(hash, k))
+ {
+ free((char*)kh_key(hash, k));
+ free((char*)kh_val(hash, k));
+ }
+ kh_destroy(str2str, hash);
+}
+
+/*
* Returns value if key exists or NULL if not
*/
static inline char *khash_str2str_get(void *_hash, const char *str)
diff --git a/main.c b/main.c
index f08b5c7..1892c1d 100644
--- a/main.c
+++ b/main.c
@@ -1,6 +1,6 @@
/* main.c -- main bcftools command front-end.
- Copyright (C) 2012-2015 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -219,7 +219,7 @@ int main(int argc, char *argv[])
if (argc < 2) { usage(stderr); return 1; }
if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
- printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2015 Genome Research Ltd.\n", bcftools_version(), hts_version());
+ printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2016 Genome Research Ltd.\n", bcftools_version(), hts_version());
#if USE_GPL
printf("License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
#else
diff --git a/peakfit.c b/peakfit.c
index 2bc8492..8094b5d 100644
--- a/peakfit.c
+++ b/peakfit.c
@@ -26,6 +26,7 @@
#include "peakfit.h"
#include <stdio.h>
+#include <gsl/gsl_version.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_multifit_nlin.h>
#include <htslib/hts.h>
@@ -550,9 +551,14 @@ double peakfit_run(peakfit_t *pkf, int nvals, double *xvals, double *yvals)
}
if ( ret ) break;
+#if GSL_MAJOR_VERSION >= 2
+ int info;
+ test1 = gsl_multifit_fdfsolver_test(solver, 1e-8,1e-8, 0.0, &info);
+#else
gsl_multifit_gradient(solver->J, solver->f, grad);
test1 = gsl_multifit_test_gradient(grad, 1e-8);
test2 = gsl_multifit_test_delta(solver->dx, solver->x, 1e-8, 1e-8);
+#endif
}
while ((test1==GSL_CONTINUE || test2==GSL_CONTINUE) && ++niter<niter_max);
if ( pkf->verbose >1 )
diff --git a/ploidy.c b/ploidy.c
index 160bc3e..719e175 100644
--- a/ploidy.c
+++ b/ploidy.c
@@ -1,5 +1,5 @@
-/*
- Copyright (C) 2014 Genome Research Ltd.
+/*
+ Copyright (C) 2014-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -98,7 +98,7 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v
ploidy->id2sex[ploidy->nsex-1] = strdup(ploidy->tmp_str.s);
sp->sex = khash_str2int_inc(ploidy->sex2id, ploidy->id2sex[ploidy->nsex-1]);
ploidy->sex2dflt = (int*) realloc(ploidy->sex2dflt,sizeof(int)*ploidy->nsex);
- ploidy->sex2dflt[ploidy->nsex-1] = ploidy->dflt;
+ ploidy->sex2dflt[ploidy->nsex-1] = -1;
}
ss = se;
@@ -106,8 +106,8 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v
if ( !*se ) error("Could not parse: %s\n", line);
sp->ploidy = strtol(ss,&se,10);
if ( ss==se ) error("Could not parse: %s\n", line);
- if ( sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy;
- if ( sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy;
+ if ( ploidy->min<0 || sp->ploidy < ploidy->min ) ploidy->min = sp->ploidy;
+ if ( ploidy->max<0 || sp->ploidy > ploidy->max ) ploidy->max = sp->ploidy;
// Special case, chr="*" stands for a default value
if ( default_ploidy_def )
@@ -119,19 +119,32 @@ int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, v
return 0;
}
+static void _set_defaults(ploidy_t *ploidy, int dflt)
+{
+ int i;
+ if ( khash_str2int_get(ploidy->sex2id, "*", &i) == 0 ) dflt = ploidy->sex2dflt[i];
+ for (i=0; i<ploidy->nsex; i++)
+ if ( ploidy->sex2dflt[i]==-1 ) ploidy->sex2dflt[i] = dflt;
+
+ ploidy->dflt = dflt;
+ if ( ploidy->min<0 || dflt < ploidy->min ) ploidy->min = dflt;
+ if ( ploidy->max<0 || dflt > ploidy->max ) ploidy->max = dflt;
+}
+
ploidy_t *ploidy_init(const char *fname, int dflt)
{
ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
if ( !pld ) return NULL;
- pld->dflt = pld->min = pld->max = dflt;
+ pld->min = pld->max = -1;
pld->sex2id = khash_str2int_init();
pld->idx = regidx_init(fname,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
if ( !pld->idx )
{
ploidy_destroy(pld);
- pld = NULL;
+ return NULL;
}
+ _set_defaults(pld,dflt);
return pld;
}
@@ -140,7 +153,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt)
ploidy_t *pld = (ploidy_t*) calloc(1,sizeof(ploidy_t));
if ( !pld ) return NULL;
- pld->dflt = pld->min = pld->max = dflt;
+ pld->min = pld->max = -1;
pld->sex2id = khash_str2int_init();
pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
@@ -160,6 +173,7 @@ ploidy_t *ploidy_init_string(const char *str, int dflt)
regidx_insert(pld->idx,NULL);
free(tmp.s);
+ _set_defaults(pld,dflt);
return pld;
}
diff --git a/plot-vcfstats b/plot-vcfstats
index 5989062..74a42ca 100755
--- a/plot-vcfstats
+++ b/plot-vcfstats
@@ -174,12 +174,12 @@ sub parse_params
{
id=>'GCsS',
header=>'GCsS, Genotype concordance by sample (SNPs)',
- exp=>"# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches"
+ exp=>"# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared"
},
{
id=>'GCiS',
header=>'GCiS, Genotype concordance by sample (indels)',
- exp=>"# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches"
+ exp=>"# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared"
},
{
id=>'PSC',
@@ -189,7 +189,7 @@ sub parse_params
{
id=>'PSI',
header=>'PSI, Per-sample Indels',
- exp=>"# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio"
+ exp=>"# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA"
},
{
id=>'DP',
diff --git a/plugins/GTisec.c b/plugins/GTisec.c
new file mode 100644
index 0000000..eaa0b15
--- /dev/null
+++ b/plugins/GTisec.c
@@ -0,0 +1,470 @@
+/* plugins/GTisec.c -- collect genotype intersection counts of all possible
+ subsets of the present samples and output in banker's
+ sequence order (in this sequence, the number of contained
+ samples increases monotonically, a property that is e.g.
+ useful for programatically creating plotting files for the
+ R package VennDiagram or the plotting tool circos from the
+ counts, as in the command line tools bankers2VennDiagram and
+ bankers2circos at htpps://github.com/dlaehnemann/bankers2)
+
+ Copyright (C) 2016 Computational Biology of Infection Research,
+ Helmholtz Centre for Infection Research, Braunschweig,
+ Germany
+
+ Author: David Laehnemann <david.laehnemann at helmholtz-hzi.de>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <math.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/khash.h>
+KHASH_MAP_INIT_INT(gts2smps, uint32_t)
+
+#include "bcftools.h"
+
+/*!
+ * Flag definitions for args.flag
+ */
+#define MISSING (1<<0)
+#define VERBOSE (1<<1)
+#define SMPORDER (1<<2)
+
+typedef struct _args_t
+{
+ bcf_srs_t *file; /*! multi-sample VCF file */
+ bcf_hdr_t *hdr; /*! VCF file header */
+ FILE *out; /*! output file pointer */
+ int nsmp; /*! number of samples, can be determined from header but is needed in multiple contexts */
+ int nsmpp2; /*! 2^(nsmp) (is needed multiple times) */
+ int *gt_arr; /*! temporary array, to store GTs of current line/record */
+ int ngt_arr; /*! hold the number of current GT array entries */
+ uint32_t *bankers; /*! array to store banker's sequence for all possible sample subsets for
+ programmatic indexing into smp_is for output printing, e.g. for three
+ samples A, B and C this would be the following order:
+ [ C, B, A, CB, CA, BA, CBA ]
+ [ 100, 010, 001, 110, 101, 011, 111 ]
+ */
+ uint64_t *quick; /*! array to store n choose k lookup table of choose() function */
+ uint8_t flag; /*! several flags, for positions see above*/
+ uint64_t *missing_gts; /*! array to count missing genotypes of each sample */
+ uint64_t *smp_is; /*! array to track all possible intersections between
+ samples, with each bit in the index integer belonging to one
+ sample. E.g. for three samples A, B and C, count would be in
+ the following order:
+ [ A, B, AB, C, AC, BC, ABC ]
+ [ 001, 010, 011, 100, 101, 110, 111 ]
+ */
+}
+args_t;
+
+static args_t args;
+uint32_t compute_bankers(unsigned long a);
+
+const char *about(void)
+{
+ return "Count genotype intersections across all possible sample subsets in a vcf file.\n";
+}
+
+
+const char *usage(void)
+{
+ return
+ "\n"
+ "About: Count genotype intersections across all possible sample subsets in a vcf file.\n"
+ "Usage: bcftools +GTisec <multisample.bcf/.vcf.gz> [General Options] -- [Plugin Options] \n"
+ "\n"
+ "Options:\n"
+ " run \"bcftools plugin\" for a list of common options\n"
+ "\n"
+ "Plugin options:\n"
+ " -m, --missing if set, include count of missing genotypes per sample in output\n"
+ " -v, --verbose if set, annotate count rows with corresponding sample subset lists\n"
+ " -H, --human-readable if set, create human readable output; i.e. sort output by sample and\n"
+ " print each subset's intersection count once for each sample contained\n"
+ " in the subset; implies verbose output (-v)\n"
+ "\n"
+ "Example:\n"
+ " bcftools +GTisec in.vcf -- -v # for verbose output\n"
+ " bcftools +GTisec in.vcf -- -H # for human readable output\n"
+ "\n";
+}
+
+
+int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
+{
+ memset(&args,0,sizeof(args_t));
+ args.flag = 0;
+
+ static struct option loptions[] =
+ {
+ {"help", no_argument, 0,'h'},
+ {"missing", no_argument, 0,'m'},
+ {"verbose", no_argument, 0,'v'},
+ {"human-readable", no_argument, 0,'H'},
+ {0,0,0,0}
+ };
+
+ int c;
+ while ((c = getopt_long(argc, argv, "?mvHh",loptions,NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 'm': args.flag |= MISSING; break;
+ case 'v': args.flag |= VERBOSE; break;
+ case 'H': args.flag |= ( SMPORDER | VERBOSE ); break;
+ case 'h': usage(); break;
+ case '?':
+ default: error("%s", usage()); break;
+ }
+ }
+ if ( optind != argc ) usage(); // too many files given
+
+
+ args.hdr = in;
+
+ if ( !bcf_hdr_nsamples(args.hdr) )
+ {
+ error("No samples in input file.\n");
+ }
+
+ args.nsmp = bcf_hdr_nsamples(args.hdr);
+ if ( args.nsmp > 32 ) error("Too many samples. A maximum of 32 is supported.\n");
+ args.nsmpp2 = pow( 2, args.nsmp);
+ args.bankers = (uint32_t*) calloc( args.nsmpp2, sizeof(uint32_t) );
+ args.quick = (uint64_t*) calloc((args.nsmp * (args.nsmp + 1)) / 4, sizeof(unsigned long));
+ if ( args.flag & MISSING ) args.missing_gts = (uint64_t*) calloc( args.nsmp, sizeof(uint64_t));
+ args.smp_is = (uint64_t*) calloc( args.nsmpp2, sizeof(uint64_t));
+ if ( bcf_hdr_id2int(args.hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header\n", __func__);
+
+ args.gt_arr = NULL;
+ args.ngt_arr = 0;
+
+ args.out = stdout;
+
+ /*! Header printing */
+ FILE *fp = args.out;
+ fprintf(fp, "# This file was produced by bcftools +GTisec (%s+htslib-%s)\n", bcftools_version(), hts_version());
+ fprintf(fp, "# The command line was:\tbcftools +GTisec %s ", argv[0]);
+ int i;
+ for (i=1; i < argc; i++)
+ {
+ fprintf(fp, " %s", argv[i]);
+ }
+ fprintf(fp,"\n");
+ fprintf(fp,"# This file can be used as input to the subset plotting tools at:\n"
+ "# https://github.com/dlaehnemann/bankers2\n");
+ fprintf(fp,"# Genotype intersections across samples:\n");
+ fprintf(fp,"@SMPS");
+ for (i = args.nsmp-1; i >= 0; i--)
+ {
+ fprintf(fp," %s", bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, i));
+ }
+ fprintf(fp,"\n");
+ if ( args.flag & MISSING )
+ {
+ if ( args.flag & SMPORDER )
+ {
+ fprintf(fp, "# The first line of each sample contains its count of missing genotypes, with a '-' appended\n"
+ "# to the sample name.\n");
+ }
+ else
+ {
+ fprintf(fp, "# The first %i lines contain the counts for missing values of each sample in the order provided\n"
+ "# in the SMPS-line above. Intersection counts only start afterwards.\n", args.nsmp);
+ }
+ }
+ if ( args.flag & SMPORDER )
+ {
+ fprintf(fp, "# Human readable output (-H) was requested. Subset intersection counts are therefore sorted by\n"
+ "# sample and repeated for each contained sample. For each sample, counts are in banker's \n"
+ "# sequence order regarding all other samples.\n");
+ }
+ else
+ {
+ fprintf(fp, "# Subset intersection counts are in global banker's sequence order.\n");
+ if ( args.nsmp > 2 )
+ {
+ fprintf(fp, "# After exclusive sample counts in order of the SMPS-line, banker's sequence continues with:\n"
+ "# %s,%s %s,%s ...\n",
+ bcf_hdr_int2id(in, BCF_DT_SAMPLE, args.nsmp-1 ),
+ bcf_hdr_int2id(in, BCF_DT_SAMPLE, args.nsmp-2 ),
+ bcf_hdr_int2id(in, BCF_DT_SAMPLE, args.nsmp-1 ),
+ bcf_hdr_int2id(in, BCF_DT_SAMPLE, args.nsmp-3 )
+ );
+ }
+ }
+ if (args.flag & VERBOSE )
+ {
+ fprintf(fp,"# [1] Number of shared non-ref genotypes \t[2] Samples sharing non-ref genotype (GT)\n");
+ }
+ else
+ {
+ fprintf(fp,"# [1] Number of shared non-ref genotypes\n");
+ }
+
+ /* Compute banker's sequence for following printing by sample and
+ * with increasing subset size.
+ */
+ uint32_t j;
+ for ( j = 0; j < args.nsmpp2; j++ )
+ {
+ args.bankers[j] = compute_bankers(j);
+ }
+
+ return 1;
+}
+
+
+/* ADAPTED CODE FROM CORIN LAWSON (START)
+ * https://github.com/au-phiware/bankers/blob/master/c/bankers.c
+ * who implemented ideas of Eric Burnett:
+ * http://www.thelowlyprogrammer.com/2010/04/indexing-and-enumerating-subsets-of.html
+ */
+
+/*
+ * Compute the binomial coefficient of `n choose k'.
+ * Use the fact that binom(n, k) = binom(n, n - k).
+ * Use a lookup table (triangle, actually) for speed.
+ * Otherwise it's dumb (heart) recursion.
+ * Added relative to Corin Lawson:
+ * * Passing in of sample number through pointer to args struct
+ * * Make quick lookup table external to keep it persistent with clean allocation
+ * and freeing
+ */
+uint64_t choose(unsigned int n, unsigned int k) {
+ if (n == 0)
+ return 0;
+ if (n == k || k == 0)
+ return 1;
+ if (k > n / 2)
+ k = n - k;
+
+ unsigned int i = (n * (n - 1)) / 4 + k - 1;
+ if (args.quick[i] == 0)
+ args.quick[i] = choose(n - 1, k - 1) + choose(n - 1, k);
+
+ return args.quick[i];
+}
+
+/*
+ * Returns the Banker's number at the specified position, a.
+ * Derived from the recursive bit flip method.
+ * Added relative to Corin Lawson:
+ * * Uses same lookup table solution as choose function, just
+ * maintained externally to persist across separate function calls.
+ * * Uses bitwise symmetry of banker's sequence to use bitwise inversion
+ * instead of recursive bit flip for second half of sequence.
+ */
+uint32_t compute_bankers(unsigned long a)
+{
+ if (a == 0)
+ return 0;
+
+ if ( args.bankers[a] == 0 )
+ {
+ if ( a >= (args.nsmpp2 / 2) )
+ return args.bankers[a] = ( compute_bankers(args.nsmpp2 - (a+1)) ^ (args.nsmpp2 - 1) ); // use bitwise symmetry of bankers sequence
+ unsigned int c = 0;
+ uint32_t n = args.nsmp;
+ uint64_t e = a, binom;
+ binom = choose(n, c);
+ do {
+ e -= binom;
+ } while ((binom = choose(n, ++c)) <= e);
+
+ do {
+ if (e == 0 || (binom = choose(n - 1, c - 1)) > e)
+ c--, args.bankers[a] |= 1;
+ else
+ e -= binom;
+ } while (--n && c && ((args.bankers[a] <<= 1) || 1));
+ args.bankers[a] <<= n;
+ }
+
+ return args.bankers[a];
+}
+
+// ADAPTED CODE FROM CORIN LAWSON END
+
+
+/*
+ * GT field (genotype) comparison function.
+ */
+bcf1_t *process(bcf1_t *rec)
+{
+ uint64_t i;
+ bcf_unpack(rec, BCF_UN_FMT); // unpack the Format fields, including the GT field
+ int gte_smp = 0; // number GT array entries per sample (should be 2, one entry per allele)
+ if ( (gte_smp = bcf_get_genotypes(args.hdr, rec, &(args.gt_arr), &(args.ngt_arr) ) ) <= 0 )
+ {
+ error("GT not present at %s: %d\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos+1);
+ }
+
+ gte_smp /= args.nsmp; // divide total number of genotypes array entries (= args.ngt_arr) by number of samples
+ int ret;
+
+ // stick all genotypes in a hash as keys and store up to 32 samples in a corresponding flag as its value
+ khiter_t bucket;
+ khash_t(gts2smps) *gts = kh_init(gts2smps); // create hash
+ for ( i = 0; i < args.nsmp; i++ )
+ {
+ int *gt_ptr = args.gt_arr + gte_smp * i;
+
+ if (bcf_gt_is_missing(gt_ptr[0]) || ( gte_smp == 2 && bcf_gt_is_missing(gt_ptr[1]) ) )
+ {
+ if ( args.flag & MISSING ) args.missing_gts[i]++; // count missing genotypes, if requested
+ continue; // don't do anything else for missing genotypes, their "sharing" gives no info...
+ }
+
+ int a = bcf_gt_allele(gt_ptr[0]);
+ int b;
+ if ( gte_smp == 2 ) // two entries available per sample, padded with missing values for haploid genotypes
+ {
+ b = bcf_gt_allele(gt_ptr[1]);
+ }
+ else if (gte_smp == 1 ) // use missing value for second entry in hash key generation below, if only one is available
+ {
+ b = bcf_gt_allele(bcf_int32_vector_end);
+ }
+ else
+ {
+ error("gtisec does not support ploidy higher than 2.\n");
+ }
+
+ int idx = bcf_alleles2gt(a,b); // generate genotype specific hash key
+
+ bucket = kh_get(gts2smps, gts, idx); // get the genotype's hash bucket
+
+ if ( bucket == kh_end(gts) ) { // means that key does not exist
+ bucket = kh_put(gts2smps, gts, idx, &ret); // create bucket with genotype index as key and return its iterator
+ kh_val(gts, bucket) = 0; // initialize the bucket with all sample bits unset
+ }
+ kh_value(gts, bucket) |= (1<<i); // set the sample's bit to 1 in this genotype's bucket
+ }
+
+ // iterate over genotypes and for each genotype increment the appropriate smp_is entry
+ for ( bucket = kh_begin(gts); bucket != kh_end(gts); ++bucket ) // iterate over all genotypes at this position
+ {
+ if ( kh_exist(gts, bucket) ) // for existing genotype buckets
+ {
+ uint32_t s = kh_val(gts, bucket); // get the 32 bit flag
+ args.smp_is[s]++; // add to the corresponding subset
+ }
+ }
+ kh_destroy(gts2smps, gts); // destroy hash
+
+ return NULL;
+}
+
+void destroy(void)
+{
+ int32_t i;
+ int s;
+
+ FILE *fp = args.out;
+
+ /* Printing to File */
+ if ( args.flag & SMPORDER )
+ {
+ /* Iterate over samples, printing out all subsets including
+ * the current sample, with the current sample first. This
+ * includes multiple printouts of the same sample but makes
+ * output more readable and is also needed for circos files
+ * printing.
+ */
+ for ( s = args.nsmp-1; s >= 0; s--)
+ {
+ if ( args.flag & MISSING ) // if missing genotype counts are requested, print them to standard output
+ {
+ fprintf(fp, "%"PRIu64"\t%s-\n", args.missing_gts[s], bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, s));
+ }
+ for ( i = 1; i < args.nsmpp2; i++ )
+ {
+ if ( (args.bankers[i]>>s) & 1 )
+ {
+ fprintf(fp, "%"PRIu64"\t", args.smp_is[ args.bankers[i] ]); // print out count of genotypes shared by samples in current banker's sequence position
+ int j;
+ /* Print sample list */
+ fprintf(fp, "%s", bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, s)); // print current sample first
+ for ( j = args.nsmp-1; j >= 0; j-- )
+ {
+ if ( (args.bankers[i] ^ (1<<s)) & (1<<j) ) // exclude current sample from printing again
+ {
+ fprintf(fp, ",%s", bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, j ) ); // print out sample list, starting with our current major sample
+ }
+ }
+ fprintf(fp, "\n" );
+ }
+ }
+ }
+ }
+ else if ( args.flag & VERBOSE )
+ {
+ if ( args.flag & MISSING ) // if missing genotype counts are requested, print them to standard output
+ {
+ for ( s = args.nsmp-1; s >= 0; s--)
+ {
+ fprintf(fp, "%"PRIu64"\t%s-\n", args.missing_gts[s], bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, s));
+ }
+ }
+ for ( i = 1; i < args.nsmpp2; i++ )
+ {
+ fprintf(fp, "%"PRIu64"\t", args.smp_is[ args.bankers[i] ]); // print out count of genotypes shared by samples in current banker's sequence position
+ int j = 0;
+ for ( s = args.nsmp-1; s >= 0; s--)
+ {
+ if ( (args.bankers[i]>>s) & 1 )
+ {
+ fprintf(fp, "%s%s", j ? "," : "", bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, s) ); // samples in specified order
+ j = 1;
+ }
+ }
+ fprintf(fp, "\n" );
+ }
+ }
+ else
+ {
+ if ( args.flag & MISSING ) // if missing genotype counts are requested, print them to standard output
+ {
+ for ( s = args.nsmp-1; s >= 0; s--)
+ {
+ fprintf(fp, "%"PRIu64"\n", args.missing_gts[s]);
+ }
+ }
+ for ( i = 1; i < args.nsmpp2; i++ )
+ {
+ fprintf(fp, "%"PRIu64"\n", args.smp_is[ args.bankers[i] ]); // print out count of genotypes shared by samples in current banker's sequence position
+ }
+ }
+ fclose(fp);
+
+ /* freeing up args */
+ free(args.gt_arr);
+ free(args.bankers);
+ free(args.quick);
+ if (args.flag & MISSING) free(args.missing_gts);
+ free(args.smp_is);
+}
diff --git a/plugins/GTisec.mk b/plugins/GTisec.mk
new file mode 100644
index 0000000..58d1cc7
--- /dev/null
+++ b/plugins/GTisec.mk
@@ -0,0 +1,2 @@
+plugins/GTisec.so: plugins/GTisec.c version.h version.c
+ $(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(EXTRA_CPPFLAGS) $(CPPFLAGS) $(LDFLAGS) -o $@ version.c $< $(LIBS)
diff --git a/plugins/color-chrs.c b/plugins/color-chrs.c
index 6a9175e..909065e 100644
--- a/plugins/color-chrs.c
+++ b/plugins/color-chrs.c
@@ -159,7 +159,7 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{"unrelated",1,0,'u'},
{0,0,0,0}
};
- char c;
+ int c;
while ((c = getopt_long(argc, argv, "?ht:u:p:",loptions,NULL)) >= 0)
{
switch (c)
diff --git a/plugins/fill-tags.c b/plugins/fill-tags.c
index 96f5e33..025ac32 100644
--- a/plugins/fill-tags.c
+++ b/plugins/fill-tags.c
@@ -37,6 +37,8 @@
#define SET_AC_Hom (1<<2)
#define SET_AC_Het (1<<3)
#define SET_AC_Hemi (1<<4)
+#define SET_AF (1<<5)
+#define SET_NS (1<<6)
typedef struct
{
@@ -47,8 +49,9 @@ counts_t;
typedef struct
{
bcf_hdr_t *in_hdr, *out_hdr;
- int tags, marr, mcounts, gt_id;
+ int tags, marr, mfarr, mcounts, gt_id;
int32_t *arr;
+ float *farr;
counts_t *counts;
}
args_t;
@@ -57,14 +60,14 @@ static args_t args;
const char *about(void)
{
- return "Set INFO tags AN, AC, AC_Hom, AC_Het, AC_Hemi.\n";
+ return "Set INFO tags AF, AN, AC, NS, AC_Hom, AC_Het, AC_Hemi.\n";
}
const char *usage(void)
{
return
"\n"
- "About: Set INFO tags AN, AC, AC_Hom, AC_Het, AC_Hemi.\n"
+ "About: Set INFO tags AF, AN, AC, NS, AC_Hom, AC_Het, AC_Hemi.\n"
"Usage: bcftools +fill-tags [General Options] -- [Plugin Options]\n"
"Options:\n"
" run \"bcftools plugin\" for a list of common options\n"
@@ -86,9 +89,11 @@ int parse_tags(args_t *args, const char *str)
{
if ( !strcasecmp(tags[i],"AN") ) flag |= SET_AN;
else if ( !strcasecmp(tags[i],"AC") ) flag |= SET_AC;
+ else if ( !strcasecmp(tags[i],"NS") ) flag |= SET_NS;
else if ( !strcasecmp(tags[i],"AC_Hom") ) flag |= SET_AC_Hom;
else if ( !strcasecmp(tags[i],"AC_Het") ) flag |= SET_AC_Het;
else if ( !strcasecmp(tags[i],"AC_Hemi") ) flag |= SET_AC_Hemi;
+ else if ( !strcasecmp(tags[i],"AF") ) flag |= SET_AF;
else
{
fprintf(stderr,"Error parsing \"--tags %s\": the tag \"%s\" is not supported\n", str,tags[i]);
@@ -112,7 +117,7 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{"tags",1,0,'t'},
{0,0,0,0}
};
- char c;
+ int c;
while ((c = getopt_long(argc, argv, "?ht:T:l:cd",loptions,NULL)) >= 0)
{
switch (c)
@@ -124,23 +129,25 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
}
}
if ( optind != argc ) error(usage());
- if ( !args.tags ) args.tags |= SET_AN|SET_AC|SET_AC_Hom|SET_AC_Het|SET_AC_Hemi;
+ if ( !args.tags ) args.tags |= SET_AN|SET_AC|SET_NS|SET_AC_Hom|SET_AC_Het|SET_AC_Hemi|SET_AF;
args.gt_id = bcf_hdr_id2int(args.in_hdr,BCF_DT_ID,"GT");
if ( args.gt_id<0 ) error("Error: GT field is not present\n");
if ( args.tags&SET_AN ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
if ( args.tags&SET_AC ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">");
+ if ( args.tags&SET_NS ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">");
if ( args.tags&SET_AC_Hom ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC_Hom,Number=A,Type=Integer,Description=\"Allele counts in homozygous genotypes\">");
if ( args.tags&SET_AC_Het ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC_Het,Number=A,Type=Integer,Description=\"Allele counts in heterozygous genotypes\">");
if ( args.tags&SET_AC_Hemi ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC_Hemi,Number=A,Type=Integer,Description=\"Allele counts in hemizygous genotypes\">");
+ if ( args.tags&SET_AF ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele frequency\">");
return 0;
}
bcf1_t *process(bcf1_t *rec)
{
- int i;
+ int i, ns = 0;
bcf_unpack(rec, BCF_UN_FMT);
bcf_fmt_t *fmt_gt = NULL;
@@ -149,6 +156,7 @@ bcf1_t *process(bcf1_t *rec)
if ( !fmt_gt ) return rec; // no GT tag
hts_expand(int32_t,rec->n_allele,args.marr,args.arr);
+ hts_expand(float,rec->n_allele,args.mfarr,args.farr);
hts_expand(counts_t,rec->n_allele,args.mcounts,args.counts);
memset(args.arr,0,sizeof(*args.arr)*rec->n_allele);
memset(args.counts,0,sizeof(*args.counts)*rec->n_allele);
@@ -169,6 +177,7 @@ bcf1_t *process(bcf1_t *rec)
als |= (1<<idx); /* this breaks with too many alleles */ \
} \
if ( ial==0 ) continue; /* missing alleles */ \
+ ns++; \
int is_hom = als && !(als & (als-1)); /* only one bit is set */ \
int is_hemi = ial==1; \
for (ial=0; als; ial++) \
@@ -193,6 +202,11 @@ bcf1_t *process(bcf1_t *rec)
default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args.in_hdr,rec),rec->pos+1); break;
}
#undef BRANCH_INT
+ if ( args.tags&SET_NS )
+ {
+ if ( bcf_update_info_int32(args.out_hdr,rec,"NS",&ns,1)!=0 )
+ error("Error occurred while updating NS at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1);
+ }
if ( args.tags&SET_AN )
{
args.arr[0] = 0;
@@ -201,6 +215,23 @@ bcf1_t *process(bcf1_t *rec)
if ( bcf_update_info_int32(args.out_hdr,rec,"AN",args.arr,1)!=0 )
error("Error occurred while updating AN at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1);
}
+ if ( args.tags&SET_AF )
+ {
+ int n = rec->n_allele-1;
+ if ( n>0 )
+ {
+ args.arr[0] = 0;
+ for (i=0; i<rec->n_allele; i++)
+ args.arr[0] += args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi;
+ for (i=1; i<rec->n_allele; i++)
+ args.farr[i] = (args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi)*1.0/args.arr[0];
+ }
+ if ( args.arr[0] )
+ {
+ if ( bcf_update_info_float(args.out_hdr,rec,"AF",args.farr+1,n)!=0 )
+ error("Error occurred while updating AF at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1);
+ }
+ }
if ( args.tags&SET_AC )
{
int n = rec->n_allele-1;
@@ -208,7 +239,7 @@ bcf1_t *process(bcf1_t *rec)
{
memset(args.arr,0,sizeof(*args.arr)*rec->n_allele);
for (i=1; i<rec->n_allele; i++)
- args.arr[i] += args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi;
+ args.arr[i] = args.counts[i].nhet + args.counts[i].nhom + args.counts[i].nhemi;
}
if ( bcf_update_info_int32(args.out_hdr,rec,"AC",args.arr+1,n)!=0 )
error("Error occurred while updating AC at %s:%d\n", bcf_seqname(args.in_hdr,rec),rec->pos+1);
@@ -256,6 +287,7 @@ void destroy(void)
{
free(args.counts);
free(args.arr);
+ free(args.farr);
}
diff --git a/plugins/impute-info.c b/plugins/impute-info.c
index c3c74ec..ad5a0d1 100644
--- a/plugins/impute-info.c
+++ b/plugins/impute-info.c
@@ -1,6 +1,6 @@
/* plugins/impute-info.c -- adds info metrics to a VCF file.
- Copyright (C) 2015 Genome Research Ltd.
+ Copyright (C) 2015-2016 Genome Research Ltd.
Author: Shane McCarthy <sm15 at sanger.ac.uk>
@@ -121,13 +121,13 @@ bcf1_t *process(bcf1_t *rec)
return rec; // require biallelic diploid, return site unchanged
}
- float esum = 0, e2sum = 0, fsum = 0;
+ double esum = 0, e2sum = 0, fsum = 0;
#define BRANCH(type_t,is_missing,is_vector_end) \
{ \
type_t *ptr = (type_t*) buf; \
for (i=0; i<rec->n_sample; i++) \
{ \
- float vals[3] = {0,0,0}; \
+ double vals[3] = {0,0,0}; \
for (j=0; j<nret; j++) \
{ \
if ( is_missing || is_vector_end ) break; \
@@ -147,8 +147,8 @@ bcf1_t *process(bcf1_t *rec)
}
#undef BRANCH
- float theta = esum / (2 * (float)nval);
- float info = (theta>0 && theta<1) ? 1 - (fsum - e2sum) / (2 * (float)nval * theta * (1.0 - theta)) : 1;
+ double theta = esum / (2 * (double)nval);
+ float info = (theta>0 && theta<1) ? (float)(1 - (fsum - e2sum) / (2 * (double)nval * theta * (1.0 - theta))) : 1;
bcf_update_info_float(out_hdr, rec, "INFO", &info, 1);
nrec++;
diff --git a/plugins/mendelian.c b/plugins/mendelian.c
index 186e3e5..304a27a 100644
--- a/plugins/mendelian.c
+++ b/plugins/mendelian.c
@@ -100,7 +100,7 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{"count",0,0,'c'},
{0,0,0,0}
};
- char c;
+ int c;
while ((c = getopt_long(argc, argv, "?ht:T:l:cd",loptions,NULL)) >= 0)
{
switch (c)
diff --git a/plugins/tag2tag.c b/plugins/tag2tag.c
index ec89565..8178326 100644
--- a/plugins/tag2tag.c
+++ b/plugins/tag2tag.c
@@ -83,7 +83,7 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{"gl-to-pl",0,0,2},
{0,0,0,0}
};
- char c;
+ int c;
while ((c = getopt_long(argc, argv, "?hr",loptions,NULL)) >= 0)
{
switch (c)
@@ -118,7 +118,7 @@ bcf1_t *process(bcf1_t *rec)
for (i=0; i<n; i++)
{
if ( bcf_float_is_missing(farr[i]) || bcf_float_is_vector_end(farr[i]) ) continue;
- farr[i] = farr[i] ? log(farr[i]) : -99;
+ farr[i] = farr[i] ? log10(farr[i]) : -99;
}
bcf_update_format_float(out_hdr,rec,"GL",farr,n);
if ( drop_source_tag )
diff --git a/plugins/vcf2sex.c b/plugins/vcf2sex.c
index b9ce69d..9cb0ed5 100644
--- a/plugins/vcf2sex.c
+++ b/plugins/vcf2sex.c
@@ -105,8 +105,8 @@ const char *usage(void)
" Y 1 59373566 F 0\n"
" \n"
" bcftools +vcf2sex in.vcf.gz\n"
- " bcftools +vcf2sex in.vcf.gz -- -n 10\n"
- " bcftools +vcf2sex in.vcf.gz -- -g GT\n"
+ " bcftools +vcf2sex -n 10 in.vcf.gz\n"
+ " bcftools +vcf2sex -g GT in.vcf.gz\n"
"\n";
}
@@ -389,7 +389,8 @@ int run(int argc, char **argv)
{"background",1,0,'b'},
{0,0,0,0}
};
- char c, *tmp, *ploidy_fname = NULL;
+ char *tmp, *ploidy_fname = NULL;
+ int c;
while ((c = getopt_long(argc, argv, "p:n:g:m:vb:",loptions,NULL)) >= 0)
{
switch (c) {
@@ -420,8 +421,8 @@ int run(int argc, char **argv)
args->sr = bcf_sr_init();
args->sr->require_index = 1;
- if ( !argv[0] ) error("%s", usage());
- if ( !bcf_sr_add_reader(args->sr,argv[0]) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
+ if ( optind==argc ) error("%s", usage());
+ if ( !bcf_sr_add_reader(args->sr,argv[optind]) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
args->hdr = args->sr->readers[0].header;
args->nsample = bcf_hdr_nsamples(args->hdr);
diff --git a/polysomy.c b/polysomy.c
index c5a5394..93f4184 100644
--- a/polysomy.c
+++ b/polysomy.c
@@ -675,7 +675,8 @@ int main_polysomy(int argc, char *argv[])
{"regions-file",1,0,'R'},
{0,0,0,0}
};
- char c, *tmp;
+ char *tmp;
+ int c;
while ((c = getopt_long(argc, argv, "h?o:vt:T:r:R:s:f:p:c:im:b:n:S:",loptions,NULL)) >= 0)
{
switch (c)
diff --git a/reheader.c b/reheader.c
index f325188..0466c16 100644
--- a/reheader.c
+++ b/reheader.c
@@ -1,6 +1,6 @@
/* reheader.c -- reheader subcommand.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014,2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -34,6 +34,7 @@ THE SOFTWARE. */
#include <math.h>
#include <htslib/vcf.h>
#include <htslib/bgzf.h>
+#include <htslib/tbx.h> // for hts_get_bgzfp()
#include <htslib/kseq.h>
#include "bcftools.h"
#include "khash_str2str.h"
@@ -69,22 +70,41 @@ static void read_header_file(char *fname, kstring_t *hdr)
static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int idx)
{
int i, j, n;
+ kstring_t key = {0,0,0};
+ kstring_t val = {0,0,0};
// Are these samples "old-name new-name" pairs?
void *hash = khash_str2str_init();
for (i=0; i<nsamples; i++)
{
- char *key, *value;
- key = value = samples[i];
- while ( *value && !isspace(*value) ) value++;
- if ( !*value ) break;
- *value = 0; value++;
- while ( isspace(*value) ) value++;
- khash_str2str_set(hash,key,value);
+ char *ptr = samples[i];
+ key.l = val.l = 0;
+ int escaped = 0;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &key);
+ escaped = 0;
+ ptr++;
+ }
+ if ( !*ptr ) break;
+ ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &val);
+ escaped = 0;
+ ptr++;
+ }
+ khash_str2str_set(hash,strdup(key.s),strdup(val.s));
}
+ free(key.s);
+ free(val.s);
if ( i!=nsamples ) // not "old-name new-name" pairs
{
- khash_str2str_destroy(hash);
+ khash_str2str_destroy_free_all(hash);
return 0;
}
@@ -117,7 +137,7 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id
char *ori = khash_str2str_get(hash,hdr->s+idx+j);
kputs(ori ? ori : hdr->s+idx+j, &tmp);
- if ( hash ) khash_str2str_destroy(hash);
+ khash_str2str_destroy_free_all(hash);
hdr->l = idx;
kputs(tmp.s, hdr);
@@ -183,7 +203,8 @@ static void reheader_vcf_gz(args_t *args)
if ( skip_until>=fp->block_length )
{
kputsn(buffer,skip_until,&hdr);
- if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", args->fname);
+ if ( bgzf_read_block(fp) != 0 ) error("Error reading %s\n", args->fname);
+ if ( !fp->block_length ) break;
skip_until = 0;
}
// The header has finished
@@ -197,7 +218,8 @@ static void reheader_vcf_gz(args_t *args)
if ( skip_until>=fp->block_length )
{
kputsn(buffer,fp->block_length,&hdr);
- if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", args->fname);
+ if ( bgzf_read_block(fp) != 0 ) error("Error reading %s\n", args->fname);
+ if ( !fp->block_length ) break;
skip_until = 0;
}
}
@@ -220,12 +242,8 @@ static void reheader_vcf_gz(args_t *args)
}
// Output the modified header
- BGZF *bgzf_out;
- if ( args->output_fname )
- bgzf_out = bgzf_open(args->output_fname,"w");
- else
- bgzf_out = bgzf_dopen(fileno(stdout), "w");
- bgzf_write(bgzf_out, hdr.s, hdr.l);
+ BGZF *bgzf_out = bgzf_open(args->output_fname ? args->output_fname : "-","w");;
+ if ( bgzf_write(bgzf_out, hdr.s, hdr.l) < 0 ) error("Can't write BGZF header (code %d)\n", bgzf_out->errcode);
free(hdr.s);
// Output all remainig data read with the header block
@@ -237,8 +255,8 @@ static void reheader_vcf_gz(args_t *args)
// Stream the rest of the file without as it is, without decompressing
ssize_t nread;
- int page_size = getpagesize();
- char *buf = (char*) valloc(page_size);
+ const size_t page_size = 32768;
+ char *buf = (char*) malloc(page_size);
while (1)
{
nread = bgzf_raw_read(fp, buf, page_size);
@@ -247,8 +265,8 @@ static void reheader_vcf_gz(args_t *args)
int count = bgzf_raw_write(bgzf_out, buf, nread);
if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
}
- if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
- if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode);
+ if (bgzf_close(bgzf_out) < 0) error("Error closing %s: %d\n",args->output_fname ? args->output_fname : "-",bgzf_out->errcode);
+ if (hts_close(args->fp)) error("Error closing %s: %d\n",args->fname,fp->errcode);
free(buf);
}
static void reheader_vcf(args_t *args)
diff --git a/test/annotate10.hdr b/test/annotate10.hdr
new file mode 100644
index 0000000..2162e9d
--- /dev/null
+++ b/test/annotate10.hdr
@@ -0,0 +1,3 @@
+##FORMAT=<ID=FINT,Number=.,Type=Integer,Description="Test type">
+##FORMAT=<ID=FFLT,Number=.,Type=Float,Description="Test type">
+##FORMAT=<ID=FSTR,Number=.,Type=String,Description="Test type">
diff --git a/test/annotate10.out b/test/annotate10.out
new file mode 100644
index 0000000..c7f393b
--- /dev/null
+++ b/test/annotate10.out
@@ -0,0 +1,18 @@
+##fileformat=VCFv4.1
+##FILTER=<ID=PASS,Description="All filters passed">
+##FILTER=<ID=q99,Description="Quality below 10">
+##contig=<ID=1,assembly=b37,length=249250621>
+##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
+##INFO=<ID=FLAG,Number=0,Type=Flag,Description="Test type">
+##INFO=<ID=IINT,Number=.,Type=Integer,Description="Test type">
+##INFO=<ID=IFLT,Number=.,Type=Float,Description="Test type">
+##INFO=<ID=ISTR,Number=.,Type=String,Description="Test type">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=FINT,Number=.,Type=Integer,Description="Test type">
+##FORMAT=<ID=FFLT,Number=.,Type=Float,Description="Test type">
+##FORMAT=<ID=FSTR,Number=.,Type=String,Description="Test type">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B A
+1 3000001 . C T . . . GT:FINT:FFLT:FSTR .:.:.:. .:.:.:.
+1 3000002 id C T 99 q99 FLAG;IINT=88,99;IFLT=8.8,9.9;ISTR=888,999 GT:FINT:FFLT:FSTR 1|1:88,99:8.8,9.9:888,999 0|1:77:7.7:77
+1 3000003 id C T 99 q99 FLAG;IINT=88,99;IFLT=8.8,9.9;ISTR=888,999 GT:FINT:FFLT:FSTR 1|1:88,99:8.8,9.9:888,999 0|1:77:7.7:77
+1 3000004 id C T 99 q99 FLAG;IINT=88,99;IFLT=8.8,9.9;ISTR=888,999 GT:FINT:FFLT:FSTR 1|1:88,99:8.8,9.9:888,999 0|1:77:7.7:77
diff --git a/test/annotate10.vcf b/test/annotate10.vcf
new file mode 100644
index 0000000..0557da6
--- /dev/null
+++ b/test/annotate10.vcf
@@ -0,0 +1,17 @@
+##fileformat=VCFv4.1
+##FILTER=<ID=PASS,Description="All filters passed">
+##FILTER=<ID=q99,Description="Quality below 10">
+##contig=<ID=1,assembly=b37,length=249250621>
+##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
+##INFO=<ID=FLAG,Number=0,Type=Flag,Description="Test type">
+##INFO=<ID=IINT,Number=.,Type=Integer,Description="Test type">
+##INFO=<ID=IFLT,Number=.,Type=Float,Description="Test type">
+##INFO=<ID=ISTR,Number=.,Type=String,Description="Test type">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##bcftools_annotateVersion=1.2-207-g2038312-dirty+htslib-1.2.1-158-g6876a07-dirty
+##bcftools_annotateCommand=annotate -x FMT test/annots2.vcf
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B A
+1 3000001 . C T . . . GT . .
+1 3000002 id C T 99 q99 FLAG;IINT=88,99;IFLT=8.8,9.9;ISTR=888,999 GT 1|1 0|1
+1 3000003 id C T 99 q99 FLAG;IINT=88,99;IFLT=8.8,9.9;ISTR=888,999 GT 1|1 0|1
+1 3000004 id C T 99 q99 FLAG;IINT=88,99;IFLT=8.8,9.9;ISTR=888,999 GT 1|1 0|1
diff --git a/test/annots10.tab b/test/annots10.tab
new file mode 100644
index 0000000..5128f3c
--- /dev/null
+++ b/test/annots10.tab
@@ -0,0 +1,4 @@
+1 3000001 . . . . . .
+1 3000002 88,99 77 8.8,9.9 7.7 888,999 77
+1 3000003 88,99 77 8.8,9.9 7.7 888,999 77
+1 3000004 88,99 77 8.8,9.9 7.7 888,999 77
diff --git a/test/concat.1.a.vcf b/test/concat.1.a.vcf
index 4e9f2f0..77912a6 100644
--- a/test/concat.1.a.vcf
+++ b/test/concat.1.a.vcf
@@ -11,8 +11,6 @@
##samtoolsVersion=0.2.0-rc10+htslib-0.2.0-rc10
##samtoolsCommand=samtools mpileup -t INFO/DPR -C50 -pm3 -F0.2 -d10000 -ug -r 1:1-1000000 -b mpileup.2014-07-03//lists/chr1-pooled.list -f human_g1k_v37.fasta
##ALT=<ID=X,Description="Represents allele(s) other than observed.">
-##bcftools_callVersion=0.2.0-rc10-2-gcd94fde+htslib-0.2.0-rc10
-##bcftools_callCommand=call -vm -f GQ -S mpileup.2014-07-03//pooled/1/1:1-1000000.samples -
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A
1 100 . GTTT G 1806 q10 XX=11;DP=35 GT:GQ:DP 0/1:409:35
1 110 . C T,G 1792 Fail DP=32 GT:GQ:DP 0/1:245:32
diff --git a/test/concat.1.b.vcf b/test/concat.1.b.vcf
index f21547e..4bb26e4 100644
--- a/test/concat.1.b.vcf
+++ b/test/concat.1.b.vcf
@@ -2,8 +2,6 @@
##samtoolsVersion=0.2.0-rc10+htslib-0.2.0-rc10
##samtoolsCommand=samtools mpileup -t INFO/DPR -C50 -pm3 -F0.2 -d10000 -ug -r 1:1-1000000 -b mpileup.2014-07-03//lists/chr1-pooled.list -f human_g1k_v37.fasta
##ALT=<ID=X,Description="Represents allele(s) other than observed.">
-##bcftools_callVersion=0.2.0-rc10-2-gcd94fde+htslib-0.2.0-rc10
-##bcftools_callCommand=call -vm -f GQ -S mpileup.2014-07-03//pooled/1/1:1-1000000.samples -
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
diff --git a/test/fill-tags.2.out b/test/fill-tags.2.out
new file mode 100644
index 0000000..01fe9c9
--- /dev/null
+++ b/test/fill-tags.2.out
@@ -0,0 +1,49 @@
+##fileformat=VCFv4.1
+##FILTER=<ID=PASS,Description="All filters passed">
+##reference=file:///seq/references/1000Genomes-NCBI37.fasta
+##contig=<ID=11,length=135006516>
+##contig=<ID=20,length=63025520>
+##contig=<ID=X,length=155270560>
+##contig=<ID=Y,length=59373566>
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
+##INFO=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
+##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of reads containing spanning deletions">
+##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
+##INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest contiguous homopolymer run of variant allele in either direction">
+##INFO=<ID=HWE,Number=1,Type=Float,Description="Hardy-Weinberg equilibrium test (PMID:15789306)">
+##INFO=<ID=ICF,Number=1,Type=Float,Description="Inbreeding coefficient F">
+##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
+##INFO=<ID=IS,Number=2,Type=Float,Description="Maximum number of reads supporting an indel and fraction of indel reads">
+##INFO=<ID=MQ,Number=1,Type=Integer,Description="Root-mean-square mapping quality of covering reads">
+##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total mapping quality zero reads">
+##INFO=<ID=PV4,Number=4,Type=Float,Description="P-values for strand bias, baseQ bias, mapQ bias and tail distance bias">
+##INFO=<ID=QD,Number=1,Type=Float,Description="Variant confidence/quality by depth">
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="# high-quality bases">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
+##FILTER=<ID=StrandBias,Description="Min P-value for strand bias (INFO/PV4) [0.0001]">
+##FILTER=<ID=BaseQualBias,Description="Min P-value for baseQ bias (INFO/PV4) [1e-100]">
+##FILTER=<ID=MapQualBias,Description="Min P-value for mapQ bias (INFO/PV4) [0]">
+##FILTER=<ID=EndDistBias,Description="Min P-value for end distance bias (INFO/PV4) [0.0001]">
+##FILTER=<ID=MinAB,Description="Minimum number of alternate bases (INFO/DP4) [2]">
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele frequency">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
+11 2343543 . A . 999 PASS DP=100223;NS=3;AN=6 GT:PL:DP:GQ 0/0:0,255,255:193:99 0/0:0,255,255:211:99 0/0:0,255,255:182:99
+11 5464562 . C T 999 PASS DP=0;NS=0;AN=0;AC=0 GT:PL:DP:GQ ./.:0,0,0:.:. ./.:0,0,0:.:. ./.:0,0,0:.:.
+20 76962 rs6111385 T C 999 PASS DP4=110138,70822,421911,262673;DP=911531;Dels=0;FS=21.447;HWE=0.491006;ICF=-0.01062;MQ0=1;MQ=46;PV4=2.5e-09,0,0,1;QD=22.31;NS=3;AN=6;AF=0.833333;AC=5 GT:PL:DP:GQ 0/1:255,0,255:193:99 1/1:255,255,0:211:99 1/1:255,255,0:182:99
+20 126310 . ACC A 999 StrandBias;EndDistBias DP4=125718,95950,113812,80890;DP=461867;HWE=0.24036;ICF=0.01738;INDEL;IS=374,0.937343;MQ=49;PV4=9e-30,1,0,3.8e-13;QD=0.0172;AN=6;AC=4;NS=3;AF=0.666667 GT:DP:GQ:PL 0/1:117:99:255,0,132 0/1:111:99:255,0,139 1/1:78:99:255,213,0
+20 138125 rs2298108 G T 999 PASS DP4=174391,20849,82080,4950;DP=286107;Dels=0;FS=3200;HWE=0.199462;ICF=0.01858;MQ0=0;MQ=46;PV4=0,0,0,1;QD=17.22;AN=6;AC=4;NS=3;AF=0.666667 GT:PL:DP:GQ 0/1:135,0,163:66:99 0/1:140,0,255:71:99 1/1:255,199,0:66:99
+20 138148 rs2298109 C T 999 PASS DP4=194136,45753,94945,14367;DP=356657;Dels=0;FS=3200;HWE=0.177865;ICF=0.0198;MQ0=0;MQ=47;PV4=0,0,0,1;QD=14.57;AN=6;AC=4;NS=3;AF=0.666667 GT:PL:DP:GQ 0/1:195,0,255:87:99 0/1:192,0,255:82:99 1/1:255,235,0:78:99
+20 271225 . T TTTA,TA 999 StrandBias DP4=29281,42401,27887,29245;DP=272732;INDEL;IS=95,0.748031;MQ=47;PV4=0,1,0,1;QD=0.0948;AN=6;AC=2,2;NS=3;AF=0.333333,0.333333 GT:DP:GQ:PL 0/2:33:49:151,53,203,0,52,159 0/1:51:99:255,0,213,255,255,255 1/2:47:99:255,255,255,255,0,241
+20 304568 . C T 999 PASS DP4=16413,4543,945,156;DP=43557;Dels=0;FS=3200;HWE=0.076855;ICF=0.0213;MQ0=0;MQ=50;PV4=0,0,0,1;QD=15.45;AN=6;AC=4;NS=3;AF=0.666667 GT:PL:DP:GQ 0|1:95,0,255:90:99 0|1:192,0,255:13:99 1|1:255,95,0:60:99
+20 326891 . A AC 999 PASS DP4=125718,95950,113812,80890;DP=461867;HWE=0.24036;ICF=0.01738;INDEL;IS=374,0.937343;MQ=49;PV4=9e-30,1,0,3.8e-13;QD=0.0172;AN=4;AC=2;NS=2;AF=0.5 GT:DP:GQ:PL 0|1:117:99:255,0,132 0|1:111:99:255,0,139 ./.:.:.:.,.,.
+X 2928329 rs62584840 C T 999 PASS DP4=302,9137,32,1329;DP=11020;Dels=0;FS=13.38;HWE=0.284332;ICF=0.0253;MQ0=0;MQ=49;PV4=0.094,0,0,1;QD=18.61;AN=4;AC=1;NS=3;AF=0.25 GT:PL:DP:GQ 0:0,56:2:73 0:0,81:3:98 0/1:73,0,19:4:30
+X 2933066 rs61746890 G C 999 PASS DP4=69865,100561,461,783;DP=173729;Dels=0;FS=10.833;MQ0=0;MQ=50;PV4=0.005,3.6e-14,0,1;QD=15.33;AN=4;AC=1;NS=3;AF=0.25 GT:PL:DP:GQ 0:0,255:39:99 0:0,255:37:99 0/1:255,255,255:62:99
+X 2942109 rs5939407 T C 999 PASS DP4=23273,27816,40128,48208;DP=146673;Dels=0;FS=43.639;HWE=0.622715;ICF=-0.01176;MQ0=1;MQ=46;PV4=0.65,1,0,1;QD=14.81;AN=4;AC=3;NS=3;AF=0.75 GT:PL:DP:GQ 0:0,255:20:99 1:255,0:33:99 1/1:255,157,0:52:99
+X 3048719 . T C 999 PASS DP4=13263,27466,40128,48208;DP=146673;Dels=0;FS=43.639;HWE=0.622715;ICF=-0.01176;MQ0=1;MQ=46;PV4=0.65,1,0,1;QD=14.81;AN=4;AC=2;NS=3;AF=0.5 GT:PL:DP:GQ 0:0,255:20:99 1:255,0:33:99 0|1:255,0,157:52:99
+Y 8657215 . C A 999 PASS DP4=74915,114274,1948,2955;DP=195469;Dels=0;FS=3.181;MQ0=0;MQ=50;PV4=0.86,1,0,1;QD=33.77;AN=2;AC=1;NS=2;AF=0.5 GT:PL:DP:GQ 0:0,255:47:99 1:255,0:64:99 .:.:.:.
+Y 10011673 rs78249411 G A 999 MinAB DP4=47351,30839,178796,279653;DP=550762;Dels=0;FS=41.028;MQ0=37362;MQ=26;PV4=0,0,0,1;QD=17.45;AN=2;AC=2;NS=2;AF=1 GT:PL:DP:GQ 1:126,101:146:37 1:95,0:130:99 .:.:.:.
diff --git a/test/norm.fa b/test/norm.fa
index 53e6b16..2eecd7f 100644
--- a/test/norm.fa
+++ b/test/norm.fa
@@ -24,3 +24,5 @@ TCCCCTCTTGACCTCTCTCTATTTTTTTTTTTTTTTCTGAGATGGATTTTTGCTCTTGTT
GTCTCAAAAAAAAAAAAAAAAAAAAGAAAAG
>21
TTTATTATTATTATTATTAAATTGAATTTATTTAGTGTACATACATTCATGTGTATTGTG
+>22
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
diff --git a/test/norm.fa.fai b/test/norm.fa.fai
index 40b06b7..da3fb85 100644
--- a/test/norm.fa.fai
+++ b/test/norm.fa.fai
@@ -5,3 +5,4 @@
4 60 985 60 61
5 31 1070 31 32
21 60 1106 60 61
+22 30 1171 30 31
diff --git a/test/norm.merge.out b/test/norm.merge.out
index f5b1092..449daba 100644
--- a/test/norm.merge.out
+++ b/test/norm.merge.out
@@ -36,7 +36,7 @@
1 105 . TAAACCCTAAA TAA,TAACCCTAAA 999 PASS INDEL;AN=4;AC=2,2;DP=19;ISTR=SomeString;XRF=1e+06,2e+06,500000;XRI=1111,2222,5555;XRS=AAA,BBB,DDD;XAF=1e+06,500000;XAI=1111,5555;XAS=AAA,DDD;XGF=1e+06,2e+06,3e+06,500000,.,9e+09;XGI=1111,2222,3333,5555,.,9999;XGS=A,B,C,E,.,F GT:PL:DP:FRF:FRI:FRS:FAF:FAI:FAS:FGF:FGI:FGS 1/2:1,2,3,4,.,6:1:1e+06,2e+06,500000:1111,2222,5555:AAAA,BBB,CC:1e+06,500000:1111,5555:A,BB:1e+06,2e+06,3e+06,500000,.,9e+09:1111,2222,3333,5555,.,9999:A,BB,CCC,EEEE,.,FFFFF 1/2: [...]
2 1 . GGGCGTCTCATAGCTGGAGCAATGGCGAGCGCCTGGACAAGGGAGGGGAAGGGGTTCTTATTACTGACGCGGGTAGCCCCTACTGCTGTGTGGTTCCCCTATTTTTTTTTTTTTCTTTTTGAGACGGAGTCTCGCTCTGTCACCCAGGCTGGAGTGCAGTGGCACAATCTCGGCTCACTGCAAGCTCCACCT ACGT 999 PASS INDEL;AN=4;AC=2 GT:DP 1/0:1 1/0:1
2 101 . ATTTTTTTTTTTTT ATTTTTTTTTTTTTTT 999 PASS INDEL;AN=4;AC=4 GT:DP 1/1:1 1/1:1
-2 114 . TC TTCC,TTC 999 FAIL1 INDEL;AN=4;AC=2,2 GT:DP 1/2:1 1/2:1
+2 114 . TC TTCC,TTC 999 FAIL1 INDEL;AN=4;AC=2,2 GT:DP:FGS 1/2:1:A,BB,CCC,EEEE,.,FFFFF 1/2:1:AA,BB,CCC,EEEE,.,FFFFF
2 115 . C T 999 PASS INDEL;AN=4;AC=4 GT:DP 1/1:1 1/1:1
20 3 . GATG CTATG,GACT 999 PASS INDEL;AN=4;AC=2,2 GT 2/1 2/1
20 5 id0001;id0002 TGGG TAC,TG,TGGGG,AC . PASS INDEL;AN=4;AC=2,2,0,0 GT:PL:DP 1/2:1,2,3,4,.,6,7,.,.,10,11,.,.,.,15:1 1/2:1,2,3,4,.,6,7,.,.,10,11,.,.,.,15:1
diff --git a/test/norm.merge.strict.out b/test/norm.merge.strict.out
index 1395d22..92f6f88 100644
--- a/test/norm.merge.strict.out
+++ b/test/norm.merge.strict.out
@@ -36,7 +36,7 @@
1 105 . TAAACCCTAAA TAA,TAACCCTAAA 999 PASS INDEL;AN=4;AC=2,2;DP=19;ISTR=SomeString;XRF=1e+06,2e+06,500000;XRI=1111,2222,5555;XRS=AAA,BBB,DDD;XAF=1e+06,500000;XAI=1111,5555;XAS=AAA,DDD;XGF=1e+06,2e+06,3e+06,500000,.,9e+09;XGI=1111,2222,3333,5555,.,9999;XGS=A,B,C,E,.,F GT:PL:DP:FRF:FRI:FRS:FAF:FAI:FAS:FGF:FGI:FGS 1/2:1,2,3,4,.,6:1:1e+06,2e+06,500000:1111,2222,5555:AAAA,BBB,CC:1e+06,500000:1111,5555:A,BB:1e+06,2e+06,3e+06,500000,.,9e+09:1111,2222,3333,5555,.,9999:A,BB,CCC,EEEE,.,FFFFF 1/2: [...]
2 1 . GGGCGTCTCATAGCTGGAGCAATGGCGAGCGCCTGGACAAGGGAGGGGAAGGGGTTCTTATTACTGACGCGGGTAGCCCCTACTGCTGTGTGGTTCCCCTATTTTTTTTTTTTTCTTTTTGAGACGGAGTCTCGCTCTGTCACCCAGGCTGGAGTGCAGTGGCACAATCTCGGCTCACTGCAAGCTCCACCT ACGT 999 PASS INDEL;AN=4;AC=2 GT:DP 1/0:1 1/0:1
2 101 . ATTTTTTTTTTTTT ATTTTTTTTTTTTTTT 999 PASS INDEL;AN=4;AC=4 GT:DP 1/1:1 1/1:1
-2 114 . TC TTCC,TTC 999 PASS INDEL;AN=4;AC=2,2 GT:DP 1/2:1 1/2:1
+2 114 . TC TTCC,TTC 999 PASS INDEL;AN=4;AC=2,2 GT:DP:FGS 1/2:1:A,BB,CCC,EEEE,.,FFFFF 1/2:1:AA,BB,CCC,EEEE,.,FFFFF
2 115 . C T 999 PASS INDEL;AN=4;AC=4 GT:DP 1/1:1 1/1:1
20 3 . GATG CTATG,GACT 999 PASS INDEL;AN=4;AC=2,2 GT 2/1 2/1
20 5 id0001;id0002 TGGG TAC,TG,TGGGG,AC . PASS INDEL;AN=4;AC=2,2,0,0 GT:PL:DP 1/2:1,2,3,4,.,6,7,.,.,10,11,.,.,.,15:1 1/2:1,2,3,4,.,6,7,.,.,10,11,.,.,.,15:1
diff --git a/test/norm.merge.vcf b/test/norm.merge.vcf
index e89178e..c702f6d 100644
--- a/test/norm.merge.vcf
+++ b/test/norm.merge.vcf
@@ -37,8 +37,8 @@
1 105 . TAAACCCTAAA TAACCCTAAA 999 PASS INDEL;AN=4;AC=2;DP=19;ISTR=SomeString;XRF=1e+06,500000;XRI=1111,5555;XRS=AAA,DDD;XAF=500000;XAI=5555;XAS=DDD;XGF=1e+06,500000,9e+09;XGI=1111,5555,9999;XGS=A,E,F GT:PL:DP:FRF:FRI:FRS:FAF:FAI:FAS:FGF:FGI:FGS 0/1:1,4,6:1:1e+06,500000:1111,5555:AAAA,CC:500000:5555:BB:1e+06,500000,9e+09:1111,5555,9999:A,EEEE,FFFFF 0/1:1,4,6:1:1e+06,500000:1111,5555:AAAA,CC:500000:5555:BB:1e+06,500000,9e+09:1111,5555,9999:A,EEEE,FFFFF
2 1 . GGGCGTCTCATAGCTGGAGCAATGGCGAGCGCCTGGACAAGGGAGGGGAAGGGGTTCTTATTACTGACGCGGGTAGCCCCTACTGCTGTGTGGTTCCCCTATTTTTTTTTTTTTCTTTTTGAGACGGAGTCTCGCTCTGTCACCCAGGCTGGAGTGCAGTGGCACAATCTCGGCTCACTGCAAGCTCCACCT ACGT 999 PASS INDEL;AN=4;AC=2 GT:DP 1/0:1 1/0:1
2 101 . ATTTTTTTTTTTTT ATTTTTTTTTTTTTTT 999 PASS INDEL;AN=4;AC=4 GT:DP 1/1:1 1/1:1
-2 114 . TC TTCC 999 FAIL1 INDEL;AN=4;AC=2 GT:DP 1/0:1 1/0:1
-2 114 . TC TTC 999 PASS INDEL;AN=4;AC=2 GT:DP 0/1:1 0/1:1
+2 114 . TC TTCC 999 FAIL1 INDEL;AN=4;AC=2 GT:DP:FGS 1/0:1:A,BB,CCC 1/0:1:AA,BB,CCC
+2 114 . TC TTC 999 PASS INDEL;AN=4;AC=2 GT:DP:FGS 0/1:1:A,EEEE,FFFFF 0/1:1:AA,EEEE,FFFFF
2 115 . C T 999 PASS INDEL;AN=4;AC=4 GT:DP 1/1:1 1/1:1
20 3 . G CT 999 PASS INDEL;AN=4;AC=2 GT 0/1 0/1
20 3 . GATG GACT 999 PASS INDEL;AN=4;AC=2 GT 1/0 1/0
diff --git a/test/norm.telomere.out b/test/norm.telomere.out
new file mode 100644
index 0000000..59431fa
--- /dev/null
+++ b/test/norm.telomere.out
@@ -0,0 +1,5 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=22,length=30>
+#CHROM POS ID REF ALT QUAL FILTER INFO
+22 1 . NN N . . .
diff --git a/test/norm.telomere.vcf b/test/norm.telomere.vcf
new file mode 100644
index 0000000..55ff94b
--- /dev/null
+++ b/test/norm.telomere.vcf
@@ -0,0 +1,4 @@
+##fileformat=VCFv4.2
+##contig=<ID=22,length=30>
+#CHROM POS ID REF ALT QUAL FILTER INFO
+22 17 . NN N . . .
diff --git a/test/norm.merge.strict.out b/test/reheader.3.out
similarity index 63%
copy from test/norm.merge.strict.out
copy to test/reheader.3.out
index 1395d22..c520746 100644
--- a/test/norm.merge.strict.out
+++ b/test/reheader.3.out
@@ -1,12 +1,5 @@
-##fileformat=VCFv4.2
+##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
-##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
-##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled likelihood">
-##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth">
-##contig=<ID=1,length=2147483647>
-##contig=<ID=2,length=2147483647>
-##contig=<ID=20,length=2147483647>
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
@@ -30,23 +23,33 @@
##FORMAT=<ID=FGS,Number=G,Type=String,Description="Test Number=AGR in FORMAT">
##FORMAT=<ID=FSTR,Number=1,Type=String,Description="Test String in FORMAT">
##INFO=<ID=ISTR,Number=1,Type=String,Description="Test String in INFO">
-##FILTER=<ID=FAIL1,Description="Failed filter 1">
-##FILTER=<ID=FAIL2,Description="Failed filter 2">
-#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT XY00001 XY00002
-1 105 . TAAACCCTAAA TAA,TAACCCTAAA 999 PASS INDEL;AN=4;AC=2,2;DP=19;ISTR=SomeString;XRF=1e+06,2e+06,500000;XRI=1111,2222,5555;XRS=AAA,BBB,DDD;XAF=1e+06,500000;XAI=1111,5555;XAS=AAA,DDD;XGF=1e+06,2e+06,3e+06,500000,.,9e+09;XGI=1111,2222,3333,5555,.,9999;XGS=A,B,C,E,.,F GT:PL:DP:FRF:FRI:FRS:FAF:FAI:FAS:FGF:FGI:FGS 1/2:1,2,3,4,.,6:1:1e+06,2e+06,500000:1111,2222,5555:AAAA,BBB,CC:1e+06,500000:1111,5555:A,BB:1e+06,2e+06,3e+06,500000,.,9e+09:1111,2222,3333,5555,.,9999:A,BB,CCC,EEEE,.,FFFFF 1/2: [...]
-2 1 . GGGCGTCTCATAGCTGGAGCAATGGCGAGCGCCTGGACAAGGGAGGGGAAGGGGTTCTTATTACTGACGCGGGTAGCCCCTACTGCTGTGTGGTTCCCCTATTTTTTTTTTTTTCTTTTTGAGACGGAGTCTCGCTCTGTCACCCAGGCTGGAGTGCAGTGGCACAATCTCGGCTCACTGCAAGCTCCACCT ACGT 999 PASS INDEL;AN=4;AC=2 GT:DP 1/0:1 1/0:1
+##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled likelihood">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth">
+##contig=<ID=1,length=2147483647>
+##contig=<ID=2,length=2147483647>
+##contig=<ID=3,length=2147483647>
+##contig=<ID=4,length=2147483647>
+##contig=<ID=5,length=2147483647>
+##contig=<ID=20,length=2147483647>
+##FILTER=<ID=Test,Description="Test Filter">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A AA BB B
2 101 . ATTTTTTTTTTTTT ATTTTTTTTTTTTTTT 999 PASS INDEL;AN=4;AC=4 GT:DP 1/1:1 1/1:1
2 114 . TC TTCC,TTC 999 PASS INDEL;AN=4;AC=2,2 GT:DP 1/2:1 1/2:1
2 115 . C T 999 PASS INDEL;AN=4;AC=4 GT:DP 1/1:1 1/1:1
-20 3 . GATG CTATG,GACT 999 PASS INDEL;AN=4;AC=2,2 GT 2/1 2/1
-20 5 id0001;id0002 TGGG TAC,TG,TGGGG,AC . PASS INDEL;AN=4;AC=2,2,0,0 GT:PL:DP 1/2:1,2,3,4,.,6,7,.,.,10,11,.,.,.,15:1 1/2:1,2,3,4,.,6,7,.,.,10,11,.,.,.,15:1
-20 59 id0003 AG . 999 PASS AN=4 GT:PL:DP 0/0:0:4 0/0:0:4
+20 3 . G CT 999 PASS INDEL;AN=4;AC=2 GT 0/1 0/1
+20 3 . GATG GACT 999 PASS INDEL;AN=4;AC=2 GT 1/0 1/0
+20 5 . TGGG TAC,TG,TGGGG,AC . PASS INDEL;AN=4;AC=2,2,0,0 GT:PL:DP 1/2:1,2,3,4,5,6,7,8,9,10,11,12,13,14,15:1 1/2:1,2,3,4,5,6,7,8,9,10,11,12,13,14,15:1
+20 59 . AG . 999 PASS AN=4 GT:PL:DP 0/0:0:4 0/0:0:4
20 80 . CACAG CACAT 999 PASS AN=4;AC=2 GT:PL:DP 0/1:255,0,255:13 0/1:255,0,255:13
20 81 . A C 999 PASS AN=4;AC=2 GT:PL:DP 0/1:255,0,255:13 0/1:255,0,255:13
20 95 . TCACCG ACACCG 999 PASS AN=4;AC=2 GT:PL:DP 0/1:255,0,255:13 0/1:255,0,255:13
-20 95 . TCACCG AAAAAA 999 PASS AN=4;AC=2 GT:PL:DP 0/1:255,0,255:13 0/1:255,0,255:13
-20 273 . CAAAAAAAAAAAAAAAAAAAAA CAAAAAAAAAAAAAAAAAAAAAAA,CAAAAAAAAAAAAAAAAAAAAAAAA 999 PASS INDEL;AN=4;AC=2,2 GT:PL:DP 1/2:0,3,5,3,.,5:1 1/2:0,3,5,3,.,5:1
+20 95 . TCACCG AAAAAA 999 Test AN=4;AC=2 GT:PL:DP 0/1:255,0,255:13 0/1:255,0,255:13
+20 273 . CAAAAAAAAAAAAAAAAAAAAA CAAAAAAAAAAAAAAAAAAAAAAA,CAAAAAAAAAAAAAAAAAAAAAAAA 999 PASS INDEL;AN=4;AC=2,2 GT:PL:DP 1/2:0,3,5,3,5,5:1 1/2:0,3,5,3,5,5:1
20 274 . AAAAAAAAA AAAAAAAAAAAAAAAAAAA 999 PASS INDEL;AN=0;AC=0 GT:PL:DP ./.:0,0,0:0 ./.:0,0,0:0
-20 275 . A C,G 0 FAIL1;FAIL2 INDEL;AN=2;AC=0,2 GT:PL:DP:FGF:FGI:FGS:FSTR 2:0,0,0:0:1e+06,2e+06,3e+06:1111,2222,3333:A,BB,CCC:WORD 2:0,0,0:0:1e+06,2e+06,3e+06:1111,2222,3333:A,BB,CCC:WORD
20 278 . AAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAA 999 PASS INDEL;AN=0;AC=0 GT:PL:DP ./.:0,0,0:0 ./.:0,0,0:0
-20 300 . A C,G 999 PASS INDEL;AN=0;AC=0,0 GT:PL:DP ./.:0,0,0,0,.,0:0 ./.:0,0,0,0,.,0:0
+3 10 . GTGGAC GTGGACACAC,GTGGACAC,GTGGACACACAC,GTGG,GTGGACACACACAC,ATGGACACACAC 999 PASS INDEL;AN=0 GT:DP ./.:0 ./.:0
+3 15 . CACA CAC 999 PASS INDEL;AN=0 GT:DP ./.:0 ./.:0
+4 21 . ATTTTTTTTTTTTTTTC ATTTTTTTTTTTTTTC,ATTTTTTTTTTTTTTTT,ATTTTTTTTTTTTTTTTC 999 PASS INDEL;AN=0 GT:DP ./.:0 ./.:0
+5 22 . A AGA 999 PASS INDEL;AN=0 GT:DP ./.:0 ./.:0
diff --git a/test/norm.merge.strict.out b/test/reheader.4.out
similarity index 63%
copy from test/norm.merge.strict.out
copy to test/reheader.4.out
index 1395d22..8b7c1ed 100644
--- a/test/norm.merge.strict.out
+++ b/test/reheader.4.out
@@ -1,12 +1,5 @@
-##fileformat=VCFv4.2
+##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
-##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
-##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled likelihood">
-##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth">
-##contig=<ID=1,length=2147483647>
-##contig=<ID=2,length=2147483647>
-##contig=<ID=20,length=2147483647>
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
@@ -30,23 +23,33 @@
##FORMAT=<ID=FGS,Number=G,Type=String,Description="Test Number=AGR in FORMAT">
##FORMAT=<ID=FSTR,Number=1,Type=String,Description="Test String in FORMAT">
##INFO=<ID=ISTR,Number=1,Type=String,Description="Test String in INFO">
-##FILTER=<ID=FAIL1,Description="Failed filter 1">
-##FILTER=<ID=FAIL2,Description="Failed filter 2">
-#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT XY00001 XY00002
-1 105 . TAAACCCTAAA TAA,TAACCCTAAA 999 PASS INDEL;AN=4;AC=2,2;DP=19;ISTR=SomeString;XRF=1e+06,2e+06,500000;XRI=1111,2222,5555;XRS=AAA,BBB,DDD;XAF=1e+06,500000;XAI=1111,5555;XAS=AAA,DDD;XGF=1e+06,2e+06,3e+06,500000,.,9e+09;XGI=1111,2222,3333,5555,.,9999;XGS=A,B,C,E,.,F GT:PL:DP:FRF:FRI:FRS:FAF:FAI:FAS:FGF:FGI:FGS 1/2:1,2,3,4,.,6:1:1e+06,2e+06,500000:1111,2222,5555:AAAA,BBB,CC:1e+06,500000:1111,5555:A,BB:1e+06,2e+06,3e+06,500000,.,9e+09:1111,2222,3333,5555,.,9999:A,BB,CCC,EEEE,.,FFFFF 1/2: [...]
-2 1 . GGGCGTCTCATAGCTGGAGCAATGGCGAGCGCCTGGACAAGGGAGGGGAAGGGGTTCTTATTACTGACGCGGGTAGCCCCTACTGCTGTGTGGTTCCCCTATTTTTTTTTTTTTCTTTTTGAGACGGAGTCTCGCTCTGTCACCCAGGCTGGAGTGCAGTGGCACAATCTCGGCTCACTGCAAGCTCCACCT ACGT 999 PASS INDEL;AN=4;AC=2 GT:DP 1/0:1 1/0:1
+##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled likelihood">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth">
+##contig=<ID=1,length=2147483647>
+##contig=<ID=2,length=2147483647>
+##contig=<ID=3,length=2147483647>
+##contig=<ID=4,length=2147483647>
+##contig=<ID=5,length=2147483647>
+##contig=<ID=20,length=2147483647>
+##FILTER=<ID=Test,Description="Test Filter">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT XY00001 BB
2 101 . ATTTTTTTTTTTTT ATTTTTTTTTTTTTTT 999 PASS INDEL;AN=4;AC=4 GT:DP 1/1:1 1/1:1
2 114 . TC TTCC,TTC 999 PASS INDEL;AN=4;AC=2,2 GT:DP 1/2:1 1/2:1
2 115 . C T 999 PASS INDEL;AN=4;AC=4 GT:DP 1/1:1 1/1:1
-20 3 . GATG CTATG,GACT 999 PASS INDEL;AN=4;AC=2,2 GT 2/1 2/1
-20 5 id0001;id0002 TGGG TAC,TG,TGGGG,AC . PASS INDEL;AN=4;AC=2,2,0,0 GT:PL:DP 1/2:1,2,3,4,.,6,7,.,.,10,11,.,.,.,15:1 1/2:1,2,3,4,.,6,7,.,.,10,11,.,.,.,15:1
-20 59 id0003 AG . 999 PASS AN=4 GT:PL:DP 0/0:0:4 0/0:0:4
+20 3 . G CT 999 PASS INDEL;AN=4;AC=2 GT 0/1 0/1
+20 3 . GATG GACT 999 PASS INDEL;AN=4;AC=2 GT 1/0 1/0
+20 5 . TGGG TAC,TG,TGGGG,AC . PASS INDEL;AN=4;AC=2,2,0,0 GT:PL:DP 1/2:1,2,3,4,5,6,7,8,9,10,11,12,13,14,15:1 1/2:1,2,3,4,5,6,7,8,9,10,11,12,13,14,15:1
+20 59 . AG . 999 PASS AN=4 GT:PL:DP 0/0:0:4 0/0:0:4
20 80 . CACAG CACAT 999 PASS AN=4;AC=2 GT:PL:DP 0/1:255,0,255:13 0/1:255,0,255:13
20 81 . A C 999 PASS AN=4;AC=2 GT:PL:DP 0/1:255,0,255:13 0/1:255,0,255:13
20 95 . TCACCG ACACCG 999 PASS AN=4;AC=2 GT:PL:DP 0/1:255,0,255:13 0/1:255,0,255:13
-20 95 . TCACCG AAAAAA 999 PASS AN=4;AC=2 GT:PL:DP 0/1:255,0,255:13 0/1:255,0,255:13
-20 273 . CAAAAAAAAAAAAAAAAAAAAA CAAAAAAAAAAAAAAAAAAAAAAA,CAAAAAAAAAAAAAAAAAAAAAAAA 999 PASS INDEL;AN=4;AC=2,2 GT:PL:DP 1/2:0,3,5,3,.,5:1 1/2:0,3,5,3,.,5:1
+20 95 . TCACCG AAAAAA 999 Test AN=4;AC=2 GT:PL:DP 0/1:255,0,255:13 0/1:255,0,255:13
+20 273 . CAAAAAAAAAAAAAAAAAAAAA CAAAAAAAAAAAAAAAAAAAAAAA,CAAAAAAAAAAAAAAAAAAAAAAAA 999 PASS INDEL;AN=4;AC=2,2 GT:PL:DP 1/2:0,3,5,3,5,5:1 1/2:0,3,5,3,5,5:1
20 274 . AAAAAAAAA AAAAAAAAAAAAAAAAAAA 999 PASS INDEL;AN=0;AC=0 GT:PL:DP ./.:0,0,0:0 ./.:0,0,0:0
-20 275 . A C,G 0 FAIL1;FAIL2 INDEL;AN=2;AC=0,2 GT:PL:DP:FGF:FGI:FGS:FSTR 2:0,0,0:0:1e+06,2e+06,3e+06:1111,2222,3333:A,BB,CCC:WORD 2:0,0,0:0:1e+06,2e+06,3e+06:1111,2222,3333:A,BB,CCC:WORD
20 278 . AAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAA 999 PASS INDEL;AN=0;AC=0 GT:PL:DP ./.:0,0,0:0 ./.:0,0,0:0
-20 300 . A C,G 999 PASS INDEL;AN=0;AC=0,0 GT:PL:DP ./.:0,0,0,0,.,0:0 ./.:0,0,0,0,.,0:0
+3 10 . GTGGAC GTGGACACAC,GTGGACAC,GTGGACACACAC,GTGG,GTGGACACACACAC,ATGGACACACAC 999 PASS INDEL;AN=0 GT:DP ./.:0 ./.:0
+3 15 . CACA CAC 999 PASS INDEL;AN=0 GT:DP ./.:0 ./.:0
+4 21 . ATTTTTTTTTTTTTTTC ATTTTTTTTTTTTTTC,ATTTTTTTTTTTTTTTT,ATTTTTTTTTTTTTTTTC 999 PASS INDEL;AN=0 GT:DP ./.:0 ./.:0
+5 22 . A AGA 999 PASS INDEL;AN=0 GT:DP ./.:0 ./.:0
diff --git a/test/reheader.empty.hdr b/test/reheader.empty.hdr
new file mode 100644
index 0000000..cd5a018
--- /dev/null
+++ b/test/reheader.empty.hdr
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.1
+##contig=<ID=11,length=135006516>
+##contig=<ID=20,length=63025520>
+##contig=<ID=X,length=155270560>
+##contig=<ID=Y,length=59373566>
+##contig=<ID=Z,length=59373566>
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
diff --git a/test/reheader.empty.out b/test/reheader.empty.out
new file mode 100644
index 0000000..cbc236b
--- /dev/null
+++ b/test/reheader.empty.out
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.1
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=11,length=135006516>
+##contig=<ID=20,length=63025520>
+##contig=<ID=X,length=155270560>
+##contig=<ID=Y,length=59373566>
+##contig=<ID=Z,length=59373566>
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
diff --git a/test/reheader.samples3 b/test/reheader.samples3
new file mode 100644
index 0000000..b8f8700
--- /dev/null
+++ b/test/reheader.samples3
@@ -0,0 +1,2 @@
+XY00002 BB\ B
+XY00001 A\ AA
diff --git a/test/reheader.samples4 b/test/reheader.samples4
new file mode 100644
index 0000000..dcda975
--- /dev/null
+++ b/test/reheader.samples4
@@ -0,0 +1 @@
+XY00002 BB
diff --git a/test/test.pl b/test/test.pl
index 899ea16..a897afd 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -1,6 +1,6 @@
#!/usr/bin/env perl
#
-# Copyright (C) 2012-2015 Genome Research Ltd.
+# Copyright (C) 2012-2016 Genome Research Ltd.
#
# Author: Petr Danecek <pd3 at sanger.ac.uk>
#
@@ -103,6 +103,7 @@ test_vcf_norm($opts,in=>'norm.merge.2',out=>'norm.merge.2.out',args=>'-m+');
test_vcf_norm($opts,in=>'norm.merge.3',out=>'norm.merge.3.out',args=>'-m+');
test_vcf_norm($opts,in=>'norm.merge',out=>'norm.merge.strict.out',args=>'-m+ -s');
test_vcf_norm($opts,in=>'norm.setref',out=>'norm.setref.out',args=>'-Nc s',fai=>'norm');
+test_vcf_norm($opts,in=>'norm.telomere',out=>'norm.telomere.out',fai=>'norm');
test_vcf_view($opts,in=>'view',out=>'view.1.out',args=>'-aUc1 -C1 -s NA00002 -v snps',reg=>'');
test_vcf_view($opts,in=>'view',out=>'view.2.out',args=>'-f PASS -Xks NA00003',reg=>'-r20,Y');
test_vcf_view($opts,in=>'view',out=>'view.3.out',args=>'-xs NA00003',reg=>'');
@@ -175,17 +176,27 @@ test_vcf_annotate($opts,in=>'annotate3',out=>'annotate6.out',args=>'-x ID,QUAL,^
test_vcf_annotate($opts,in=>'annotate3',out=>'annotate7.out',args=>'-x FORMAT');
test_vcf_annotate($opts,in=>'annotate4',vcf=>'annots4',out=>'annotate8.out',args=>'-c +INFO');
test_vcf_annotate($opts,in=>'annotate4',tab=>'annots4',out=>'annotate8.out',args=>'-c CHROM,POS,REF,ALT,+FA,+FR,+IA,+IR,+SA,+SR');
-test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+missing2ref');
-test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+setGT',args=>'-- -t . -n 0');
+test_vcf_annotate($opts,in=>'annotate10',tab=>'annots10',out=>'annotate10.out',args=>'-c CHROM,POS,FMT/FINT,FMT/FFLT,FMT/FSTR');
+test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+missing2ref --no-version');
+test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+setGT --no-version',args=>'-- -t . -n 0');
test_vcf_annotate($opts,in=>'annotate9',tab=>'annots9',out=>'annotate9.out',args=>'-c CHROM,POS,REF,ALT,+ID');
-test_vcf_plugin($opts,in=>'plugin1',out=>'fill-AN-AC.out',cmd=>'+fill-AN-AC');
+test_vcf_plugin($opts,in=>'plugin1',out=>'fill-AN-AC.out',cmd=>'+fill-AN-AC --no-version');
test_vcf_plugin($opts,in=>'plugin1',out=>'dosage.out',cmd=>'+dosage');
-test_vcf_plugin($opts,in=>'fixploidy',out=>'fixploidy.out',cmd=>'+fixploidy',args=>'-- -s {PATH}/fixploidy.samples -p {PATH}/fixploidy.ploidy');
+test_vcf_plugin($opts,in=>'fixploidy',out=>'fixploidy.out',cmd=>'+fixploidy --no-version',args=>'-- -s {PATH}/fixploidy.samples -p {PATH}/fixploidy.ploidy');
test_vcf_plugin($opts,in=>'vcf2sex',out=>'vcf2sex.out',cmd=>'+vcf2sex',args=>'-- -n 5');
test_vcf_plugin($opts,in=>'vcf2sex',out=>'vcf2sex.out',cmd=>'+vcf2sex',args=>'-- -g GT');
test_vcf_plugin($opts,in=>'vcf2sex',out=>'vcf2sex.out',cmd=>'+vcf2sex',args=>'-- -g GT -n 5');
-test_vcf_plugin($opts,in=>'view.GL',out=>'view.PL.vcf',cmd=>'+tag2tag',args=>'-- -r --gl-to-pl');
-test_vcf_plugin($opts,in=>'merge.a',out=>'fill-tags.out',cmd=>'+fill-tags');
+test_vcf_plugin($opts,in=>'view.GL',out=>'view.PL.vcf',cmd=>'+tag2tag --no-version',args=>'-- -r --gl-to-pl');
+test_vcf_plugin($opts,in=>'merge.a',out=>'fill-tags.out',cmd=>'+fill-tags --no-version',args=>'-- -t AN,AC,AC_Hom,AC_Het,AC_Hemi');
+test_vcf_plugin($opts,in=>'view',out=>'fill-tags.2.out',cmd=>'+fill-tags --no-version',args=>'-- -t AC,AN,AF,NS');
+test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.out',cmd=>'+GTisec',args=>' | grep -v bcftools');
+test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.H.out',cmd=>'+GTisec',args=>'-- -H | grep -v bcftools');
+test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.Hm.out',cmd=>'+GTisec',args=>'-- -Hm | grep -v bcftools');
+test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.Hmv.out',cmd=>'+GTisec',args=>'-- -Hmv | grep -v bcftools');
+test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.Hv.out',cmd=>'+GTisec',args=>'-- -Hv | grep -v bcftools');
+test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.m.out',cmd=>'+GTisec',args=>'-- -m | grep -v bcftools');
+test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.mv.out',cmd=>'+GTisec',args=>'-- -mv | grep -v bcftools');
+test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.v.out',cmd=>'+GTisec',args=>'-- -v | grep -v bcftools');
test_vcf_concat($opts,in=>['concat.1.a','concat.1.b'],out=>'concat.1.vcf.out',do_bcf=>0,args=>'');
test_vcf_concat($opts,in=>['concat.1.a','concat.1.b'],out=>'concat.1.bcf.out',do_bcf=>1,args=>'');
test_vcf_concat($opts,in=>['concat.2.a','concat.2.b'],out=>'concat.2.vcf.out',do_bcf=>0,args=>'-a');
@@ -197,6 +208,9 @@ test_vcf_concat($opts,in=>['concat.3.a','concat.3.b','concat.3.0','concat.3.c','
test_vcf_reheader($opts,in=>'reheader',out=>'reheader.1.out',header=>'reheader.hdr');
test_vcf_reheader($opts,in=>'reheader',out=>'reheader.2.out',samples=>'reheader.samples');
test_vcf_reheader($opts,in=>'reheader',out=>'reheader.2.out',samples=>'reheader.samples2');
+test_vcf_reheader($opts,in=>'reheader',out=>'reheader.3.out',samples=>'reheader.samples3');
+test_vcf_reheader($opts,in=>'reheader',out=>'reheader.4.out',samples=>'reheader.samples4');
+test_vcf_reheader($opts,in=>'empty',out=>'reheader.empty.out',header=>'reheader.empty.hdr');
test_rename_chrs($opts,in=>'annotate');
test_vcf_convert($opts,in=>'convert',out=>'convert.gs.gt.gen',args=>'-g -,.');
test_vcf_convert($opts,in=>'convert',out=>'convert.gs.gt.samples',args=>'-g .,-');
@@ -431,8 +445,10 @@ sub test_vcf_idxstats
cmd("$$opts{bin}/bcftools view -Oz $$opts{path}/$args{in}.vcf > $$opts{tmp}/$args{in}.vcf.gz");
cmd("$$opts{bin}/bcftools index --tbi -f $$opts{tmp}/$args{in}.vcf.gz");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools index $args{args} $$opts{tmp}/$args{in}.vcf.gz");
+ unlink("$$opts{tmp}/$args{in}.vcf.gz.tbi");
cmd("$$opts{bin}/bcftools index --csi -f $$opts{tmp}/$args{in}.vcf.gz");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools index $args{args} $$opts{tmp}/$args{in}.vcf.gz");
+ unlink("$$opts{tmp}/$args{in}.vcf.gz.csi");
cmd("$$opts{bin}/bcftools view -Ob $$opts{path}/$args{in}.vcf > $$opts{tmp}/$args{in}.bcf");
cmd("$$opts{bin}/bcftools index -f $$opts{tmp}/$args{in}.bcf");
@@ -455,7 +471,7 @@ sub test_vcf_check_merge
cmd("$$opts{bin}/bcftools stats -r 2 $$opts{tmp}/$args{in}.vcf.gz > $$opts{tmp}/$args{in}.2.chk");
cmd("$$opts{bin}/bcftools stats -r 3 $$opts{tmp}/$args{in}.vcf.gz > $$opts{tmp}/$args{in}.3.chk");
cmd("$$opts{bin}/bcftools stats -r 4 $$opts{tmp}/$args{in}.vcf.gz > $$opts{tmp}/$args{in}.4.chk");
- test_cmd($opts,%args,cmd=>"$$opts{bin}/plot-vcfstats -m $$opts{tmp}/$args{in}.1.chk $$opts{tmp}/$args{in}.2.chk $$opts{tmp}/$args{in}.3.chk $$opts{tmp}/$args{in}.4.chk | grep -v 'plot-vcfstats' | grep -v '^# The command' | grep -v '^# This' | grep -v '^ID\t'");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/plot-vcfstats -m $$opts{tmp}/$args{in}.1.chk $$opts{tmp}/$args{in}.2.chk $$opts{tmp}/$args{in}.3.chk $$opts{tmp}/$args{in}.4.chk 2>/dev/null | grep -v 'plot-vcfstats' | grep -v '^# The command' | grep -v '^# This' | grep -v '^ID\t'");
}
sub test_vcf_stats
@@ -480,7 +496,7 @@ sub test_vcf_merge
}
my $args = exists($args{args}) ? $args{args} : '';
my $files = join(' ', at files);
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools merge $args $files | grep -v ^##bcftools_");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools merge --no-version $args $files");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools merge -Ob $args $files | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
}
sub test_vcf_isec
@@ -493,8 +509,8 @@ sub test_vcf_isec
push @files, "$$opts{tmp}/$file.vcf.gz";
}
my $files = join(' ', at files);
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec $args{args} $files");
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec -Ob $args{args} $files");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec $args{args} $files 2>/dev/null");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec -Ob $args{args} $files 2>/dev/null");
}
sub test_vcf_isec2
{
@@ -507,7 +523,7 @@ sub test_vcf_isec2
}
my $files = join(' ', at files);
bgzip_tabix($opts,file=>$args{tab_in},suffix=>'tab',args=>'-s 1 -b 2 -e 3');
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec $args{args} -T $$opts{tmp}/$args{tab_in}.tab.gz $files 2>/dev/null | grep -v ^##bcftools_");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec --no-version $args{args} -T $$opts{tmp}/$args{tab_in}.tab.gz $files 2>/dev/null");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools isec -Ob $args{args} -T $$opts{tmp}/$args{tab_in}.tab.gz $files 2>/dev/null | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
}
sub test_vcf_query
@@ -521,29 +537,29 @@ sub test_vcf_convert
{
my ($opts,%args) = @_;
bgzip_tabix_vcf($opts,$args{in});
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert $args{args} $$opts{tmp}/$args{in}.vcf.gz");
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools view -Ob $$opts{tmp}/$args{in}.vcf.gz | $$opts{bin}/bcftools convert $args{args}");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert $args{args} $$opts{tmp}/$args{in}.vcf.gz 2>/dev/null");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools view -Ob $$opts{tmp}/$args{in}.vcf.gz | $$opts{bin}/bcftools convert $args{args} 2>/dev/null");
}
sub test_vcf_convert_hls2vcf
{
my ($opts,%args) = @_;
my $hls = join(',', map { "$$opts{path}/$_" }( $args{h}, $args{l}, $args{s} ) );
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert $args{args} $hls | grep -v ^##");
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert $args{args} $hls -Ou | $$opts{bin}/bcftools view | grep -v ^##");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert $args{args} $hls 2>/dev/null | grep -v ^##");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert $args{args} $hls -Ou 2>/dev/null | $$opts{bin}/bcftools view | grep -v ^##");
}
sub test_vcf_convert_hs2vcf
{
my ($opts,%args) = @_;
my $hs = join(',', map { "$$opts{path}/$_" }( $args{h}, $args{s} ) );
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert $args{args} $hs | grep -v ^##");
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert $args{args} $hs -Ou | $$opts{bin}/bcftools view | grep -v ^##");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert $args{args} $hs 2>/dev/null | grep -v ^##");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert $args{args} $hs -Ou 2>/dev/null | $$opts{bin}/bcftools view | grep -v ^##");
}
sub test_vcf_convert_gvcf
{
my ($opts,%args) = @_;
bgzip_tabix_vcf($opts,$args{in});
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert $args{args} -f $$opts{path}/$args{fa} $$opts{tmp}/$args{in}.vcf.gz | grep -v ^##bcftools");
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools view -Ob $$opts{tmp}/$args{in}.vcf.gz | $$opts{bin}/bcftools convert $args{args} -f $$opts{path}/$args{fa} | grep -v ^##bcftools");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert --no-version $args{args} -f $$opts{path}/$args{fa} $$opts{tmp}/$args{in}.vcf.gz 2>/dev/null");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools view -Ob $$opts{tmp}/$args{in}.vcf.gz | $$opts{bin}/bcftools convert $args{args} -f $$opts{path}/$args{fa} 2>/dev/null | grep -v ^##bcftools");
}
sub test_vcf_convert_tsv2vcf
{
@@ -551,8 +567,8 @@ sub test_vcf_convert_tsv2vcf
my $params = '';
if ( exists($args{args}) ) { $params .= " $args{args}"; }
if ( exists($args{fai} ) ) { $params .= " -f $$opts{path}/$args{fai}.fa"; }
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert $params --tsv2vcf $$opts{path}/$args{in} | grep -v ^##bcftools_");
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert -Ou $params --tsv2vcf $$opts{path}/$args{in} | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert --no-version $params --tsv2vcf $$opts{path}/$args{in} 2>/dev/null");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools convert -Ou $params --tsv2vcf $$opts{path}/$args{in} 2>/dev/null | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
}
sub test_vcf_norm
{
@@ -561,8 +577,8 @@ sub test_vcf_norm
my $params = '';
if ( exists($args{args}) ) { $params .= " $args{args}"; }
if ( exists($args{fai} ) ) { $params .= " -f $$opts{path}/$args{fai}.fa"; }
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools norm $params $$opts{tmp}/$args{in}.vcf.gz | grep -v ^##bcftools_");
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools norm -Ob $params $$opts{tmp}/$args{in}.vcf.gz | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools norm --no-version $params $$opts{tmp}/$args{in}.vcf.gz 2>/dev/null");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools norm -Ob $params $$opts{tmp}/$args{in}.vcf.gz 2>/dev/null | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
}
sub test_vcf_view
{
@@ -571,7 +587,7 @@ sub test_vcf_view
if ( !exists($args{args}) ) { $args{args} = ''; }
if ( exists($args{tgts}) ) { $args{args} .= "-T $$opts{path}/$args{tgts}"; }
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools view $args{args} $$opts{tmp}/$args{in}.vcf.gz $args{reg} | grep -v ^##bcftools_");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools view --no-version $args{args} $$opts{tmp}/$args{in}.vcf.gz $args{reg}");
unless ($args{args} =~ /-H/) {
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools view -Ob $args{args} $$opts{tmp}/$args{in}.vcf.gz $args{reg} | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
}
@@ -580,15 +596,15 @@ sub test_vcf_call
{
my ($opts,%args) = @_;
$args{args} =~ s/{PATH}/$$opts{path}/g;
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call $args{args} $$opts{path}/$args{in}.vcf | grep -v ^##bcftools_");
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call -Ob $args{args} $$opts{path}/$args{in}.vcf | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call --no-version $args{args} $$opts{path}/$args{in}.vcf 2>/dev/null");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call -Ob $args{args} $$opts{path}/$args{in}.vcf 2>/dev/null | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
}
sub test_vcf_call_cAls
{
my ($opts,%args) = @_;
bgzip_tabix($opts,file=>$args{tab},suffix=>'tab',args=>'-s1 -b2 -e2');
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call -mA -C alleles -T $$opts{tmp}/$args{tab}.tab.gz $$opts{path}/$args{in}.vcf | grep -v ^##bcftools_");
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call -Ob -mA -C alleles -T $$opts{tmp}/$args{tab}.tab.gz $$opts{path}/$args{in}.vcf | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call --no-version -mA -C alleles -T $$opts{tmp}/$args{tab}.tab.gz $$opts{path}/$args{in}.vcf 2>/dev/null");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call -Ob -mA -C alleles -T $$opts{tmp}/$args{tab}.tab.gz $$opts{path}/$args{in}.vcf 2>/dev/null | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
}
sub test_vcf_filter
{
@@ -753,8 +769,8 @@ sub test_vcf_annotate
$annot_fname = '';
$hdr = '';
}
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools annotate $annot_fname $hdr $args{args} $in_fname | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools annotate -Ob $annot_fname $hdr $args{args} $in_fname | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools annotate $annot_fname $hdr $args{args} $in_fname 2>/dev/null | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools annotate -Ob $annot_fname $hdr $args{args} $in_fname 2>/dev/null | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
}
sub test_vcf_plugin
{
@@ -764,7 +780,7 @@ sub test_vcf_plugin
if ( !exists($args{args}) ) { $args{args} = ''; }
$args{args} =~ s/{PATH}/$$opts{path}/g;
bgzip_tabix_vcf($opts,"$args{in}");
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools $args{cmd} $$opts{tmp}/$args{in}.vcf.gz $args{args} | grep -v ^##bcftools_");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools $args{cmd} $$opts{tmp}/$args{in}.vcf.gz $args{args}");
cmd("$$opts{bin}/bcftools view -Ob $$opts{tmp}/$args{in}.vcf.gz > $$opts{tmp}/$args{in}.bcf");
cmd("$$opts{bin}/bcftools index -f $$opts{tmp}/$args{in}.bcf");
@@ -778,7 +794,7 @@ sub test_vcf_concat
{
if ( $args{do_bcf} )
{
- cmd("$$opts{bin}/bcftools view -Ob $$opts{tmp}/$file.vcf.gz > $$opts{tmp}/$file.bcf");
+ cmd("$$opts{bin}/bcftools view --no-version -Ob $$opts{tmp}/$file.vcf.gz > $$opts{tmp}/$file.bcf");
cmd("$$opts{bin}/bcftools index -f $$opts{tmp}/$file.bcf");
$files .= " $$opts{tmp}/$file.bcf";
}
@@ -788,14 +804,14 @@ sub test_vcf_concat
$files .= " $$opts{tmp}/$file.vcf.gz";
}
}
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools concat $args{args} $files | grep -v ^##bcftools_");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools concat --no-version $args{args} $files");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools concat -Ob $args{args} $files | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
}
sub test_vcf_reheader
{
my ($opts,%args) = @_;
- cmd("$$opts{bin}/bcftools view -Ob $$opts{path}/$args{in}.vcf > $$opts{tmp}/$args{in}.bcf");
- cmd("$$opts{bin}/bcftools view -Oz $$opts{path}/$args{in}.vcf > $$opts{tmp}/$args{in}.vcf.gz");
+ cmd("$$opts{bin}/bcftools view --no-version -Ob $$opts{path}/$args{in}.vcf > $$opts{tmp}/$args{in}.bcf");
+ cmd("$$opts{bin}/bcftools view --no-version -Oz $$opts{path}/$args{in}.vcf > $$opts{tmp}/$args{in}.vcf.gz");
my $arg = exists($args{header}) ? "-h $$opts{path}/$args{header}" : "-s $$opts{path}/$args{samples}";
for my $file ("$$opts{path}/$args{in}.vcf","$$opts{tmp}/$args{in}.bcf","$$opts{tmp}/$args{in}.vcf.gz")
@@ -803,8 +819,8 @@ sub test_vcf_reheader
# bcf header lines can come in different order
my %bcf_args = ();
if ( $file=~/\.bcf$/ && -e "$$opts{path}/$args{out}.bcf" ) { %bcf_args = ( out=>"$args{out}.bcf" ); }
- test_cmd($opts,%args,%bcf_args,cmd=>"$$opts{bin}/bcftools reheader $arg $file | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
- test_cmd($opts,%args,%bcf_args,cmd=>"cat $file | $$opts{bin}/bcftools reheader $arg | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
+ test_cmd($opts,%args,%bcf_args,cmd=>"$$opts{bin}/bcftools reheader $arg $file | $$opts{bin}/bcftools view --no-version");
+ test_cmd($opts,%args,%bcf_args,cmd=>"cat $file | $$opts{bin}/bcftools reheader $arg | $$opts{bin}/bcftools view --no-version");
}
}
sub test_rename_chrs
@@ -827,7 +843,7 @@ sub test_vcf_consensus
bgzip_tabix_vcf($opts,$args{in});
my $mask = $args{mask} ? "-m $$opts{path}/$args{mask}" : '';
my $chain = $args{chain} ? "-c $$opts{tmp}/$args{chain}" : '';
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools consensus $$opts{tmp}/$args{in}.vcf.gz -f $$opts{path}/$args{fa} $args{args} $mask $chain");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools consensus $$opts{tmp}/$args{in}.vcf.gz -f $$opts{path}/$args{fa} $args{args} $mask $chain 2>/dev/null");
}
sub test_vcf_consensus_chain
{
@@ -835,6 +851,6 @@ sub test_vcf_consensus_chain
bgzip_tabix_vcf($opts,$args{in});
my $mask = $args{mask} ? "-m $$opts{path}/$args{mask}" : '';
my $chain = $args{chain} ? "-c $$opts{tmp}/$args{chain}.new" : '';
- test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools consensus $$opts{tmp}/$args{in}.vcf.gz -f $$opts{path}/$args{fa} $args{args} $mask $chain > /dev/null; cat $$opts{tmp}/$args{chain}.new");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools consensus $$opts{tmp}/$args{in}.vcf.gz -f $$opts{path}/$args{fa} $args{args} $mask $chain > /dev/null 2>/dev/null; cat $$opts{tmp}/$args{chain}.new");
}
diff --git a/test/view.GTisec.H.out b/test/view.GTisec.H.out
new file mode 100644
index 0000000..5cf831b
--- /dev/null
+++ b/test/view.GTisec.H.out
@@ -0,0 +1,20 @@
+# This file can be used as input to the subset plotting tools at:
+# https://github.com/dlaehnemann/bankers2
+# Genotype intersections across samples:
+ at SMPS NA00003 NA00002 NA00001
+# Human readable output (-H) was requested. Subset intersection counts are therefore sorted by
+# sample and repeated for each contained sample. For each sample, counts are in banker's
+# sequence order regarding all other samples.
+# [1] Number of shared non-ref genotypes [2] Samples sharing non-ref genotype (GT)
+9 NA00003
+1 NA00003,NA00002
+0 NA00003,NA00001
+1 NA00003,NA00002,NA00001
+4 NA00002
+1 NA00002,NA00003
+8 NA00002,NA00001
+1 NA00002,NA00003,NA00001
+5 NA00001
+0 NA00001,NA00003
+8 NA00001,NA00002
+1 NA00001,NA00003,NA00002
diff --git a/test/view.GTisec.Hm.out b/test/view.GTisec.Hm.out
new file mode 100644
index 0000000..e70f86b
--- /dev/null
+++ b/test/view.GTisec.Hm.out
@@ -0,0 +1,25 @@
+# This file can be used as input to the subset plotting tools at:
+# https://github.com/dlaehnemann/bankers2
+# Genotype intersections across samples:
+ at SMPS NA00003 NA00002 NA00001
+# The first line of each sample contains its count of missing genotypes, with a '-' appended
+# to the sample name.
+# Human readable output (-H) was requested. Subset intersection counts are therefore sorted by
+# sample and repeated for each contained sample. For each sample, counts are in banker's
+# sequence order regarding all other samples.
+# [1] Number of shared non-ref genotypes [2] Samples sharing non-ref genotype (GT)
+4 NA00003-
+9 NA00003
+1 NA00003,NA00002
+0 NA00003,NA00001
+1 NA00003,NA00002,NA00001
+1 NA00002-
+4 NA00002
+1 NA00002,NA00003
+8 NA00002,NA00001
+1 NA00002,NA00003,NA00001
+1 NA00001-
+5 NA00001
+0 NA00001,NA00003
+8 NA00001,NA00002
+1 NA00001,NA00003,NA00002
diff --git a/test/view.GTisec.Hmv.out b/test/view.GTisec.Hmv.out
new file mode 100644
index 0000000..e70f86b
--- /dev/null
+++ b/test/view.GTisec.Hmv.out
@@ -0,0 +1,25 @@
+# This file can be used as input to the subset plotting tools at:
+# https://github.com/dlaehnemann/bankers2
+# Genotype intersections across samples:
+ at SMPS NA00003 NA00002 NA00001
+# The first line of each sample contains its count of missing genotypes, with a '-' appended
+# to the sample name.
+# Human readable output (-H) was requested. Subset intersection counts are therefore sorted by
+# sample and repeated for each contained sample. For each sample, counts are in banker's
+# sequence order regarding all other samples.
+# [1] Number of shared non-ref genotypes [2] Samples sharing non-ref genotype (GT)
+4 NA00003-
+9 NA00003
+1 NA00003,NA00002
+0 NA00003,NA00001
+1 NA00003,NA00002,NA00001
+1 NA00002-
+4 NA00002
+1 NA00002,NA00003
+8 NA00002,NA00001
+1 NA00002,NA00003,NA00001
+1 NA00001-
+5 NA00001
+0 NA00001,NA00003
+8 NA00001,NA00002
+1 NA00001,NA00003,NA00002
diff --git a/test/view.GTisec.Hv.out b/test/view.GTisec.Hv.out
new file mode 100644
index 0000000..5cf831b
--- /dev/null
+++ b/test/view.GTisec.Hv.out
@@ -0,0 +1,20 @@
+# This file can be used as input to the subset plotting tools at:
+# https://github.com/dlaehnemann/bankers2
+# Genotype intersections across samples:
+ at SMPS NA00003 NA00002 NA00001
+# Human readable output (-H) was requested. Subset intersection counts are therefore sorted by
+# sample and repeated for each contained sample. For each sample, counts are in banker's
+# sequence order regarding all other samples.
+# [1] Number of shared non-ref genotypes [2] Samples sharing non-ref genotype (GT)
+9 NA00003
+1 NA00003,NA00002
+0 NA00003,NA00001
+1 NA00003,NA00002,NA00001
+4 NA00002
+1 NA00002,NA00003
+8 NA00002,NA00001
+1 NA00002,NA00003,NA00001
+5 NA00001
+0 NA00001,NA00003
+8 NA00001,NA00002
+1 NA00001,NA00003,NA00002
diff --git a/test/view.GTisec.m.out b/test/view.GTisec.m.out
new file mode 100644
index 0000000..8d217a5
--- /dev/null
+++ b/test/view.GTisec.m.out
@@ -0,0 +1,20 @@
+# This file can be used as input to the subset plotting tools at:
+# https://github.com/dlaehnemann/bankers2
+# Genotype intersections across samples:
+ at SMPS NA00003 NA00002 NA00001
+# The first 3 lines contain the counts for missing values of each sample in the order provided
+# in the SMPS-line above. Intersection counts only start afterwards.
+# Subset intersection counts are in global banker's sequence order.
+# After exclusive sample counts in order of the SMPS-line, banker's sequence continues with:
+# NA00003,NA00002 NA00003,NA00001 ...
+# [1] Number of shared non-ref genotypes
+4
+1
+1
+9
+4
+5
+1
+0
+8
+1
diff --git a/test/view.GTisec.mv.out b/test/view.GTisec.mv.out
new file mode 100644
index 0000000..1772a3b
--- /dev/null
+++ b/test/view.GTisec.mv.out
@@ -0,0 +1,20 @@
+# This file can be used as input to the subset plotting tools at:
+# https://github.com/dlaehnemann/bankers2
+# Genotype intersections across samples:
+ at SMPS NA00003 NA00002 NA00001
+# The first 3 lines contain the counts for missing values of each sample in the order provided
+# in the SMPS-line above. Intersection counts only start afterwards.
+# Subset intersection counts are in global banker's sequence order.
+# After exclusive sample counts in order of the SMPS-line, banker's sequence continues with:
+# NA00003,NA00002 NA00003,NA00001 ...
+# [1] Number of shared non-ref genotypes [2] Samples sharing non-ref genotype (GT)
+4 NA00003-
+1 NA00002-
+1 NA00001-
+9 NA00003
+4 NA00002
+5 NA00001
+1 NA00003,NA00002
+0 NA00003,NA00001
+8 NA00002,NA00001
+1 NA00003,NA00002,NA00001
diff --git a/test/view.GTisec.out b/test/view.GTisec.out
new file mode 100644
index 0000000..2b2b0f6
--- /dev/null
+++ b/test/view.GTisec.out
@@ -0,0 +1,15 @@
+# This file can be used as input to the subset plotting tools at:
+# https://github.com/dlaehnemann/bankers2
+# Genotype intersections across samples:
+ at SMPS NA00003 NA00002 NA00001
+# Subset intersection counts are in global banker's sequence order.
+# After exclusive sample counts in order of the SMPS-line, banker's sequence continues with:
+# NA00003,NA00002 NA00003,NA00001 ...
+# [1] Number of shared non-ref genotypes
+9
+4
+5
+1
+0
+8
+1
diff --git a/test/view.GTisec.v.out b/test/view.GTisec.v.out
new file mode 100644
index 0000000..8ec3aa3
--- /dev/null
+++ b/test/view.GTisec.v.out
@@ -0,0 +1,15 @@
+# This file can be used as input to the subset plotting tools at:
+# https://github.com/dlaehnemann/bankers2
+# Genotype intersections across samples:
+ at SMPS NA00003 NA00002 NA00001
+# Subset intersection counts are in global banker's sequence order.
+# After exclusive sample counts in order of the SMPS-line, banker's sequence continues with:
+# NA00003,NA00002 NA00003,NA00001 ...
+# [1] Number of shared non-ref genotypes [2] Samples sharing non-ref genotype (GT)
+9 NA00003
+4 NA00002
+5 NA00001
+1 NA00003,NA00002
+0 NA00003,NA00001
+8 NA00002,NA00001
+1 NA00003,NA00002,NA00001
diff --git a/test/view.filter.annovar.vcf b/test/view.filter.annovar.vcf
index 7613667..c19aa25 100644
--- a/test/view.filter.annovar.vcf
+++ b/test/view.filter.annovar.vcf
@@ -18,6 +18,7 @@
##INFO=<ID=ANNOVAR_DATE,Number=1,Type=String,Description="Flag the start of ANNOVAR annotation for one alternative allele">
##INFO=<ID=Gene.refGene,Number=.,Type=String,Description="Gene.refGene annotation provided by ANNOVAR">
##INFO=<ID=GeneDetail.refGene,Number=.,Type=String,Description="GeneDetail.refGene annotation provided by ANNOVAR">
+##INFO=<ID=Func.refGene,Number=.,Type=String,Description="Func.refGene annotation provided by ANNOVAR">
##INFO=<ID=ExonicFunc.refGene,Number=.,Type=String,Description="ExonicFunc.refGene annotation provided by ANNOVAR">
##INFO=<ID=AAChange.refGene,Number=.,Type=String,Description="AAChange.refGene annotation provided by ANNOVAR">
##INFO=<ID=Func.ensGene,Number=.,Type=String,Description="Func.ensGene annotation provided by ANNOVAR">
diff --git a/vcfannotate.c b/vcfannotate.c
index 96a1649..d5164f3 100644
--- a/vcfannotate.c
+++ b/vcfannotate.c
@@ -1,6 +1,6 @@
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -120,7 +120,7 @@ typedef struct _args_t
char **argv, *output_fname, *targets_fname, *regions_list, *header_fname;
char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites;
- int argc, drop_header, tgts_is_vcf, mark_sites_logic;
+ int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic;
}
args_t;
@@ -809,6 +809,135 @@ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, vo
return bcf_update_genotypes(args->hdr_out,line,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
}
}
+static int count_vals(annot_line_t *tab, int icol_beg, int icol_end)
+{
+ int i, nmax = 0;
+ for (i=icol_beg; i<icol_end; i++)
+ {
+ char *str = tab->cols[i], *end = str;
+ if ( str[0]=='.' && !str[1] )
+ {
+ // missing value
+ if ( !nmax ) nmax = 1;
+ continue;
+ }
+ int n = 1;
+ while ( *end )
+ {
+ if ( *end==',' ) n++;
+ end++;
+ }
+ if ( nmax<n ) nmax = n;
+ }
+ return nmax;
+}
+static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
+ assert( col->icol+nsmpl <= tab->ncols );
+ int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
+ assert( nvals>0 );
+ hts_expand(int32_t,nvals*nsmpl,args->mtmpi,args->tmpi);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<nsmpl; ismpl++)
+ {
+ int32_t *ptr = args->tmpi + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ ptr[ival++] = bcf_int32_missing;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtol(str, &end, 10);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
+ icol++;
+ }
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsmpl*nvals);
+}
+static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
+ assert( col->icol+nsmpl <= tab->ncols );
+ int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
+ assert( nvals>0 );
+ hts_expand(float,nvals*nsmpl,args->mtmpf,args->tmpf);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<nsmpl; ismpl++)
+ {
+ float *ptr = args->tmpf + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ bcf_float_set_missing(ptr[ival]);
+ ival++;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtod(str, &end);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
+ icol++;
+ }
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsmpl*nvals);
+}
+static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
+ assert( col->icol+nsmpl <= tab->ncols );
+
+ int i, max_len = 0;
+ for (i=col->icol; i<col->icol+nsmpl; i++)
+ {
+ int len = strlen(tab->cols[i]);
+ if ( max_len < len ) max_len = len;
+ }
+ hts_expand(char,max_len*nsmpl,args->mtmps,args->tmps);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<nsmpl; ismpl++)
+ {
+ char *ptr = args->tmps + ismpl*max_len;
+ char *str = tab->cols[icol];
+ i = 0;
+ while ( str[i] )
+ {
+ ptr[i] = str[i];
+ i++;
+ }
+ while ( i<max_len ) ptr[i++] = 0;
+ icol++;
+ }
+ return bcf_update_format_char(args->hdr_out,line,col->hdr_key,args->tmps,nsmpl*max_len);
+}
static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
@@ -1127,7 +1256,7 @@ static void init_columns(args_t *args)
kstring_t str = {0,0,0}, tmp = {0,0,0};
char *ss = args->columns, *se = ss;
args->ncols = 0;
- int i = -1, has_fmt_str = 0, force_samples = -1;
+ int icol = -1, has_fmt_str = 0, force_samples = -1;
while ( *ss )
{
if ( *se && *se!=',' ) { se++; continue; }
@@ -1135,22 +1264,22 @@ static void init_columns(args_t *args)
if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; }
else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; }
else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; }
- i++;
+ icol++;
str.l = 0;
kputsn(ss, se-ss, &str);
if ( !str.s[0] || !strcasecmp("-",str.s) ) ;
- else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = i;
- else if ( !strcasecmp("POS",str.s) ) args->from_idx = i;
- else if ( !strcasecmp("FROM",str.s) ) args->from_idx = i;
- else if ( !strcasecmp("TO",str.s) ) args->to_idx = i;
- else if ( !strcasecmp("REF",str.s) ) args->ref_idx = i;
- else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = i;
+ else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol;
+ else if ( !strcasecmp("POS",str.s) ) args->from_idx = icol;
+ else if ( !strcasecmp("FROM",str.s) ) args->from_idx = icol;
+ else if ( !strcasecmp("TO",str.s) ) args->to_idx = icol;
+ else if ( !strcasecmp("REF",str.s) ) args->ref_idx = icol;
+ else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = icol;
else if ( !strcasecmp("ID",str.s) )
{
if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = i;
+ col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
col->hdr_key = strdup(str.s);
@@ -1160,7 +1289,7 @@ static void init_columns(args_t *args)
if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = i;
+ col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
col->hdr_key = strdup(str.s);
@@ -1187,7 +1316,7 @@ static void init_columns(args_t *args)
if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = i;
+ col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
col->hdr_key = strdup(str.s);
@@ -1262,30 +1391,38 @@ static void init_columns(args_t *args)
}
else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) )
{
- if ( !args->tgts_is_vcf )
- error("Error: FORMAT fields can be carried over from a VCF file only.\n");
-
char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
if ( force_samples<0 ) force_samples = replace;
- if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;;
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL);
- tmp.l = 0;
- bcf_hrec_format(hrec, &tmp);
- bcf_hdr_append(args->hdr_out, tmp.s);
- bcf_hdr_sync(args->hdr_out);
+ if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ if ( args->tgts_is_vcf )
+ {
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL);
+ tmp.l = 0;
+ bcf_hrec_format(hrec, &tmp);
+ bcf_hdr_append(args->hdr_out, tmp.s);
+ bcf_hdr_sync(args->hdr_out);
+ }
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key);
+ if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
+ error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = -1;
+ if ( !args->tgts_is_vcf )
+ {
+ col->icol = icol;
+ icol += bcf_hdr_nsamples(args->hdr_out) - 1;
+ }
+ else
+ col->icol = -1;
col->replace = replace;
col->hdr_key = strdup(key);
if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt;
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
- case BCF_HT_INT: col->setter = vcf_setter_format_int; break;
- case BCF_HT_REAL: col->setter = vcf_setter_format_real; break;
- case BCF_HT_STR: col->setter = vcf_setter_format_str; has_fmt_str = 1; break;
+ case BCF_HT_INT: col->setter = args->tgts_is_vcf ? vcf_setter_format_int : setter_format_int; break;
+ case BCF_HT_REAL: col->setter = args->tgts_is_vcf ? vcf_setter_format_real : setter_format_real; break;
+ case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_format_str : setter_format_str; has_fmt_str = 1; break;
default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
}
}
@@ -1314,7 +1451,7 @@ static void init_columns(args_t *args)
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = i;
+ col->icol = icol;
col->replace = replace;
col->hdr_key = strdup(str.s);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
@@ -1338,11 +1475,12 @@ static void init_columns(args_t *args)
if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt);
if ( has_fmt_str )
{
- int n = bcf_hdr_nsamples(args->hdr_out) > bcf_hdr_nsamples(args->files->readers[1].header) ? bcf_hdr_nsamples(args->hdr_out) : bcf_hdr_nsamples(args->files->readers[1].header);
+ int n = bcf_hdr_nsamples(args->hdr_out);
+ if ( args->tgts_is_vcf && n<bcf_hdr_nsamples(args->files->readers[1].header) ) n = bcf_hdr_nsamples(args->files->readers[1].header);
args->tmpp = (char**)malloc(sizeof(char*)*n);
args->tmpp2 = (char**)malloc(sizeof(char*)*n);
}
- if ( force_samples>=0 )
+ if ( force_samples>=0 && args->tgts_is_vcf )
set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1);
}
@@ -1419,7 +1557,7 @@ static void init_data(args_t *args)
args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
}
- bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
if ( !args->drop_header )
{
if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
@@ -1517,8 +1655,10 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en
}
if ( args->ref_idx != -1 )
{
- assert( args->ref_idx < tmp->ncols );
- assert( args->alt_idx < tmp->ncols );
+ if ( args->ref_idx >= tmp->ncols )
+ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,args->tgts->line.s);
+ if ( args->alt_idx >= tmp->ncols )
+ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,args->tgts->line.s);
tmp->nals = 2;
hts_expand(char*,tmp->nals,tmp->mals,tmp->als);
tmp->als[0] = tmp->cols[args->ref_idx];
@@ -1624,9 +1764,10 @@ static void usage(args_t *args)
fprintf(stderr, " -c, --columns <list> list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
- fprintf(stderr, " -I, --set-id [+]<format> set ID column, see man pagee for details\n");
- fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man pagee for details)\n");
+ fprintf(stderr, " -I, --set-id [+]<format> set ID column, see man page for details\n");
+ fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -m, --mark-sites [+-]<tag> add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
@@ -1649,6 +1790,7 @@ int main_vcfannotate(int argc, char *argv[])
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1;
args->set_ids_replace = 1;
int regions_is_file = 0;
@@ -1671,6 +1813,7 @@ int main_vcfannotate(int argc, char *argv[])
{"header-lines",required_argument,NULL,'h'},
{"samples",required_argument,NULL,'s'},
{"samples-file",required_argument,NULL,'S'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0)
@@ -1705,6 +1848,7 @@ int main_vcfannotate(int argc, char *argv[])
case 'h': args->header_fname = optarg; break;
case 1 : args->rename_chrs = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
diff --git a/vcfcall.c b/vcfcall.c
index a28caee..e5bbf11 100644
--- a/vcfcall.c
+++ b/vcfcall.c
@@ -1,6 +1,6 @@
/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -68,7 +68,7 @@ void error(const char *format, ...);
typedef struct
{
int flag; // combination of CF_* flags above
- int output_type, n_threads;
+ int output_type, n_threads, record_cmd_line;
htsFile *bcf_in, *out_fh;
char *bcf_fname, *output_fname;
char **samples; // for subsampling and ploidy
@@ -175,6 +175,11 @@ static ploidy_predef_t ploidy_predefs[] =
"* * * M 1\n"
"* * * F 0\n"
},
+ { .alias = "1",
+ .about = "Treat all samples as haploid",
+ .ploidy =
+ "* * * * 1\n"
+ },
{
.alias = NULL,
.about = NULL,
@@ -381,7 +386,7 @@ static void init_data(args_t *args)
if ( args->regions )
{
if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
- error("Failed to read the targets: %s\n", args->regions);
+ error("Failed to read the regions: %s\n", args->regions);
}
if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum));
@@ -396,9 +401,21 @@ static void init_data(args_t *args)
if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname);
fprintf(stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams);
}
+ }
+ if ( args->ploidy )
+ {
args->nsex = ploidy_nsex(args->ploidy);
args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int));
args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int));
+ if ( !args->nsamples )
+ {
+ args->nsamples = bcf_hdr_nsamples(args->aux.hdr);
+ args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples);
+ for (i=0; i<args->nsamples; i++) args->sample2sex[i] = 0;
+ }
+ }
+ if ( args->nsamples )
+ {
args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2;
for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2;
@@ -418,9 +435,12 @@ static void init_data(args_t *args)
else
{
args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0));
- for (i=0; i<args->nsamples; i++)
- if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
- error("No such sample: %s\n", args->samples[i]);
+ if ( args->samples )
+ {
+ for (i=0; i<args->nsamples; i++)
+ if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
+ error("No such sample: %s\n", args->samples[i]);
+ }
}
args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
@@ -439,7 +459,7 @@ static void init_data(args_t *args)
bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS");
bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16");
- bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
bcf_hdr_write(args->out_fh, args->aux.hdr);
if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
@@ -451,7 +471,10 @@ static void destroy_data(args_t *args)
else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux);
else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux);
int i;
- for (i=0; i<args->nsamples; i++) free(args->samples[i]);
+ if ( args->samples )
+ {
+ for (i=0; i<args->nsamples; i++) free(args->samples[i]);
+ }
if ( args->aux.fams )
{
for (i=0; i<args->aux.nfams; i++) free(args->aux.fams[i].name);
@@ -579,6 +602,7 @@ static void usage(args_t *args)
fprintf(stderr, "Usage: bcftools call [options] <in.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "File format options:\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
fprintf(stderr, " --ploidy <assembly>[?] predefined ploidy, 'list' to print available settings, append '?' for details\n");
@@ -634,6 +658,7 @@ int main_vcfcall(int argc, char *argv[])
args.output_fname = "-";
args.output_type = FT_VCF;
args.n_threads = 0;
+ args.record_cmd_line = 1;
args.aux.trio_Pm_SNPs = 1 - 1e-8;
args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9;
@@ -668,6 +693,7 @@ int main_vcfcall(int argc, char *argv[])
{"ploidy-file",required_argument,NULL,2},
{"chromosome-X",no_argument,NULL,'X'},
{"chromosome-Y",no_argument,NULL,'Y'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
@@ -727,6 +753,7 @@ int main_vcfcall(int argc, char *argv[])
case 's': args.samples_fname = optarg; break;
case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break;
case 9 : args.n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args.record_cmd_line = 0; break;
default: usage(&args);
}
}
diff --git a/vcfconcat.c b/vcfconcat.c
index cfec7c0..bd6a00a 100644
--- a/vcfconcat.c
+++ b/vcfconcat.c
@@ -31,13 +31,15 @@ THE SOFTWARE. */
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/kseq.h>
+#include <htslib/bgzf.h>
+#include <htslib/tbx.h> // for hts_get_bgzfp()
#include "bcftools.h"
typedef struct _args_t
{
bcf_srs_t *files;
htsFile *out_fh;
- int output_type, n_threads;
+ int output_type, n_threads, record_cmd_line;
bcf_hdr_t *out_hdr;
int *seen_seq;
@@ -50,7 +52,7 @@ typedef struct _args_t
char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list;
int argc, nfnames, allow_overlaps, phased_concat, regions_is_file;
- int compact_PS, phase_set_changed;
+ int compact_PS, phase_set_changed, naive_concat;
}
args_t;
@@ -106,7 +108,7 @@ static void init_data(args_t *args)
bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">");
bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
}
- bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
@@ -176,8 +178,11 @@ static void destroy_data(args_t *args)
for (i=0; i<args->nfnames; i++) free(args->fnames[i]);
free(args->fnames);
if ( args->files ) bcf_sr_destroy(args->files);
- if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
- bcf_hdr_destroy(args->out_hdr);
+ if ( args->out_fh )
+ {
+ if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
+ }
+ if ( args->out_hdr ) bcf_hdr_destroy(args->out_hdr);
free(args->seen_seq);
free(args->start_pos);
free(args->swap_phase);
@@ -550,6 +555,108 @@ static void concat(args_t *args)
}
}
+static void naive_concat(args_t *args)
+{
+ // only compressed BCF atm
+ BGZF *bgzf_out = bgzf_open(args->output_fname,"w");;
+
+ const size_t page_size = 32768;
+ char *buf = (char*) malloc(page_size);
+ kstring_t tmp = {0,0,0};
+ int i;
+ for (i=0; i<args->nfnames; i++)
+ {
+ htsFile *hts_fp = hts_open(args->fnames[i],"r");
+ if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]);
+ htsFormat type = *hts_get_format(hts_fp);
+
+ if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
+ if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
+
+ BGZF *fp = hts_get_bgzfp(hts_fp);
+ if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
+ error("Failed to read %s: %s\n", args->fnames[i], strerror(errno));
+
+ uint8_t magic[5];
+ if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);
+
+ if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ hts_expand(char,tmp.l,tmp.m,tmp.s);
+ if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+
+ // write only the first header
+ if ( i==0 )
+ {
+ if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
+ if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
+ if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
+ }
+
+ // Output all non-header data that were read together with the header block
+ int nskip = fp->block_offset;
+ if ( fp->block_length - nskip > 0 )
+ {
+ if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
+ }
+ if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);
+
+
+ // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks
+ ssize_t nread, ncached = 0, nwr;
+ const int neof = 28;
+ char cached[neof];
+ while (1)
+ {
+ nread = bgzf_raw_read(fp, buf, page_size);
+
+ // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends
+ if ( nread<=0 ) break;
+ if ( nread<=neof ) // last block
+ {
+ if ( ncached )
+ {
+ // flush the part of the cache that won't be needed
+ nwr = bgzf_raw_write(bgzf_out, cached, nread);
+ if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
+
+ // make space in the cache so that we can append to the end
+ if ( nread!=neof ) memmove(cached,cached+nread,neof-nread);
+ }
+
+ // fill the cache and check for eof outside this loop
+ memcpy(cached+neof-nread,buf,nread);
+ break;
+ }
+
+ // not the last block, flush the cache if full
+ if ( ncached )
+ {
+ nwr = bgzf_raw_write(bgzf_out, cached, ncached);
+ if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached);
+ ncached = 0;
+ }
+
+ // fill the cache
+ nread -= neof;
+ memcpy(cached,buf+nread,neof);
+ ncached = neof;
+
+ nwr = bgzf_raw_write(bgzf_out, buf, nread);
+ if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
+ }
+ if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) )
+ {
+ nwr = bgzf_raw_write(bgzf_out, cached, neof);
+ if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof);
+ }
+ if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]);
+ }
+ free(buf);
+ free(tmp.s);
+ if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
+}
+
static void usage(args_t *args)
{
fprintf(stderr, "\n");
@@ -558,7 +665,9 @@ static void usage(args_t *args)
fprintf(stderr, " concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel\n");
fprintf(stderr, " VCF into one. The input files must be sorted by chr and position. The files\n");
fprintf(stderr, " must be given in the correct order to produce sorted VCF on output unless\n");
- fprintf(stderr, " the -a, --allow-overlaps option is specified.\n");
+ fprintf(stderr, " the -a, --allow-overlaps option is specified. With the --naive option, the files\n");
+ fprintf(stderr, " are concatenated without being recompressed, which is very fast but dangerous\n");
+ fprintf(stderr, " if the BCF headers differ.\n");
fprintf(stderr, "Usage: bcftools concat [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
@@ -568,6 +677,8 @@ static void usage(args_t *args)
fprintf(stderr, " -D, --remove-duplicates Alias for -d none\n");
fprintf(stderr, " -f, --file-list <file> Read the list of files from a file.\n");
fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(stderr, " -n, --naive Concatenate BCF files without recompression (dangerous, use with caution)\n");
fprintf(stderr, " -o, --output <file> Write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
@@ -586,10 +697,12 @@ int main_vcfconcat(int argc, char *argv[])
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->min_PQ = 30;
static struct option loptions[] =
{
+ {"naive",no_argument,NULL,'n'},
{"compact-PS",no_argument,NULL,'c'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
@@ -602,10 +715,11 @@ int main_vcfconcat(int argc, char *argv[])
{"threads",required_argument,NULL,9},
{"file-list",required_argument,NULL,'f'},
{"min-PQ",required_argument,NULL,'q'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:c",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cn",loptions,NULL)) >= 0)
{
switch (c) {
case 'c': args->compact_PS = 1; break;
@@ -617,6 +731,7 @@ int main_vcfconcat(int argc, char *argv[])
args->min_PQ = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
break;
+ case 'n': args->naive_concat = 1; break;
case 'a': args->allow_overlaps = 1; break;
case 'l': args->phased_concat = 1; break;
case 'f': args->file_list = optarg; break;
@@ -631,6 +746,7 @@ int main_vcfconcat(int argc, char *argv[])
};
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
@@ -654,6 +770,15 @@ int main_vcfconcat(int argc, char *argv[])
if ( !args->nfnames ) usage(args);
if ( args->remove_dups && !args->allow_overlaps ) error("The -D option is supported only with -a\n");
if ( args->regions_list && !args->allow_overlaps ) error("The -r/-R option is supported only with -a\n");
+ if ( args->naive_concat )
+ {
+ if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n");
+ if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n");
+ naive_concat(args);
+ destroy_data(args);
+ free(args);
+ return 0;
+ }
init_data(args);
concat(args);
destroy_data(args);
diff --git a/vcfconvert.c b/vcfconvert.c
index 26166df..1e60d30 100644
--- a/vcfconvert.c
+++ b/vcfconvert.c
@@ -66,7 +66,7 @@ struct _args_t
int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
char *outfname, *infname, *ref_fname;
- int argc, n_threads;
+ int argc, n_threads, record_cmd_line;
};
static void destroy_data(args_t *args)
@@ -369,7 +369,7 @@ static void gensample_to_vcf(args_t *args)
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
bcf_hdr_append(args->header, "##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype Probabilities\">");
bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
- bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
@@ -489,7 +489,7 @@ static void haplegendsample_to_vcf(args_t *args)
bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
- bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
int i, nrows, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nrows);
@@ -606,7 +606,7 @@ static void hapsample_to_vcf(args_t *args)
bcf_hdr_append(args->header, "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
bcf_hdr_printf(args->header, "##contig=<ID=%s,length=%d>", args->str.s,0x7fffffff); // MAX_CSI_COOR
- bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
@@ -1143,7 +1143,7 @@ static void tsv_to_vcf(args_t *args)
args->header = bcf_hdr_init("w");
bcf_hdr_set_chrs(args->header, args->ref);
bcf_hdr_append(args->header, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
- bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert");
int i, n;
char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
@@ -1241,7 +1241,7 @@ static void gvcf_to_vcf(args_t *args)
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
- bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
+ if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert");
bcf_hdr_write(out_fh,hdr);
int32_t *itmp = NULL, nitmp = 0;
@@ -1304,11 +1304,12 @@ static void usage(void)
fprintf(stderr, " -S, --samples-file <file> file of samples to include\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "VCF output options:\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> output file name [stdout]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
fprintf(stderr, " -G, --gensample2vcf <...> <prefix>|<gen-file>,<sample-file>\n");
@@ -1359,6 +1360,7 @@ int main_vcfconvert(int argc, char *argv[])
args->outfname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
static struct option loptions[] =
{
@@ -1387,6 +1389,7 @@ int main_vcfconvert(int argc, char *argv[])
{"haplegendsample2vcf",required_argument,NULL,'H'},
{"columns",required_argument,NULL,'c'},
{"fasta-ref",required_argument,NULL,'f'},
+ {"no-version",no_argument,NULL,10},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
@@ -1424,6 +1427,7 @@ int main_vcfconvert(int argc, char *argv[])
break;
case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 10 : args->record_cmd_line = 0; break;
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
}
diff --git a/vcffilter.c b/vcffilter.c
index ac4c3a3..f979d77 100644
--- a/vcffilter.c
+++ b/vcffilter.c
@@ -71,7 +71,7 @@ typedef struct _args_t
int output_type, n_threads;
char **argv, *output_fname, *targets_list, *regions_list;
- int argc;
+ int argc, record_cmd_line;
}
args_t;
@@ -149,7 +149,7 @@ static void init_data(args_t *args)
}
}
- bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter");
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
@@ -408,6 +408,7 @@ static void usage(args_t *args)
fprintf(stderr, " -G, --IndelGap <int> filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
fprintf(stderr, " -i, --include <expr> include only sites for which the expression is true (see man page for details\n");
fprintf(stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
@@ -430,6 +431,7 @@ int main_vcffilter(int argc, char *argv[])
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
int regions_is_file = 0, targets_is_file = 0;
static struct option loptions[] =
@@ -448,6 +450,7 @@ int main_vcffilter(int argc, char *argv[])
{"threads",required_argument,NULL,9},
{"SnpGap",required_argument,NULL,'g'},
{"IndelGap",required_argument,NULL,'G'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
char *tmp;
@@ -488,6 +491,7 @@ int main_vcffilter(int argc, char *argv[])
else error("The argument to -S not recognised: %s\n", optarg);
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage(args);
default: error("Unknown argument: %s\n", optarg);
diff --git a/vcfindex.c b/vcfindex.c
index e40fab5..d1e9179 100644
--- a/vcfindex.c
+++ b/vcfindex.c
@@ -1,7 +1,7 @@
/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014-2016 Genome Research Ltd.
Author: Shane McCarthy <sm15 at sanger.ac.uk>
@@ -177,6 +177,7 @@ int main_vcfindex(int argc, char *argv[])
if (stats) return vcf_index_stats(fname, stats);
htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) error("Failed to read %s\n", fname);
htsFormat type = *hts_get_format(fp);
hts_close(fp);
diff --git a/vcfisec.c b/vcfisec.c
index 6115146..9afe620 100644
--- a/vcfisec.c
+++ b/vcfisec.c
@@ -58,7 +58,7 @@ typedef struct
htsFile **fh_out;
char **argv, *prefix, *output_fname, **fnames, *write_files, *targets_list, *regions_list;
char *isec_exact;
- int argc;
+ int argc, record_cmd_line;
}
args_t;
@@ -143,7 +143,7 @@ void isec_vcf(args_t *args)
out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
- bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
+ if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
bcf_hdr_write(out_fh, files->readers[args->iwrite].header);
}
if ( !args->nwrite && !out_std && !args->prefix )
@@ -351,7 +351,7 @@ static void init_data(args_t *args)
args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type)); \
if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \
if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \
- bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
+ if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
bcf_hdr_write(args->fh_out[i], args->files->readers[j].header); \
}
if ( !args->nwrite || args->write[0] )
@@ -456,6 +456,7 @@ static void usage(void)
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
fprintf(stderr, " -i, --include <expr> include only sites for which the expression is true\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -n, --nfiles [+-=~]<int> output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
@@ -464,8 +465,8 @@ static void usage(void)
fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " -w, --write <list> list of files to write with -p given as 1-based indexes. By default, all files are written\n");
fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, " -w, --write <list> list of files to write with -p given as 1-based indexes. By default, all files are written\n");
fprintf(stderr, "\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Create intersection and complements of two sets saving the output in dir/*\n");
@@ -492,6 +493,7 @@ int main_vcfisec(int argc, char *argv[])
args->output_fname = NULL;
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
int targets_is_file = 0, regions_is_file = 0;
static struct option loptions[] =
@@ -512,6 +514,7 @@ int main_vcfisec(int argc, char *argv[])
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) {
@@ -560,6 +563,7 @@ int main_vcfisec(int argc, char *argv[])
}
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
diff --git a/vcfmerge.c b/vcfmerge.c
index 0517bd5..02fac6b 100644
--- a/vcfmerge.c
+++ b/vcfmerge.c
@@ -118,7 +118,7 @@ typedef struct
htsFile *out_fh;
bcf_hdr_t *out_hdr;
char **argv;
- int argc, n_threads;
+ int argc, n_threads, record_cmd_line;
}
args_t;
@@ -858,7 +858,7 @@ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst
}
if ( ith_src!=isrc ) return -1; // requested field not found
int end_src = start_src;
- while ( end_src<src_len && src[end_src]!=',' ) end_src++;
+ while ( end_src<src_len && src[end_src] && src[end_src]!=',' ) end_src++;
int nsrc_cpy = end_src - start_src;
if ( nsrc_cpy==1 && src[start_src]=='.' ) return 0; // don't write missing values, dst is already initialized
@@ -1913,7 +1913,7 @@ void merge_vcf(args_t *args)
char buf[10]; snprintf(buf,10,"%d",i+1);
merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples);
}
- bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge");
bcf_hdr_sync(args->out_hdr);
}
info_rules_init(args);
@@ -1962,6 +1962,7 @@ static void usage(void)
fprintf(stderr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
fprintf(stderr, " -l, --file-list <file> read file names from the file\n");
fprintf(stderr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
@@ -1980,6 +1981,7 @@ int main_vcfmerge(int argc, char *argv[])
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->collapse = COLLAPSE_BOTH;
int regions_is_file = 0;
@@ -1998,6 +2000,7 @@ int main_vcfmerge(int argc, char *argv[])
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
{"info-rules",required_argument,NULL,'i'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) {
@@ -2032,6 +2035,7 @@ int main_vcfmerge(int argc, char *argv[])
case 2 : args->header_only = 1; break;
case 3 : args->force_samples = 1; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
diff --git a/vcfnorm.c b/vcfnorm.c
index 732eca9..781833c 100644
--- a/vcfnorm.c
+++ b/vcfnorm.c
@@ -1,6 +1,6 @@
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -76,6 +76,7 @@ typedef struct
char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels;
int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
+ int record_cmd_line;
}
args_t;
@@ -295,17 +296,19 @@ static int realign(args_t *args, bcf1_t *line)
if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE;
}
-
// trim from right
int ori_pos = line->pos;
while (1)
{
// is the rightmost base identical in all alleles?
+ int min_len = als[0].l;
for (i=1; i<line->n_allele; i++)
{
if ( als[0].s[ als[0].l-1 ]!=als[i].s[ als[i].l-1 ] ) break;
+ if ( als[i].l < min_len ) min_len = als[i].l;
}
if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
+ if ( min_len<=1 && line->pos==0 ) break;
int pad_from_left = 0;
for (i=0; i<line->n_allele; i++) // trim all alleles
@@ -343,7 +346,7 @@ static int realign(args_t *args, bcf1_t *line)
if ( als[0].s[ntrim_left]!=als[i].s[ntrim_left] ) break;
if ( min_len > als[i].l - ntrim_left ) min_len = als[i].l - ntrim_left;
}
- if ( i!=line->n_allele || min_len==1 ) break; // there are differences, cannot be trimmed
+ if ( i!=line->n_allele || min_len<=1 ) break; // there are differences, cannot be trimmed
ntrim_left++;
}
if ( ntrim_left )
@@ -1287,7 +1290,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm
{
kstring_t *tmp = &args->tmp_str[i];
kputsn(tmp->s,tmp->l,&str);
- for (j=tmp->l; j<max_len; j++) kputc(0,tmp);
+ for (j=tmp->l; j<max_len; j++) kputc('\0',&str);
}
args->ntmp_arr2 = str.m;
args->tmp_arr2 = (uint8_t*)str.s;
@@ -1581,7 +1584,7 @@ static void normalize_vcf(args_t *args)
htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out, args->n_threads);
- bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
bcf_hdr_write(out, args->hdr);
int prev_rid = -1, prev_pos = -1, prev_type = 0;
@@ -1641,7 +1644,6 @@ static void normalize_vcf(args_t *args)
if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break;
j++;
}
- if ( args->rbuf.n==args->rbuf.m ) j = 1;
if ( j>0 ) flush_buffer(args, out, j);
}
flush_buffer(args, out, args->rbuf.n);
@@ -1666,6 +1668,7 @@ static void usage(void)
fprintf(stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|any\n");
fprintf(stderr, " -f, --fasta-ref <file> reference sequence\n");
fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
@@ -1674,8 +1677,8 @@ static void usage(void)
fprintf(stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(stderr, "\n");
exit(1);
}
@@ -1689,6 +1692,7 @@ int main_vcfnorm(int argc, char *argv[])
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->aln_win = 100;
args->buf_win = 1000;
args->mrows_collapse = COLLAPSE_BOTH;
@@ -1714,6 +1718,7 @@ int main_vcfnorm(int argc, char *argv[])
{"threads",required_argument,NULL,9},
{"check-ref",required_argument,NULL,'c'},
{"strict-filter",no_argument,NULL,'s'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
char *tmp;
@@ -1771,6 +1776,7 @@ int main_vcfnorm(int argc, char *argv[])
if ( *tmp ) error("Could not parse argument: --site-win %s\n", optarg);
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
diff --git a/vcfplugin.c b/vcfplugin.c
index e2ca04a..87a773f 100644
--- a/vcfplugin.c
+++ b/vcfplugin.c
@@ -140,7 +140,7 @@ typedef struct _args_t
char **plugin_paths;
char **argv, *output_fname, *regions_list, *targets_list;
- int argc, drop_header, verbose;
+ int argc, drop_header, verbose, record_cmd_line;
}
args_t;
@@ -239,13 +239,6 @@ static void print_plugin_usage_hint(void)
fprintf(stderr,
" in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n"
"- Is the plugin path correct?\n\n"
- "- Are all shared libraries, namely libhts.so, accessible? Verify with\n"
- " on Mac OS X: `otool -L your/plugin.so` and set DYLD_LIBRARY_PATH if they are not\n"
- " on Linux: `ldd your/plugin.so` and set LD_LIBRARY_PATH if they are not\n"
- "\n"
- "- If not installed systemwide, set the environment variable LD_LIBRARY_PATH (linux) or\n"
- "DYLD_LIBRARY_PATH (mac) to include directory where *libhts.so* is located.\n"
- "\n"
"- Run \"bcftools plugin -lv\" for more detailed error output.\n"
"\n",
getenv("BCFTOOLS_PLUGINS")
@@ -418,7 +411,7 @@ static void init_data(args_t *args)
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
- bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
if ( !args->drop_header )
{
args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
@@ -460,6 +453,7 @@ static void usage(args_t *args)
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
fprintf(stderr, "VCF output options:\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
@@ -480,12 +474,27 @@ int main_plugin(int argc, char *argv[])
args->output_fname = "-";
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
args->nplugin_paths = -1;
int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0;
if ( argc==1 ) usage(args);
+
char *plugin_name = NULL;
- if ( argv[1][0]!='-' ) { plugin_name = argv[1]; argc--; argv++; }
+ if ( argv[1][0]!='-' )
+ {
+ plugin_name = argv[1];
+ argc--;
+ argv++;
+ load_plugin(args, plugin_name, 1, &args->plugin);
+ if ( args->plugin.run )
+ {
+ int ret = args->plugin.run(argc, argv);
+ destroy_data(args);
+ free(args);
+ return ret;
+ }
+ }
static struct option loptions[] =
{
@@ -502,6 +511,7 @@ int main_plugin(int argc, char *argv[])
{"regions-file",required_argument,NULL,'R'},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0)
@@ -527,6 +537,7 @@ int main_plugin(int argc, char *argv[])
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'l': plist_only = 1; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case '?':
case 'h': usage_only = 1; break;
default: error("Unknown argument: %s\n", optarg);
@@ -535,7 +546,6 @@ int main_plugin(int argc, char *argv[])
if ( plist_only ) return list_plugins(args);
if ( usage_only && ! plugin_name ) usage(args);
- load_plugin(args, plugin_name, 1, &args->plugin);
if ( version_only )
{
const char *bver, *hver;
@@ -554,15 +564,6 @@ int main_plugin(int argc, char *argv[])
return 0;
}
- if ( args->plugin.run )
- {
- int iopt = optind; optind = 0;
- int ret = args->plugin.run(argc-iopt, argv+iopt);
- destroy_data(args);
- free(args);
- return ret;
- }
-
char *fname = NULL;
if ( optind>=argc || argv[optind][0]=='-' )
{
diff --git a/vcfroh.c b/vcfroh.c
index fa64b79..9560559 100644
--- a/vcfroh.c
+++ b/vcfroh.c
@@ -368,14 +368,31 @@ static void flush_viterbi(args_t *args)
}
}
- // update the transition matrix tprob
+ // update the transition matrix
+ int n = 1;
for (i=0; i<2; i++)
{
- int n = 0;
for (j=0; j<2; j++) n += MAT(tcounts,2,i,j);
- if ( !n) error("fixme: state %d not observed\n", i+1);
- for (j=0; j<2; j++) MAT(tcounts,2,i,j) /= n;
}
+ for (i=0; i<2; i++)
+ {
+ for (j=0; j<2; j++)
+ {
+ // no transition to i-th state was observed, set to a small number
+ if ( !MAT(tcounts,2,i,j) ) MAT(tcounts,2,i,j) = 0.1/n;
+ else MAT(tcounts,2,i,j) /= n;
+ }
+ }
+
+ // normalize
+ for (i=0; i<2; i++)
+ {
+ double norm = 0;
+ for (j=0; j<2; j++) norm += MAT(tcounts,2,j,i);
+ assert( norm!=0 );
+ for (j=0; j<2; j++) MAT(tcounts,2,j,i) /= norm;
+ }
+
if ( args->genmap_fname || args->rec_rate > 0 )
hmm_set_tprob(args->hmm, tcounts, 0);
else
@@ -385,14 +402,16 @@ static void flush_viterbi(args_t *args)
deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev);
delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev);
niter++;
-
- fprintf(stderr,"%d: %f %f\n", niter,deltaz,delthw);
+ fprintf(stderr,"Viterbi training, iteration %d: dAZ=%e dHW=%e\tP(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n",
+ niter,deltaz,delthw,
+ MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
+ MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
}
while ( deltaz > 0.0 || delthw > 0.0 );
- fprintf(stderr, "Viterbi training converged in %d iterations to", niter);
double *tprob_arr = hmm_get_tprob(args->hmm);
- for (i=0; i<2; i++) for (j=0; j<2; j++) fprintf(stderr, " %f", MAT(tprob_arr,2,i,j));
- fprintf(stderr, "\n");
+ fprintf(stderr, "Viterbi training converged in %d iterations to P(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", niter,
+ MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
+ MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
// output the results
for (i=0; i<args->nrids; i++)
@@ -400,12 +419,16 @@ static void flush_viterbi(args_t *args)
int ioff = args->rid_offs[i];
int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
+ hmm_run_fwd_bwd(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+ double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]);
for (j=0; j<nsites; j++)
{
- printf("%s\t%d\t%d\t..\n", chr,args->sites[ioff+j]+1,vpath[j*2]==STATE_AZ ? 1 : 0);
+ int state = vpath[j*2];
+ double pval = fwd[j*2 + state];
+ printf("%s\t%d\t%d\t%e\n", chr,args->sites[ioff+j]+1,state==STATE_AZ ? 1 : 0, pval);
}
}
}
diff --git a/vcfview.c b/vcfview.c
index ed41595..c14075d 100644
--- a/vcfview.c
+++ b/vcfview.c
@@ -72,6 +72,7 @@ typedef struct _args_t
int sample_is_file, force_samples;
char *include_types, *exclude_types;
int include, exclude;
+ int record_cmd_line;
htsFile *out;
}
args_t;
@@ -86,7 +87,8 @@ static void init_data(args_t *args)
bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">");
bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
}
- bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");
+ else bcf_hdr_sync(args->hdr);
// setup sample data
if (args->sample_names)
@@ -485,6 +487,7 @@ static void usage(args_t *args)
fprintf(stderr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n");
fprintf(stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n");
fprintf(stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output-file <file> output file name [stdout]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
@@ -529,6 +532,7 @@ int main_vcfview(int argc, char *argv[])
args->update_info = 1;
args->output_type = FT_VCF;
args->n_threads = 0;
+ args->record_cmd_line = 1;
int targets_is_file = 0, regions_is_file = 0;
static struct option loptions[] =
@@ -569,6 +573,7 @@ int main_vcfview(int argc, char *argv[])
{"max-af",required_argument,NULL,'Q'},
{"phased",no_argument,NULL,'p'},
{"exclude-phased",no_argument,NULL,'P'},
+ {"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
char *tmp;
@@ -678,6 +683,7 @@ int main_vcfview(int argc, char *argv[])
break;
}
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 8 : args->record_cmd_line = 0; break;
case '?': usage(args);
default: error("Unknown argument: %s\n", optarg);
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bcftools.git
More information about the debian-med-commit
mailing list