[med-svn] [bcftools] 01/05: New upstream version 1.6
Andreas Tille
tille at debian.org
Mon Dec 11 14:37:20 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository bcftools.
commit 01bdfe5461861d96114f4312b341e37e3d1006e0
Author: Andreas Tille <tille at debian.org>
Date: Mon Dec 11 13:22:00 2017 +0100
New upstream version 1.6
---
Makefile | 4 +-
NEWS | 33 +++
config.mk.in | 1 +
consensus.c | 129 ++++++++--
csq.c | 133 +++++-----
doc/bcftools.1 | 179 ++++++++++++--
doc/bcftools.html | 116 +++++++--
doc/bcftools.txt | 98 +++++++-
filter.c | 652 +++++++++++++++++++++++++++++++++++--------------
kheap.h | 5 +-
main.c | 5 +
misc/plot-roh.py | 38 +--
misc/plot-vcfstats | 21 +-
misc/run-roh.pl | 2 +-
mpileup.c | 9 +-
plugins/check-ploidy.c | 165 +++++++++++++
plugins/mendelian.c | 110 +++++++--
plugins/setGT.c | 264 +++++++++++++++-----
test/check.chk | 22 +-
test/convert.gvcf.out | 2 +-
test/convert.gvcf.vcf | 2 +-
test/filter.12.out | 1 +
test/filter.13.out | 2 +
test/filter.14.out | 8 +
test/filter.15.out | 2 +
test/filter.16.out | 3 +
test/filter.17.out | 3 +
test/filter.18.out | 11 +
test/filter.19.out | 1 +
test/mendelian.1.out | 40 ++-
test/mendelian.2.out | 29 ++-
test/mendelian.3.out | 23 +-
test/mendelian.vcf | 40 ++-
test/query.34.out | 1 +
test/query.35.out | 1 +
test/query.36.out | 1 +
test/sort.out | 15 ++
test/sort.vcf | 21 ++
test/stats.B.chk | 6 +-
test/stats.chk | 18 +-
test/stats.counts.chk | 37 +++
test/stats.counts.vcf | 18 ++
test/test.pl | 28 ++-
vcfcnv.c | 6 +
vcfconvert.c | 5 +-
vcfindex.c | 7 +
vcfisec.c | 14 +-
vcfmerge.c | 14 +-
vcfnorm.c | 18 ++
vcfquery.c | 17 ++
vcfsort.c | 306 +++++++++++++++++++++++
vcfstats.c | 63 +++--
52 files changed, 2245 insertions(+), 504 deletions(-)
diff --git a/Makefile b/Makefile
index 7e87b08..8e0f04d 100644
--- a/Makefile
+++ b/Makefile
@@ -39,6 +39,7 @@ OBJS = main.o vcfindex.o tabix.o \
vcfcnv.o HMM.o vcfplugin.o consensus.o ploidy.o bin.o hclust.o version.o \
regidx.o smpl_ilist.o csq.o vcfbuf.o \
mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \
+ vcfsort.o \
ccall.o em.o prob1.o kmin.o # the original samtools calling
prefix = /usr/local
@@ -92,7 +93,7 @@ endif
include config.mk
-PACKAGE_VERSION = 1.5
+PACKAGE_VERSION = 1.6
# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
@@ -202,6 +203,7 @@ vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vc
vcfroh.o: vcfroh.c $(roh_h)
vcfcnv.o: vcfcnv.c $(cnv_h)
vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h)
+vcfsort.o: vcfsort.c $(htslib_vcf_h) $(bcftools_h)
vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) $(bin_h)
vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h)
reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_kseq_h) $(bcftools_h)
diff --git a/NEWS b/NEWS
index 5ae6521..c05420b 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,36 @@
+## Release 1.6 (September 2017)
+
+* New `sort` command.
+
+* New options added to the `consensus` command. Note that the `-i, --iupac`
+ option has been renamed to `-I, --iupac`, in favor of the standard
+ `-i, --include`.
+
+* Filtering expressions (`-i/-e`): support for `GT=<type>` expressions and
+ for lists and ranges (#639) - see the man page for details.
+
+* `csq`: relax some GFF3 parsing restrictions to enable using Ensembl
+ GFF3 files for plants (#667)
+
+* `stats`: add further documentation to output stats files (#316) and
+ include haploid counts in per-sample output (#671).
+
+* `plot-vcfstats`: further fixes for Python3 (@nsoranzo, #645, #666).
+
+* `query` bugfix (#632)
+
+* `+setGT` plugin: new option to set genotypes based on a two-tailed binomial
+ distribution test. Also, allow combining `-i/-e` with `-t q`.
+
+* `mpileup`: fix typo (#636)
+
+* `convert --gvcf2vcf` bugfix (#641)
+
+* `+mendelian`: recognize some mendelian inconsistencies that were
+ being missed (@oronnavon, #660), also add support for multiallelic
+ sites and sex chromosomes.
+
+
## Release 1.5 (June 2017)
* Added autoconf support to bcftools. See `INSTALL` for more details.
diff --git a/config.mk.in b/config.mk.in
index df70d55..8b54368 100644
--- a/config.mk.in
+++ b/config.mk.in
@@ -31,6 +31,7 @@
prefix = @prefix@
exec_prefix = @exec_prefix@
bindir = @bindir@
+libexecdir = @libexecdir@
datarootdir = @datarootdir@
mandir = @mandir@
diff --git a/consensus.c b/consensus.c
index 258ef14..544eca6 100644
--- a/consensus.c
+++ b/consensus.c
@@ -1,6 +1,6 @@
/* The MIT License
- Copyright (c) 2014 Genome Research Ltd.
+ Copyright (c) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -39,6 +39,16 @@
#include "regidx.h"
#include "bcftools.h"
#include "rbuf.h"
+#include "filter.h"
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+#define PICK_REF 1
+#define PICK_ALT 2
+#define PICK_LONG 4
+#define PICK_SHORT 8
typedef struct
{
@@ -75,12 +85,16 @@ typedef struct
chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
// Note that the chain is re-initialised for each chromosome/seq_region
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+
bcf_srs_t *files;
bcf_hdr_t *hdr;
FILE *fp_out;
FILE *fp_chain;
char **argv;
- int argc, output_iupac, haplotype, isample;
+ int argc, output_iupac, haplotype, allele, isample;
char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname;
}
args_t;
@@ -195,7 +209,7 @@ static void init_data(args_t *args)
args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample);
if ( args->isample<0 ) error("No such sample: %s\n", args->sample);
}
- if ( args->haplotype && args->isample<0 )
+ if ( (args->haplotype || args->allele) && args->isample<0 )
{
if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
args->isample = 0;
@@ -220,10 +234,14 @@ static void init_data(args_t *args)
if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
}
else args->fp_out = stdout;
+ if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records\n");
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
}
static void destroy_data(args_t *args)
{
+ if (args->filter) filter_destroy(args->filter);
bcf_sr_destroy(args->files);
int i;
for (i=0; i<args->vcf_rbuf.m; i++)
@@ -287,9 +305,16 @@ static bcf1_t **next_vcf_line(args_t *args)
int i = rbuf_shift(&args->vcf_rbuf);
return &args->vcf_buf[i];
}
- else if ( bcf_sr_next_line(args->files) )
+ while ( bcf_sr_next_line(args->files) )
+ {
+ if ( args->filter )
+ {
+ int is_ok = filter_test(args->filter, bcf_sr_get_line(args->files,0), NULL);
+ if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1;
+ if ( !is_ok ) continue;
+ }
return &args->files->readers[0].buffer[0];
-
+ }
return NULL;
}
static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
@@ -358,33 +383,36 @@ static void apply_variant(args_t *args, bcf1_t *rec)
int i, ialt = 1;
if ( args->isample >= 0 )
{
+ bcf_unpack(rec, BCF_UN_FMT);
bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
if ( !fmt ) return;
+
+ if ( fmt->type!=BCF_BT_INT8 )
+ error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1);
+ uint8_t *ptr = fmt->p + fmt->size*args->isample;
+
if ( args->haplotype )
{
if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
- uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1;
- ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ ialt = ptr[args->haplotype-1];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);
}
else if ( args->output_iupac )
{
- uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample;
- ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ ialt = ptr[0];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);
int jalt;
if ( fmt->n>1 )
{
- ptr = fmt->p + fmt->size*args->isample + 1;
- jalt = bcf_dec_int1(ptr, fmt->type, &ignore);
+ jalt = ptr[1];
if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
else jalt = bcf_gt_allele(jalt);
}
else jalt = ialt;
- if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
{
char ial = rec->d.allele[ialt][0];
@@ -394,13 +422,40 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
else
{
+ int is_hom = 1;
for (i=0; i<fmt->n; i++)
{
- uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i;
- ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
- if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
- ialt = bcf_gt_allele(ialt);
- if ( ialt ) break;
+ if ( bcf_gt_is_missing(ptr[i]) ) return; // ignore missing or half-missing genotypes
+ if ( ptr[i]==bcf_int32_vector_end ) break;
+ ialt = bcf_gt_allele(ptr[i]);
+ if ( i>0 && ialt!=bcf_gt_allele(ptr[i-1]) ) { is_hom = 0; break; }
+ }
+ if ( !is_hom )
+ {
+ int prev_len = 0, jalt;
+ for (i=0; i<fmt->n; i++)
+ {
+ if ( ptr[i]==bcf_int32_vector_end ) break;
+ jalt = bcf_gt_allele(ptr[i]);
+ if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ if ( args->allele & (PICK_LONG|PICK_SHORT) )
+ {
+ int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]);
+ if ( i==0 ) ialt = jalt, prev_len = len;
+ else if ( len == prev_len )
+ {
+ if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt, prev_len = len;
+ else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt, prev_len = len;
+ }
+ else if ( args->allele & PICK_LONG && len > prev_len ) ialt = jalt, prev_len = len;
+ else if ( args->allele & PICK_SHORT && len < prev_len ) ialt = jalt, prev_len = len;
+ }
+ else
+ {
+ if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt;
+ else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt;
+ }
+ }
}
}
if ( !ialt ) return; // ref allele
@@ -623,12 +678,21 @@ static void usage(args_t *args)
fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -c, --chain <file> write a chain file for liftover\n");
+ fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
- fprintf(stderr, " -H, --haplotype <1|2> apply variants for the given haplotype\n");
- fprintf(stderr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
+ fprintf(stderr, " -H, --haplotype <which> choose which allele to use from the FORMAT/GT field, note\n");
+ fprintf(stderr, " the codes are case-insensitive:\n");
+ fprintf(stderr, " 1: first allele from GT\n");
+ fprintf(stderr, " 2: second allele\n");
+ fprintf(stderr, " R: REF allele in het genotypes\n");
+ fprintf(stderr, " A: ALT allele\n");
+ fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
+ fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
+ fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
fprintf(stderr, " -m, --mask <file> replace regions with N\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(stderr, " -c, --chain <file> write a chain file for liftover\n");
fprintf(stderr, " -s, --sample <name> apply variants of the given sample\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
@@ -645,8 +709,10 @@ int main_consensus(int argc, char *argv[])
static struct option loptions[] =
{
+ {"exclude",required_argument,NULL,'e'},
+ {"include",required_argument,NULL,'i'},
{"sample",1,0,'s'},
- {"iupac-codes",0,0,'i'},
+ {"iupac-codes",0,0,'I'},
{"haplotype",1,0,'H'},
{"output",1,0,'o'},
{"fasta-ref",1,0,'f'},
@@ -655,19 +721,32 @@ int main_consensus(int argc, char *argv[])
{0,0,0,0}
};
int c;
- while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:",loptions,NULL)) >= 0)
{
switch (c)
{
case 's': args->sample = optarg; break;
case 'o': args->output_fname = optarg; break;
- case 'i': args->output_iupac = 1; break;
+ case 'I': args->output_iupac = 1; break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'f': args->ref_fname = optarg; break;
case 'm': args->mask_fname = optarg; break;
case 'c': args->chain_fname = optarg; break;
case 'H':
- args->haplotype = optarg[0] - '0';
- if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
+ if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
+ else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT;
+ else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF;
+ else if ( !strcasecmp(optarg,"S") ) args->allele |= PICK_SHORT|PICK_REF;
+ else if ( !strcasecmp(optarg,"LR") ) args->allele |= PICK_LONG|PICK_REF;
+ else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT;
+ else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF;
+ else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT;
+ else
+ {
+ args->haplotype = optarg[0] - '0';
+ if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
+ }
break;
default: usage(args); break;
}
diff --git a/csq.c b/csq.c
index b1db103..94ac442 100644
--- a/csq.c
+++ b/csq.c
@@ -164,17 +164,6 @@
#define N_SPLICE_REGION_EXON 3
#define N_SPLICE_REGION_INTRON 8
-// Ensembl ID format, e.g.
-// ENST00000423372 for human .. ENST%011d
-// ENSMUST00000120394 for mouse .. ENSMUST%011d
-char ENSID_BUF[32], *ENSID_FMT = NULL;
-static inline char *ENSID(uint32_t id)
-{
- sprintf(ENSID_BUF,ENSID_FMT,id);
- return ENSID_BUF;
-}
-
-
#define N_REF_PAD 10 // number of bases to avoid boundary effects
#define STRAND_REV 0
@@ -509,7 +498,6 @@ hap_t;
temporary list of all exons, CDS, UTRs
*/
KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
-KHASH_MAP_INIT_INT(int2int, int)
KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
typedef struct
{
@@ -522,25 +510,41 @@ typedef struct
uint32_t iseq:29;
}
ftr_t;
+/*
+ Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
+ to integer id. To keep the memory requirements low, the original version
+ relied on IDs in the form of a string prefix and a numerical id. However,
+ it turns out that this assumption is not valid for some ensembl GFFs, see
+ for example Zea_mays.AGPv4.36.gff3.gz
+ */
+typedef struct
+{
+ void *str2id; // khash_str2int
+ int nstr, mstr;
+ char **str; // numeric id to string
+}
+id_tbl_t;
typedef struct
{
// all exons, CDS, UTRs
ftr_t *ftr;
int nftr, mftr;
- // mapping from transcript ensembl id to gene id
+ // mapping from gene id to gf_gene_t
kh_int2gene_t *gid2gene;
// mapping from transcript id to tscript, for quick CDS anchoring
kh_int2tscript_t *id2tr;
// sequences
- void *seq2int;
+ void *seq2int; // str2int hash
char **seq;
int nseq, mseq;
// ignored biotypes
void *ignored_biotypes;
+
+ id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
}
aux_t;
@@ -590,6 +594,7 @@ typedef struct _args_t
int nrm_tr, mrm_tr;
csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
int ncsq_buf, mcsq_buf;
+ id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
faidx_t *fai;
kstring_t str, str2;
@@ -694,33 +699,38 @@ static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg,
if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
return se+1;
}
-static inline uint32_t gff_parse_id(const char *line, const char *needle, char *ss)
+static void gff_id_init(id_tbl_t *tbl)
{
- ss = strstr(ss,needle);
- if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
- ss += strlen(needle);
- while ( *ss && !isdigit(*ss) ) ss++;
- if ( !ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
- char *se;
- uint32_t id = strtol(ss, &se, 10);
- if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
- if ( *se && *se!=';' && *se!='\t' ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
- assert( id <= 0xffffff ); // see gf_gene_t.id. Ensembl IDs are never that big in practice
- return id;
+ memset(tbl, 0, sizeof(*tbl));
+ tbl->str2id = khash_str2int_init();
+}
+static void gff_id_destroy(id_tbl_t *tbl)
+{
+ khash_str2int_destroy_free(tbl->str2id);
+ free(tbl->str);
}
-static void gff_parse_ensid_fmt(const char *line, const char *needle, char *ss)
+static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char *needle, char *ss)
{
- ss = strstr(ss,needle);
+ ss = strstr(ss,needle); // e.g. "ID=transcript:"
if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
ss += strlen(needle);
+
char *se = ss;
- while ( *se && !isdigit(*se) ) se++;
- kstring_t str = {0,0,0};
- kputsn(ss,se-ss,&str);
- ss = se;
- while ( *se && isdigit(*se) ) se++;
- ksprintf(&str,"%%0%dd",(int)(se-ss));
- ENSID_FMT = str.s;
+ while ( *se && *se!=';' && !isspace(*se) ) se++;
+ char tmp = *se;
+ *se = 0;
+
+ int id;
+ if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 )
+ {
+ id = tbl->nstr++;
+ hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
+ tbl->str[id] = strdup(ss);
+ int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
+ }
+ *se = tmp;
+
+ return id;
}
static inline int gff_parse_type(char *line)
{
@@ -880,10 +890,8 @@ void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
}
// create a mapping from transcript_id to gene_id
- uint32_t trid = gff_parse_id(line, "ID=transcript:", ss);
- uint32_t gene_id = gff_parse_id(line, "Parent=gene:", ss);
-
- if ( !ENSID_FMT ) gff_parse_ensid_fmt(line, "ID=transcript:", ss); // id prefix different across species
+ uint32_t trid = gff_id_parse(&args->tscript_ids, line, "ID=transcript:", ss);
+ uint32_t gene_id = gff_id_parse(&args->init.gene_ids, line, "Parent=gene:", ss);
tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
tr->id = trid;
@@ -910,7 +918,7 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha
aux_t *aux = &args->init;
// substring search for "ID=gene:ENSG00000437963"
- uint32_t gene_id = gff_parse_id(line, "ID=gene:", ss);
+ uint32_t gene_id = gff_id_parse(&aux->gene_ids, line, "ID=gene:", ss);
gf_gene_t *gene = gene_init(aux, gene_id);
assert( !gene->name ); // the gene_id should be unique
@@ -918,13 +926,17 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha
// substring search for "Name=OR4F5"
ss = strstr(chr_end+2,"Name=");
- if ( !ss ) error("Could not parse the line, \"Name=\" not present: %s\n", line);
- ss += 5;
- char *se = ss;
- while ( *se && *se!=';' && !isspace(*se) ) se++;
- gene->name = (char*) malloc(se-ss+1);
- memcpy(gene->name,ss,se-ss);
- gene->name[se-ss] = 0;
+ if ( ss )
+ {
+ ss += 5;
+ char *se = ss;
+ while ( *se && *se!=';' && !isspace(*se) ) se++;
+ gene->name = (char*) malloc(se-ss+1);
+ memcpy(gene->name,ss,se-ss);
+ gene->name[se-ss] = 0;
+ }
+ else
+ gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
}
int gff_parse(args_t *args, char *line, ftr_t *ftr)
{
@@ -999,7 +1011,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr)
ss += 2;
// substring search for "Parent=transcript:ENST00000437963"
- ftr->trid = gff_parse_id(line, "Parent=transcript:", ss);
+ ftr->trid = gff_id_parse(&args->tscript_ids, line, "Parent=transcript:", ss);
ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
return 0;
}
@@ -1104,7 +1116,7 @@ void tscript_init_cds(args_t *args)
{
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
- error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
assert( phase == len%3 );
len += tr->cds[i]->len;
}
@@ -1132,7 +1144,7 @@ void tscript_init_cds(args_t *args)
{
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
- error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
len += tr->cds[i]->len;
}
}
@@ -1205,6 +1217,8 @@ void init_gff(args_t *args)
aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
aux->ignored_biotypes = khash_str2int_init();
+ gff_id_init(&aux->gene_ids);
+ gff_id_init(&args->tscript_ids);
// parse gff
kstring_t str = {0,0,0};
@@ -1252,7 +1266,7 @@ void init_gff(args_t *args)
else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
else
- error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,ENSID(ftr->trid),gf_type2gff_string(ftr->type));
+ error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
}
tscript_init_cds(args);
@@ -1270,6 +1284,7 @@ void init_gff(args_t *args)
// keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
kh_destroy(int2tscript,aux->id2tr);
free(aux->seq);
+ gff_id_destroy(&aux->gene_ids);
if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) )
{
@@ -1409,7 +1424,7 @@ void destroy_data(args_t *args)
free(args->gt_arr);
free(args->str.s);
free(args->str2.s);
- free(ENSID_FMT);
+ gff_id_destroy(&args->tscript_ids);
}
/*
@@ -2491,7 +2506,7 @@ exit_duplicate:
#define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen)
#define node2rpos(i) (hap->stack[i].node->rec->pos)
-void kput_vcsq(vcsq_t *csq, kstring_t *str)
+void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
{
// Remove start/stop from incomplete CDS, but only if there is another
// consequence as something must be reported
@@ -2520,7 +2535,7 @@ void kput_vcsq(vcsq_t *csq, kstring_t *str)
if ( csq->gene ) kputs(csq->gene , str);
kputc_('|', str);
- if ( csq->type & CSQ_PRN_TSCRIPT ) ksprintf(str, "%s",ENSID(csq->trid));
+ if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
kputc_('|', str);
kputs(gf_type2gff_string(csq->biotype), str);
@@ -2889,7 +2904,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
fprintf(args->out,"-");
args->str.l = 0;
- kput_vcsq(&csq->type, &args->str);
+ kput_vcsq(args, &csq->type, &args->str);
fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
}
static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
@@ -2913,7 +2928,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih
fprintf(args->out,"-");
args->str.l = 0;
- kput_vcsq(&csq->type, &args->str);
+ kput_vcsq(args, &csq->type, &args->str);
fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
}
}
@@ -3057,11 +3072,11 @@ void vbuf_flush(args_t *args)
}
args->str.l = 0;
- kput_vcsq(&vrec->vcsq[0], &args->str);
+ kput_vcsq(args, &vrec->vcsq[0], &args->str);
for (j=1; j<vrec->nvcsq; j++)
{
kputc_(',', &args->str);
- kput_vcsq(&vrec->vcsq[j], &args->str);
+ kput_vcsq(args, &vrec->vcsq[j], &args->str);
}
bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s);
if ( args->hdr_nsmpl )
@@ -3665,7 +3680,7 @@ void process(args_t *args, bcf1_t **rec_ptr)
return;
}
-const char *usage(void)
+static const char *usage(void)
{
return
"\n"
diff --git a/doc/bcftools.1 b/doc/bcftools.1
index 0df9450..80312cd 100644
--- a/doc/bcftools.1
+++ b/doc/bcftools.1
@@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
-.\" Date: 2017-06-21
+.\" Date: 2017-09-28
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
-.TH "BCFTOOLS" "1" "2017\-06\-21" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "2017\-09\-28" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
@@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
.SS "VERSION"
.sp
-This manual page was last updated \fB2017\-06\-21\fR and refers to bcftools git version \fB1\&.5\fR\&.
+This manual page was last updated \fB2017\-09\-28\fR and refers to bcftools git version \fB1\&.6\fR\&.
.SS "BCF1"
.sp
The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
@@ -318,6 +318,19 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F
.IP \(bu 2.3
.\}
+\fBsort\fR
+\&.\&. sort VCF/BCF files
+.RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04'\(bu\h'+03'\c
+.\}
+.el \{\
+.sp -1
+.IP \(bu 2.3
+.\}
+
\fBstats\fR
\&.\&. produce VCF/BCF stats (former vcfcheck)
.RE
@@ -461,7 +474,7 @@ cannot be used in combination with
.PP
\fB\-s, \-\-samples\fR [^]\fILIST\fR
.RS 4
-Comma\-separated list of samples to include or exclude if prefixed with "^"\&. Note that in general tags such as INFO/AC, INFO/AN, etc are not updated to correspond to the subset samples\&.
+Comma\-separated list of samples to include or exclude if prefixed with "^"\&. The sample order is updated to reflect that given on the command line\&. Note that in general tags such as INFO/AC, INFO/AN, etc are not updated to correspond to the subset samples\&.
\fBbcftools view\fR
is the exception where some tags will be updated (unless the
\fB\-I, \-\-no\-update\fR
@@ -486,7 +499,7 @@ into that command\&. For example:
.RS 4
File of sample names to include or exclude if prefixed with "^"\&. One sample per line\&. See also the note above for the
\fB\-s, \-\-samples\fR
-option\&. The command
+option\&. The sample order is updated to reflect that given in the input file\&. The command
\fBbcftools call\fR
accepts an optional second column indicating ploidy (0, 1 or 2) or sex (as defined by
\fB\-\-ploidy\fR, for example "F" or "M"), and can parse also PED files\&. If the second column is not present, the sex "F" is assumed\&. With
@@ -1062,12 +1075,12 @@ option can be used
read allele frequencies from a tab\-delimited file with the columns CHR,POS,REF,ALT,AF
.RE
.PP
-*\-o, \-\-output\-dir \fIpath\fR
+\fB\-o, \-\-output\-dir\fR \fIpath\fR
.RS 4
output directory
.RE
.PP
-*\-p, \-\-plot\-threshold \fIfloat\fR
+\fB\-p, \-\-plot\-threshold\fR \fIfloat\fR
.RS 4
call
\fBmatplotlib\fR
@@ -1252,18 +1265,78 @@ see
.sp
Create consensus sequence by applying VCF variants to a reference fasta file\&. By default, the program will apply all ALT variants to the reference fasta to obtain the consensus sequence\&. Using the \fB\-\-sample\fR (and, optionally, \fB\-\-haplotype\fR) option will apply genotype (haplotype) calls from FORMAT/GT\&. Note that the program does not act as a primitive variant caller and ignores allelic depth information, such as INFO/AD or FORMAT/AD\&. For that, consider using the \fBsetG [...]
.PP
+\fB\-c, \-\-chain\fR \fIFILE\fR
+.RS 4
+write a chain file for liftover
+.RE
+.PP
+\fB\-e, \-\-exclude\fR \fIEXPRESSION\fR
+.RS 4
+exclude sites for which
+\fIEXPRESSION\fR
+is true\&. For valid expressions see
+\fBEXPRESSIONS\fR\&.
+.RE
+.PP
\fB\-f, \-\-fasta\-ref\fR \fIFILE\fR
.RS 4
reference sequence in fasta format
.RE
.PP
-\fB\-H, \-\-haplotype\fR \fI1\fR|\fI2\fR
+\fB\-H, \-\-haplotype\fR \fI1\fR|\fI2\fR|\fIR\fR|\fIA\fR|\fILR\fR|\fILA\fR|\fISR\fR|\fISA\fR
+.RS 4
+choose which allele from the FORMAT/GT field to use (the codes are case\-insensitive):
+.PP
+\fI1\fR
+.RS 4
+the first allele
+.RE
+.PP
+\fI2\fR
+.RS 4
+the second allele
+.RE
+.PP
+\fIR\fR
+.RS 4
+the REF allele (in heterozygous genotypes)
+.RE
+.PP
+\fIA\fR
+.RS 4
+the ALT allele (in heterozygous genotypes)
+.RE
+.PP
+\fILR, LA\fR
+.RS 4
+the longer allele\&. If both have the same length, use the REF allele (LR), or the ALT allele (LA)
+.RE
+.PP
+\fISR, SA\fR
+.RS 4
+the shorter allele\&. If both have the same length, use the REF allele (SR), or the ALT allele (SA)
+.sp
+.if n \{\
+.RS 4
+.\}
+.nf
+This option requires *\-s*, unless exactly one sample is present in the VCF
+.fi
+.if n \{\
+.RE
+.\}
+.RE
+.RE
+.PP
+\fB\-i, \-\-include\fR \fIEXPRESSION\fR
.RS 4
-apply variants for the given haplotype\&. This option requires
-\fB\-s\fR, unless exactly one sample is present in the VCF
+include only sites for which
+\fIEXPRESSION\fR
+is true\&. For valid expressions see
+\fBEXPRESSIONS\fR\&.
.RE
.PP
-\fB\-i, \-\-iupac\-codes\fR
+\fB\-I, \-\-iupac\-codes\fR
.RS 4
output variants in the form of IUPAC ambiguity codes
.RE
@@ -2899,13 +2972,12 @@ and
can swap alleles and will update genotypes (GT) and AC counts, but will not attempt to fix PL or other fields\&.
.RE
.PP
-\fB\-d, \-\-rm\-dup\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIall\fR|\fInone\fR
+\fB\-d, \-\-rm\-dup\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIany\fR
.RS 4
If a record is present multiple times, output only the first instance, see
\fB\-\-collapse\fR
in
-\fBCommon Options\fR\&. Requires
-\fB\-a, \-\-allow\-overlaps\fR\&.
+\fBCommon Options\fR\&.
.RE
.PP
\fB\-D, \-\-remove\-duplicates\fR
@@ -3144,6 +3216,16 @@ find positions with wildly varying ALT allele frequency (Fisher test on FMT/AD)
collect AF deviation stats and GT probability distribution given AF and assuming HWE
.RE
.PP
+\fBcheck\-ploidy\fR
+.RS 4
+check if ploidy of samples is consistent for all sites
+.RE
+.PP
+\fBcheck\-sparsity\fR
+.RS 4
+print samples without genotypes in a region or chromosome
+.RE
+.PP
\fBcolor\-chrs\fR
.RS 4
color shared chromosomal segments, requires trio VCF with phased GTs
@@ -3171,7 +3253,7 @@ fill INFO or REF field based on values in a fasta file
.PP
\fBfill\-tags\fR
.RS 4
-set INFO tags AF, AN, AC, NS, AC_Hom, AC_Het, AC_Hemi
+set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, HWE, MAF, NS
.RE
.PP
\fBfix\-ploidy\fR
@@ -3199,6 +3281,11 @@ determine sample sex by checking genotype likelihoods (GL,PL) or genotypes (GT)
add imputation information metrics to the INFO field based on selected FORMAT tags
.RE
.PP
+\fBisecGT\fR
+.RS 4
+compare two files and set non\-identical genotypes to missing
+.RE
+.PP
\fBmendelian\fR
.RS 4
count Mendelian consistent / inconsistent genotypes\&.
@@ -3209,6 +3296,11 @@ count Mendelian consistent / inconsistent genotypes\&.
sets missing genotypes ("\&./\&.") to ref allele ("0/0" or "0|0")
.RE
.PP
+\fBprune\fR
+.RS 4
+prune sites by missingness or linkage disequilibrium
+.RE
+.PP
\fBsetGT\fR
.RS 4
general tool to set genotypes according to rules requested by the user
@@ -3845,6 +3937,29 @@ estimate HMM parameters using Baum\-Welch algorithm, using the convergence thres
\fIFLOAT\fR, e\&.g\&. 1e\-10 (experimental)
.RE
.RE
+.SS "bcftools sort [\fIOPTIONS\fR] file\&.bcf"
+.PP
+\fB\-m, \-\-max\-mem\fR \fIFLOAT\fR[\fIkMG\fR]
+.RS 4
+Maximum memory to use\&. Approximate, affects the number of temporary files written to the disk\&. Note that if the command fails at this step because of too many open files, your system limit on the number of open files ("ulimit") may need to be increased\&.
+.RE
+.PP
+\fB\-o, \-\-output\fR \fIFILE\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
+\fB\-O, \-\-output\-type\fR \fIb\fR|\fIu\fR|\fIz\fR|\fIv\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
+\fB\-T, \-\-temp\-dir\fR \fIDIR\fR
+.RS 4
+Use this directory to store temporary files
+.RE
.SS "bcftools stats [\fIOPTIONS\fR] \fIA\&.vcf\&.gz\fR [\fIB\&.vcf\&.gz\fR]"
.sp
Parses VCF or BCF and produces text file stats which is suitable for machine processing and can be plotted using \fBplot\-vcfstats\fR\&. When two files are given, the program generates separate stats for intersection and the complements\&. By default only sites are compared, \fB\-s\fR/\fB\-S\fR must given to include also sample columns\&. When one VCF file is specified on the command line, then stats by non\-reference allele frequency, depth distribution, stats by quality and per\-sample [...]
@@ -4479,6 +4594,35 @@ GT="\&.|\&.", GT="\&./\&.", GT="\&."
.sp -1
.IP \(bu 2.3
.\}
+sample genotype: homozygous, heterozygous, haploid, ref\-ref hom, alt\-alt hom, ref\-alt het, alt\-alt het, haploid ref, haploid alt (case\-insensitive)
+.sp
+.if n \{\
+.RS 4
+.\}
+.nf
+GT="hom"
+GT="het"
+GT="hap"
+GT="RR"
+GT="AA"
+GT="RA" or GT="AR"
+GT="Aa" or GT="aA"
+GT="R"
+GT="A"
+.fi
+.if n \{\
+.RE
+.\}
+.RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04'\(bu\h'+03'\c
+.\}
+.el \{\
+.sp -1
+.IP \(bu 2.3
+.\}
TYPE for variant type in REF,ALT columns (indel,snp,mnp,ref,bnd,other)\&. Use the regex operator "\e~" to require at least one allele of the given type or the equal sign "=" to require that all alleles are of the given type\&. Compare
.sp
.if n \{\
@@ -4503,7 +4647,7 @@ TYPE!~"snp"
.sp -1
.IP \(bu 2.3
.\}
-array subscripts, "*" for any field
+array subscripts (0\-based), "*" for any field, "\-" to indicate a range
.sp
.if n \{\
.RS 4
@@ -4512,6 +4656,9 @@ array subscripts, "*" for any field
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0\&.3
DP4[*] == 0
CSQ[*] ~ "missense_variant\&.*deleterious"
+FORMAT/DP[1\-3] > 10
+FORMAT/DP[1\-] < 7
+FORMAT/DP[0,2\-4] > 20
.fi
.if n \{\
.RE
diff --git a/doc/bcftools.html b/doc/bcftools.html
index 5104307..ac22e4b 100644
--- a/doc/bcftools.html
+++ b/doc/bcftools.html
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25137632"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipu [...]
+<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp53408"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulat [...]
Call Format (VCF) and its binary counterpart BCF. All commands work
transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.</p><p>Most commands accept VCF, bgzipped VCF and BCF with filetype detected
automatically even when streaming from a pipe. Indexed VCF and BCF
@@ -8,7 +8,7 @@ will work in all situations. Un-indexed VCF and BCF and streams will
work in most, but not all situations. In general, whenever multiple VCFs are
read simultaneously, they must be indexed and therefore also compressed.</p><p>BCFtools is designed to work on a stream. It regards an input file "-" as the
standard input (stdin) and outputs to the standard output (stdout). Several
-commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2017-06-21</strong></span> and refers to bcftools git version <span class="strong"><strong>1.5</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools <= 0.1.19 is <span class="strong"><strong>not</strong></span>
+commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2017-09-28</strong></span> and refers to bcftools git version <span class="strong"><strong>1.6</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools <= 0.1.19 is <span class="strong"><strong>not</strong></span>
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions <= 0.1.19 to convert to VCF, which can then be read by
@@ -57,6 +57,8 @@ list of available options, run <span class="strong"><strong>bcftools</strong></s
</li><li class="listitem">
<span class="strong"><strong><a class="link" href="#roh" title="bcftools roh [OPTIONS] file.vcf.gz">roh</a></strong></span> .. identify runs of homo/auto-zygosity
</li><li class="listitem">
+<span class="strong"><strong><a class="link" href="#sort" title="bcftools sort [OPTIONS] file.bcf">sort</a></strong></span> .. sort VCF/BCF files
+</li><li class="listitem">
<span class="strong"><strong><a class="link" href="#stats" title="bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]">stats</a></strong></span> .. produce VCF/BCF stats (former vcfcheck)
</li><li class="listitem">
<span class="strong"><strong><a class="link" href="#view" title="bcftools view [OPTIONS] file.vcf.gz [REGION […]]">view</a></strong></span> .. subset, filter and convert VCF and BCF files
@@ -167,6 +169,7 @@ specific commands to see if they apply.</p><div class="variablelist"><dl><dt><sp
</span></dt><dd>
Comma-separated list of samples to include or exclude if prefixed
with "^".
+ The sample order is updated to reflect that given on the command line.
Note that in general tags such as INFO/AC, INFO/AN, etc are not updated
to correspond to the subset samples. <span class="strong"><strong><a class="link" href="#view" title="bcftools view [OPTIONS] file.vcf.gz [REGION […]]">bcftools view</a></strong></span> is the
exception where some tags will be updated (unless the <span class="strong"><strong>-I, --no-update</strong></span>
@@ -179,6 +182,7 @@ specific commands to see if they apply.</p><div class="variablelist"><dl><dt><sp
File of sample names to include or exclude if prefixed with "^".
One sample per line. See also the note above for the <span class="strong"><strong>-s, --samples</strong></span>
option.
+ The sample order is updated to reflect that given in the input file.
The command <span class="strong"><strong><a class="link" href="#call" title="bcftools call [OPTIONS] FILE">bcftools call</a></strong></span> accepts an optional second
column indicating ploidy (0, 1 or 2) or sex (as defined by
<span class="strong"><strong><a class="link" href="#ploidy">--ploidy</a></strong></span>, for example "F" or "M"), and can parse also PED
@@ -564,11 +568,11 @@ loss), 0 (complete loss), 3 (single-copy gain).</p><div class="refsect3" title="
</span></dt><dd>
read allele frequencies from a tab-delimited file with the columns CHR,POS,REF,ALT,AF
</dd><dt><span class="term">
-*-o, --output-dir <span class="emphasis"><em>path</em></span>
+<span class="strong"><strong>-o, --output-dir</strong></span> <span class="emphasis"><em>path</em></span>
</span></dt><dd>
output directory
</dd><dt><span class="term">
-*-p, --plot-threshold <span class="emphasis"><em>float</em></span>
+<span class="strong"><strong>-p, --plot-threshold</strong></span> <span class="emphasis"><em>float</em></span>
</span></dt><dd>
call <span class="strong"><strong>matplotlib</strong></span> to produce plots for chromosomes with quality at least <span class="emphasis"><em>float</em></span>,
useful for visual inspection of the calls. With <span class="strong"><strong>-p 0</strong></span>, plots for all chromosomes will be
@@ -717,16 +721,53 @@ obtain the consensus sequence. Using the <span class="strong"><strong>--sample</
Note that the program does not act as a primitive variant caller and ignores allelic
depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
<span class="strong"><strong>setGT</strong></span> plugin.</p><div class="variablelist"><dl><dt><span class="term">
+<span class="strong"><strong>-c, --chain</strong></span> <span class="emphasis"><em>FILE</em></span>
+</span></dt><dd>
+ write a chain file for liftover
+</dd><dt><span class="term">
+<span class="strong"><strong>-e, --exclude</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
+</span></dt><dd>
+ exclude sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
+ <span class="strong"><strong><a class="link" href="#expressions" title="EXPRESSIONS">EXPRESSIONS</a></strong></span>.
+</dd><dt><span class="term">
<span class="strong"><strong>-f, --fasta-ref</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
reference sequence in fasta format
</dd><dt><span class="term">
-<span class="strong"><strong>-H, --haplotype</strong></span> <span class="emphasis"><em>1</em></span>|<span class="emphasis"><em>2</em></span>
+<span class="strong"><strong>-H, --haplotype</strong></span> <span class="emphasis"><em>1</em></span>|<span class="emphasis"><em>2</em></span>|<span class="emphasis"><em>R</em></span>|<span class="emphasis"><em>A</em></span>|<span class="emphasis"><em>LR</em></span>|<span class="emphasis"><em>LA</em></span>|<span class="emphasis"><em>SR</em></span>|<span class="emphasis"><em>SA</em></span>
+</span></dt><dd><p class="simpara">
+ choose which allele from the FORMAT/GT field to use (the codes are case-insensitive):
+</p><div class="variablelist"><dl><dt><span class="term">
+<span class="emphasis"><em>1</em></span>
+</span></dt><dd>
+ the first allele
+</dd><dt><span class="term">
+<span class="emphasis"><em>2</em></span>
+</span></dt><dd>
+ the second allele
+</dd><dt><span class="term">
+<span class="emphasis"><em>R</em></span>
+</span></dt><dd>
+ the REF allele (in heterozygous genotypes)
+</dd><dt><span class="term">
+<span class="emphasis"><em>A</em></span>
+</span></dt><dd>
+ the ALT allele (in heterozygous genotypes)
+</dd><dt><span class="term">
+<span class="emphasis"><em>LR, LA</em></span>
+</span></dt><dd>
+ the longer allele. If both have the same length, use the REF allele (LR), or the ALT allele (LA)
+</dd><dt><span class="term">
+<span class="emphasis"><em>SR, SA</em></span>
+</span></dt><dd><p class="simpara">
+ the shorter allele. If both have the same length, use the REF allele (SR), or the ALT allele (SA)
+</p><pre class="literallayout">This option requires *-s*, unless exactly one sample is present in the VCF</pre></dd></dl></div></dd><dt><span class="term">
+<span class="strong"><strong>-i, --include</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
</span></dt><dd>
- apply variants for the given haplotype. This option requires <span class="strong"><strong>-s</strong></span>, unless
- exactly one sample is present in the VCF
+ include only sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
+ <span class="strong"><strong><a class="link" href="#expressions" title="EXPRESSIONS">EXPRESSIONS</a></strong></span>.
</dd><dt><span class="term">
-<span class="strong"><strong>-i, --iupac-codes</strong></span>
+<span class="strong"><strong>-I, --iupac-codes</strong></span>
</span></dt><dd>
output variants in the form of IUPAC ambiguity codes
</dd><dt><span class="term">
@@ -1732,11 +1773,10 @@ the <span class="strong"><strong><a class="link" href="#fasta_ref">--fasta-ref</
can swap alleles and will update genotypes (GT) and AC counts,
but will not attempt to fix PL or other fields.
</dd><dt><span class="term">
-<span class="strong"><strong>-d, --rm-dup</strong></span> <span class="emphasis"><em>snps</em></span>|<span class="emphasis"><em>indels</em></span>|<span class="emphasis"><em>both</em></span>|<span class="emphasis"><em>all</em></span>|<span class="emphasis"><em>none</em></span>
+<span class="strong"><strong>-d, --rm-dup</strong></span> <span class="emphasis"><em>snps</em></span>|<span class="emphasis"><em>indels</em></span>|<span class="emphasis"><em>both</em></span>|<span class="emphasis"><em>any</em></span>
</span></dt><dd>
If a record is present multiple times, output only the first instance,
see <span class="strong"><strong>--collapse</strong></span> in <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>.
- Requires <span class="strong"><strong>-a, --allow-overlaps</strong></span>.
</dd><dt><span class="term">
<span class="strong"><strong>-D, --remove-duplicates</strong></span>
</span></dt><dd>
@@ -1891,6 +1931,14 @@ the usage examples that each plugin comes with.</p><div class="refsect3" title="
</span></dt><dd>
collect AF deviation stats and GT probability distribution given AF and assuming HWE
</dd><dt><span class="term">
+<span class="strong"><strong>check-ploidy</strong></span>
+</span></dt><dd>
+ check if ploidy of samples is consistent for all sites
+</dd><dt><span class="term">
+<span class="strong"><strong>check-sparsity</strong></span>
+</span></dt><dd>
+ print samples without genotypes in a region or chromosome
+</dd><dt><span class="term">
<span class="strong"><strong>color-chrs</strong></span>
</span></dt><dd>
color shared chromosomal segments, requires trio VCF with phased GTs
@@ -1914,7 +1962,7 @@ the usage examples that each plugin comes with.</p><div class="refsect3" title="
</dd><dt><span class="term">
<span class="strong"><strong>fill-tags</strong></span>
</span></dt><dd>
- set INFO tags AF, AN, AC, NS, AC_Hom, AC_Het, AC_Hemi
+ set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, HWE, MAF, NS
</dd><dt><span class="term">
<span class="strong"><strong>fix-ploidy</strong></span>
</span></dt><dd>
@@ -1937,6 +1985,10 @@ the usage examples that each plugin comes with.</p><div class="refsect3" title="
</span></dt><dd>
add imputation information metrics to the INFO field based on selected FORMAT tags
</dd><dt><span class="term">
+<span class="strong"><strong>isecGT</strong></span>
+</span></dt><dd>
+ compare two files and set non-identical genotypes to missing
+</dd><dt><span class="term">
<span class="strong"><strong>mendelian</strong></span>
</span></dt><dd>
count Mendelian consistent / inconsistent genotypes.
@@ -1945,6 +1997,10 @@ the usage examples that each plugin comes with.</p><div class="refsect3" title="
</span></dt><dd>
sets missing genotypes ("./.") to ref allele ("0/0" or "0|0")
</dd><dt><span class="term">
+<span class="strong"><strong>prune</strong></span>
+</span></dt><dd>
+ prune sites by missingness or linkage disequilibrium
+</dd><dt><span class="term">
<span class="strong"><strong>setGT</strong></span>
</span></dt><dd>
general tool to set genotypes according to rules requested by the user
@@ -2291,7 +2347,25 @@ Transition probabilities:
</span></dt><dd>
estimate HMM parameters using Baum-Welch algorithm, using the convergence threshold
<span class="emphasis"><em>FLOAT</em></span>, e.g. 1e-10 (experimental)
-</dd></dl></div></div></div><div class="refsect2" title="bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]"><a id="stats"></a><h3>bcftools stats [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>A.vcf.gz</em></span> [<span class="emphasis"><em>B.vcf.gz</em></span>]</h3><p>Parses VCF or BCF and produces text file stats which is suitable for machine
+</dd></dl></div></div></div><div class="refsect2" title="bcftools sort [OPTIONS] file.bcf"><a id="sort"></a><h3>bcftools sort [<span class="emphasis"><em>OPTIONS</em></span>] file.bcf</h3><div class="variablelist"><dl><dt><span class="term">
+<span class="strong"><strong>-m, --max-mem</strong></span> <span class="emphasis"><em>FLOAT</em></span>[<span class="emphasis"><em>kMG</em></span>]
+</span></dt><dd>
+ Maximum memory to use. Approximate, affects the number of temporary files written
+ to the disk. Note that if the command fails at this step because of too many open files,
+ your system limit on the number of open files ("ulimit") may need to be increased.
+</dd><dt><span class="term">
+<span class="strong"><strong>-o, --output</strong></span> <span class="emphasis"><em>FILE</em></span>
+</span></dt><dd>
+ see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
+<span class="strong"><strong>-O, --output-type</strong></span> <span class="emphasis"><em>b</em></span>|<span class="emphasis"><em>u</em></span>|<span class="emphasis"><em>z</em></span>|<span class="emphasis"><em>v</em></span>
+</span></dt><dd>
+ see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
+<span class="strong"><strong>-T, --temp-dir</strong></span> <span class="emphasis"><em>DIR</em></span>
+</span></dt><dd>
+ Use this directory to store temporary files
+</dd></dl></div></div><div class="refsect2" title="bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]"><a id="stats"></a><h3>bcftools stats [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>A.vcf.gz</em></span> [<span class="emphasis"><em>B.vcf.gz</em></span>]</h3><p>Parses VCF or BCF and produces text file stats which is suitable for machine
processing and can be plotted using <span class="strong"><strong><a class="link" href="#plot-vcfstats" title="plot-vcfstats [OPTIONS] file.vchk […]">plot-vcfstats</a></strong></span>. When two files are given,
the program generates separate stats for intersection and the complements. By
default only sites are compared, <span class="strong"><strong>-s</strong></span>/<span class="strong"><strong>-S</strong></span> must given to include also sample
@@ -2607,6 +2681,17 @@ using these expressions
missing genotypes can be matched including the phase and ploidy (".|.", "./.", ".")
using these expressions
</p><pre class="literallayout">GT=".|.", GT="./.", GT="."</pre></li><li class="listitem"><p class="simpara">
+sample genotype: homozygous, heterozygous, haploid, ref-ref hom, alt-alt hom,
+ref-alt het, alt-alt het, haploid ref, haploid alt (case-insensitive)
+</p><pre class="literallayout">GT="hom"
+GT="het"
+GT="hap"
+GT="RR"
+GT="AA"
+GT="RA" or GT="AR"
+GT="Aa" or GT="aA"
+GT="R"
+GT="A"</pre></li><li class="listitem"><p class="simpara">
TYPE for variant type in REF,ALT columns (indel,snp,mnp,ref,bnd,other). Use the regex
operator "\~" to require at least one allele of the given type or the equal sign "="
to require that all alleles are of the given type. Compare
@@ -2614,10 +2699,13 @@ to require that all alleles are of the given type. Compare
TYPE~"snp"
TYPE!="snp"
TYPE!~"snp"</pre></li><li class="listitem"><p class="simpara">
-array subscripts, "*" for any field
+array subscripts (0-based), "*" for any field, "-" to indicate a range
</p><pre class="literallayout">(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
DP4[*] == 0
-CSQ[*] ~ "missense_variant.*deleterious"</pre></li><li class="listitem"><p class="simpara">
+CSQ[*] ~ "missense_variant.*deleterious"
+FORMAT/DP[1-3] > 10
+FORMAT/DP[1-] < 7
+FORMAT/DP[0,2-4] > 20</pre></li><li class="listitem"><p class="simpara">
function on FORMAT tags (over samples) and INFO tags (over vector fields)
</p><pre class="literallayout">MAX, MIN, AVG, SUM, STRLEN, ABS</pre></li><li class="listitem"><p class="simpara">
variables calculated on the fly if not present: number of alternate alleles;
diff --git a/doc/bcftools.txt b/doc/bcftools.txt
index a6ef03b..fc5c25a 100644
--- a/doc/bcftools.txt
+++ b/doc/bcftools.txt
@@ -93,6 +93,7 @@ list of available options, run *bcftools* 'COMMAND' without arguments.
- *<<query,query>>* .. transform VCF/BCF into user-defined formats
- *<<reheader,reheader>>* .. modify VCF/BCF header, change sample names
- *<<roh,roh>>* .. identify runs of homo/auto-zygosity
+- *<<sort,sort>>* .. sort VCF/BCF files
- *<<stats,stats>>* .. produce VCF/BCF stats (former vcfcheck)
- *<<view,view>>* .. subset, filter and convert VCF and BCF files
@@ -201,6 +202,7 @@ specific commands to see if they apply.
*-s, --samples* \[^]'LIST'::
Comma-separated list of samples to include or exclude if prefixed
with "^".
+ The sample order is updated to reflect that given on the command line.
Note that in general tags such as INFO/AC, INFO/AN, etc are not updated
to correspond to the subset samples. *<<view,bcftools view>>* is the
exception where some tags will be updated (unless the *-I, --no-update*
@@ -215,6 +217,7 @@ specific commands to see if they apply.
File of sample names to include or exclude if prefixed with "^".
One sample per line. See also the note above for the *-s, --samples*
option.
+ The sample order is updated to reflect that given in the input file.
The command *<<call,bcftools call>>* accepts an optional second
column indicating ploidy (0, 1 or 2) or sex (as defined by
*<<ploidy,--ploidy>>*, for example "F" or "M"), and can parse also PED
@@ -601,10 +604,10 @@ loss), 0 (complete loss), 3 (single-copy gain).
*-f, --AF-file* 'file'::
read allele frequencies from a tab-delimited file with the columns CHR,POS,REF,ALT,AF
-*-o, --output-dir 'path'::
+*-o, --output-dir* 'path'::
output directory
-*-p, --plot-threshold 'float'::
+*-p, --plot-threshold* 'float'::
call *matplotlib* to produce plots for chromosomes with quality at least 'float',
useful for visual inspection of the calls. With *-p 0*, plots for all chromosomes will be
generated. If not given, a *matplotlib* script will be created but not called.
@@ -739,14 +742,44 @@ Note that the program does not act as a primitive variant caller and ignores all
depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
*setGT* plugin.
+*-c, --chain* 'FILE'::
+ write a chain file for liftover
+
+*-e, --exclude* 'EXPRESSION'::
+ exclude sites for which 'EXPRESSION' is true. For valid expressions see
+ *<<expressions,EXPRESSIONS>>*.
+
*-f, --fasta-ref* 'FILE'::
reference sequence in fasta format
-*-H, --haplotype* '1'|'2'::
- apply variants for the given haplotype. This option requires *-s*, unless
- exactly one sample is present in the VCF
+*-H, --haplotype* '1'|'2'|'R'|'A'|'LR'|'LA'|'SR'|'SA'::
+ choose which allele from the FORMAT/GT field to use (the codes are case-insensitive):
+
+ '1';;
+ the first allele
+
+ '2';;
+ the second allele
+
+ 'R';;
+ the REF allele (in heterozygous genotypes)
+
+ 'A';;
+ the ALT allele (in heterozygous genotypes)
+
+ 'LR, LA';;
+ the longer allele. If both have the same length, use the REF allele (LR), or the ALT allele (LA)
+
+ 'SR, SA';;
+ the shorter allele. If both have the same length, use the REF allele (SR), or the ALT allele (SA)
+
+ This option requires *-s*, unless exactly one sample is present in the VCF
+
+*-i, --include* 'EXPRESSION'::
+ include only sites for which 'EXPRESSION' is true. For valid expressions see
+ *<<expressions,EXPRESSIONS>>*.
-*-i, --iupac-codes*::
+*-I, --iupac-codes*::
output variants in the form of IUPAC ambiguity codes
*-m, --mask* 'FILE'::
@@ -1770,10 +1803,9 @@ the *<<fasta_ref,--fasta-ref>>* option is supplied.
can swap alleles and will update genotypes (GT) and AC counts,
but will not attempt to fix PL or other fields.
-*-d, --rm-dup* 'snps'|'indels'|'both'|'all'|'none'::
+*-d, --rm-dup* 'snps'|'indels'|'both'|'any'::
If a record is present multiple times, output only the first instance,
see *--collapse* in *<<common_options,Common Options>>*.
- Requires *-a, --allow-overlaps*.
*-D, --remove-duplicates*::
If a record is present in multiple files, output only the first instance.
@@ -1915,6 +1947,12 @@ By default, appropriate system directories are searched for installed plugins.
*af-dist*::
collect AF deviation stats and GT probability distribution given AF and assuming HWE
+*check-ploidy*::
+ check if ploidy of samples is consistent for all sites
+
+*check-sparsity*::
+ print samples without genotypes in a region or chromosome
+
*color-chrs*::
color shared chromosomal segments, requires trio VCF with phased GTs
@@ -1932,7 +1970,7 @@ By default, appropriate system directories are searched for installed plugins.
fill INFO or REF field based on values in a fasta file
*fill-tags*::
- set INFO tags AF, AN, AC, NS, AC_Hom, AC_Het, AC_Hemi
+ set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, HWE, MAF, NS
*fix-ploidy*::
sets correct ploidy
@@ -1950,12 +1988,18 @@ By default, appropriate system directories are searched for installed plugins.
*impute-info*::
add imputation information metrics to the INFO field based on selected FORMAT tags
+*isecGT*::
+ compare two files and set non-identical genotypes to missing
+
*mendelian*::
count Mendelian consistent / inconsistent genotypes.
*missing2ref*::
sets missing genotypes ("./.") to ref allele ("0/0" or "0|0")
+*prune*::
+ prune sites by missingness or linkage disequilibrium
+
*setGT*::
general tool to set genotypes according to rules requested by the user
@@ -2321,6 +2365,24 @@ Transition probabilities:
'FLOAT', e.g. 1e-10 (experimental)
+[[sort]]
+=== bcftools sort ['OPTIONS'] file.bcf
+
+*-m, --max-mem* 'FLOAT'['kMG']::
+ Maximum memory to use. Approximate, affects the number of temporary files written
+ to the disk. Note that if the command fails at this step because of too many open files,
+ your system limit on the number of open files ("ulimit") may need to be increased.
+
+*-o, --output* 'FILE'::
+ see *<<common_options,Common Options>>*
+
+*-O, --output-type* 'b'|'u'|'z'|'v'::
+ see *<<common_options,Common Options>>*
+
+*-T, --temp-dir* 'DIR'::
+ Use this directory to store temporary files
+
+
[[stats]]
=== bcftools stats ['OPTIONS'] 'A.vcf.gz' ['B.vcf.gz']
@@ -2647,6 +2709,19 @@ using these expressions
GT=".|.", GT="./.", GT="."
+* sample genotype: homozygous, heterozygous, haploid, ref-ref hom, alt-alt hom,
+ref-alt het, alt-alt het, haploid ref, haploid alt (case-insensitive)
+
+ GT="hom"
+ GT="het"
+ GT="hap"
+ GT="RR"
+ GT="AA"
+ GT="RA" or GT="AR"
+ GT="Aa" or GT="aA"
+ GT="R"
+ GT="A"
+
* TYPE for variant type in REF,ALT columns (indel,snp,mnp,ref,bnd,other). Use the regex
operator "\~" to require at least one allele of the given type or the equal sign "="
to require that all alleles are of the given type. Compare
@@ -2656,11 +2731,14 @@ to require that all alleles are of the given type. Compare
TYPE!="snp"
TYPE!~"snp"
-* array subscripts, "*" for any field
+* array subscripts (0-based), "*" for any field, "-" to indicate a range
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
DP4[*] == 0
CSQ[*] ~ "missense_variant.*deleterious"
+ FORMAT/DP[1-3] > 10
+ FORMAT/DP[1-] < 7
+ FORMAT/DP[0,2-4] > 20
* function on FORMAT tags (over samples) and INFO tags (over vector fields)
diff --git a/filter.c b/filter.c
index 78ff1f1..3dc91a7 100644
--- a/filter.c
+++ b/filter.c
@@ -67,21 +67,23 @@ typedef struct _token_t
char *tag; // for debugging and printout only, VCF tag name
double threshold; // filtering threshold
int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types
- int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*])
+ int idx; // 0-based index to VCF vectors, -1: not a vector,
+ // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
+ int *idxs, nidxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *);
void *hash; // test presence of str value in the hash via comparator
regex_t *regex; // precompiled regex for string comparison
// modified on filter evaluation at each VCF line
- double *values; // In case str_value is set, values[0] is one sample's string length
- char *str_value; // and values[0]*nsamples gives the total length;
+ double *values;
+ kstring_t str_value;
int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
int pass_site; // -1 not applicable, 0 fails, >0 pass
uint8_t *pass_samples; // status of individual samples
int nsamples; // number of samples
- int nvalues, mvalues; // number of used values, n=0 for missing values, n=1 for scalars
- // for strings, total length of str_value
+ int nvalues, mvalues; // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l
+ int nstr1; // per-sample string length, set only with str_value.l>0 && nsamples>1
}
token_t;
@@ -93,6 +95,7 @@ struct _filter_t
token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack
int32_t *tmpi;
float *tmpf;
+ kstring_t tmps;
int max_unpack, mtmpi, mtmpf, nsamples;
};
@@ -169,6 +172,7 @@ static int filters_next_token(char **str, int *len)
return TOK_VAL;
}
+ int square_brackets = 0;
while ( tmp[0] )
{
if ( tmp[0]=='"' ) break;
@@ -183,11 +187,12 @@ static int filters_next_token(char **str, int *len)
if ( tmp[0]=='(' ) break;
if ( tmp[0]==')' ) break;
if ( tmp[0]=='+' ) break;
- // hacky: so that [*] is not split, the tokenizer does not recognise square brackets []
- if ( tmp[0]=='*' && (tmp==*str || tmp[-1]!='[') ) break;
- if ( tmp[0]=='-' ) break;
+ if ( tmp[0]=='*' && !square_brackets ) break;
+ if ( tmp[0]=='-' && !square_brackets ) break;
if ( tmp[0]=='/' ) break;
if ( tmp[0]=='~' ) break;
+ if ( tmp[0]==']' ) { if (square_brackets) tmp++; break; }
+ if ( tmp[0]=='[' ) square_brackets++;
tmp++;
}
if ( tmp > *str )
@@ -270,12 +275,15 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
else if ( line->d.info[i].type==BCF_BT_CHAR )
{
int n = line->d.info[i].len;
- int m = (int)tok->values[0];
- hts_expand(char,n+1,m,tok->str_value);
- memcpy(tok->str_value,line->d.info[i].vptr,n);
- tok->str_value[n] = 0;
- tok->values[0] = m;
- tok->nvalues = n;
+ if ( n >= tok->str_value.m )
+ {
+ tok->str_value.m = n + 1;
+ tok->str_value.s = (char*) realloc(tok->str_value.s, tok->str_value.m);
+ if ( !tok->str_value.s ) error("Failed to alloc %d bytes\n", (int)tok->str_value.m);
+ }
+ memcpy(tok->str_value.s, line->d.info[i].vptr, n);
+ tok->str_value.s[n] = 0;
+ tok->nvalues = tok->str_value.l = n;
}
else if ( line->d.info[i].type==BCF_BT_FLOAT )
{
@@ -285,10 +293,11 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
tok->values[0] = line->d.info[i].v1.f;
tok->nvalues = 1;
}
- tok->str_value = NULL;
+ tok->str_value.l = 0;
}
else
{
+ tok->str_value.l = 0;
if ( line->d.info[i].type==BCF_BT_INT8 && line->d.info[i].v1.i==bcf_int8_missing ) tok->nvalues = 0;
else if ( line->d.info[i].type==BCF_BT_INT16 && line->d.info[i].v1.i==bcf_int16_missing ) tok->nvalues = 0;
else if ( line->d.info[i].type==BCF_BT_INT32 && line->d.info[i].v1.i==bcf_int32_missing ) tok->nvalues = 0;
@@ -297,7 +306,6 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
tok->values[0] = line->d.info[i].v1.i;
tok->nvalues = 1;
}
- tok->str_value = NULL;
}
}
static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
@@ -346,8 +354,8 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin
return ret ? 0 : 1;
}
- if ( op_type==TOK_EQ ) return strcmp(btok->str_value,line->d.id) ? 0 : 1;
- return strcmp(btok->str_value,line->d.id) ? 1 : 0;
+ if ( op_type==TOK_EQ ) return strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
+ return strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
}
/**
@@ -409,13 +417,16 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok)
{
if ( tok->idx==-2 )
{
- int i;
tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
if ( tok->nvalues<=0 ) tok->nvalues = 0;
else
{
hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- for (i=0; i<tok->nvalues; i++) tok->values[i] = flt->tmpi[i];
+ int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1;
+ if ( end >= tok->nvalues ) end = tok->nvalues - 1;
+ for (i=0; i<=end; i++)
+ if ( i>=tok->nidxs || tok->idxs[i] ) tok->values[j++] = flt->tmpi[i];
+ tok->nvalues = j;
}
}
else
@@ -435,15 +446,21 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
if ( tok->idx==-2 )
{
- int i;
tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf);
if ( tok->nvalues<=0 ) tok->nvalues = 0;
else
{
hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- for (i=0; i<tok->nvalues; i++)
- if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[i]);
- else tok->values[i] = flt->tmpf[i];
+ int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1;
+ if ( end >= tok->nvalues ) end = tok->nvalues - 1;
+ for (i=0; i<=end; i++)
+ if ( i>=tok->nidxs || tok->idxs[i] )
+ {
+ if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[j]);
+ else tok->values[j] = flt->tmpf[i];
+ j++;
+ }
+ tok->nvalues = j;
}
}
else
@@ -461,33 +478,64 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- int m = (int)tok->values[0];
- int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value,&m);
- if ( n<0 ) { tok->nvalues = 0; return; }
- tok->values[0] = m; // allocated length
+ int32_t m = tok->str_value.m;
+ int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value.s,&m);
+ tok->str_value.m = m;
+ if ( n<0 ) { tok->nvalues = tok->str_value.l = 0; return; }
if ( tok->idx>=0 )
{
// get ith field (i=tok->idx)
int i = 0;
- char *ss = tok->str_value, *se = tok->str_value + n;
+ char *ss = tok->str_value.s, *se = tok->str_value.s + n;
while ( ss<se && i<tok->idx )
{
if ( *ss==',' ) i++;
ss++;
}
- if ( ss==se || i!=tok->idx ) { tok->nvalues = 0; return; }
+ if ( ss==se || i!=tok->idx ) { tok->nvalues = tok->str_value.l = 0; return; }
se = ss;
- while ( se-tok->str_value<n && *se!=',' ) se++;
- if ( ss==tok->str_value ) *se = 0;
+ while ( se - tok->str_value.s < n && *se!=',' ) se++;
+ if ( ss==tok->str_value.s ) *se = 0;
else
{
- memmove(tok->str_value,ss,se-ss);
- tok->str_value[se-ss] = 0;
+ memmove(tok->str_value.s, ss, se-ss);
+ tok->str_value.s[se-ss] = 0;
}
- tok->nvalues = se-ss;
+ tok->str_value.l = se - ss;
}
- else if ( tok->idx==-2 ) tok->nvalues = n;
+ else if ( tok->idx==-2 && tok->idxs[0]==-1 ) // keep all values, TAG[*]
+ tok->str_value.l = n;
+ else if ( tok->idx==-2 )
+ {
+ flt->tmps.l = 0;
+ ks_resize(&flt->tmps, n);
+ int i, end = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
+ if ( end >= n ) end = n - 1;
+ char *beg = tok->str_value.s, *dst = flt->tmps.s;
+ for (i=0; i<=end; i++)
+ {
+ char *end = beg;
+ while ( *end && *end!=',' ) end++;
+
+ if ( i>=tok->nidxs || tok->idxs[i] )
+ {
+ memcpy(dst, beg, end - beg);
+ dst += end - beg;
+ dst[0] = ',';
+ dst++;
+ }
+
+ beg = end+1;
+ }
+ dst[0] = 0;
+ tok->str_value.l = dst - flt->tmps.s;
+
+ #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+ SWAP(char *, flt->tmps.s, tok->str_value.s);
+ SWAP(size_t, flt->tmps.m, tok->str_value.m);
+ }
+ tok->nvalues = tok->str_value.l;
}
static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok)
@@ -503,127 +551,266 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
{
int i;
if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
- tok->nvalues = 0;
- else
{
+ tok->nvalues = tok->nsamples = 0;
+ return;
+ }
+ if ( tok->idx >= -1 ) // scalar or vector index
+ {
+ hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
+ int nvals = tok->nvalues / line->n_sample;
+ int idx = tok->idx >= 0 ? tok->idx : 0;
int is_missing = 1;
- hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- for (i=0; i<tok->nvalues; i++)
+ int32_t *ptr = flt->tmpi;
+ for (i=0; i<line->n_sample; i++)
{
- if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end )
+ if ( ptr[idx]==bcf_int32_missing || ptr[idx]==bcf_int32_vector_end )
bcf_double_set_missing(tok->values[i]);
else
{
- tok->values[i] = flt->tmpi[i];
+ tok->values[i] = ptr[idx];
is_missing = 0;
}
+ ptr += nvals;
}
if ( is_missing ) tok->nvalues = 0;
- else if ( tok->idx >= 0 )
+ else tok->nvalues = line->n_sample;
+ tok->nsamples = tok->nvalues;
+ return;
+ }
+ if ( tok->idx == -2 )
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ int nvals = tok->nvalues / line->n_sample;
+ int idx = tok->idx >= 0 ? tok->idx : 0;
+ int is_missing = 1;
+ int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
+ if ( end >= nvals ) end = nvals - 1;
+ int32_t *ptr = flt->tmpi;
+ for (i=0; i<line->n_sample; i++)
{
- int nsmpl = bcf_hdr_nsamples(flt->hdr);
- int nvals = tok->nvalues / nsmpl;
- if ( tok->idx >= nvals )
- tok->nvalues = 0; // the index is too big
- else
- {
- for (i=0; i<nsmpl; i++)
- tok->values[i] = tok->values[i*nvals+tok->idx];
- tok->nvalues = nsmpl;
- }
+ for (k=0; k<=end; k++)
+ if ( k>=tok->nidxs || tok->idxs[k] )
+ {
+ if ( ptr[k]==bcf_int32_missing || ptr[k]==bcf_int32_vector_end )
+ bcf_double_set_missing(tok->values[j]);
+ else
+ {
+ tok->values[j] = ptr[k];
+ is_missing = 0;
+ }
+ j++;
+ }
+ ptr += nvals;
+ }
+ if ( is_missing ) tok->nvalues = tok->nsamples = 0;
+ else
+ {
+ tok->nsamples = line->n_sample;
+ tok->nvalues = j;
}
+ return;
}
- tok->nsamples = tok->nvalues;
}
static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
int i;
- if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<=0 )
+ if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
{
- tok->nvalues = tok->nsamples = 0; // missing values
+ tok->nvalues = tok->nsamples = 0;
+ return;
}
- else
+ if ( tok->idx >= -1 ) // scalar or vector index
{
+ hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
+ int nvals = tok->nvalues / line->n_sample;
+ int idx = tok->idx >= 0 ? tok->idx : 0;
int is_missing = 1;
- hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- for (i=0; i<tok->nvalues; i++)
+ float *ptr = flt->tmpf;
+ for (i=0; i<line->n_sample; i++)
{
- if ( bcf_float_is_missing(flt->tmpf[i]) || bcf_float_is_vector_end(flt->tmpf[i]) )
+ if ( bcf_float_is_missing(ptr[idx]) || bcf_float_is_vector_end(ptr[idx]) )
bcf_double_set_missing(tok->values[i]);
else
{
- tok->values[i] = flt->tmpf[i];
+ tok->values[i] = ptr[idx];
is_missing = 0;
}
+ ptr += nvals;
}
if ( is_missing ) tok->nvalues = 0;
- else if ( tok->idx >= 0 )
+ else tok->nvalues = line->n_sample;
+ tok->nsamples = tok->nvalues;
+ return;
+ }
+ if ( tok->idx == -2 )
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ int nvals = tok->nvalues / line->n_sample;
+ int idx = tok->idx >= 0 ? tok->idx : 0;
+ int is_missing = 1;
+ int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
+ if ( end >= nvals ) end = nvals - 1;
+ float *ptr = flt->tmpf;
+ for (i=0; i<line->n_sample; i++)
{
- int nsmpl = bcf_hdr_nsamples(flt->hdr);
- int nvals = tok->nvalues / nsmpl;
- if ( tok->idx >= nvals )
- tok->nvalues = 0; // the index is too big
- else
- {
- for (i=0; i<nsmpl; i++)
- tok->values[i] = tok->values[i*nvals+tok->idx];
- tok->nvalues = nsmpl;
- }
+ for (k=0; k<=end; k++)
+ if ( k>=tok->nidxs || tok->idxs[k] )
+ {
+ if ( bcf_float_is_missing(ptr[k]) || bcf_float_is_vector_end(ptr[k]) )
+ bcf_double_set_missing(tok->values[j]);
+ else
+ {
+ tok->values[j] = ptr[k];
+ is_missing = 0;
+ }
+ j++;
+ }
+ ptr += nvals;
+ }
+ if ( is_missing ) tok->nvalues = tok->nsamples = 0;
+ else
+ {
+ tok->nsamples = line->n_sample;
+ tok->nvalues = j;
}
+ return;
}
- tok->nsamples = tok->nvalues;
}
static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- int ndim = tok->nsamples * (int)tok->values[0];
- int ret = bcf_get_format_char(flt->hdr,line,tok->tag,&tok->str_value,&ndim);
+ tok->str_value.l = tok->nvalues = 0;
+ if ( !line->n_sample ) return;
- int nsmpl = bcf_hdr_nsamples(flt->hdr);
- ndim /= nsmpl;
- tok->values[0] = ndim;
+ int ndim = tok->str_value.m;
+ int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim);
+ tok->str_value.m = ndim;
- if ( ret<=0 )
- {
- tok->nvalues = 0;
- return;
- }
+ if ( nstr<=0 ) return;
- if ( tok->idx < 0 ) // scalar
+ if ( tok->idx == -1 || (tok->idx==-2 && tok->idxs[0]==-1) ) // scalar or keep all values of a vector: TAG[*]
{
- tok->nvalues = tok->nsamples = nsmpl;
+ tok->nsamples = line->n_sample;
+ tok->nstr1 = ndim / line->n_sample;
+ tok->nvalues = tok->str_value.l = nstr;
return;
}
- // vector
+ int nstr1 = nstr / line->n_sample;
+
+ // vector, one or multiple indices
int i;
- for (i=0; i<nsmpl; i++)
+ for (i=0; i<line->n_sample; i++)
{
- char *ss = tok->str_value + i*ndim;
- int is = 0, ivec = 0;
- while ( ivec<tok->idx && is<ndim && ss[is] )
- {
- if ( ss[is]==',' ) ivec++;
- is++;
- }
- if ( ivec!=tok->idx || is==ndim || !ss[is] )
+ char *dst = tok->str_value.s + i*nstr1, *str = dst;
+ int nval = 0, ibeg = 0;
+ while ( ibeg < nstr1 )
{
- ss[0] = '.';
- ss[1] = 0;
- continue;
+ int iend = ibeg + 1;
+ while ( iend < nstr1 && str[iend] && str[iend]!=',' ) iend++;
+
+ int keep = 0;
+ if ( tok->idx >=0 )
+ keep = tok->idx==nval ? 1 : 0;
+ else if ( nval < tok->nidxs )
+ keep = tok->idxs[nval] ? 1 : 0;
+ else if ( tok->idxs[tok->nidxs-1] < 0 )
+ keep = 1;
+
+ if ( keep )
+ {
+ if ( ibeg>0 ) memmove(dst, str+ibeg, iend-ibeg+1);
+ dst += iend - ibeg + 1;
+ if ( tok->idx>=0 ) break;
+ }
+ if ( !str[iend] ) break;
+ ibeg = iend + 1;
+ nval++;
}
- int ie = is;
- while ( ie<ndim && ss[ie] && ss[ie]!=',' ) ie++;
- if ( is ) memmove(ss,&ss[is],ie-is);
- if ( ndim-(ie-is) ) memset(ss+ie-is,0,ndim-(ie-is));
+ if ( dst==str ) { dst[0] = '.'; dst+=2; }
+ if ( dst - str < nstr1 ) memset(dst-1, 0, nstr1 - (dst - str));
}
- if ( !ndim )
+ tok->nvalues = tok->str_value.l = nstr;
+ tok->nstr1 = nstr1;
+ tok->nsamples = line->n_sample;
+}
+static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type)
+{
+ bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
+ if ( !fmt )
{
- tok->nvalues = 0;
+ tok->nvalues = tok->str_value.l = 0;
return;
}
- tok->nvalues = ret;
+
+ int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals = type==2 ? 3 : 4;
+ if ( tok->str_value.m <= nvals*nsmpl )
+ {
+ tok->str_value.m = nvals*nsmpl + 1;
+ tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m);
+ }
+
+#define BRANCH_INT(type_t,vector_end) \
+ { \
+ for (i=0; i<line->n_sample; i++) \
+ { \
+ type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
+ int is_het = 0, has_ref = 0, missing = 0; \
+ for (j=0; j<fmt->n; j++) \
+ { \
+ if ( ptr[j]==vector_end ) break; /* smaller ploidy */ \
+ if ( bcf_gt_is_missing(ptr[j]) ) { missing=1; break; } /* missing allele */ \
+ int ial = ptr[j]; \
+ if ( bcf_gt_allele(ial)==0 ) has_ref = 1; \
+ if ( j>0 ) \
+ { \
+ int jal = ptr[j-1]; \
+ if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \
+ } \
+ } \
+ char *dst = &tok->str_value.s[nvals*i]; \
+ if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
+ else if ( type==3 ) \
+ { \
+ if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \
+ else if ( !is_het ) dst[0]='h', dst[1]='o', dst[2]='m', dst[3] = 0; /* hom */ \
+ else dst[0]='h', dst[1]='e', dst[2]='t', dst[3] = 0; /* het */ \
+ } \
+ else \
+ { \
+ if ( j==1 ) \
+ { \
+ if ( has_ref ) dst[0]='r', dst[1]=0; /* r, haploid */ \
+ else dst[0]='a', dst[1]=0; /* a, haploid */ \
+ } \
+ else if ( !is_het ) \
+ { \
+ if ( has_ref ) dst[0]='r', dst[1]='r', dst[2] = 0; /* rr */ \
+ else dst[0]='a', dst[1]='a', dst[2] = 0; /* aa */ \
+ } \
+ else \
+ { \
+ if ( has_ref ) dst[0]='r', dst[1]='a', dst[2] = 0; /* ra */ \
+ else dst[0]='a', dst[1]='A', dst[2] = 0; /* aA */ \
+ } \
+ } \
+ } \
+ }
+ switch (fmt->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+ default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break;
+ }
+#undef BRANCH_INT
tok->nsamples = nsmpl;
+ tok->nvalues = tok->str_value.l = nvals*nsmpl;
+ tok->str_value.s[tok->str_value.l] = 0;
+ tok->nstr1 = nvals;
}
+static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
+static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
+
static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
@@ -636,67 +823,73 @@ static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *to
kstring_t str;
gt_length_too_big:
- str.s = tok->str_value; str.m = tok->values[0] * nsmpl; str.l = 0;
+ tok->str_value.l = 0;
for (i=0; i<nsmpl; i++)
{
- int plen = str.l;
+ int plen = tok->str_value.l;
- bcf_format_gt(fmt, i, &str);
- kputc_(0,&str);
- if ( str.l - plen > blen )
+ bcf_format_gt(fmt, i, &tok->str_value);
+ kputc_(0, &tok->str_value);
+ if ( tok->str_value.l - plen > blen )
{
// too many alternate alleles or ploidy is too large, the genotype does not fit
// three characters ("0/0" vs "10/10").
- tok->str_value = str.s;
blen *= 2;
goto gt_length_too_big;
}
- plen = str.l - plen;
- while ( plen<blen )
+ plen = tok->str_value.l - plen;
+ while ( plen < blen )
{
- kputc_(0, &str);
+ kputc_(0, &tok->str_value);
plen++;
}
}
- tok->nvalues = str.l;
tok->nsamples = nsmpl;
- tok->values[0] = blen;
- tok->str_value = str.s;
+ tok->nvalues = tok->str_value.l;
+ tok->nstr1 = blen;
}
static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0;
- kputs(line->d.allele[0], &str);
- tok->nvalues = str.l;
- tok->values[0] = str.m;
- tok->str_value = str.s;
+ tok->str_value.l = 0;
+ kputs(line->d.allele[0], &tok->str_value);
+ tok->nvalues = tok->str_value.l;
}
static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0;
+ tok->str_value.l = 0;
if ( tok->idx>=0 )
{
- if ( line->n_allele >= tok->idx )
- kputs(line->d.allele[tok->idx], &str);
+ if ( line->n_allele > tok->idx + 1 )
+ kputs(line->d.allele[tok->idx + 1], &tok->str_value);
else
- kputc('.', &str);
+ kputc('.', &tok->str_value);
+ tok->idx = 0;
+ }
+ else if ( tok->idx==-2 )
+ {
+ int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? line->n_allele - 1 : tok->nidxs - 1;
+ if ( end >= line->n_allele - 1 ) end = line->n_allele - 2;
+ for (i=0; i<=end; i++)
+ if ( i>=tok->nidxs || tok->idxs[i] )
+ {
+ if ( tok->str_value.l ) kputc(',', &tok->str_value);
+ kputs(line->d.allele[i+1], &tok->str_value);
+ }
}
else if ( line->n_allele>1 )
{
- kputs(line->d.allele[1], &str);
+ kputs(line->d.allele[1], &tok->str_value);
int i;
for (i=2; i<line->n_allele; i++)
{
- kputc(',', &str);
- kputs(line->d.allele[i], &str);
+ kputc(',', &tok->str_value);
+ kputs(line->d.allele[i], &tok->str_value);
}
}
else if ( line->n_allele==1 )
- kputc('.', &str);
- tok->nvalues = str.l;
- tok->values[0] = str.m;
- tok->str_value = str.s;
+ kputc('.', &tok->str_value);
+ tok->nvalues = tok->str_value.l;
}
static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok)
{
@@ -857,11 +1050,11 @@ static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok)
static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
{
tok->is_str = 0;
- if ( !tok->nvalues ) return;
+ if ( !tok->str_value.l ) return;
if ( tok->idx==-2 )
{
int i = 0;
- char *ss = tok->str_value;
+ char *ss = tok->str_value.s;
while ( *ss )
{
char *se = ss;
@@ -881,9 +1074,10 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
}
else
{
- tok->values[0] = strlen(tok->str_value);
+ tok->values[0] = strlen(tok->str_value.s);
tok->nvalues = 1;
}
+ tok->str_value.l = 0;
}
#define VECTOR_ARITHMETICS(atok,btok,AOP) \
{ \
@@ -1077,7 +1271,16 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
else \
{ \
- if ( (atok)->nsamples && (btok)->nsamples ) \
+ if ( (atok)->idx<=-2 || (btok)->idx<=-2 ) \
+ { \
+ /* any field can match: [*] */ \
+ for (i=0; i<(atok)->nvalues; i++) \
+ { \
+ for (j=0; j<(btok)->nvalues; j++) \
+ if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
+ } \
+ } \
+ else if ( (atok)->nsamples && (btok)->nsamples ) \
{ \
for (i=0; i<(atok)->nsamples; i++) \
{ \
@@ -1111,15 +1314,6 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
(atok)->nsamples = (btok)->nsamples; \
if ( !has_values ) (atok)->nvalues = 0; \
} \
- else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \
- { \
- /* any field can match: [*] */ \
- for (i=0; i<(atok)->nvalues; i++) \
- { \
- for (j=0; j<(btok)->nvalues; j++) \
- if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
- } \
- } \
else \
{ \
if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \
@@ -1130,18 +1324,18 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
}
static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // logic: TOK_EQ or TOK_NE
{
- if ( !atok->nvalues ) { return 0; }
- if ( !btok->nvalues ) { atok->nvalues = 0; return 0; }
+ if ( !atok->str_value.l ) { return 0; }
+ if ( !btok->str_value.l ) { atok->str_value.l = 0; return 0; }
int i, pass_site = 0;
if ( atok->nsamples && atok->nsamples==btok->nsamples )
{
for (i=0; i<atok->nsamples; i++)
{
- char *astr = atok->str_value + i*(int)atok->values[0];
- char *bstr = btok->str_value + i*(int)btok->values[0];
- char *aend = astr + (int)atok->values[0], *a = astr;
+ char *astr = atok->str_value.s + i*atok->nstr1;
+ char *bstr = btok->str_value.s + i*btok->nstr1;
+ char *aend = astr + atok->str_value.l, *a = astr;
while ( a<aend && *a ) a++;
- char *bend = bstr + (int)btok->values[0], *b = bstr;
+ char *bend = bstr + btok->str_value.l, *b = bstr;
while ( b<bend && *b ) b++;
if ( a-astr != b-bstr ) atok->pass_samples[i] = 0;
else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0;
@@ -1161,8 +1355,8 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
token_t *xtok, *ytok; // xtok is scalar, ytok array
if ( btok->idx==-2 ) { xtok = atok; ytok = btok; }
else { xtok = btok; ytok = atok; }
- char *xstr = xtok->str_value, *xend = xstr + xtok->nvalues;
- char *ystr = ytok->str_value, *yend = ystr + ytok->nvalues, *y = ystr;
+ char *xstr = xtok->str_value.s, *xend = xstr + xtok->str_value.l;
+ char *ystr = ytok->str_value.s, *yend = ystr + ytok->str_value.l, *y = ystr;
while ( y<=yend )
{
if ( y==yend || *y==',' )
@@ -1178,7 +1372,7 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
}
}
else
- pass_site = strcmp(atok->str_value,btok->str_value) ? 0 : 1;
+ pass_site = strcmp(atok->str_value.s,btok->str_value.s) ? 0 : 1;
if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
}
else
@@ -1186,19 +1380,26 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
token_t *xtok, *ytok;
if ( !atok->nsamples ) { xtok = atok; ytok = btok; }
else { xtok = btok; ytok = atok; }
- char *xstr = xtok->str_value;
- char *xend = xstr + (int)xtok->values[0], *x = xstr;
+ char *xstr = xtok->str_value.s;
+ char *xend = xstr + xtok->str_value.l, *x = xstr;
while ( x<xend && *x ) x++;
for (i=0; i<ytok->nsamples; i++)
{
- char *ystr = ytok->str_value + i*(int)ytok->values[0];
- char *yend = ystr + (int)ytok->values[0], *y = ystr;
- while ( y<yend && *y ) y++;
- if ( x-xstr != y-ystr ) atok->pass_samples[i] = 0;
- else atok->pass_samples[i] = strncmp(xstr,ystr,x-xstr)==0 ? 1 : 0;
- if ( logic!=TOK_EQ )
- atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
- pass_site |= atok->pass_samples[i];
+ char *ystr = ytok->str_value.s + i*ytok->nstr1;
+ char *ybeg = ystr, *yend = ystr + ytok->nstr1;
+ int pass = 0;
+ while ( ybeg < yend )
+ {
+ char *y = ybeg;
+ while ( y<yend && *y && *y!=',' ) y++;
+ if ( y - ybeg != x - xstr ) pass = 0;
+ else pass = strncmp(xstr,ybeg,x-xstr)==0 ? 1 : 0;
+ if ( logic!=TOK_EQ ) pass = pass ? 0 : 1;
+ if ( pass || !*y ) break;
+ ybeg = y+1;
+ }
+ atok->pass_samples[i] = pass;
+ pass_site |= pass;
}
if ( !atok->nsamples )
atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set
@@ -1212,18 +1413,70 @@ static int regex_vector_strings(token_t *atok, token_t *btok, int negate)
{
for (i=0; i<atok->nsamples; i++)
{
- char *ptr = atok->str_value + i*(int)atok->values[0];
+ char *ptr = atok->str_value.s + i*atok->nstr1;
atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1;
if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
pass_site |= atok->pass_samples[i];
}
return pass_site;
}
- pass_site = regexec(btok->regex, atok->str_value, 0,NULL,0) ? 0 : 1;
+ pass_site = regexec(btok->regex, atok->str_value.s, 0,NULL,0) ? 0 : 1;
if ( negate ) pass_site = pass_site ? 0 : 1;
return pass_site;
}
+static void parse_tag_idx(char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG["
+{
+ // TAG[*] .. any field
+ if ( !strncmp("*]", tag_idx, 3) )
+ {
+ tok->idxs = (int*) malloc(sizeof(int));
+ tok->idxs[0] = -1;
+ tok->nidxs = 1;
+ tok->idx = -2;
+ return;
+ }
+
+ // TAG[integer] .. one field
+ char *end, *beg = tag_idx;
+ tok->idx = strtol(tag_idx, &end, 10);
+ if ( tok->idx >= 0 && *end==']' ) return;
+
+
+ // TAG[0,1] or TAG[0-2] or [1-] etc
+ int i, ibeg = -1;
+ while ( *beg && *beg!=']' )
+ {
+ int idx = strtol(beg, &end, 10);
+ if ( end[0]==',' ) beg = end + 1;
+ else if ( end[0]==']' ) beg = end;
+ else if ( end[0]=='-' ) { beg = end + 1; ibeg = idx; continue; }
+ else error("Could not parse the index: %s[%s\n", tag, tag_idx+1);
+ if ( idx >= tok->nidxs )
+ {
+ tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(idx+1));
+ memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(idx - tok->nidxs + 1));
+ tok->nidxs = idx + 1;
+ }
+ if ( ibeg>=0 )
+ {
+ for (i=ibeg; i<=idx; i++) tok->idxs[i] = 1;
+ ibeg = -1;
+ }
+ tok->idxs[idx] = 1;
+ }
+ if ( ibeg >=0 )
+ {
+ if ( ibeg >= tok->nidxs )
+ {
+ tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(ibeg+1));
+ memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(ibeg - tok->nidxs + 1));
+ tok->nidxs = ibeg + 1;
+ }
+ tok->idxs[ibeg] = -1;
+ }
+ tok->idx = -2;
+}
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
{
tok->tok_type = TOK_VAL;
@@ -1361,17 +1614,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
int i;
for (i=0; i<tmp.l; i++)
if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; is_array = i+1; break; }
- if ( is_array )
- {
- if ( tmp.s[is_array]=='*' )
- tok->idx = -2; // tag[*] .. any field
- else
- {
- char *end;
- tok->idx = strtol(tmp.s+is_array, &end, 10);
- if ( *end!=']' ) error("Could not parse the index: %s[%s\n", tmp.s,tmp.s+is_array);
- }
- }
+ if ( is_array )
+ parse_tag_idx(tmp.s, tmp.s+is_array, tok);
}
tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s);
if ( is_fmt==-1 )
@@ -1425,7 +1669,13 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break;
default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
}
- if(!is_array) tok->idx = -2;
+ if (!is_array)
+ {
+ tok->idx = -2;
+ tok->idxs = (int*) malloc(sizeof(int));
+ tok->idxs[0] = -1;
+ tok->nidxs = 1;
+ }
}
}
filter->max_unpack |= BCF_UN_INFO;
@@ -1518,6 +1768,11 @@ static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks)
}
}
+static void str_to_lower(char *str)
+{
+ while ( *str ) { *str = tolower(*str); str++; }
+}
+
// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
@@ -1538,8 +1793,8 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
ret = filters_next_token(&tmp, &len);
if ( ret==-1 ) error("Missing quotes in: %s\n", str);
- //fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
- //int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", TOKEN_STRING[ops[i]]); fprintf(stderr,"\n");
+ // fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
+ // int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", TOKEN_STRING[ops[i]]); fprintf(stderr,"\n");
if ( ret==TOK_LFT ) // left bracket
{
@@ -1670,6 +1925,28 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
i = itok;
continue;
}
+ if ( !strcmp(out[i].tag,"GT") )
+ {
+ if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
+ int ival;
+ if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1;
+ else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = i + 1;
+ else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1;
+ else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+
+ // assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc
+ if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; } // ra
+ else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; } // aA
+ else if ( !strcasecmp(out[ival].key,"aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='a'; } // aa
+ else if ( !strcasecmp(out[ival].key,"a") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]=0; } // a
+ else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; } // r
+ continue;
+ }
if ( !strcmp(out[i].tag,"FILTER") )
{
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
@@ -1728,9 +2005,10 @@ void filter_destroy(filter_t *filter)
int i;
for (i=0; i<filter->nfilters; i++)
{
- //if ( filter->filters[i].key ) free(filter->filters[i].key);
- free(filter->filters[i].str_value);
+ if ( filter->filters[i].key ) free(filter->filters[i].key);
+ free(filter->filters[i].str_value.s);
free(filter->filters[i].tag);
+ free(filter->filters[i].idxs);
free(filter->filters[i].values);
free(filter->filters[i].pass_samples);
if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash);
@@ -1745,6 +2023,7 @@ void filter_destroy(filter_t *filter)
free(filter->str);
free(filter->tmpi);
free(filter->tmpf);
+ free(filter->tmps.s);
free(filter);
}
@@ -1765,16 +2044,15 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
filter->filters[i].setter(filter, line, &filter->filters[i]);
else if ( filter->filters[i].key ) // string constant
{
- filter->filters[i].str_value = filter->filters[i].key;
- filter->filters[i].values[0] = filter->filters[i].values[0];
- filter->filters[i].nvalues = strlen(filter->filters[i].key);
+ filter->filters[i].str_value.l = 0;
+ kputs(filter->filters[i].key, &filter->filters[i].str_value);
+ filter->filters[i].nvalues = filter->filters[i].str_value.l;
}
else // numeric constant
{
filter->filters[i].values[0] = filter->filters[i].threshold;
filter->filters[i].nvalues = 1;
}
-
filter->flt_stack[nstack++] = &filter->filters[i];
continue;
}
diff --git a/kheap.h b/kheap.h
index ac2f9f9..cb5dda4 100644
--- a/kheap.h
+++ b/kheap.h
@@ -57,6 +57,8 @@
// "data_t".
heap_t *heap = khp_init(mh);
+ // When inserting a new element, the heap stores a copy of the memory
+ // area pointed to by the third argument.
for (int i=0; i<3; i++)
khp_insert(mh, heap, &data[i]);
@@ -130,7 +132,8 @@
{ \
heap->mdat = heap->ndat; \
kroundup32(heap->mdat); \
- heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t)); \
+ heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t)); \
+ memset(heap->dat + heap->ndat, 0, (heap->mdat - heap->ndat)*sizeof(kheap_t)); \
} \
int i = heap->ndat - 1; \
while ( i && __cmp(dat,&heap->dat[khp_parent(i)]) ) \
diff --git a/main.c b/main.c
index 4e3e0e5..03fa6a7 100644
--- a/main.c
+++ b/main.c
@@ -57,6 +57,7 @@ int main_plugin(int argc, char *argv[]);
int main_consensus(int argc, char *argv[]);
int main_csq(int argc, char *argv[]);
int bam_mpileup(int argc, char *argv[]);
+int main_sort(int argc, char *argv[]);
typedef struct
{
@@ -126,6 +127,10 @@ static cmd_t cmds[] =
.alias = "reheader",
.help = "modify VCF/BCF header, change sample names"
},
+ { .func = main_sort,
+ .alias = "sort",
+ .help = "sort VCF/BCF file"
+ },
{ .func = main_vcfview,
.alias = "view",
.help = "VCF/BCF conversion, view, subset and filter VCF/BCF files"
diff --git a/misc/plot-roh.py b/misc/plot-roh.py
index 49cef76..3221109 100755
--- a/misc/plot-roh.py
+++ b/misc/plot-roh.py
@@ -263,22 +263,24 @@ for fname in fnames:
pos = int(row[2])
reg = region_overlap(regs,chr,pos,pos)
if reg==None: continue
- smpl = row[3]
- if samples!=None and smpl not in samples: continue
- gt = row[4]
- x = gt.split('/')
- dsg = 2
- if x[0]!=x[1]: dsg = 1
- elif x[0]=='0': continue # skip HomRef 0/0 genotypes
- if chr not in dat_gt:
- dat_gt[chr] = {}
- chrs.append(chr)
- if smpl not in dat_gt[chr]:
- dat_gt[chr][smpl] = []
- if smpl not in smpl2y:
- y = len(smpl2y)
- smpl2y[smpl] = y
- dat_gt[chr][smpl].append([pos,dsg])
+ for i in range(3,len(row),2):
+ smpl = row[i]
+ if samples!=None and smpl not in samples: continue
+ gt = row[i+1]
+ x = gt.split('/')
+ if x[0]=='.': continue # missing genotype ./.
+ dsg = 2
+ if x[0]!=x[1]: dsg = 1
+ elif x[0]=='0': continue # skip HomRef 0/0 genotypes
+ if chr not in dat_gt:
+ dat_gt[chr] = {}
+ chrs.append(chr)
+ if smpl not in dat_gt[chr]:
+ dat_gt[chr][smpl] = []
+ if smpl not in smpl2y:
+ y = len(smpl2y)
+ smpl2y[smpl] = y
+ dat_gt[chr][smpl].append([pos,dsg])
elif row[0]=='RG':
smpl = row[1]
if samples!=None and smpl not in samples: continue
@@ -326,8 +328,8 @@ for chr in chrs:
off += max_pos + off_sep
off_list.append(off)
-height = len(fnames)
-if len(fnames)>5: heigth = 5
+height = len(smpl2y)
+if len(smpl2y)>5: heigth = 5
wh = 20,height
def bignum(num):
diff --git a/misc/plot-vcfstats b/misc/plot-vcfstats
index 7172f05..d253fd7 100755
--- a/misc/plot-vcfstats
+++ b/misc/plot-vcfstats
@@ -70,11 +70,9 @@ exit;
#--------------------------------
-sub error
+sub usage
{
- my (@msg) = @_;
- if ( scalar @msg ) { confess @msg; }
- die
+ print STDERR
"About: Plots output of \"bcftools stats\"\n",
"Usage: plot-vcfstats [OPTIONS] file.chk ...\n",
" plot-vcfstats -p outdir/ file.chk ...\n",
@@ -91,6 +89,15 @@ sub error
}
+sub error
+{
+ my (@msg) = @_;
+ if ( scalar @msg ) { confess @msg; }
+ usage();
+ exit 1;
+}
+
+
sub parse_params
{
$0 =~ s{^.+/}{};
@@ -244,7 +251,7 @@ sub parse_params
if ( $arg eq '-t' || $arg eq '--title' ) { push @{$$opts{titles}},shift(@ARGV); next; }
if ( $arg eq '-T' || $arg eq '--main-title' ) { $$opts{main_title} = shift(@ARGV); next; }
if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; }
- if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
+ if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { usage(); exit 0; }
if ( -e $arg ) { push @{$$opts{vcfstats}},$arg; next; }
error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
}
@@ -1590,8 +1597,8 @@ sub plot_substitutions
\\tfig = plt.figure(figsize=($$opts{img_width},$$opts{img_height}))
\\tcm = mpl.cm.get_cmap('autumn')
\\tn = 12
- \\tcol = range(n)
- \\tfor i in range(n): col[i] = cm(1.*i/n)
+ \\tcol = []
+ \\tfor i in list(range(n)): col.append(cm(1.*i/n))
\\tax1 = fig.add_subplot(111)
\\tax1.bar([row[0] for row in dat], [row[2] for row in dat], color=col)
\\tax1.set_ylabel('Count')
diff --git a/misc/run-roh.pl b/misc/run-roh.pl
index 4a1afa4..1940d2e 100755
--- a/misc/run-roh.pl
+++ b/misc/run-roh.pl
@@ -176,7 +176,7 @@ sub run_roh
push @files,$outfile;
if ( -e $outfile ) { next; }
cmd(
- "bcftools annotate --rename-chrs $chr_fname $$opts{indir}/$file -Ou | " .
+ "bcftools annotate --rename-chrs $chr_fname '$$opts{indir}/$file' -Ou | " .
"bcftools annotate -c CHROM,POS,REF,ALT,AF1KG -h $$opts{af_annots}.hdr -a $$opts{af_annots} -Ob -o $outfile.part && " .
"mv $outfile.part $outfile",%$opts);
}
diff --git a/mpileup.c b/mpileup.c
index ac37dd4..9b6c6eb 100644
--- a/mpileup.c
+++ b/mpileup.c
@@ -909,7 +909,7 @@ int bam_mpileup(int argc, char *argv[])
{"ignore-RG", no_argument, NULL, 5},
{"ignore-rg", no_argument, NULL, 5},
{"gvcf", required_argument, NULL, 'g'},
- {"non-reference", no_argument, NULL, 7},
+ {"no-reference", no_argument, NULL, 7},
{"no-version", no_argument, NULL, 8},
{"threads",required_argument,NULL,9},
{"illumina1.3+", no_argument, NULL, '6'},
@@ -1099,11 +1099,8 @@ int bam_mpileup(int argc, char *argv[])
free(mplp.files);
free(mplp.reg_fname); free(mplp.pl_list);
if (mplp.fai) fai_destroy(mplp.fai);
- if (mplp.bed)
- {
- regidx_destroy(mplp.bed);
- regitr_destroy(mplp.bed_itr);
- }
+ if (mplp.bed) regidx_destroy(mplp.bed);
+ if (mplp.bed_itr) regitr_destroy(mplp.bed_itr);
if (mplp.reg) regidx_destroy(mplp.reg);
bam_smpl_destroy(mplp.bsmpl);
return ret;
diff --git a/plugins/check-ploidy.c b/plugins/check-ploidy.c
new file mode 100644
index 0000000..51c4a3b
--- /dev/null
+++ b/plugins/check-ploidy.c
@@ -0,0 +1,165 @@
+/*
+ Copyright (C) 2017 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include <htslib/kseq.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include "bcftools.h"
+
+typedef struct
+{
+ char *sample;
+ int beg,end,ploidy;
+}
+dat_t;
+
+typedef struct
+{
+ int argc;
+ char **argv;
+ int rid, gt_id, ndat;
+ dat_t *dat;
+ bcf_hdr_t *hdr;
+}
+args_t;
+
+static args_t *args;
+
+const char *about(void)
+{
+ return "Check if ploidy of samples is consistent for all sites\n";
+}
+
+const char *usage(void)
+{
+ return
+ "\n"
+ "About: Check if ploidy of samples is consistent for all sites.\n"
+ "Usage: bcftools +check-ploidy [General Options] -- [Plugin Options]\n"
+ "Options:\n"
+ " run \"bcftools plugin\" for a list of common options\n"
+ "\n"
+ "Example:\n"
+ " bcftools +check-ploidy file.bcf\n"
+ "\n";
+}
+
+int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
+{
+ args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->hdr = in;
+ args->ndat = bcf_hdr_nsamples(args->hdr);
+ args->dat = (dat_t*) calloc(args->ndat,sizeof(dat_t));
+ int i;
+ for (i=0; i<args->ndat; i++) args->dat[i].sample = args->hdr->samples[i];
+ args->rid = -1;
+ args->gt_id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"GT");
+ if ( args->gt_id<0 ) error("Error: GT field is not present\n");
+ printf("# [1]Sample\t[2]Chromosome\t[3]Region Start\t[4]Region End\t[5]Ploidy\n");
+ return 1;
+}
+
+bcf1_t *process(bcf1_t *rec)
+{
+ int i;
+
+ bcf_unpack(rec, BCF_UN_FMT);
+ bcf_fmt_t *fmt_gt = NULL;
+ for (i=0; i<rec->n_fmt; i++)
+ if ( rec->d.fmt[i].id==args->gt_id ) { fmt_gt = &rec->d.fmt[i]; break; }
+ if ( !fmt_gt ) return NULL; // no GT tag
+
+ if ( args->ndat != rec->n_sample )
+ error("Incorrect number of samples at %s:%d .. found %d, expected %d\n",bcf_seqname(args->hdr,rec),rec->pos+1,rec->n_sample,args->ndat);
+
+ if ( args->rid!=rec->rid && args->rid!=-1 )
+ {
+ for (i=0; i<args->ndat; i++)
+ {
+ dat_t *dat = &args->dat[i];
+ if ( dat->ploidy!=0 ) printf("%s\t%s\t%d\t%d\t%d\n", dat->sample,bcf_seqname(args->hdr,rec),dat->beg+1,dat->end+1,dat->ploidy);
+ dat->ploidy = 0;
+ }
+ }
+ args->rid = rec->rid;
+
+ #define BRANCH_INT(type_t,vector_end) \
+ { \
+ for (i=0; i<rec->n_sample; i++) \
+ { \
+ type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
+ int nal, missing = 0; \
+ for (nal=0; nal<fmt_gt->n; nal++) \
+ { \
+ if ( p[nal]==vector_end ) break; /* smaller ploidy */ \
+ if ( bcf_gt_is_missing(p[nal]) ) { missing=1; break; } /* missing allele */ \
+ } \
+ if ( !nal || missing ) continue; /* missing genotype */ \
+ dat_t *dat = &args->dat[i]; \
+ if ( dat->ploidy==nal ) \
+ { \
+ dat->end = rec->pos; \
+ continue; \
+ } \
+ if ( dat->ploidy!=0 ) \
+ printf("%s\t%s\t%d\t%d\t%d\n", dat->sample,bcf_seqname(args->hdr,rec),dat->beg+1,dat->end+1,dat->ploidy); \
+ dat->ploidy = nal; \
+ dat->beg = rec->pos; \
+ dat->end = rec->pos; \
+ } \
+ }
+ switch (fmt_gt->type) {
+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+ default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args->hdr,rec),rec->pos+1); break;
+ }
+ #undef BRANCH_INT
+
+ return NULL;
+}
+
+void destroy(void)
+{
+ int i;
+ for (i=0; i<args->ndat; i++)
+ {
+ dat_t *dat = &args->dat[i];
+ if ( dat->ploidy!=0 ) printf("%s\t%s\t%d\t%d\t%d\n", dat->sample,bcf_hdr_id2name(args->hdr,args->rid),dat->beg+1,dat->end+1,dat->ploidy);
+ dat->ploidy = 0;
+ }
+ free(args->dat);
+ free(args);
+}
+
diff --git a/plugins/mendelian.c b/plugins/mendelian.c
index e8ae8e5..da83cda 100644
--- a/plugins/mendelian.c
+++ b/plugins/mendelian.c
@@ -117,7 +117,7 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
case 'T': trio_file = optarg; break;
case 'h':
case '?':
- default: error("%s", usage()); break;
+ default: error(usage()); break;
}
}
if ( optind != argc ) error(usage());
@@ -176,30 +176,96 @@ bcf1_t *process(bcf1_t *rec)
int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr);
if ( ngt<0 ) return dflt;
- if ( ngt!=2*bcf_hdr_nsamples(args.hdr) ) return dflt;
+ if ( ngt!=2*bcf_hdr_nsamples(args.hdr) && ngt!=bcf_hdr_nsamples(args.hdr) ) return dflt;
+ ngt /= bcf_hdr_nsamples(args.hdr);
int i, has_bad = 0, needs_update = 0;
for (i=0; i<args.ntrios; i++)
{
- int mother,father,child;
int32_t a,b,c,d,e,f;
trio_t *trio = &args.trios[i];
- a = args.gt_arr[2*trio->imother];
- b = args.gt_arr[2*trio->imother+1];
- c = args.gt_arr[2*trio->ifather];
- d = args.gt_arr[2*trio->ifather+1];
- e = args.gt_arr[2*trio->ichild];
- f = args.gt_arr[2*trio->ichild+1];
- if ( bcf_gt_is_missing(a) || bcf_gt_is_missing(b) ) continue;
+ a = args.gt_arr[ngt*trio->imother];
+ b = ngt==2 ? args.gt_arr[ngt*trio->imother+1] : bcf_int32_vector_end;
+ c = args.gt_arr[ngt*trio->ifather];
+ d = ngt==2 ? args.gt_arr[ngt*trio->ifather+1] : bcf_int32_vector_end;
+ e = args.gt_arr[ngt*trio->ichild];
+ f = ngt==2 ? args.gt_arr[ngt*trio->ichild+1] : bcf_int32_vector_end;
+
+ // missing allele in the child: missing data or daugther's chrY
+ if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) continue;
+
+ // missing data in father
if ( bcf_gt_is_missing(c) || bcf_gt_is_missing(d) ) continue;
- if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) continue;
- mother = (1<<bcf_gt_allele(a)) | (1<<bcf_gt_allele(b));
- father = (1<<bcf_gt_allele(c)) | (1<<bcf_gt_allele(d));
- child = (1<<bcf_gt_allele(e)) | (1<<bcf_gt_allele(f));
+ uint64_t mother,father,child1,child2;
+
+ // sex chr - father is haploid
+ if ( d==bcf_int32_vector_end )
+ {
+ if ( bcf_gt_is_missing(a) && (bcf_gt_is_missing(b) || b==bcf_int32_vector_end) )
+ {
+ // either the child does not match the father or the child is diploid
+ if ( bcf_gt_allele(c)!=bcf_gt_allele(e) || f!=bcf_int32_vector_end )
+ {
+ trio->nbad++;
+ has_bad = 1;
+ if ( args.mode&MODE_DELETE )
+ {
+ args.gt_arr[ngt*trio->imother] = bcf_gt_missing;
+ if ( b!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing; // should be always true
+ args.gt_arr[ngt*trio->ifather] = bcf_gt_missing;
+ args.gt_arr[ngt*trio->ichild] = bcf_gt_missing;
+ if ( ngt==2 ) args.gt_arr[ngt*trio->ichild+1] = bcf_int32_vector_end;
+ needs_update = 1;
+ }
+ }
+ else
+ trio->nok++;
+ continue;
+ }
+ // son's chrX
+ if ( f==bcf_int32_vector_end )
+ {
+ // chrY - no data in mother
+ if ( bcf_gt_allele(e)!=bcf_gt_allele(a) && bcf_gt_allele(e)!=bcf_gt_allele(b) )
+ {
+ trio->nbad++;
+ has_bad = 1;
+ if ( args.mode&MODE_DELETE )
+ {
+ args.gt_arr[ngt*trio->imother] = bcf_gt_missing;
+ if ( ngt==2 ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing;
+ args.gt_arr[ngt*trio->ifather] = bcf_gt_missing;
+ if ( d!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ifather+1] = bcf_gt_missing;
+ args.gt_arr[ngt*trio->ichild] = bcf_gt_missing;
+ needs_update = 1;
+ }
+ }
+ else
+ trio->nok++;
+ continue;
+ }
+
+ // skip genotypes which do not fit in a 64bit bitmask
+ if ( bcf_gt_allele(a) > 63 || bcf_gt_allele(b) > 63 || bcf_gt_allele(c) > 63 ) continue;
+ if ( bcf_gt_allele(e) > 63 || bcf_gt_allele(f) > 63 ) continue;
+
+ // daughter's chrX
+ mother = (1<<bcf_gt_allele(a)) | (1<<bcf_gt_allele(b));
+ father = 1<<bcf_gt_allele(c);
+ child1 = 1<<bcf_gt_allele(e);
+ child2 = 1<<bcf_gt_allele(f);
+ }
+ else
+ {
+ mother = (1<<bcf_gt_allele(a)) | (1<<bcf_gt_allele(b));
+ father = (1<<bcf_gt_allele(c)) | (1<<bcf_gt_allele(d));
+ child1 = 1<<bcf_gt_allele(e);
+ child2 = 1<<bcf_gt_allele(f);
+ }
- if ( (mother&child) && (father&child) )
+ if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) )
{
trio->nok++;
}
@@ -209,18 +275,18 @@ bcf1_t *process(bcf1_t *rec)
has_bad = 1;
if ( args.mode&MODE_DELETE )
{
- args.gt_arr[2*trio->imother] = bcf_gt_missing;
- args.gt_arr[2*trio->imother+1] = bcf_gt_missing;
- args.gt_arr[2*trio->ifather] = bcf_gt_missing;
- args.gt_arr[2*trio->ifather+1] = bcf_gt_missing;
- args.gt_arr[2*trio->ichild] = bcf_gt_missing;
- args.gt_arr[2*trio->ichild+1] = bcf_gt_missing;
+ args.gt_arr[ngt*trio->imother] = bcf_gt_missing;
+ if ( b!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing; // should be always true
+ args.gt_arr[ngt*trio->ifather] = bcf_gt_missing;
+ if ( d!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ifather+1] = bcf_gt_missing;
+ args.gt_arr[ngt*trio->ichild] = bcf_gt_missing;
+ if ( f!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ichild+1] = bcf_gt_missing;
needs_update = 1;
}
}
}
- if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt) )
+ if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) )
error("Could not update GT field at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1);
if ( args.mode&MODE_DELETE ) return rec;
diff --git a/plugins/setGT.c b/plugins/setGT.c
index 5c3d68d..676159d 100644
--- a/plugins/setGT.c
+++ b/plugins/setGT.c
@@ -1,6 +1,6 @@
/* plugins/setGT.c -- set gentoypes to given values
- Copyright (C) 2015-2016 Genome Research Ltd.
+ Copyright (C) 2015-2017 Genome Research Ltd.
Author: Petr Danecek <pd3 at sanger.ac.uk>
@@ -26,8 +26,10 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <htslib/vcf.h>
#include <htslib/vcfutils.h>
+#include <htslib/kfunc.h>
#include <inttypes.h>
#include <getopt.h>
+#include <ctype.h>
#include "bcftools.h"
#include "filter.h"
@@ -35,15 +37,32 @@ DEALINGS IN THE SOFTWARE. */
#define FLT_INCLUDE 1
#define FLT_EXCLUDE 2
-bcf_hdr_t *in_hdr, *out_hdr;
-int32_t *gts = NULL, mgts = 0;
-int *arr = NULL, marr = 0;
-uint64_t nchanged = 0;
-int tgt_mask = 0, new_mask = 0, new_gt = 0;
-filter_t *filter = NULL;
-char *filter_str = NULL;
-int filter_logic = 0;
-const uint8_t *smpl_pass = NULL;
+typedef int (*cmp_f)(double a, double b);
+
+static int cmp_eq(double a, double b) { return a==b ? 1 : 0; }
+static int cmp_le(double a, double b) { return a<=b ? 1 : 0; }
+static int cmp_ge(double a, double b) { return a>=b ? 1 : 0; }
+static int cmp_lt(double a, double b) { return a<b ? 1 : 0; }
+static int cmp_gt(double a, double b) { return a>b ? 1 : 0; }
+
+typedef struct
+{
+ bcf_hdr_t *in_hdr, *out_hdr;
+ int32_t *gts, mgts, *iarr, miarr;
+ int *arr, marr;
+ uint64_t nchanged;
+ int tgt_mask, new_mask, new_gt;
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic;
+ const uint8_t *smpl_pass;
+ double binom_val;
+ char *binom_tag;
+ cmp_f binom_cmp;
+}
+args_t;
+
+args_t *args = NULL;
#define GT_MISSING 1
#define GT_PARTIAL (1<<1)
@@ -53,6 +72,7 @@ const uint8_t *smpl_pass = NULL;
#define GT_UNPHASED (1<<5)
#define GT_ALL (1<<6)
#define GT_QUERY (1<<7)
+#define GT_BINOM (1<<8)
const char *about(void)
{
@@ -67,6 +87,7 @@ const char *usage(void)
" ./x .. partially missing (e.g., \"./0\" or \".|1\" but not \"./.\")\n"
" . .. partially or completely missing\n"
" a .. all genotypes\n"
+ " b .. heterozygous genotypes failing two-tailed binomial test (example below)\n"
" q .. select genotypes using -i/-e options\n"
" and the new genotype can be one of:\n"
" . .. missing (\".\" or \"./.\", keeps ploidy)\n"
@@ -93,12 +114,71 @@ const char *usage(void)
"\n"
" # set partially missing genotypes to completely missing\n"
" bcftools +setGT in.vcf -- -t ./x -n .\n"
+ "\n"
+ " # set heterozygous genotypes to 0/0 if binom.test(nAlt,nRef+nAlt,0.5)<1e-3\n"
+ " bcftools +setGT in.vcf -- -t \"b:AD<1e-3\" -n 0\n" // todo: make -i/-e recognise something like is_het or gt="het" so that this can be generalized?
"\n";
}
+void parse_binom_expr(args_t *args, char *str)
+{
+ if ( str[1]!=':' ) goto err;
+
+ char *beg = str+2;
+ while ( *beg && isspace(*beg) ) beg++;
+ if ( !*beg ) goto err;
+ char *end = beg;
+ while ( *end )
+ {
+ if ( isspace(*end) || *end=='<' || *end=='=' || *end=='>' ) break;
+ end++;
+ }
+ if ( !*end ) goto err;
+ args->binom_tag = (char*) calloc(1,end-beg+1);
+ memcpy(args->binom_tag,beg,end-beg);
+ int tag_id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,args->binom_tag);
+ if ( !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,tag_id) ) error("The FORMAT tag \"%s\" is not present in the VCF\n", args->binom_tag);
+
+ while ( *end && isspace(*end) ) end++;
+ if ( !*end ) goto err;
+
+ if ( !strncmp(end,"<=",2) ) { args->binom_cmp = cmp_le; beg = end+2; }
+ else if ( !strncmp(end,">=",2) ) { args->binom_cmp = cmp_ge; beg = end+2; }
+ else if ( !strncmp(end,"==",2) ) { args->binom_cmp = cmp_eq; beg = end+2; }
+ else if ( !strncmp(end,"<",1) ) { args->binom_cmp = cmp_lt; beg = end+1; }
+ else if ( !strncmp(end,">",1) ) { args->binom_cmp = cmp_gt; beg = end+1; }
+ else if ( !strncmp(end,"=",1) ) { args->binom_cmp = cmp_eq; beg = end+1; }
+ else goto err;
+
+ while ( *beg && isspace(*beg) ) beg++;
+ if ( !*beg ) goto err;
+
+ args->binom_val = strtod(beg, &end);
+ while ( *end && isspace(*end) ) end++;
+ if ( *end ) goto err;
+
+ args->tgt_mask |= GT_BINOM;
+ return;
+
+err:
+ error(
+ "Error parsing the expression: %s\n"
+ "Expected TAG CMP VAL, where\n"
+ " TAG .. one of the format tags\n"
+ " CMP .. operator, one of <, <=, >, >=\n"
+ " VAL .. value\n"
+ "For example:\n"
+ " bcftools +setGT in.vcf -- -t \"b:AD>1e-3\" -n 0\n"
+ "\n", str
+ );
+}
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{
+ args = (args_t*) calloc(1,sizeof(args_t));
+ args->in_hdr = in;
+ args->out_hdr = out;
+
int c;
static struct option loptions[] =
{
@@ -112,42 +192,41 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{
switch (c)
{
- case 'i': filter_str = optarg; filter_logic = FLT_INCLUDE; break;
- case 'e': filter_str = optarg; filter_logic = FLT_EXCLUDE; break;
- case 'n': new_mask = bcf_gt_phased(0);
- if ( strchr(optarg,'.') ) new_mask |= GT_MISSING;
- if ( strchr(optarg,'0') ) new_mask |= GT_REF;
- if ( strchr(optarg,'M') ) new_mask |= GT_MAJOR;
- if ( strchr(optarg,'p') ) new_mask |= GT_PHASED;
- if ( strchr(optarg,'u') ) new_mask |= GT_UNPHASED;
- if ( new_mask==0 ) error("Unknown parameter to --new-gt: %s\n", optarg);
+ case 'i': args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break;
+ case 'e': args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break;
+ case 'n': args->new_mask = bcf_gt_phased(0);
+ if ( strchr(optarg,'.') ) args->new_mask |= GT_MISSING;
+ if ( strchr(optarg,'0') ) args->new_mask |= GT_REF;
+ if ( strchr(optarg,'M') ) args->new_mask |= GT_MAJOR;
+ if ( strchr(optarg,'p') ) args->new_mask |= GT_PHASED;
+ if ( strchr(optarg,'u') ) args->new_mask |= GT_UNPHASED;
+ if ( args->new_mask==0 ) error("Unknown parameter to --new-gt: %s\n", optarg);
break;
case 't':
- if ( !strcmp(optarg,".") ) tgt_mask |= GT_MISSING|GT_PARTIAL;
- if ( !strcmp(optarg,"./x") ) tgt_mask |= GT_PARTIAL;
- if ( !strcmp(optarg,"./.") ) tgt_mask |= GT_MISSING;
- if ( !strcmp(optarg,"a") ) tgt_mask |= GT_ALL;
- if ( !strcmp(optarg,"q") ) tgt_mask |= GT_QUERY;
- if ( !strcmp(optarg,"?") ) tgt_mask |= GT_QUERY; // for backward compatibility
- if ( tgt_mask==0 ) error("Unknown parameter to --target-gt: %s\n", optarg);
+ if ( !strcmp(optarg,".") ) args->tgt_mask |= GT_MISSING|GT_PARTIAL;
+ if ( !strcmp(optarg,"./x") ) args->tgt_mask |= GT_PARTIAL;
+ if ( !strcmp(optarg,"./.") ) args->tgt_mask |= GT_MISSING;
+ if ( !strcmp(optarg,"a") ) args->tgt_mask |= GT_ALL;
+ if ( !strcmp(optarg,"q") ) args->tgt_mask |= GT_QUERY;
+ if ( !strcmp(optarg,"?") ) args->tgt_mask |= GT_QUERY; // for backward compatibility
+ if ( strchr(optarg,'b') ) parse_binom_expr(args, strchr(optarg,'b'));
+ if ( args->tgt_mask==0 ) error("Unknown parameter to --target-gt: %s\n", optarg);
break;
case 'h':
case '?':
default: fprintf(stderr,"%s", usage()); exit(1); break;
}
}
- in_hdr = in;
- out_hdr = out;
- if ( !new_mask ) error("Expected -n option\n");
- if ( !tgt_mask ) error("Expected -t option\n");
+ if ( !args->new_mask ) error("Expected -n option\n");
+ if ( !args->tgt_mask ) error("Expected -t option\n");
- if ( new_mask & GT_MISSING ) new_gt = bcf_gt_missing;
- if ( new_mask & GT_REF ) new_gt = new_mask>_PHASED ? bcf_gt_phased(0) : bcf_gt_unphased(0);
+ if ( args->new_mask & GT_MISSING ) args->new_gt = bcf_gt_missing;
+ if ( args->new_mask & GT_REF ) args->new_gt = args->new_mask>_PHASED ? bcf_gt_phased(0) : bcf_gt_unphased(0);
- if ( filter_str && tgt_mask!=GT_QUERY ) error("Expected -t? with -i/-e\n");
- if ( !filter_str && tgt_mask>_QUERY ) error("Expected -i/-e with -t?\n");
- if ( filter_str ) filter = filter_init(in,filter_str);
+ if ( args->filter_str && !(args->tgt_mask>_QUERY) ) error("Expected -tq with -i/-e\n");
+ if ( !args->filter_str && args->tgt_mask>_QUERY ) error("Expected -i/-e with -tq\n");
+ if ( args->filter_str ) args->filter = filter_init(in,args->filter_str);
return 0;
}
@@ -184,61 +263,111 @@ static inline int set_gt(int32_t *ptr, int ngts, int gt)
for (j=0; j<ngts; j++)
{
if ( ptr[j]==bcf_int32_vector_end ) break;
+ if ( ptr[j] != gt ) changed++;
ptr[j] = gt;
- changed++;
}
return changed;
}
+static inline double calc_binom(int na, int nb)
+{
+ int N = na + nb;
+ if ( !N ) return 1;
+
+ /*
+ kfunc.h implements kf_betai, which is the regularized beta function I_x(a,b) = P(X<=a/(a+b))
+ */
+ double prob = 2 * kf_betai(na, nb+1, 0.5);
+ if ( prob > 1 ) prob = 1;
+
+ return prob;
+}
+
bcf1_t *process(bcf1_t *rec)
{
if ( !rec->n_sample ) return rec;
- int ngts = bcf_get_genotypes(in_hdr, rec, >s, &mgts);
+ int ngts = bcf_get_genotypes(args->in_hdr, rec, &args->gts, &args->mgts);
ngts /= rec->n_sample;
int i, j, changed = 0;
+
+ int nbinom = 0;
+ if ( args->tgt_mask & GT_BINOM )
+ {
+ nbinom = bcf_get_format_int32(args->in_hdr, rec, args->binom_tag, &args->iarr, &args->miarr);
+ if ( nbinom<0 ) nbinom = 0;
+ nbinom /= rec->n_sample;
+ }
// Calculating allele frequency for each allele and determining major allele
// only do this if use_major is true
int an = 0, maxAC = -1, majorAllele = -1;
- if ( new_mask & GT_MAJOR )
+ if ( args->new_mask & GT_MAJOR )
{
- hts_expand(int,rec->n_allele,marr,arr);
- int ret = bcf_calc_ac(in_hdr,rec,arr,BCF_UN_FMT);
+ hts_expand(int,rec->n_allele,args->marr,args->arr);
+ int ret = bcf_calc_ac(args->in_hdr,rec,args->arr,BCF_UN_FMT);
if ( ret<= 0 )
- error("Could not calculate allele count at %s:%d\n", bcf_seqname(in_hdr,rec),rec->pos+1);
+ error("Could not calculate allele count at %s:%d\n", bcf_seqname(args->in_hdr,rec),rec->pos+1);
for(i=0; i < rec->n_allele; ++i)
{
- an += arr[i];
- if (arr[i] > maxAC)
+ an += args->arr[i];
+ if (args->arr[i] > maxAC)
{
- maxAC = arr[i];
+ maxAC = args->arr[i];
majorAllele = i;
}
}
// replacing new_gt by major allele
- new_gt = new_mask & GT_PHASED ? bcf_gt_phased(majorAllele) : bcf_gt_unphased(majorAllele);
+ args->new_gt = args->new_mask & GT_PHASED ? bcf_gt_phased(majorAllele) : bcf_gt_unphased(majorAllele);
}
// replace gts
- if ( tgt_mask>_QUERY )
+ if ( nbinom && ngts>=2 ) // only diploid genotypes are considered: higher ploidy ignored further, haploid here
+ {
+ if ( args->filter ) filter_test(args->filter,rec,&args->smpl_pass);
+ for (i=0; i<rec->n_sample; i++)
+ {
+ if ( args->smpl_pass )
+ {
+ if ( !args->smpl_pass[i] && args->filter_logic==FLT_INCLUDE ) continue;
+ if ( args->smpl_pass[i] && args->filter_logic==FLT_EXCLUDE ) continue;
+ }
+ int32_t *ptr = args->gts + i*ngts;
+ if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end ) continue;
+ if ( ptr[0]==ptr[1] ) continue; // a hom
+ int ia = bcf_gt_allele(ptr[0]);
+ int ib = bcf_gt_allele(ptr[1]);
+ if ( ia>=nbinom || ib>=nbinom )
+ error("The sample %s has incorrect number of %s fields at %s:%d\n",
+ args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),rec->pos+1);
+
+ double prob = calc_binom(args->iarr[i*nbinom+ia],args->iarr[i*nbinom+ib]);
+ if ( !args->binom_cmp(prob,args->binom_val) ) continue;
+
+ if ( args->new_mask>_UNPHASED )
+ changed += unphase_gt(ptr, ngts);
+ else
+ changed += set_gt(ptr, ngts, args->new_gt);
+ }
+ }
+ else if ( args->tgt_mask>_QUERY )
{
- int pass_site = filter_test(filter,rec,&smpl_pass);
- if ( (pass_site && filter_logic==FLT_EXCLUDE) || (!pass_site && filter_logic==FLT_INCLUDE) ) return rec;
+ int pass_site = filter_test(args->filter,rec,&args->smpl_pass);
+ if ( (pass_site && args->filter_logic==FLT_EXCLUDE) || (!pass_site && args->filter_logic==FLT_INCLUDE) ) return rec;
for (i=0; i<rec->n_sample; i++)
{
- if ( smpl_pass )
+ if ( args->smpl_pass )
{
- if ( !smpl_pass[i] && filter_logic==FLT_INCLUDE ) continue;
- if ( smpl_pass[i] && filter_logic==FLT_EXCLUDE ) continue;
+ if ( !args->smpl_pass[i] && args->filter_logic==FLT_INCLUDE ) continue;
+ if ( args->smpl_pass[i] && args->filter_logic==FLT_EXCLUDE ) continue;
}
- if ( new_mask>_UNPHASED )
- changed += unphase_gt(gts + i*ngts, ngts);
+ if ( args->new_mask>_UNPHASED )
+ changed += unphase_gt(args->gts + i*ngts, ngts);
else
- changed += set_gt(gts + i*ngts, ngts, new_gt);
+ changed += set_gt(args->gts + i*ngts, ngts, args->new_gt);
}
}
else
@@ -246,7 +375,7 @@ bcf1_t *process(bcf1_t *rec)
for (i=0; i<rec->n_sample; i++)
{
int ploidy = 0, nmiss = 0;
- int32_t *ptr = gts + i*ngts;
+ int32_t *ptr = args->gts + i*ngts;
for (j=0; j<ngts; j++)
{
if ( ptr[j]==bcf_int32_vector_end ) break;
@@ -255,29 +384,32 @@ bcf1_t *process(bcf1_t *rec)
}
int do_set = 0;
- if ( tgt_mask>_ALL ) do_set = 1;
- else if ( tgt_mask>_PARTIAL && nmiss ) do_set = 1;
- else if ( tgt_mask>_MISSING && ploidy==nmiss ) do_set = 1;
+ if ( args->tgt_mask>_ALL ) do_set = 1;
+ else if ( args->tgt_mask>_PARTIAL && nmiss ) do_set = 1;
+ else if ( args->tgt_mask>_MISSING && ploidy==nmiss ) do_set = 1;
if ( !do_set ) continue;
- if ( new_mask>_UNPHASED )
+ if ( args->new_mask>_UNPHASED )
changed += unphase_gt(ptr, ngts);
else
- changed += set_gt(ptr, ngts, new_gt);
+ changed += set_gt(ptr, ngts, args->new_gt);
}
}
- nchanged += changed;
- if ( changed ) bcf_update_genotypes(out_hdr, rec, gts, ngts*rec->n_sample);
+ args->nchanged += changed;
+ if ( changed ) bcf_update_genotypes(args->out_hdr, rec, args->gts, ngts*rec->n_sample);
return rec;
}
void destroy(void)
{
- if ( filter ) filter_destroy(filter);
- free(arr);
- fprintf(stderr,"Filled %"PRId64" alleles\n", nchanged);
- free(gts);
+ fprintf(stderr,"Filled %"PRId64" alleles\n", args->nchanged);
+ free(args->binom_tag);
+ if ( args->filter ) filter_destroy(args->filter);
+ free(args->arr);
+ free(args->iarr);
+ free(args->gts);
+ free(args);
}
diff --git a/test/check.chk b/test/check.chk
index c04a1bf..865773c 100644
--- a/test/check.chk
+++ b/test/check.chk
@@ -2,6 +2,20 @@
# Definition of sets:
# ID [2]id [3]tab-separated file names
# SN, Summary numbers:
+# number of records .. number of data rows in the VCF
+# number of no-ALTs .. reference-only sites, ALT is either "." or identical to REF
+# number of SNPs .. number of rows with a SNP
+# number of MNPs .. number of rows with a MNP, such as CC>TT
+# number of indels .. number of rows with an indel
+# number of others .. number of rows with other type, for example a symbolic allele or
+# a complex substitution, such as ACT>TCGA
+# number of multiallelic sites .. number of rows with multiple alternate alleles
+# number of multiallelic SNP sites .. number of rows with multiple alternate alleles, all SNPs
+#
+# Note that rows containing multiple types will be counted multiple times, in each
+# counter. For example, a row with a SNP and an indel increments both the SNP and
+# the indel counter.
+#
# SN [2]id [3]key [4]value
SN 0 number of samples: 2
SN 0 number of records: 18
@@ -71,10 +85,10 @@ DP 0 32 4 11.111111 0 0.000000
DP 0 35 4 11.111111 0 0.000000
DP 0 60 0 0.000000 1 33.333333
DP 0 62 0 0.000000 2 66.666667
-# PSC, Per-sample counts
-# PSC [2]id [3]sample [4]nRefHom [5]nNonRefHom [6]nHets [7]nTransitions [8]nTransversions [9]nIndels [10]average depth [11]nSingletons
-PSC 0 A 0 2 4 3 2 9 28.7 1
-PSC 0 B 1 1 4 2 2 9 28.7 0
+# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.
+# PSC [2]id [3]sample [4]nRefHom [5]nNonRefHom [6]nHets [7]nTransitions [8]nTransversions [9]nIndels [10]average depth [11]nSingletons [12]nHapRef [13]nHapAlt
+PSC 0 A 0 2 4 3 2 9 28.7 1 0 0
+PSC 0 B 1 1 4 2 2 9 28.7 0 0 0
# PSI, Per-Sample Indels
# PSI [2]id [3]sample [4]in-frame [5]out-frame [6]not applicable [7]out/(in+out) ratio [8]nHets [9]nAA
PSI 0 A 0 0 0 0.00 9 0
diff --git a/test/convert.gvcf.out b/test/convert.gvcf.out
index d2c0fc9..4cd237b 100644
--- a/test/convert.gvcf.out
+++ b/test/convert.gvcf.out
@@ -112,7 +112,7 @@
22 83 . A . 0 PASS BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:54:19:0
22 84 . C . 0 PASS BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:54:19:0
22 85 . G . 0 PASS BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:54:19:0
-22 86 . G C 23 LowGQX SNVSB=0;SNVHPOL=2 GT:GQ:GQX:DP:DPF:AD 0/1:56:23:22:0:16,6
+22 86 . G C 23 PASS SNVSB=0;SNVHPOL=2 GT:GQ:GQX:DP:DPF:AD 0/1:56:23:22:0:16,6
22 87 . T . 0 PASS BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:69:24:0
22 88 . C . 0 PASS BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:69:24:0
22 89 . A . 0 PASS BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:69:24:0
diff --git a/test/convert.gvcf.vcf b/test/convert.gvcf.vcf
index 6dfc44d..378edd1 100644
--- a/test/convert.gvcf.vcf
+++ b/test/convert.gvcf.vcf
@@ -80,7 +80,7 @@
22 51 . C . 0 PASS END=55;BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:30:11:0
22 56 . G . 0 PASS END=72;BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:42:15:0
22 73 . T . 0 PASS END=85;BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:54:19:0
-22 86 . G C 23 LowGQX SNVSB=0;SNVHPOL=2 GT:GQ:GQX:DP:DPF:AD 0/1:56:23:22:0:16,6
+22 86 . G C 23 PASS SNVSB=0;SNVHPOL=2 GT:GQ:GQX:DP:DPF:AD 0/1:56:23:22:0:16,6
22 87 . T . 0 PASS END=101;BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:69:24:0
22 102 . A . 0 PASS END=140;BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:84:29:0
22 141 . G . 0 PASS END=185;BLOCKAVG_min30p3a GT:GQX:DP:DPF 0/0:90:31:0
diff --git a/test/filter.12.out b/test/filter.12.out
new file mode 100644
index 0000000..2726105
--- /dev/null
+++ b/test/filter.12.out
@@ -0,0 +1 @@
+3062915 0/1 2
diff --git a/test/filter.13.out b/test/filter.13.out
new file mode 100644
index 0000000..f79e69c
--- /dev/null
+++ b/test/filter.13.out
@@ -0,0 +1,2 @@
+3177144 0/0 1/1
+3177144 0/0 0/0
diff --git a/test/filter.14.out b/test/filter.14.out
new file mode 100644
index 0000000..a04ac1e
--- /dev/null
+++ b/test/filter.14.out
@@ -0,0 +1,8 @@
+3000150 0/1 0/1
+3000151 0/1 0/1
+3062915 0/1 0/1
+3062915 0/1 2
+3106154 0/1 0/1
+3106154 0/1 0/1
+3162006 0/1 0/1
+3258448 0/1 0/1
diff --git a/test/filter.15.out b/test/filter.15.out
new file mode 100644
index 0000000..829c42c
--- /dev/null
+++ b/test/filter.15.out
@@ -0,0 +1,2 @@
+3157410 1/1 1/1
+3177144 0/0 1/1
diff --git a/test/filter.16.out b/test/filter.16.out
new file mode 100644
index 0000000..1f9f385
--- /dev/null
+++ b/test/filter.16.out
@@ -0,0 +1,3 @@
+3184885 1/2 1/2
+3199812 1/2 1/2
+3212016 1/2 1/2
diff --git a/test/filter.17.out b/test/filter.17.out
new file mode 100644
index 0000000..8b5b20a
--- /dev/null
+++ b/test/filter.17.out
@@ -0,0 +1,3 @@
+3157410 1/1 1/1
+3177144 0/0 1/1
+3177144 0/0 0/0
diff --git a/test/filter.18.out b/test/filter.18.out
new file mode 100644
index 0000000..d14486f
--- /dev/null
+++ b/test/filter.18.out
@@ -0,0 +1,11 @@
+3000150 0/1 0/1
+3000151 0/1 0/1
+3062915 0/1 0/1
+3062915 0/1 2
+3106154 0/1 0/1
+3106154 0/1 0/1
+3162006 0/1 0/1
+3184885 1/2 1/2
+3199812 1/2 1/2
+3212016 1/2 1/2
+3258448 0/1 0/1
diff --git a/test/filter.19.out b/test/filter.19.out
new file mode 100644
index 0000000..2726105
--- /dev/null
+++ b/test/filter.19.out
@@ -0,0 +1 @@
+3062915 0/1 2
diff --git a/test/mendelian.1.out b/test/mendelian.1.out
index 414f79b..e494efd 100644
--- a/test/mendelian.1.out
+++ b/test/mendelian.1.out
@@ -1,9 +1,37 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##contig=<ID=1,assembly=b37,length=249250621>
-##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
-#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT mom1 dad1 child1 mom2 dad2 child2
-1 100 . A G 100 PASS . GT ./. ./. ./. 1/1 0/1 1/1
-1 101 . A T 100 PASS . GT 0/0 0/1 0/1 1/1 0/1 1/0
-1 102 . A T 100 PASS . GT 0/0 0/1 0/1 1/1 0/1 1/0
+##contig=<ID=1>
+##contig=<ID=2>
+##contig=<ID=3>
+##contig=<ID=4>
+##contig=<ID=5>
+##contig=<ID=6>
+##contig=<ID=X>
+##contig=<ID=Y>
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT mom1 dad1 child1
+1 100 . A G 100 PASS . GT 0/0 0/0 0/0
+1 101 . A G 100 PASS . GT ./. ./. ./.
+1 102 . A G 100 PASS . GT ./. ./. ./.
+2 200 . A G 100 PASS . GT 0/0 0/1 0/0
+2 201 . A G 100 PASS . GT 0/0 0/1 0/1
+2 202 . A G 100 PASS . GT ./. ./. ./.
+3 300 . A G 100 PASS . GT ./. ./. ./.
+3 301 . A G 100 PASS . GT 0/0 1/1 0/1
+3 302 . A G 100 PASS . GT ./. ./. ./.
+4 400 . A G 100 PASS . GT 0/1 0/0 0/0
+4 401 . A G 100 PASS . GT 0/1 0/0 0/1
+4 402 . A G 100 PASS . GT ./. ./. ./.
+5 500 . A G 100 PASS . GT 0/1 0/1 0/0
+5 501 . A G 100 PASS . GT 0/1 0/1 0/1
+5 502 . A G 100 PASS . GT 0/1 0/1 1/1
+6 600 . A G 100 PASS . GT ./. ./. ./.
+6 601 . A G 100 PASS . GT 0/1 1/1 0/1
+6 602 . A G 100 PASS . GT 0/1 1/1 1/1
+X 100 . A G 100 PASS . GT 0/0 1 0
+X 101 . A G 100 PASS . GT ./. . .
+X 102 . A G 100 PASS . GT 0/0 1 0/1
+X 103 . A G 100 PASS . GT ./. . ./.
+Y 101 . A G 100 PASS . GT . 0 0
+Y 102 . A G 100 PASS . GT . . .
+Y 103 . A G 100 PASS . GT . 0 .
diff --git a/test/mendelian.2.out b/test/mendelian.2.out
index 81155ae..fe15ae7 100644
--- a/test/mendelian.2.out
+++ b/test/mendelian.2.out
@@ -1,8 +1,27 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##contig=<ID=1,assembly=b37,length=249250621>
-##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
-#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT mom1 dad1 child1 mom2 dad2 child2
-1 101 . A T 100 PASS . GT 0/0 0/1 0/1 1/1 0/1 1/0
-1 102 . A T 100 PASS . GT 0/0 0/1 0/1 1/1 0/1 1/0
+##contig=<ID=1>
+##contig=<ID=2>
+##contig=<ID=3>
+##contig=<ID=4>
+##contig=<ID=5>
+##contig=<ID=6>
+##contig=<ID=X>
+##contig=<ID=Y>
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT mom1 dad1 child1
+1 100 . A G 100 PASS . GT 0/0 0/0 0/0
+2 200 . A G 100 PASS . GT 0/0 0/1 0/0
+2 201 . A G 100 PASS . GT 0/0 0/1 0/1
+3 301 . A G 100 PASS . GT 0/0 1/1 0/1
+4 400 . A G 100 PASS . GT 0/1 0/0 0/0
+4 401 . A G 100 PASS . GT 0/1 0/0 0/1
+5 500 . A G 100 PASS . GT 0/1 0/1 0/0
+5 501 . A G 100 PASS . GT 0/1 0/1 0/1
+5 502 . A G 100 PASS . GT 0/1 0/1 1/1
+6 601 . A G 100 PASS . GT 0/1 1/1 0/1
+6 602 . A G 100 PASS . GT 0/1 1/1 1/1
+X 100 . A G 100 PASS . GT 0/0 1 0
+X 102 . A G 100 PASS . GT 0/0 1 0/1
+Y 101 . A G 100 PASS . GT . 0 0
+Y 103 . A G 100 PASS . GT . 0 .
diff --git a/test/mendelian.3.out b/test/mendelian.3.out
index 9ea430a..7accace 100644
--- a/test/mendelian.3.out
+++ b/test/mendelian.3.out
@@ -1,7 +1,22 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##contig=<ID=1,assembly=b37,length=249250621>
-##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
-#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT mom1 dad1 child1 mom2 dad2 child2
-1 100 . A G 100 PASS . GT 0/0 0/0 1/1 1/1 0/1 1/1
+##contig=<ID=1>
+##contig=<ID=2>
+##contig=<ID=3>
+##contig=<ID=4>
+##contig=<ID=5>
+##contig=<ID=6>
+##contig=<ID=X>
+##contig=<ID=Y>
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT mom1 dad1 child1
+1 101 . A G 100 PASS . GT 0/0 0/0 0/1
+1 102 . A G 100 PASS . GT 0/0 0/0 1/1
+2 202 . A G 100 PASS . GT 0/0 0/1 1/1
+3 300 . A G 100 PASS . GT 0/0 1/1 0/0
+3 302 . A G 100 PASS . GT 0/0 1/1 1/1
+4 402 . A G 100 PASS . GT 0/1 0/0 1/1
+6 600 . A G 100 PASS . GT 0/1 1/1 0/0
+X 101 . A G 100 PASS . GT 0/0 1 1
+X 103 . A G 100 PASS . GT 0/0 0 0/1
+Y 102 . A G 100 PASS . GT . 0 1
diff --git a/test/mendelian.vcf b/test/mendelian.vcf
index c43ba07..a0e90df 100644
--- a/test/mendelian.vcf
+++ b/test/mendelian.vcf
@@ -1,8 +1,36 @@
##fileformat=VCFv4.2
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##contig=<ID=1,assembly=b37,length=249250621>
-##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
-#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT mom1 dad1 child1 mom2 dad2 child2
-1 100 . A G 100 PASS . GT 0/0 0/0 1/1 1/1 0/1 1/1
-1 101 . A T 100 PASS . GT 0/0 0/1 0/1 1/1 0/1 1/0
-1 102 . A T 100 PASS . GT 0/0 0/1 0/1 1/1 0/1 1/0
+##contig=<ID=1>
+##contig=<ID=2>
+##contig=<ID=3>
+##contig=<ID=4>
+##contig=<ID=5>
+##contig=<ID=6>
+##contig=<ID=X>
+##contig=<ID=Y>
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT mom1 dad1 child1
+1 100 . A G 100 PASS . GT 0/0 0/0 0/0
+1 101 . A G 100 PASS . GT 0/0 0/0 0/1
+1 102 . A G 100 PASS . GT 0/0 0/0 1/1
+2 200 . A G 100 PASS . GT 0/0 0/1 0/0
+2 201 . A G 100 PASS . GT 0/0 0/1 0/1
+2 202 . A G 100 PASS . GT 0/0 0/1 1/1
+3 300 . A G 100 PASS . GT 0/0 1/1 0/0
+3 301 . A G 100 PASS . GT 0/0 1/1 0/1
+3 302 . A G 100 PASS . GT 0/0 1/1 1/1
+4 400 . A G 100 PASS . GT 0/1 0/0 0/0
+4 401 . A G 100 PASS . GT 0/1 0/0 0/1
+4 402 . A G 100 PASS . GT 0/1 0/0 1/1
+5 500 . A G 100 PASS . GT 0/1 0/1 0/0
+5 501 . A G 100 PASS . GT 0/1 0/1 0/1
+5 502 . A G 100 PASS . GT 0/1 0/1 1/1
+6 600 . A G 100 PASS . GT 0/1 1/1 0/0
+6 601 . A G 100 PASS . GT 0/1 1/1 0/1
+6 602 . A G 100 PASS . GT 0/1 1/1 1/1
+X 100 . A G 100 PASS . GT 0/0 1 0
+X 101 . A G 100 PASS . GT 0/0 1 1
+X 102 . A G 100 PASS . GT 0/0 1 0/1
+X 103 . A G 100 PASS . GT 0/0 0 0/1
+Y 101 . A G 100 PASS . GT . 0 0
+Y 102 . A G 100 PASS . GT . 0 1
+Y 103 . A G 100 PASS . GT . 0 .
diff --git a/test/query.34.out b/test/query.34.out
new file mode 100644
index 0000000..ba8cd61
--- /dev/null
+++ b/test/query.34.out
@@ -0,0 +1 @@
+3162007 AAAAAA,BBBBB,CCCC,DDD,EE,F AAAAAA,BBB,C
diff --git a/test/query.35.out b/test/query.35.out
new file mode 100644
index 0000000..25e8f6b
--- /dev/null
+++ b/test/query.35.out
@@ -0,0 +1 @@
+3162007 1,2,3,4,5,6 1,2,3
diff --git a/test/query.36.out b/test/query.36.out
new file mode 100644
index 0000000..e4ef348
--- /dev/null
+++ b/test/query.36.out
@@ -0,0 +1 @@
+3162007 0.1,0.02,0.003,0.0004,5e-05,6e-06 0.1,0.02,0.003
diff --git a/test/sort.out b/test/sort.out
new file mode 100644
index 0000000..fa723cd
--- /dev/null
+++ b/test/sort.out
@@ -0,0 +1,15 @@
+1 101
+1 102
+1 103
+1 104
+1 105
+2 101
+2 102
+2 103
+2 104
+2 105
+3 101
+3 102
+3 103
+3 104
+3 105
diff --git a/test/sort.vcf b/test/sort.vcf
new file mode 100644
index 0000000..649bd01
--- /dev/null
+++ b/test/sort.vcf
@@ -0,0 +1,21 @@
+##fileformat=VCFv4.2
+##reference=file:///ref.fa
+##contig=<ID=1,length=2147483647>
+##contig=<ID=2,length=2147483647>
+##contig=<ID=3,length=2147483647>
+#CHROM POS ID REF ALT QUAL FILTER INFO
+3 105 . T C . . .
+3 104 . T C . . .
+3 103 . T C . . .
+3 102 . T C . . .
+3 101 . T C . . .
+2 105 . T C . . .
+2 104 . T C . . .
+2 103 . T C . . .
+2 102 . T C . . .
+2 101 . T C . . .
+1 105 . T C . . .
+1 104 . T C . . .
+1 103 . T C . . .
+1 102 . T C . . .
+1 101 . T C . . .
diff --git a/test/stats.B.chk b/test/stats.B.chk
index 3245562..2551b64 100644
--- a/test/stats.B.chk
+++ b/test/stats.B.chk
@@ -77,9 +77,9 @@ GCsS 2 B 75.000 0 1 0 0 3 0 0
GCiS 2 B 0.000 0 0 0 0 0 0 0
GCTs B 0 0 0 0 0 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
GCTi B 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-PSC 0 B 0 0 0 0 0 0 0.0 0
-PSC 1 B 0 0 0 0 0 0 0.0 0
-PSC 2 B 0 0 4 3 1 0 0.0 4
+PSC 0 B 0 0 0 0 0 0 0.0 0 0 0
+PSC 1 B 0 0 0 0 0 0 0.0 0 0 0
+PSC 2 B 0 0 4 3 1 0 0.0 4 0 0
PSI 0 B 0 0 0 0.00 0 0
PSI 1 B 0 0 0 0.00 0 0
PSI 2 B 0 0 0 0.00 0 0
diff --git a/test/stats.chk b/test/stats.chk
index af03c3e..4c1a95c 100644
--- a/test/stats.chk
+++ b/test/stats.chk
@@ -85,15 +85,15 @@ GCTs C 0 0 1 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
GCTi A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
GCTi B 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
GCTi C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-PSC 0 A 0 0 0 0 0 0 0.0 0
-PSC 0 B 0 0 0 0 0 0 0.0 0
-PSC 0 C 0 0 0 0 0 0 0.0 0
-PSC 1 A 0 0 0 0 0 0 0.0 0
-PSC 1 B 0 0 0 0 0 0 0.0 0
-PSC 1 C 0 0 0 0 0 0 0.0 0
-PSC 2 A 4 0 0 0 0 0 0.0 0
-PSC 2 B 0 0 4 3 1 0 0.0 1
-PSC 2 C 1 3 0 3 0 0 0.0 0
+PSC 0 A 0 0 0 0 0 0 0.0 0 0 0
+PSC 0 B 0 0 0 0 0 0 0.0 0 0 0
+PSC 0 C 0 0 0 0 0 0 0.0 0 0 0
+PSC 1 A 0 0 0 0 0 0 0.0 0 0 0
+PSC 1 B 0 0 0 0 0 0 0.0 0 0 0
+PSC 1 C 0 0 0 0 0 0 0.0 0 0 0
+PSC 2 A 4 0 0 0 0 0 0.0 0 0 0
+PSC 2 B 0 0 4 3 1 0 0.0 1 0 0
+PSC 2 C 1 3 0 3 0 0 0.0 0 0 0
PSI 0 A 0 0 0 0.00 0 0
PSI 0 B 0 0 0 0.00 0 0
PSI 0 C 0 0 0 0.00 0 0
diff --git a/test/stats.counts.chk b/test/stats.counts.chk
new file mode 100644
index 0000000..b14de7d
--- /dev/null
+++ b/test/stats.counts.chk
@@ -0,0 +1,37 @@
+SN 0 number of samples: 3
+SN 0 number of records: 11
+SN 0 number of no-ALTs: 2
+SN 0 number of SNPs: 6
+SN 0 number of MNPs: 2
+SN 0 number of indels: 1
+SN 0 number of others: 2
+SN 0 number of multiallelic sites: 4
+SN 0 number of multiallelic SNP sites: 2
+TSTV 0 4 3 1.33 3 2 1.50
+SiS 0 1 2 1 1 1 0 0 1
+AF 0 0.000000 2 1 1 1 0 0 1
+AF 0 0.330000 2 1 1 0 0 0 0
+AF 0 0.490000 3 2 1 0 0 0 0
+QUAL 0 998 6 3 2 1
+IDD 0 1 1
+ST 0 A>C 0
+ST 0 A>G 0
+ST 0 A>T 0
+ST 0 C>A 1
+ST 0 C>G 1
+ST 0 C>T 1
+ST 0 G>A 3
+ST 0 G>C 1
+ST 0 G>T 0
+ST 0 T>A 0
+ST 0 T>C 0
+ST 0 T>G 0
+PSC 0 A 11 0 0 0 0 0 0.0 0 0 0
+PSC 0 B 1 1 4 3 1 1 0.0 0 1 0
+PSC 0 C 1 5 0 3 1 0 0.0 0 0 1
+PSI 0 A 0 0 0 0.00 0 0
+PSI 0 B 0 0 0 0.00 1 0
+PSI 0 C 0 0 0 0.00 0 0
+HWE 0 0.000000 3 0.000000 0.000000 0.000000
+HWE 0 0.330000 1 0.000000 0.000000 0.000000
+HWE 0 0.490000 7 0.330000 0.330000 0.330000
diff --git a/test/stats.counts.vcf b/test/stats.counts.vcf
new file mode 100644
index 0000000..d4dc075
--- /dev/null
+++ b/test/stats.counts.vcf
@@ -0,0 +1,18 @@
+##fileformat=VCFv4.2
+##ALT=<ID=DEL,Description="Deletion">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=1,assembly=b37,length=249250621>
+##contig=<ID=X,assembly=b37,length=249250621>
+##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C
+1 1000 . G A . PASS . GT 0/0 0/1 1/1
+1 1001 . G A,C . PASS . GT 0/0 1/2 2/2
+1 1002 . G A,GA . PASS . GT 0/0 1/2 1/1
+1 1003 . CC TT . PASS . GT 0/0 0/1 1/1
+1 1003 . CC TT,CT . PASS . GT 0/0 0/1 1/1
+1 1003 . CACC TAT . PASS . GT 0/0 0/1 1/1
+1 1004 . C <DEL> . PASS . GT 0/0 0/1 1/1
+1 1005 . C . . PASS . GT 0/0 0/0 0/0
+1 1006 . CC CC . PASS . GT 0/0 0/1 1/1
+1 1006 . C G,T . PASS . GT 0/0 1/1 2/2
+X 1000 . C A . PASS . GT 0/0 0 1
diff --git a/test/test.pl b/test/test.pl
index e65aa0e..e10b881 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -46,6 +46,7 @@ test_vcf_check($opts,in=>'check',out=>'check.chk');
test_vcf_check_merge($opts,in=>'check',out=>'check_merge.chk');
test_vcf_stats($opts,in=>['stats.a','stats.b'],out=>'stats.chk',args=>'-s -');
test_vcf_stats($opts,in=>['stats.a','stats.b'],out=>'stats.B.chk',args=>'-s B');
+test_vcf_stats($opts,in=>['stats.counts'],out=>'stats.counts.chk',args=>'-s -');
test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.out',args=>'-n =2');
test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.flt.out',args=>'-n =2 -i"STRLEN(REF)==2"');
test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.both.out',args=>'-n =2 -c both');
@@ -110,6 +111,9 @@ test_vcf_query($opts,in=>'filter-missing-floats',out=>'query.30.out',args=>q[-f'
test_vcf_query($opts,in=>'filter-missing-floats',out=>'query.31.out',args=>q[-f'%POS\\t%A_AF\\t%B_AF\\t%C_AF\\n' -e'A_AF>=0.0001 || B_AF >= 0.0001 || C_AF >= 0.0001']);
test_vcf_query($opts,in=>'missing',out=>'query.32.out',args=>q[-i'FMT/FINT!="."' -f'[\t%FINT]\\n']);
test_vcf_query($opts,in=>'query.filter.2',out=>'query.33.out',args=>q[-f'[%GT]\\n' -i'GT~"0/[1-9]" || GT~"[1-9]/0"']);
+test_vcf_query($opts,in=>'view.filter',out=>'query.34.out',args=>q[-f'%POS[ %FGS]\\n' -i'FGS[1-2,4]="EE"']);
+test_vcf_query($opts,in=>'view.filter',out=>'query.35.out',args=>q[-f'%POS[ %FGI]\\n' -i'FGI[1-2,5]=6']);
+test_vcf_query($opts,in=>'view.filter',out=>'query.36.out',args=>q[-f'%POS[ %FGF]\\n' -i'FGF[1-3,4]=5e-5']);
test_vcf_norm($opts,in=>'norm',out=>'norm.out',fai=>'norm',args=>'-cx');
test_vcf_norm($opts,in=>'norm.split',out=>'norm.split.out',args=>'-m-');
test_vcf_norm($opts,in=>'norm.split.2',out=>'norm.split.2.out',args=>'-m-');
@@ -185,6 +189,18 @@ test_vcf_filter($opts,in=>'filter.2',out=>'filter.8.out',args=>q[-i'AC[*]=2 && F
test_vcf_filter($opts,in=>'filter.2',out=>'filter.9.out',args=>q[-i'ALT="."'],fmt=>'%POS\\t%AC[\\t%GT]\\n');
test_vcf_filter($opts,in=>'filter.4',out=>'filter.10.out',args=>q[-S . -i 'FORMAT/TEST3<25']);
test_vcf_filter($opts,in=>'filter.4',out=>'filter.10.out',args=>q[-S . -i 'FORMAT/TEST4<25']);
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.12.out',args=>q[-i'GT="A"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.13.out',args=>q[-i'GT="RR"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.14.out',args=>q[-i'GT="RA"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.14.out',args=>q[-i'GT="AR"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.15.out',args=>q[-i'GT="AA"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.16.out',args=>q[-i'GT="aA"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.16.out',args=>q[-i'GT="Aa"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.17.out',args=>q[-i'GT="HOM"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.18.out',args=>q[-i'GT="HET"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.19.out',args=>q[-i'GT="HAP"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 0],fmt=>'%CHROM\\t%POS\\n');
+test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 1000],fmt=>'%CHROM\\t%POS\\n');
test_vcf_regions($opts,in=>'regions');
test_vcf_annotate($opts,in=>'annotate',tab=>'annotate',out=>'annotate.out',args=>'-c CHROM,POS,REF,ALT,ID,QUAL,INFO/T_INT,INFO/T_FLOAT,INDEL');
test_vcf_annotate($opts,in=>'annotate',tab=>'annotate2',out=>'annotate2.out',args=>'-c CHROM,FROM,TO,T_STR');
@@ -275,8 +291,8 @@ test_vcf_consensus($opts,in=>'consensus',out=>'consensus.1.out',fa=>'consensus.f
test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.1.chain',chain=>'consensus.1.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'');
test_vcf_consensus($opts,in=>'consensus',out=>'consensus.2.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-H 1');
test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.2.chain',chain=>'consensus.2.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-H 1');
-test_vcf_consensus($opts,in=>'consensus',out=>'consensus.3.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-i');
-test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.3.chain',chain=>'consensus.3.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-i');
+test_vcf_consensus($opts,in=>'consensus',out=>'consensus.3.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-I');
+test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.3.chain',chain=>'consensus.3.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-I');
test_vcf_consensus($opts,in=>'consensus',out=>'consensus.4.out',fa=>'consensus.fa',args=>'-H 1');
test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.4.chain',chain=>'consensus.4.chain',fa=>'consensus.fa',args=>'-H 1');
test_vcf_consensus($opts,in=>'consensus2',out=>'consensus2.1.out',fa=>'consensus2.fa',args=>'-H 1');
@@ -703,6 +719,13 @@ sub test_vcf_filter
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools filter $args{args} $$opts{path}/$args{in}.vcf | $pipe");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools filter -Ob $args{args} $$opts{path}/$args{in}.vcf | $$opts{bin}/bcftools view | $pipe");
}
+sub test_vcf_sort
+{
+ my ($opts,%args) = @_;
+ my $pipe = "$$opts{bin}/bcftools query -f '$args{fmt}'";
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools sort $args{args} $$opts{path}/$args{in}.vcf | $pipe");
+ test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools sort -Ob $args{args} $$opts{path}/$args{in}.vcf | $$opts{bin}/bcftools view | $pipe");
+}
sub test_vcf_regions
{
my ($opts,%args) = @_;
@@ -1088,4 +1111,3 @@ sub test_csq_real
}
closedir($dh);
}
-
diff --git a/vcfcnv.c b/vcfcnv.c
index ffe71c4..11c55bd 100644
--- a/vcfcnv.c
+++ b/vcfcnv.c
@@ -212,8 +212,14 @@ static double *init_iprobs(int ndim, double same_prob)
static void init_sample_files(sample_t *smpl, char *dir)
{
smpl->dat_fh = open_file(&smpl->dat_fname,"w","%s/dat.%s.tab",dir,smpl->name);
+ if ( !smpl->dat_fh ) error("Error opening file: %s/dat.%s.tab\n",dir,smpl->name);
+
smpl->cn_fh = open_file(&smpl->cn_fname,"w","%s/cn.%s.tab",dir,smpl->name);
+ if ( !smpl->cn_fh ) error("Error opening file: %s/cn.%s.tab\n",dir,smpl->name);
+
smpl->summary_fh = open_file(&smpl->summary_fname,"w","%s/summary.%s.tab",dir,smpl->name);
+ if ( !smpl->summary_fh ) error("Error opening file: %s/summary.%s.tab\n",dir,smpl->name);
+
fprintf(smpl->dat_fh,"# [1]Chromosome\t[2]Position\t[3]BAF\t[4]LRR\n");
fprintf(smpl->cn_fh,"# [1]Chromosome\t[2]Position\t[3]CN\t[4]P(CN0)\t[5]P(CN1)\t[6]P(CN2)\t[7]P(CN3)\n");
fprintf(smpl->summary_fh,"# RG, Regions [2]Chromosome\t[3]Start\t[4]End\t[5]Copy Number state\t[6]Quality\t[7]nSites\t[8]nHETs\n");
diff --git a/vcfconvert.c b/vcfconvert.c
index 8f596d4..1e28ad8 100644
--- a/vcfconvert.c
+++ b/vcfconvert.c
@@ -1316,12 +1316,13 @@ static void gvcf_to_vcf(args_t *args)
}
// check if alleles compatible with being a gVCF record
+ // ALT must be one of ., <*>, <X>, <NON_REF>
+ // check for INFO/END is below
int i, gallele = -1;
if (line->n_allele==1)
gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present)
- else
+ else if ( line->d.allele[1][0]=='<' )
{
- if ( line->d.allele[1][0]!='<' ) continue;
for (i=1; i<line->n_allele; i++)
{
if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF
diff --git a/vcfindex.c b/vcfindex.c
index aa60fb2..807fedd 100644
--- a/vcfindex.c
+++ b/vcfindex.c
@@ -32,6 +32,7 @@ DEALINGS IN THE SOFTWARE. */
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <htslib/kstring.h>
+#include <htslib/bgzf.h>
#include "bcftools.h"
#define BCF_LIDX_SHIFT 14
@@ -208,6 +209,12 @@ int main_vcfindex(int argc, char *argv[])
return 1;
}
}
+
+ // check for truncated files, allow only with -f
+ BGZF *fp = bgzf_open(fname, "r");
+ if ( !fp ) error("index: failed to open %s\n", fname);
+ if ( bgzf_check_EOF(fp)!=1 ) error("index: the input is probably truncated, use -f to index anyway: %s\n", fname);
+ if ( bgzf_close(fp)!=0 ) error("index: close failed: %s\n", fname);
}
int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads);
diff --git a/vcfisec.c b/vcfisec.c
index 9eb3a7c..3e0e1e5 100644
--- a/vcfisec.c
+++ b/vcfisec.c
@@ -82,13 +82,13 @@ void mkdir_p(const char *fmt, ...)
while (*p)
{
while (*p && *p!='/') p++;
- if ( *p )
- {
- *p = 0;
- mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
- *p = '/';
- p++;
- }
+ if ( !*p ) break;
+ char ctmp = *p;
+ *p = 0;
+ int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+ if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", path,strerror(errno));
+ *p = ctmp;
+ while ( *p && *p=='/' ) p++;
}
free(tmp);
free(path);
diff --git a/vcfmerge.c b/vcfmerge.c
index e9ed5ad..31f5dad 100644
--- a/vcfmerge.c
+++ b/vcfmerge.c
@@ -662,7 +662,6 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb)
}
// new allele
map[i] = *nb;
- if ( b[*nb] ) free(b[*nb]);
b[*nb] = const_ai ? strdup(ai) : ai;
(*nb)++;
}
@@ -1668,6 +1667,11 @@ void gvcf_set_alleles(args_t *args)
bcf_srs_t *files = args->files;
maux_t *maux = args->maux;
gvcf_aux_t *gaux = maux->gvcf;
+ for (i=0; i<maux->nals; i++)
+ {
+ free(maux->als[i]);
+ maux->als[i] = NULL;
+ }
maux->nals = 0;
for (i=0; i<files->nreaders; i++)
@@ -2025,9 +2029,15 @@ int can_merge(args_t *args)
maux_t *maux = args->maux;
gvcf_aux_t *gaux = maux->gvcf;
char *id = NULL, ref = 'N';
+ int i,j,k, ntodo = 0;
+
+ for (i=0; i<maux->nals; i++)
+ {
+ free(maux->als[i]);
+ maux->als[i] = NULL;
+ }
maux->var_types = maux->nals = 0;
- int i,j,k, ntodo = 0;
for (i=0; i<files->nreaders; i++)
{
buffer_t *buf = &maux->buf[i];
diff --git a/vcfnorm.c b/vcfnorm.c
index 86c20ab..bc51018 100644
--- a/vcfnorm.c
+++ b/vcfnorm.c
@@ -1514,6 +1514,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
{
bcf1_t *line;
int i, k;
+ int prev_rid = -1, prev_pos = -1, prev_type = 0;
for (i=0; i<n; i++)
{
k = rbuf_shift(&args->rbuf);
@@ -1534,6 +1535,23 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
continue;
}
}
+ else if ( args->rmdup )
+ {
+ int line_type = bcf_get_variant_types(args->lines[k]);
+ if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos )
+ {
+ if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
+ if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+ if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ }
+ else
+ {
+ prev_rid = args->lines[k]->rid;
+ prev_pos = args->lines[k]->pos;
+ prev_type = 0;
+ }
+ prev_type |= line_type;
+ }
bcf_write1(file, args->hdr, args->lines[k]);
}
if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n )
diff --git a/vcfquery.c b/vcfquery.c
index ab4c100..04554f8 100644
--- a/vcfquery.c
+++ b/vcfquery.c
@@ -32,6 +32,7 @@ THE SOFTWARE. */
#include <sys/types.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
+#include <htslib/khash_str2int.h>
#include <htslib/vcfutils.h>
#include "bcftools.h"
#include "filter.h"
@@ -151,10 +152,26 @@ static void query_vcf(args_t *args)
static void list_columns(args_t *args)
{
+ void *has_sample = NULL;
+ if ( args->sample_list )
+ {
+ has_sample = khash_str2int_init();
+ int i, nsmpl;
+ char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
+ for (i=0; i<nsmpl; i++) khash_str2int_inc(has_sample, smpl[i]);
+ free(smpl);
+ }
+
int i;
bcf_sr_t *reader = &args->files->readers[0];
for (i=0; i<bcf_hdr_nsamples(reader->header); i++)
+ {
+ if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue;
printf("%s\n", reader->header->samples[i]);
+ }
+
+ if ( has_sample )
+ khash_str2int_destroy_free(has_sample);
}
static char **copy_header(bcf_hdr_t *hdr, char **src, int nsrc)
diff --git a/vcfsort.c b/vcfsort.c
new file mode 100644
index 0000000..e41b628
--- /dev/null
+++ b/vcfsort.c
@@ -0,0 +1,306 @@
+/* vcfsort.c -- sort subcommand
+
+ Copyright (C) 2017 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/kstring.h>
+#include "kheap.h"
+#include "bcftools.h"
+
+typedef struct
+{
+ char *fname;
+ htsFile *fh;
+ bcf1_t *rec;
+}
+blk_t;
+
+typedef struct _args_t
+{
+ bcf_hdr_t *hdr;
+ char **argv, *fname, *output_fname, *tmp_dir;
+ int argc, output_type;
+ size_t max_mem, mem;
+ bcf1_t **buf;
+ size_t nbuf, mbuf, nblk;
+ blk_t *blk;
+}
+args_t;
+
+int cmp_bcf_pos(const void *aptr, const void *bptr)
+{
+ bcf1_t *a = *((bcf1_t**)aptr);
+ bcf1_t *b = *((bcf1_t**)bptr);
+ if ( a->rid < b->rid ) return -1;
+ if ( a->rid > b->rid ) return 1;
+ if ( a->pos < b->pos ) return -1;
+ if ( a->pos > b->pos ) return 1;
+ return 0;
+}
+
+void buf_flush(args_t *args)
+{
+ if ( !args->nbuf ) return;
+
+ qsort(args->buf, args->nbuf, sizeof(*args->buf), cmp_bcf_pos);
+
+ args->nblk++;
+ args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk);
+ blk_t *blk = args->blk + args->nblk - 1;
+
+ kstring_t str = {0,0,0};
+ ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk);
+ blk->fname = str.s;
+
+ htsFile *fh = hts_open(blk->fname, "wbu");
+ if ( fh == NULL ) error("Cannot write %s: %s\n", blk->fname, strerror(errno));
+ bcf_hdr_write(fh, args->hdr);
+
+ int i;
+ for (i=0; i<args->nbuf; i++)
+ {
+ bcf_write(fh, args->hdr, args->buf[i]);
+ bcf_destroy(args->buf[i]);
+ }
+ hts_close(fh);
+
+ args->nbuf = 0;
+ args->mem = 0;
+}
+
+void buf_push(args_t *args, bcf1_t *rec)
+{
+ int delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + sizeof(bcf1_t*);
+ if ( args->mem + delta > args->max_mem ) buf_flush(args);
+ args->nbuf++;
+ args->mem += delta;
+ hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf);
+ args->buf[args->nbuf-1] = rec;
+}
+
+void sort_blocks(args_t *args)
+{
+ htsFile *in = hts_open(args->fname, "r");
+ if ( !in ) error("Could not read %s\n", args->fname);
+ args->hdr = bcf_hdr_read(in);
+
+ while ( 1 )
+ {
+ bcf1_t *rec = bcf_init();
+ int ret = bcf_read1(in, args->hdr, rec);
+ if ( ret < -1 ) error("Error encountered while parsing the input\n");
+ if ( ret == -1 )
+ {
+ bcf_destroy(rec);
+ break;
+ }
+ buf_push(args, rec);
+ }
+ buf_flush(args);
+ free(args->buf);
+
+ if ( hts_close(in)!=0 ) error("Close failed: %s\n", args->fname);
+}
+
+static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr)
+{
+ blk_t *a = *aptr;
+ blk_t *b = *bptr;
+ if ( a->rec->rid < b->rec->rid ) return 1;
+ if ( a->rec->rid > b->rec->rid ) return 0;
+ if ( a->rec->pos < b->rec->pos ) return 1;
+ return 0;
+}
+KHEAP_INIT(blk, blk_t*, blk_is_smaller)
+
+void blk_read(khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk)
+{
+ if ( !blk->fh ) return;
+ int ret = bcf_read(blk->fh, hdr, blk->rec);
+ if ( ret < -1 ) error("Error reading %s\n", blk->fname);
+ if ( ret == -1 )
+ {
+ if ( hts_close(blk->fh)!=0 ) error("Close failed: %s\n", blk->fname);
+ blk->fh = 0;
+ return;
+ }
+ khp_insert(blk, bhp, &blk);
+}
+
+void merge_blocks(args_t *args)
+{
+ fprintf(stderr,"Merging %d temporary files\n", (int)args->nblk);
+
+ khp_blk_t *bhp = khp_init(blk);
+
+ int i;
+ for (i=0; i<args->nblk; i++)
+ {
+ blk_t *blk = args->blk + i;
+ blk->fh = hts_open(blk->fname, "r");
+ if ( !blk->fh ) error("Could not read %s: %s\n", blk->fname, strerror(errno));
+ bcf_hdr_t *hdr = bcf_hdr_read(blk->fh);
+ bcf_hdr_destroy(hdr);
+ blk->rec = bcf_init();
+ blk_read(bhp, args->hdr, blk);
+ }
+
+ htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ bcf_hdr_write(out, args->hdr);
+ while ( bhp->ndat )
+ {
+ blk_t *blk = bhp->dat[0];
+ bcf_write(out, args->hdr, blk->rec);
+ khp_delete(blk, bhp);
+ blk_read(bhp, args->hdr, blk);
+ }
+ if ( hts_close(out)!=0 ) error("Close failed: %s\n", args->output_fname);
+
+ fprintf(stderr,"Cleaning\n");
+ for (i=0; i<args->nblk; i++)
+ {
+ blk_t *blk = args->blk + i;
+ unlink(blk->fname);
+ free(blk->fname);
+ bcf_destroy(blk->rec);
+ }
+ rmdir(args->tmp_dir);
+ free(args->blk);
+ khp_destroy(blk, bhp);
+ fprintf(stderr,"Done\n");
+}
+
+static void usage(args_t *args)
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "About: Sort VCF/BCF file.\n");
+ fprintf(stderr, "Usage: bcftools sort [OPTIONS] <FILE.vcf>\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -m, --max-mem <float>[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6
+ fprintf(stderr, " -o, --output-file <file> output file name [stdout]\n");
+ fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, " -T, --temp-dir <dir> temporary files [/tmp/bcftools-sort.XXXXXX/]\n");
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+size_t parse_mem_string(char *str)
+{
+ char *tmp;
+ double mem = strtod(str, &tmp);
+ if ( tmp==str ) error("Could not parse: --max-mem %s\n", str);
+ if ( !strcasecmp("k",tmp) ) mem *= 1000;
+ else if ( !strcasecmp("m",tmp) ) mem *= 1000*1000;
+ else if ( !strcasecmp("g",tmp) ) mem *= 1000*1000*1000;
+ return mem;
+}
+
+void mkdir_p(const char *fmt, ...);
+void init(args_t *args)
+{
+ if ( !args->tmp_dir )
+ {
+ args->tmp_dir = strdup("/tmp/bcftools-sort.XXXXXX");
+ char *tmp_dir = mkdtemp(args->tmp_dir);
+ if ( !tmp_dir ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno));
+ }
+ else
+ {
+ args->tmp_dir = strdup(args->tmp_dir);
+ mkdir_p(args->tmp_dir);
+ }
+ fprintf(stderr,"Writing to %s\n", args->tmp_dir);
+}
+void destroy(args_t *args)
+{
+ bcf_hdr_destroy(args->hdr);
+ free(args->tmp_dir);
+ free(args);
+}
+
+int main_sort(int argc, char *argv[])
+{
+ int c;
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->max_mem = 768*1000*1000;
+ args->output_fname = "-";
+
+ static struct option loptions[] =
+ {
+ {"max-mem",required_argument,NULL,'m'},
+ {"temp-dir",required_argument,NULL,'T'},
+ {"output-type",required_argument,NULL,'O'},
+ {"output-file",required_argument,NULL,'o'},
+ {"help",no_argument,NULL,'h'},
+ {0,0,0,0}
+ };
+ while ((c = getopt_long(argc, argv, "m:T:O:o:h?",loptions,NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 'm': args->max_mem = parse_mem_string(optarg); break;
+ case 'T': args->tmp_dir = optarg; break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ };
+ break;
+ case 'h': usage(args);
+ case '?': usage(args);
+ default: error("Unknown argument: %s\n", optarg);
+ }
+ }
+
+ if ( optind>=argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else args->fname = argv[optind];
+
+ init(args);
+ sort_blocks(args);
+ merge_blocks(args);
+ destroy(args);
+
+ return 0;
+}
diff --git a/vcfstats.c b/vcfstats.c
index 4041a5a..3b73173 100644
--- a/vcfstats.c
+++ b/vcfstats.c
@@ -87,6 +87,7 @@ typedef struct
int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
int subst[15];
int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
+ int *smpl_hapRef, *smpl_hapAlt;
int *smpl_indel_hets, *smpl_indel_homs;
int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
unsigned long int *smpl_dp;
@@ -472,6 +473,8 @@ static void init_stats(args_t *args)
stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_hapRef = (int *) calloc(args->files->n_smpl,sizeof(int));
+ stats->smpl_hapAlt = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int));
@@ -548,17 +551,19 @@ static void destroy_stats(args_t *args)
#endif
free(stats->insertions);
free(stats->deletions);
- if (stats->smpl_hets) free(stats->smpl_hets);
- if (stats->smpl_homAA) free(stats->smpl_homAA);
- if (stats->smpl_homRR) free(stats->smpl_homRR);
- if (stats->smpl_indel_homs) free(stats->smpl_indel_homs);
- if (stats->smpl_indel_hets) free(stats->smpl_indel_hets);
- if (stats->smpl_ts) free(stats->smpl_ts);
- if (stats->smpl_tv) free(stats->smpl_tv);
- if (stats->smpl_indels) free(stats->smpl_indels);
- if (stats->smpl_dp) free(stats->smpl_dp);
- if (stats->smpl_ndp) free(stats->smpl_ndp);
- if (stats->smpl_sngl) free(stats->smpl_sngl);
+ free(stats->smpl_hets);
+ free(stats->smpl_homAA);
+ free(stats->smpl_homRR);
+ free(stats->smpl_hapRef);
+ free(stats->smpl_hapAlt);
+ free(stats->smpl_indel_homs);
+ free(stats->smpl_indel_hets);
+ free(stats->smpl_ts);
+ free(stats->smpl_tv);
+ free(stats->smpl_indels);
+ free(stats->smpl_dp);
+ free(stats->smpl_ndp);
+ free(stats->smpl_sngl);
idist_destroy(&stats->dp);
idist_destroy(&stats->dp_sites);
for (j=0; j<stats->nusr; j++)
@@ -861,6 +866,8 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
assert( ial<line->n_allele );
stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
}
+ if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++;
+ if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++;
continue;
}
if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; }
@@ -873,7 +880,10 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
case GT_HOM_AA: nalt_tot++; break;
}
#endif
- if ( line_type&VCF_SNP || line_type==VCF_REF ) // count ALT=. as SNP
+ int var_type = 0;
+ if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
+ if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
+ if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP
{
if ( gt == GT_HET_RA ) stats->smpl_hets[is]++;
else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++;
@@ -889,7 +899,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
stats->smpl_tv[is]++;
}
}
- if ( line_type&VCF_INDEL )
+ if ( var_type&VCF_INDEL )
{
if ( gt != GT_HOM_RR )
{
@@ -1068,7 +1078,7 @@ static void do_vcf_stats(args_t *args)
if ( line->n_allele>2 )
{
stats->n_mals++;
- if ( line_type == VCF_SNP ) stats->n_snp_mals++;
+ if ( line_type == VCF_SNP ) stats->n_snp_mals++; // note: this will be fooled by C>C,T
}
if ( files->n_smpl )
@@ -1125,7 +1135,22 @@ static void print_header(args_t *args)
static void print_stats(args_t *args)
{
int i, j,k, id;
- printf("# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n");
+ printf("# SN, Summary numbers:\n");
+ printf("# number of records .. number of data rows in the VCF\n");
+ printf("# number of no-ALTs .. reference-only sites, ALT is either \".\" or identical to REF\n");
+ printf("# number of SNPs .. number of rows with a SNP\n");
+ printf("# number of MNPs .. number of rows with a MNP, such as CC>TT\n");
+ printf("# number of indels .. number of rows with an indel\n");
+ printf("# number of others .. number of rows with other type, for example a symbolic allele or\n");
+ printf("# a complex substitution, such as ACT>TCGA\n");
+ printf("# number of multiallelic sites .. number of rows with multiple alternate alleles\n");
+ printf("# number of multiallelic SNP sites .. number of rows with multiple alternate alleles, all SNPs\n");
+ printf("# \n");
+ printf("# Note that rows containing multiple types will be counted multiple times, in each\n");
+ printf("# counter. For example, a row with a SNP and an indel increments both the SNP and\n");
+ printf("# the indel counter.\n");
+ printf("# \n");
+ printf("# SN\t[2]id\t[3]key\t[4]value\n");
for (id=0; id<args->files->nreaders; id++)
printf("SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header));
for (id=0; id<args->nstats; id++)
@@ -1470,16 +1495,18 @@ static void print_stats(args_t *args)
if ( args->files->n_smpl )
{
- printf("# PSC, Per-sample counts\n# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\n");
+ printf("# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n");
+ printf("# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons"
+ "\t[12]nHapRef\t[13]nHapAlt\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
for (i=0; i<args->files->n_smpl; i++)
{
float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0;
- printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\n", id,args->files->samples[i],
+ printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i],
stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i],
- stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i]);
+ stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]);
}
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bcftools.git
More information about the debian-med-commit
mailing list