[med-svn] [bcftools] 01/05: New upstream version 1.6

Andreas Tille tille at debian.org
Mon Dec 11 14:37:20 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository bcftools.

commit 01bdfe5461861d96114f4312b341e37e3d1006e0
Author: Andreas Tille <tille at debian.org>
Date:   Mon Dec 11 13:22:00 2017 +0100

    New upstream version 1.6
---
 Makefile               |   4 +-
 NEWS                   |  33 +++
 config.mk.in           |   1 +
 consensus.c            | 129 ++++++++--
 csq.c                  | 133 +++++-----
 doc/bcftools.1         | 179 ++++++++++++--
 doc/bcftools.html      | 116 +++++++--
 doc/bcftools.txt       |  98 +++++++-
 filter.c               | 652 +++++++++++++++++++++++++++++++++++--------------
 kheap.h                |   5 +-
 main.c                 |   5 +
 misc/plot-roh.py       |  38 +--
 misc/plot-vcfstats     |  21 +-
 misc/run-roh.pl        |   2 +-
 mpileup.c              |   9 +-
 plugins/check-ploidy.c | 165 +++++++++++++
 plugins/mendelian.c    | 110 +++++++--
 plugins/setGT.c        | 264 +++++++++++++++-----
 test/check.chk         |  22 +-
 test/convert.gvcf.out  |   2 +-
 test/convert.gvcf.vcf  |   2 +-
 test/filter.12.out     |   1 +
 test/filter.13.out     |   2 +
 test/filter.14.out     |   8 +
 test/filter.15.out     |   2 +
 test/filter.16.out     |   3 +
 test/filter.17.out     |   3 +
 test/filter.18.out     |  11 +
 test/filter.19.out     |   1 +
 test/mendelian.1.out   |  40 ++-
 test/mendelian.2.out   |  29 ++-
 test/mendelian.3.out   |  23 +-
 test/mendelian.vcf     |  40 ++-
 test/query.34.out      |   1 +
 test/query.35.out      |   1 +
 test/query.36.out      |   1 +
 test/sort.out          |  15 ++
 test/sort.vcf          |  21 ++
 test/stats.B.chk       |   6 +-
 test/stats.chk         |  18 +-
 test/stats.counts.chk  |  37 +++
 test/stats.counts.vcf  |  18 ++
 test/test.pl           |  28 ++-
 vcfcnv.c               |   6 +
 vcfconvert.c           |   5 +-
 vcfindex.c             |   7 +
 vcfisec.c              |  14 +-
 vcfmerge.c             |  14 +-
 vcfnorm.c              |  18 ++
 vcfquery.c             |  17 ++
 vcfsort.c              | 306 +++++++++++++++++++++++
 vcfstats.c             |  63 +++--
 52 files changed, 2245 insertions(+), 504 deletions(-)

diff --git a/Makefile b/Makefile
index 7e87b08..8e0f04d 100644
--- a/Makefile
+++ b/Makefile
@@ -39,6 +39,7 @@ OBJS     = main.o vcfindex.o tabix.o \
            vcfcnv.o HMM.o vcfplugin.o consensus.o ploidy.o bin.o hclust.o version.o \
            regidx.o smpl_ilist.o csq.o vcfbuf.o \
            mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \
+           vcfsort.o \
            ccall.o em.o prob1.o kmin.o # the original samtools calling
 
 prefix      = /usr/local
@@ -92,7 +93,7 @@ endif
 
 include config.mk
 
-PACKAGE_VERSION = 1.5
+PACKAGE_VERSION = 1.6
 
 # If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
 # description of the working tree: either a release tag with the same value
@@ -202,6 +203,7 @@ vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vc
 vcfroh.o: vcfroh.c $(roh_h)
 vcfcnv.o: vcfcnv.c $(cnv_h)
 vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h)
+vcfsort.o: vcfsort.c $(htslib_vcf_h) $(bcftools_h)
 vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) $(bin_h)
 vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h)
 reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_kseq_h) $(bcftools_h)
diff --git a/NEWS b/NEWS
index 5ae6521..c05420b 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,36 @@
+## Release 1.6 (September 2017)
+
+* New `sort` command.
+
+* New options added to the `consensus` command. Note that the `-i, --iupac`
+  option has been renamed to `-I, --iupac`, in favor of the standard 
+  `-i, --include`.
+
+* Filtering expressions (`-i/-e`): support for `GT=<type>` expressions and
+  for lists and ranges (#639) - see the man page for details.
+
+* `csq`: relax some GFF3 parsing restrictions to enable using Ensembl
+  GFF3 files for plants (#667)
+
+* `stats`: add further documentation to output stats files (#316) and
+  include haploid counts in per-sample output (#671).
+
+* `plot-vcfstats`: further fixes for Python3 (@nsoranzo, #645, #666).
+
+* `query` bugfix (#632)
+
+* `+setGT` plugin: new option to set genotypes based on a two-tailed binomial
+  distribution test. Also, allow combining `-i/-e` with `-t q`.
+
+* `mpileup`: fix typo (#636)
+
+* `convert --gvcf2vcf` bugfix (#641)
+
+* `+mendelian`: recognize some mendelian inconsistencies that were
+  being missed (@oronnavon, #660), also add support for multiallelic
+  sites and sex chromosomes.
+
+
 ## Release 1.5 (June 2017)
 
 * Added autoconf support to bcftools. See `INSTALL` for more details.
diff --git a/config.mk.in b/config.mk.in
index df70d55..8b54368 100644
--- a/config.mk.in
+++ b/config.mk.in
@@ -31,6 +31,7 @@
 prefix       = @prefix@
 exec_prefix  = @exec_prefix@
 bindir       = @bindir@
+libexecdir   = @libexecdir@
 datarootdir  = @datarootdir@
 mandir       = @mandir@
 
diff --git a/consensus.c b/consensus.c
index 258ef14..544eca6 100644
--- a/consensus.c
+++ b/consensus.c
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2014 Genome Research Ltd.
+   Copyright (c) 2014-2017 Genome Research Ltd.
 
    Author: Petr Danecek <pd3 at sanger.ac.uk>
    
@@ -39,6 +39,16 @@
 #include "regidx.h"
 #include "bcftools.h"
 #include "rbuf.h"
+#include "filter.h"
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+#define PICK_REF   1
+#define PICK_ALT   2
+#define PICK_LONG  4
+#define PICK_SHORT 8
 
 typedef struct
 {
@@ -75,12 +85,16 @@ typedef struct
     chain_t *chain;     // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
                         // Note that the chain is re-initialised for each chromosome/seq_region
 
+    filter_t *filter;
+    char *filter_str;
+    int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+
     bcf_srs_t *files;
     bcf_hdr_t *hdr;
     FILE *fp_out;
     FILE *fp_chain;
     char **argv;
-    int argc, output_iupac, haplotype, isample;
+    int argc, output_iupac, haplotype, allele, isample;
     char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname;
 }
 args_t;
@@ -195,7 +209,7 @@ static void init_data(args_t *args)
         args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample);
         if ( args->isample<0 ) error("No such sample: %s\n", args->sample);
     }
-    if ( args->haplotype && args->isample<0 )
+    if ( (args->haplotype || args->allele) && args->isample<0 )
     {
         if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
         args->isample = 0;
@@ -220,10 +234,14 @@ static void init_data(args_t *args)
         if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
     }
     else args->fp_out = stdout;
+    if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records\n");
+    if ( args->filter_str )
+        args->filter = filter_init(args->hdr, args->filter_str);
 }
 
 static void destroy_data(args_t *args)
 {
+    if (args->filter) filter_destroy(args->filter);
     bcf_sr_destroy(args->files);
     int i;
     for (i=0; i<args->vcf_rbuf.m; i++)
@@ -287,9 +305,16 @@ static bcf1_t **next_vcf_line(args_t *args)
         int i = rbuf_shift(&args->vcf_rbuf);
         return &args->vcf_buf[i];
     }
-    else if ( bcf_sr_next_line(args->files) )
+    while ( bcf_sr_next_line(args->files) )
+    {
+        if ( args->filter )
+        {
+            int is_ok = filter_test(args->filter, bcf_sr_get_line(args->files,0), NULL);
+            if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1;
+            if ( !is_ok ) continue;
+        }
         return &args->files->readers[0].buffer[0];
-
+    }
     return NULL;
 }
 static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
@@ -358,33 +383,36 @@ static void apply_variant(args_t *args, bcf1_t *rec)
     int i, ialt = 1;
     if ( args->isample >= 0 )
     {
+        bcf_unpack(rec, BCF_UN_FMT);
         bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
         if ( !fmt ) return;
+
+        if ( fmt->type!=BCF_BT_INT8 )
+            error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1);
+        uint8_t *ptr = fmt->p + fmt->size*args->isample;
+
         if ( args->haplotype )
         {
             if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
-            uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1;
-            ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+            ialt = ptr[args->haplotype-1];
             if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
             ialt = bcf_gt_allele(ialt);
         }
         else if ( args->output_iupac ) 
         {
-            uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample;
-            ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
+            ialt = ptr[0];
             if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
             ialt = bcf_gt_allele(ialt);
 
             int jalt;
             if ( fmt->n>1 )
             {
-                ptr = fmt->p + fmt->size*args->isample + 1;
-                jalt = bcf_dec_int1(ptr, fmt->type, &ignore);
+                jalt = ptr[1];
                 if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
                 else jalt = bcf_gt_allele(jalt);
             }
             else jalt = ialt;
-            if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+            if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
             if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
             {
                 char ial = rec->d.allele[ialt][0];
@@ -394,13 +422,40 @@ static void apply_variant(args_t *args, bcf1_t *rec)
         }
         else
         {
+            int is_hom = 1;
             for (i=0; i<fmt->n; i++)
             {
-                uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i;
-                ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
-                if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
-                ialt = bcf_gt_allele(ialt);
-                if ( ialt ) break;
+                if ( bcf_gt_is_missing(ptr[i]) ) return;  // ignore missing or half-missing genotypes
+                if ( ptr[i]==bcf_int32_vector_end ) break;
+                ialt = bcf_gt_allele(ptr[i]);
+                if ( i>0 && ialt!=bcf_gt_allele(ptr[i-1]) ) { is_hom = 0; break; }
+            }
+            if ( !is_hom )
+            {
+                int prev_len = 0, jalt;
+                for (i=0; i<fmt->n; i++)
+                {
+                    if ( ptr[i]==bcf_int32_vector_end ) break;
+                    jalt = bcf_gt_allele(ptr[i]);
+                    if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+                    if ( args->allele & (PICK_LONG|PICK_SHORT) )
+                    {
+                        int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]);
+                        if ( i==0 ) ialt = jalt, prev_len = len;
+                        else if ( len == prev_len )
+                        {
+                            if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt, prev_len = len;
+                            else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt, prev_len = len;
+                        }
+                        else if ( args->allele & PICK_LONG && len > prev_len ) ialt = jalt, prev_len = len;
+                        else if ( args->allele & PICK_SHORT && len < prev_len ) ialt = jalt, prev_len = len;
+                    }
+                    else
+                    {
+                        if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt;
+                        else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt;
+                    }
+                }
             }
         }
         if ( !ialt ) return;  // ref allele
@@ -623,12 +678,21 @@ static void usage(args_t *args)
     fprintf(stderr, "       information, such as INFO/AD or FORMAT/AD.\n");
     fprintf(stderr, "Usage:   bcftools consensus [OPTIONS] <file.vcf>\n");
     fprintf(stderr, "Options:\n");
+    fprintf(stderr, "    -c, --chain <file>         write a chain file for liftover\n");
+    fprintf(stderr, "    -e, --exclude <expr>       exclude sites for which the expression is true (see man page for details)\n");
     fprintf(stderr, "    -f, --fasta-ref <file>     reference sequence in fasta format\n");
-    fprintf(stderr, "    -H, --haplotype <1|2>      apply variants for the given haplotype\n");
-    fprintf(stderr, "    -i, --iupac-codes          output variants in the form of IUPAC ambiguity codes\n");
+    fprintf(stderr, "    -H, --haplotype <which>    choose which allele to use from the FORMAT/GT field, note\n");
+    fprintf(stderr, "                               the codes are case-insensitive:\n");
+    fprintf(stderr, "                                   1: first allele from GT\n");
+    fprintf(stderr, "                                   2: second allele\n");
+    fprintf(stderr, "                                   R: REF allele in het genotypes\n");
+    fprintf(stderr, "                                   A: ALT allele\n");
+    fprintf(stderr, "                                   LR,LA: longer allele and REF/ALT if equal length\n");
+    fprintf(stderr, "                                   SR,SA: shorter allele and REF/ALT if equal length\n");
+    fprintf(stderr, "    -i, --include <expr>       select sites for which the expression is true (see man page for details)\n");
+    fprintf(stderr, "    -I, --iupac-codes          output variants in the form of IUPAC ambiguity codes\n");
     fprintf(stderr, "    -m, --mask <file>          replace regions with N\n");
     fprintf(stderr, "    -o, --output <file>        write output to a file [standard output]\n");
-    fprintf(stderr, "    -c, --chain <file>         write a chain file for liftover\n");
     fprintf(stderr, "    -s, --sample <name>        apply variants of the given sample\n");
     fprintf(stderr, "Examples:\n");
     fprintf(stderr, "   # Get the consensus for one region. The fasta header lines are then expected\n");
@@ -645,8 +709,10 @@ int main_consensus(int argc, char *argv[])
 
     static struct option loptions[] = 
     {
+        {"exclude",required_argument,NULL,'e'},
+        {"include",required_argument,NULL,'i'},
         {"sample",1,0,'s'},
-        {"iupac-codes",0,0,'i'},
+        {"iupac-codes",0,0,'I'},
         {"haplotype",1,0,'H'},
         {"output",1,0,'o'},
         {"fasta-ref",1,0,'f'},
@@ -655,19 +721,32 @@ int main_consensus(int argc, char *argv[])
         {0,0,0,0}
     };
     int c;
-    while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0) 
+    while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:",loptions,NULL)) >= 0) 
     {
         switch (c) 
         {
             case 's': args->sample = optarg; break;
             case 'o': args->output_fname = optarg; break;
-            case 'i': args->output_iupac = 1; break;
+            case 'I': args->output_iupac = 1; break;
+            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
             case 'f': args->ref_fname = optarg; break;
             case 'm': args->mask_fname = optarg; break;
             case 'c': args->chain_fname = optarg; break;
             case 'H': 
-                args->haplotype = optarg[0] - '0'; 
-                if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
+                if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
+                else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT;
+                else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF;
+                else if ( !strcasecmp(optarg,"S") ) args->allele |= PICK_SHORT|PICK_REF;
+                else if ( !strcasecmp(optarg,"LR") ) args->allele |= PICK_LONG|PICK_REF;
+                else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT;
+                else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF;
+                else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT;
+                else
+                {
+                    args->haplotype = optarg[0] - '0'; 
+                    if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
+                }
                 break;
             default: usage(args); break;
         }
diff --git a/csq.c b/csq.c
index b1db103..94ac442 100644
--- a/csq.c
+++ b/csq.c
@@ -164,17 +164,6 @@
 #define N_SPLICE_REGION_EXON   3 
 #define N_SPLICE_REGION_INTRON 8 
 
-// Ensembl ID format, e.g. 
-//     ENST00000423372 for human .. ENST%011d
-//  ENSMUST00000120394 for mouse .. ENSMUST%011d
-char  ENSID_BUF[32], *ENSID_FMT = NULL;
-static inline char *ENSID(uint32_t id)
-{
-    sprintf(ENSID_BUF,ENSID_FMT,id);
-    return ENSID_BUF;
-}
-
-
 #define N_REF_PAD 10    // number of bases to avoid boundary effects
 
 #define STRAND_REV 0
@@ -509,7 +498,6 @@ hap_t;
         temporary list of all exons, CDS, UTRs 
 */
 KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
-KHASH_MAP_INIT_INT(int2int, int)
 KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
 typedef struct
 {
@@ -522,25 +510,41 @@ typedef struct
     uint32_t iseq:29;
 }
 ftr_t;
+/*
+    Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001)
+    to integer id.  To keep the memory requirements low, the original version
+    relied on IDs in the form of a string prefix and a numerical id.  However,
+    it turns out that this assumption is not valid for some ensembl GFFs, see
+    for example Zea_mays.AGPv4.36.gff3.gz
+ */
+typedef struct
+{
+    void *str2id;       // khash_str2int
+    int nstr, mstr;
+    char **str;         // numeric id to string
+}
+id_tbl_t;
 typedef struct
 {
     // all exons, CDS, UTRs
     ftr_t *ftr;
     int nftr, mftr;
 
-    // mapping from transcript ensembl id to gene id
+    // mapping from gene id to gf_gene_t
     kh_int2gene_t *gid2gene;
 
     // mapping from transcript id to tscript, for quick CDS anchoring
     kh_int2tscript_t *id2tr;
 
     // sequences
-    void *seq2int;
+    void *seq2int;  // str2int hash
     char **seq;
     int nseq, mseq;
 
     // ignored biotypes
     void *ignored_biotypes;
+
+    id_tbl_t gene_ids;   // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx
 }
 aux_t;
 
@@ -590,6 +594,7 @@ typedef struct _args_t
     int nrm_tr, mrm_tr;
     csq_t *csq_buf;             // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
     int ncsq_buf, mcsq_buf;
+    id_tbl_t tscript_ids;       // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
 
     faidx_t *fai;
     kstring_t str, str2;
@@ -694,33 +699,38 @@ static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg,
     if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
     return se+1;
 }
-static inline uint32_t gff_parse_id(const char *line, const char *needle, char *ss)
+static void gff_id_init(id_tbl_t *tbl)
 {
-    ss = strstr(ss,needle);
-    if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
-    ss += strlen(needle);
-    while ( *ss && !isdigit(*ss) ) ss++;
-    if ( !ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
-    char *se;
-    uint32_t id = strtol(ss, &se, 10);
-    if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
-    if ( *se && *se!=';' && *se!='\t' ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-    assert( id <= 0xffffff );   // see gf_gene_t.id. Ensembl IDs are never that big in practice
-    return id;
+    memset(tbl, 0, sizeof(*tbl));
+    tbl->str2id = khash_str2int_init();
+}
+static void gff_id_destroy(id_tbl_t *tbl)
+{
+    khash_str2int_destroy_free(tbl->str2id);
+    free(tbl->str);
 }
-static void gff_parse_ensid_fmt(const char *line, const char *needle, char *ss)
+static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char *needle, char *ss)
 {
-    ss = strstr(ss,needle);
+    ss = strstr(ss,needle);     // e.g. "ID=transcript:"
     if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
     ss += strlen(needle);
+
     char *se = ss;
-    while ( *se && !isdigit(*se) ) se++;
-    kstring_t str = {0,0,0};
-    kputsn(ss,se-ss,&str);
-    ss = se;
-    while ( *se && isdigit(*se) ) se++;
-    ksprintf(&str,"%%0%dd",(int)(se-ss));
-    ENSID_FMT = str.s;
+    while ( *se && *se!=';' && !isspace(*se) ) se++;
+    char tmp = *se;
+    *se = 0;
+
+    int id;
+    if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 )
+    {
+        id = tbl->nstr++;
+        hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
+        tbl->str[id] = strdup(ss);
+        int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
+    }
+    *se = tmp;
+
+    return id;
 }
 static inline int gff_parse_type(char *line)
 {
@@ -880,10 +890,8 @@ void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
     }
 
     // create a mapping from transcript_id to gene_id
-    uint32_t trid = gff_parse_id(line, "ID=transcript:", ss);
-    uint32_t gene_id = gff_parse_id(line, "Parent=gene:", ss);
-
-    if ( !ENSID_FMT ) gff_parse_ensid_fmt(line, "ID=transcript:", ss);      // id prefix different across species
+    uint32_t trid = gff_id_parse(&args->tscript_ids, line, "ID=transcript:", ss);
+    uint32_t gene_id = gff_id_parse(&args->init.gene_ids, line, "Parent=gene:", ss);
 
     tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
     tr->id     = trid;
@@ -910,7 +918,7 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha
     aux_t *aux = &args->init;
 
     // substring search for "ID=gene:ENSG00000437963"
-    uint32_t gene_id = gff_parse_id(line, "ID=gene:", ss);
+    uint32_t gene_id = gff_id_parse(&aux->gene_ids, line, "ID=gene:", ss);
     gf_gene_t *gene = gene_init(aux, gene_id);
     assert( !gene->name );      // the gene_id should be unique
 
@@ -918,13 +926,17 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha
 
     // substring search for "Name=OR4F5"
     ss = strstr(chr_end+2,"Name=");
-    if ( !ss ) error("Could not parse the line, \"Name=\" not present: %s\n", line);
-    ss += 5;
-    char *se = ss;
-    while ( *se && *se!=';' && !isspace(*se) ) se++;
-    gene->name = (char*) malloc(se-ss+1);
-    memcpy(gene->name,ss,se-ss);
-    gene->name[se-ss] = 0;
+    if ( ss )
+    {
+        ss += 5;
+        char *se = ss;
+        while ( *se && *se!=';' && !isspace(*se) ) se++;
+        gene->name = (char*) malloc(se-ss+1);
+        memcpy(gene->name,ss,se-ss);
+        gene->name[se-ss] = 0;
+    }
+    else
+        gene->name = strdup(aux->gene_ids.str[gene_id]); // Name=<GeneName> field is not present, use the gene ID instead
 }
 int gff_parse(args_t *args, char *line, ftr_t *ftr)
 {
@@ -999,7 +1011,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr)
     ss += 2;
 
     // substring search for "Parent=transcript:ENST00000437963"
-    ftr->trid = gff_parse_id(line, "Parent=transcript:", ss);
+    ftr->trid = gff_id_parse(&args->tscript_ids, line, "Parent=transcript:", ss);
     ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
     return 0;
 }
@@ -1104,7 +1116,7 @@ void tscript_init_cds(args_t *args)
             {
                 int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
                 if ( phase!=len%3)
-                    error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+                    error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                 assert( phase == len%3 );
                 len += tr->cds[i]->len; 
             }
@@ -1132,7 +1144,7 @@ void tscript_init_cds(args_t *args)
             {
                 int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
                 if ( phase!=len%3)
-                    error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+                    error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                 len += tr->cds[i]->len;
             }
         }
@@ -1205,6 +1217,8 @@ void init_gff(args_t *args)
     aux->id2tr     = kh_init(int2tscript);   // transcript id to tscript_t
     args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
     aux->ignored_biotypes = khash_str2int_init();
+    gff_id_init(&aux->gene_ids);
+    gff_id_init(&args->tscript_ids);
 
     // parse gff
     kstring_t str = {0,0,0};
@@ -1252,7 +1266,7 @@ void init_gff(args_t *args)
         else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
         else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
         else
-            error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,ENSID(ftr->trid),gf_type2gff_string(ftr->type));
+            error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type));
     }
     tscript_init_cds(args);
 
@@ -1270,6 +1284,7 @@ void init_gff(args_t *args)
     // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
     kh_destroy(int2tscript,aux->id2tr);
     free(aux->seq);
+    gff_id_destroy(&aux->gene_ids);
 
     if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) )
     {
@@ -1409,7 +1424,7 @@ void destroy_data(args_t *args)
     free(args->gt_arr);
     free(args->str.s);
     free(args->str2.s);
-    free(ENSID_FMT);
+    gff_id_destroy(&args->tscript_ids);
 }
 
 /*
@@ -2491,7 +2506,7 @@ exit_duplicate:
 #define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen)
 #define node2rpos(i) (hap->stack[i].node->rec->pos)
 
-void kput_vcsq(vcsq_t *csq, kstring_t *str)
+void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
 {
     // Remove start/stop from incomplete CDS, but only if there is another
     // consequence as something must be reported
@@ -2520,7 +2535,7 @@ void kput_vcsq(vcsq_t *csq, kstring_t *str)
     if ( csq->gene ) kputs(csq->gene , str);
 
     kputc_('|', str);
-    if ( csq->type & CSQ_PRN_TSCRIPT ) ksprintf(str, "%s",ENSID(csq->trid));
+    if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str);
 
     kputc_('|', str);
     kputs(gf_type2gff_string(csq->biotype), str);
@@ -2889,7 +2904,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
         fprintf(args->out,"-");
 
     args->str.l = 0;
-    kput_vcsq(&csq->type, &args->str);
+    kput_vcsq(args, &csq->type, &args->str);
     fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
 }
 static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
@@ -2913,7 +2928,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih
             fprintf(args->out,"-");
 
         args->str.l = 0;
-        kput_vcsq(&csq->type, &args->str);
+        kput_vcsq(args, &csq->type, &args->str);
         fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
     }
 }
@@ -3057,11 +3072,11 @@ void vbuf_flush(args_t *args)
             }
             
             args->str.l = 0;
-            kput_vcsq(&vrec->vcsq[0], &args->str);
+            kput_vcsq(args, &vrec->vcsq[0], &args->str);
             for (j=1; j<vrec->nvcsq; j++)
             {
                 kputc_(',', &args->str);
-                kput_vcsq(&vrec->vcsq[j], &args->str);
+                kput_vcsq(args, &vrec->vcsq[j], &args->str);
             }
             bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s);
             if ( args->hdr_nsmpl )
@@ -3665,7 +3680,7 @@ void process(args_t *args, bcf1_t **rec_ptr)
     return;
 }
 
-const char *usage(void)
+static const char *usage(void)
 {
     return 
         "\n"
diff --git a/doc/bcftools.1 b/doc/bcftools.1
index 0df9450..80312cd 100644
--- a/doc/bcftools.1
+++ b/doc/bcftools.1
@@ -2,12 +2,12 @@
 .\"     Title: bcftools
 .\"    Author: [see the "AUTHORS" section]
 .\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
-.\"      Date: 2017-06-21
+.\"      Date: 2017-09-28
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "BCFTOOLS" "1" "2017\-06\-21" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "2017\-09\-28" "\ \&" "\ \&"
 .\" -----------------------------------------------------------------
 .\" * Define some portability stuff
 .\" -----------------------------------------------------------------
@@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
 BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
 .SS "VERSION"
 .sp
-This manual page was last updated \fB2017\-06\-21\fR and refers to bcftools git version \fB1\&.5\fR\&.
+This manual page was last updated \fB2017\-09\-28\fR and refers to bcftools git version \fB1\&.6\fR\&.
 .SS "BCF1"
 .sp
 The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
@@ -318,6 +318,19 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F
 .IP \(bu 2.3
 .\}
 
+\fBsort\fR
+\&.\&. sort VCF/BCF files
+.RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04'\(bu\h'+03'\c
+.\}
+.el \{\
+.sp -1
+.IP \(bu 2.3
+.\}
+
 \fBstats\fR
 \&.\&. produce VCF/BCF stats (former vcfcheck)
 .RE
@@ -461,7 +474,7 @@ cannot be used in combination with
 .PP
 \fB\-s, \-\-samples\fR [^]\fILIST\fR
 .RS 4
-Comma\-separated list of samples to include or exclude if prefixed with "^"\&. Note that in general tags such as INFO/AC, INFO/AN, etc are not updated to correspond to the subset samples\&.
+Comma\-separated list of samples to include or exclude if prefixed with "^"\&. The sample order is updated to reflect that given on the command line\&. Note that in general tags such as INFO/AC, INFO/AN, etc are not updated to correspond to the subset samples\&.
 \fBbcftools view\fR
 is the exception where some tags will be updated (unless the
 \fB\-I, \-\-no\-update\fR
@@ -486,7 +499,7 @@ into that command\&. For example:
 .RS 4
 File of sample names to include or exclude if prefixed with "^"\&. One sample per line\&. See also the note above for the
 \fB\-s, \-\-samples\fR
-option\&. The command
+option\&. The sample order is updated to reflect that given in the input file\&. The command
 \fBbcftools call\fR
 accepts an optional second column indicating ploidy (0, 1 or 2) or sex (as defined by
 \fB\-\-ploidy\fR, for example "F" or "M"), and can parse also PED files\&. If the second column is not present, the sex "F" is assumed\&. With
@@ -1062,12 +1075,12 @@ option can be used
 read allele frequencies from a tab\-delimited file with the columns CHR,POS,REF,ALT,AF
 .RE
 .PP
-*\-o, \-\-output\-dir \fIpath\fR
+\fB\-o, \-\-output\-dir\fR \fIpath\fR
 .RS 4
 output directory
 .RE
 .PP
-*\-p, \-\-plot\-threshold \fIfloat\fR
+\fB\-p, \-\-plot\-threshold\fR \fIfloat\fR
 .RS 4
 call
 \fBmatplotlib\fR
@@ -1252,18 +1265,78 @@ see
 .sp
 Create consensus sequence by applying VCF variants to a reference fasta file\&. By default, the program will apply all ALT variants to the reference fasta to obtain the consensus sequence\&. Using the \fB\-\-sample\fR (and, optionally, \fB\-\-haplotype\fR) option will apply genotype (haplotype) calls from FORMAT/GT\&. Note that the program does not act as a primitive variant caller and ignores allelic depth information, such as INFO/AD or FORMAT/AD\&. For that, consider using the \fBsetG [...]
 .PP
+\fB\-c, \-\-chain\fR \fIFILE\fR
+.RS 4
+write a chain file for liftover
+.RE
+.PP
+\fB\-e, \-\-exclude\fR \fIEXPRESSION\fR
+.RS 4
+exclude sites for which
+\fIEXPRESSION\fR
+is true\&. For valid expressions see
+\fBEXPRESSIONS\fR\&.
+.RE
+.PP
 \fB\-f, \-\-fasta\-ref\fR \fIFILE\fR
 .RS 4
 reference sequence in fasta format
 .RE
 .PP
-\fB\-H, \-\-haplotype\fR \fI1\fR|\fI2\fR
+\fB\-H, \-\-haplotype\fR \fI1\fR|\fI2\fR|\fIR\fR|\fIA\fR|\fILR\fR|\fILA\fR|\fISR\fR|\fISA\fR
+.RS 4
+choose which allele from the FORMAT/GT field to use (the codes are case\-insensitive):
+.PP
+\fI1\fR
+.RS 4
+the first allele
+.RE
+.PP
+\fI2\fR
+.RS 4
+the second allele
+.RE
+.PP
+\fIR\fR
+.RS 4
+the REF allele (in heterozygous genotypes)
+.RE
+.PP
+\fIA\fR
+.RS 4
+the ALT allele (in heterozygous genotypes)
+.RE
+.PP
+\fILR, LA\fR
+.RS 4
+the longer allele\&. If both have the same length, use the REF allele (LR), or the ALT allele (LA)
+.RE
+.PP
+\fISR, SA\fR
+.RS 4
+the shorter allele\&. If both have the same length, use the REF allele (SR), or the ALT allele (SA)
+.sp
+.if n \{\
+.RS 4
+.\}
+.nf
+This option requires *\-s*, unless exactly one sample is present in the VCF
+.fi
+.if n \{\
+.RE
+.\}
+.RE
+.RE
+.PP
+\fB\-i, \-\-include\fR \fIEXPRESSION\fR
 .RS 4
-apply variants for the given haplotype\&. This option requires
-\fB\-s\fR, unless exactly one sample is present in the VCF
+include only sites for which
+\fIEXPRESSION\fR
+is true\&. For valid expressions see
+\fBEXPRESSIONS\fR\&.
 .RE
 .PP
-\fB\-i, \-\-iupac\-codes\fR
+\fB\-I, \-\-iupac\-codes\fR
 .RS 4
 output variants in the form of IUPAC ambiguity codes
 .RE
@@ -2899,13 +2972,12 @@ and
 can swap alleles and will update genotypes (GT) and AC counts, but will not attempt to fix PL or other fields\&.
 .RE
 .PP
-\fB\-d, \-\-rm\-dup\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIall\fR|\fInone\fR
+\fB\-d, \-\-rm\-dup\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIany\fR
 .RS 4
 If a record is present multiple times, output only the first instance, see
 \fB\-\-collapse\fR
 in
-\fBCommon Options\fR\&. Requires
-\fB\-a, \-\-allow\-overlaps\fR\&.
+\fBCommon Options\fR\&.
 .RE
 .PP
 \fB\-D, \-\-remove\-duplicates\fR
@@ -3144,6 +3216,16 @@ find positions with wildly varying ALT allele frequency (Fisher test on FMT/AD)
 collect AF deviation stats and GT probability distribution given AF and assuming HWE
 .RE
 .PP
+\fBcheck\-ploidy\fR
+.RS 4
+check if ploidy of samples is consistent for all sites
+.RE
+.PP
+\fBcheck\-sparsity\fR
+.RS 4
+print samples without genotypes in a region or chromosome
+.RE
+.PP
 \fBcolor\-chrs\fR
 .RS 4
 color shared chromosomal segments, requires trio VCF with phased GTs
@@ -3171,7 +3253,7 @@ fill INFO or REF field based on values in a fasta file
 .PP
 \fBfill\-tags\fR
 .RS 4
-set INFO tags AF, AN, AC, NS, AC_Hom, AC_Het, AC_Hemi
+set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, HWE, MAF, NS
 .RE
 .PP
 \fBfix\-ploidy\fR
@@ -3199,6 +3281,11 @@ determine sample sex by checking genotype likelihoods (GL,PL) or genotypes (GT)
 add imputation information metrics to the INFO field based on selected FORMAT tags
 .RE
 .PP
+\fBisecGT\fR
+.RS 4
+compare two files and set non\-identical genotypes to missing
+.RE
+.PP
 \fBmendelian\fR
 .RS 4
 count Mendelian consistent / inconsistent genotypes\&.
@@ -3209,6 +3296,11 @@ count Mendelian consistent / inconsistent genotypes\&.
 sets missing genotypes ("\&./\&.") to ref allele ("0/0" or "0|0")
 .RE
 .PP
+\fBprune\fR
+.RS 4
+prune sites by missingness or linkage disequilibrium
+.RE
+.PP
 \fBsetGT\fR
 .RS 4
 general tool to set genotypes according to rules requested by the user
@@ -3845,6 +3937,29 @@ estimate HMM parameters using Baum\-Welch algorithm, using the convergence thres
 \fIFLOAT\fR, e\&.g\&. 1e\-10 (experimental)
 .RE
 .RE
+.SS "bcftools sort [\fIOPTIONS\fR] file\&.bcf"
+.PP
+\fB\-m, \-\-max\-mem\fR \fIFLOAT\fR[\fIkMG\fR]
+.RS 4
+Maximum memory to use\&. Approximate, affects the number of temporary files written to the disk\&. Note that if the command fails at this step because of too many open files, your system limit on the number of open files ("ulimit") may need to be increased\&.
+.RE
+.PP
+\fB\-o, \-\-output\fR \fIFILE\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
+\fB\-O, \-\-output\-type\fR \fIb\fR|\fIu\fR|\fIz\fR|\fIv\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
+\fB\-T, \-\-temp\-dir\fR \fIDIR\fR
+.RS 4
+Use this directory to store temporary files
+.RE
 .SS "bcftools stats [\fIOPTIONS\fR] \fIA\&.vcf\&.gz\fR [\fIB\&.vcf\&.gz\fR]"
 .sp
 Parses VCF or BCF and produces text file stats which is suitable for machine processing and can be plotted using \fBplot\-vcfstats\fR\&. When two files are given, the program generates separate stats for intersection and the complements\&. By default only sites are compared, \fB\-s\fR/\fB\-S\fR must given to include also sample columns\&. When one VCF file is specified on the command line, then stats by non\-reference allele frequency, depth distribution, stats by quality and per\-sample [...]
@@ -4479,6 +4594,35 @@ GT="\&.|\&.", GT="\&./\&.", GT="\&."
 .sp -1
 .IP \(bu 2.3
 .\}
+sample genotype: homozygous, heterozygous, haploid, ref\-ref hom, alt\-alt hom, ref\-alt het, alt\-alt het, haploid ref, haploid alt (case\-insensitive)
+.sp
+.if n \{\
+.RS 4
+.\}
+.nf
+GT="hom"
+GT="het"
+GT="hap"
+GT="RR"
+GT="AA"
+GT="RA" or GT="AR"
+GT="Aa" or GT="aA"
+GT="R"
+GT="A"
+.fi
+.if n \{\
+.RE
+.\}
+.RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04'\(bu\h'+03'\c
+.\}
+.el \{\
+.sp -1
+.IP \(bu 2.3
+.\}
 TYPE for variant type in REF,ALT columns (indel,snp,mnp,ref,bnd,other)\&. Use the regex operator "\e~" to require at least one allele of the given type or the equal sign "=" to require that all alleles are of the given type\&. Compare
 .sp
 .if n \{\
@@ -4503,7 +4647,7 @@ TYPE!~"snp"
 .sp -1
 .IP \(bu 2.3
 .\}
-array subscripts, "*" for any field
+array subscripts (0\-based), "*" for any field, "\-" to indicate a range
 .sp
 .if n \{\
 .RS 4
@@ -4512,6 +4656,9 @@ array subscripts, "*" for any field
 (DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0\&.3
 DP4[*] == 0
 CSQ[*] ~ "missense_variant\&.*deleterious"
+FORMAT/DP[1\-3] > 10
+FORMAT/DP[1\-]  < 7
+FORMAT/DP[0,2\-4] > 20
 .fi
 .if n \{\
 .RE
diff --git a/doc/bcftools.html b/doc/bcftools.html
index 5104307..ac22e4b 100644
--- a/doc/bcftools.html
+++ b/doc/bcftools.html
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25137632"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipu [...]
+<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp53408"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulat [...]
 Call Format (VCF) and its binary counterpart BCF. All commands work
 transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.</p><p>Most commands accept VCF, bgzipped VCF and BCF with filetype detected
 automatically even when streaming from a pipe. Indexed VCF and BCF
@@ -8,7 +8,7 @@ will work in all situations. Un-indexed VCF and BCF and streams will
 work in most, but not all situations. In general, whenever multiple VCFs are
 read simultaneously, they must be indexed and therefore also compressed.</p><p>BCFtools is designed to work on a stream. It regards an input file "-" as the
 standard input (stdin) and outputs to the standard output (stdout). Several
-commands can thus be  combined  with  Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2017-06-21</strong></span> and refers to bcftools git version <span class="strong"><strong>1.5</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools <= 0.1.19 is <span class="strong"><strong>not</strong></span>
+commands can thus be  combined  with  Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2017-09-28</strong></span> and refers to bcftools git version <span class="strong"><strong>1.6</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools <= 0.1.19 is <span class="strong"><strong>not</strong></span>
 compatible with this version of bcftools. To read BCF1 files one can use
 the view command from old versions of bcftools packaged with samtools
 versions <= 0.1.19 to convert to VCF, which can then be read by
@@ -57,6 +57,8 @@ list of available options, run <span class="strong"><strong>bcftools</strong></s
 </li><li class="listitem">
 <span class="strong"><strong><a class="link" href="#roh" title="bcftools roh [OPTIONS] file.vcf.gz">roh</a></strong></span>          ..  identify runs of homo/auto-zygosity
 </li><li class="listitem">
+<span class="strong"><strong><a class="link" href="#sort" title="bcftools sort [OPTIONS] file.bcf">sort</a></strong></span>        ..  sort VCF/BCF files
+</li><li class="listitem">
 <span class="strong"><strong><a class="link" href="#stats" title="bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]">stats</a></strong></span>      ..  produce VCF/BCF stats (former vcfcheck)
 </li><li class="listitem">
 <span class="strong"><strong><a class="link" href="#view" title="bcftools view [OPTIONS] file.vcf.gz [REGION […]]">view</a></strong></span>        ..  subset, filter and convert VCF and BCF files
@@ -167,6 +169,7 @@ specific commands to see if they apply.</p><div class="variablelist"><dl><dt><sp
 </span></dt><dd>
     Comma-separated list of samples to include or exclude if prefixed
     with "^".
+    The sample order is updated to reflect that given on the command line.
     Note that in general tags such as INFO/AC, INFO/AN, etc are not updated
     to correspond to the subset samples. <span class="strong"><strong><a class="link" href="#view" title="bcftools view [OPTIONS] file.vcf.gz [REGION […]]">bcftools view</a></strong></span> is the
     exception where some tags will be updated (unless the <span class="strong"><strong>-I, --no-update</strong></span>
@@ -179,6 +182,7 @@ specific commands to see if they apply.</p><div class="variablelist"><dl><dt><sp
     File of sample names to include or exclude if prefixed with "^".
     One sample per line. See also the note above for the <span class="strong"><strong>-s, --samples</strong></span>
     option.
+    The sample order is updated to reflect that given in the input file.
     The command <span class="strong"><strong><a class="link" href="#call" title="bcftools call [OPTIONS] FILE">bcftools call</a></strong></span> accepts an optional second
     column indicating ploidy (0, 1 or 2) or sex (as defined by
     <span class="strong"><strong><a class="link" href="#ploidy">--ploidy</a></strong></span>, for example "F" or "M"), and can parse also PED
@@ -564,11 +568,11 @@ loss), 0 (complete loss), 3 (single-copy gain).</p><div class="refsect3" title="
 </span></dt><dd>
     read allele frequencies from  a tab-delimited file with the columns CHR,POS,REF,ALT,AF
 </dd><dt><span class="term">
-*-o, --output-dir <span class="emphasis"><em>path</em></span>
+<span class="strong"><strong>-o, --output-dir</strong></span> <span class="emphasis"><em>path</em></span>
 </span></dt><dd>
     output directory
 </dd><dt><span class="term">
-*-p, --plot-threshold <span class="emphasis"><em>float</em></span>
+<span class="strong"><strong>-p, --plot-threshold</strong></span> <span class="emphasis"><em>float</em></span>
 </span></dt><dd>
     call <span class="strong"><strong>matplotlib</strong></span> to produce plots for chromosomes with quality at least <span class="emphasis"><em>float</em></span>,
     useful for visual inspection of the calls. With <span class="strong"><strong>-p 0</strong></span>, plots for all chromosomes will be
@@ -717,16 +721,53 @@ obtain the consensus sequence. Using the <span class="strong"><strong>--sample</
 Note that the program does not act as a primitive variant caller and ignores allelic
 depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
 <span class="strong"><strong>setGT</strong></span> plugin.</p><div class="variablelist"><dl><dt><span class="term">
+<span class="strong"><strong>-c, --chain</strong></span> <span class="emphasis"><em>FILE</em></span>
+</span></dt><dd>
+    write a chain file for liftover
+</dd><dt><span class="term">
+<span class="strong"><strong>-e, --exclude</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
+</span></dt><dd>
+    exclude sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
+    <span class="strong"><strong><a class="link" href="#expressions" title="EXPRESSIONS">EXPRESSIONS</a></strong></span>.
+</dd><dt><span class="term">
 <span class="strong"><strong>-f, --fasta-ref</strong></span> <span class="emphasis"><em>FILE</em></span>
 </span></dt><dd>
     reference sequence in fasta format
 </dd><dt><span class="term">
-<span class="strong"><strong>-H, --haplotype</strong></span> <span class="emphasis"><em>1</em></span>|<span class="emphasis"><em>2</em></span>
+<span class="strong"><strong>-H, --haplotype</strong></span> <span class="emphasis"><em>1</em></span>|<span class="emphasis"><em>2</em></span>|<span class="emphasis"><em>R</em></span>|<span class="emphasis"><em>A</em></span>|<span class="emphasis"><em>LR</em></span>|<span class="emphasis"><em>LA</em></span>|<span class="emphasis"><em>SR</em></span>|<span class="emphasis"><em>SA</em></span>
+</span></dt><dd><p class="simpara">
+    choose which allele from the FORMAT/GT field to use (the codes are case-insensitive):
+</p><div class="variablelist"><dl><dt><span class="term">
+<span class="emphasis"><em>1</em></span>
+</span></dt><dd>
+            the first allele
+</dd><dt><span class="term">
+<span class="emphasis"><em>2</em></span>
+</span></dt><dd>
+            the second allele
+</dd><dt><span class="term">
+<span class="emphasis"><em>R</em></span>
+</span></dt><dd>
+            the REF allele (in heterozygous genotypes)
+</dd><dt><span class="term">
+<span class="emphasis"><em>A</em></span>
+</span></dt><dd>
+            the ALT allele (in heterozygous genotypes)
+</dd><dt><span class="term">
+<span class="emphasis"><em>LR, LA</em></span>
+</span></dt><dd>
+            the longer allele. If both have the same length, use the REF allele (LR), or the ALT allele  (LA)
+</dd><dt><span class="term">
+<span class="emphasis"><em>SR, SA</em></span>
+</span></dt><dd><p class="simpara">
+            the shorter allele. If both have the same length, use the REF allele (SR), or the ALT allele  (SA)
+</p><pre class="literallayout">This option requires *-s*, unless exactly one sample is present in the VCF</pre></dd></dl></div></dd><dt><span class="term">
+<span class="strong"><strong>-i, --include</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
 </span></dt><dd>
-    apply variants for the given haplotype. This option requires <span class="strong"><strong>-s</strong></span>, unless
-    exactly one sample is present in the VCF
+    include only sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
+    <span class="strong"><strong><a class="link" href="#expressions" title="EXPRESSIONS">EXPRESSIONS</a></strong></span>.
 </dd><dt><span class="term">
-<span class="strong"><strong>-i, --iupac-codes</strong></span>
+<span class="strong"><strong>-I, --iupac-codes</strong></span>
 </span></dt><dd>
     output variants in the form of IUPAC ambiguity codes
 </dd><dt><span class="term">
@@ -1732,11 +1773,10 @@ the <span class="strong"><strong><a class="link" href="#fasta_ref">--fasta-ref</
     can swap alleles and will update genotypes (GT) and AC counts,
     but will not attempt to fix PL or other fields.
 </dd><dt><span class="term">
-<span class="strong"><strong>-d, --rm-dup</strong></span> <span class="emphasis"><em>snps</em></span>|<span class="emphasis"><em>indels</em></span>|<span class="emphasis"><em>both</em></span>|<span class="emphasis"><em>all</em></span>|<span class="emphasis"><em>none</em></span>
+<span class="strong"><strong>-d, --rm-dup</strong></span> <span class="emphasis"><em>snps</em></span>|<span class="emphasis"><em>indels</em></span>|<span class="emphasis"><em>both</em></span>|<span class="emphasis"><em>any</em></span>
 </span></dt><dd>
     If a record is present multiple times, output only the first instance,
     see <span class="strong"><strong>--collapse</strong></span> in <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>.
-    Requires <span class="strong"><strong>-a, --allow-overlaps</strong></span>.
 </dd><dt><span class="term">
 <span class="strong"><strong>-D, --remove-duplicates</strong></span>
 </span></dt><dd>
@@ -1891,6 +1931,14 @@ the usage examples that each plugin comes with.</p><div class="refsect3" title="
 </span></dt><dd>
     collect AF deviation stats and GT probability distribution given AF and assuming HWE
 </dd><dt><span class="term">
+<span class="strong"><strong>check-ploidy</strong></span>
+</span></dt><dd>
+    check if ploidy of samples is consistent for all sites
+</dd><dt><span class="term">
+<span class="strong"><strong>check-sparsity</strong></span>
+</span></dt><dd>
+    print samples without genotypes in a region or chromosome
+</dd><dt><span class="term">
 <span class="strong"><strong>color-chrs</strong></span>
 </span></dt><dd>
     color shared chromosomal segments, requires trio VCF with phased GTs
@@ -1914,7 +1962,7 @@ the usage examples that each plugin comes with.</p><div class="refsect3" title="
 </dd><dt><span class="term">
 <span class="strong"><strong>fill-tags</strong></span>
 </span></dt><dd>
-    set INFO tags AF, AN, AC, NS, AC_Hom, AC_Het, AC_Hemi
+    set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, HWE, MAF, NS
 </dd><dt><span class="term">
 <span class="strong"><strong>fix-ploidy</strong></span>
 </span></dt><dd>
@@ -1937,6 +1985,10 @@ the usage examples that each plugin comes with.</p><div class="refsect3" title="
 </span></dt><dd>
     add imputation information metrics to the INFO field based on selected FORMAT tags
 </dd><dt><span class="term">
+<span class="strong"><strong>isecGT</strong></span>
+</span></dt><dd>
+    compare two files and set non-identical genotypes to missing
+</dd><dt><span class="term">
 <span class="strong"><strong>mendelian</strong></span>
 </span></dt><dd>
     count Mendelian consistent / inconsistent genotypes.
@@ -1945,6 +1997,10 @@ the usage examples that each plugin comes with.</p><div class="refsect3" title="
 </span></dt><dd>
     sets missing genotypes ("./.") to ref allele ("0/0" or "0|0")
 </dd><dt><span class="term">
+<span class="strong"><strong>prune</strong></span>
+</span></dt><dd>
+    prune sites by missingness or linkage disequilibrium
+</dd><dt><span class="term">
 <span class="strong"><strong>setGT</strong></span>
 </span></dt><dd>
     general tool to set genotypes according to rules requested by the user
@@ -2291,7 +2347,25 @@ Transition probabilities:
 </span></dt><dd>
     estimate HMM parameters using Baum-Welch algorithm, using the convergence threshold
     <span class="emphasis"><em>FLOAT</em></span>, e.g. 1e-10 (experimental)
-</dd></dl></div></div></div><div class="refsect2" title="bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]"><a id="stats"></a><h3>bcftools stats [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>A.vcf.gz</em></span> [<span class="emphasis"><em>B.vcf.gz</em></span>]</h3><p>Parses VCF or BCF and produces text file stats which is suitable for machine
+</dd></dl></div></div></div><div class="refsect2" title="bcftools sort [OPTIONS] file.bcf"><a id="sort"></a><h3>bcftools sort [<span class="emphasis"><em>OPTIONS</em></span>] file.bcf</h3><div class="variablelist"><dl><dt><span class="term">
+<span class="strong"><strong>-m, --max-mem</strong></span> <span class="emphasis"><em>FLOAT</em></span>[<span class="emphasis"><em>kMG</em></span>]
+</span></dt><dd>
+    Maximum memory to use. Approximate, affects the number of temporary files written
+    to the disk. Note that if the command fails at this step because of too many open files,
+    your system limit on the number of open files ("ulimit") may need to be increased.
+</dd><dt><span class="term">
+<span class="strong"><strong>-o, --output</strong></span> <span class="emphasis"><em>FILE</em></span>
+</span></dt><dd>
+    see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
+<span class="strong"><strong>-O, --output-type</strong></span> <span class="emphasis"><em>b</em></span>|<span class="emphasis"><em>u</em></span>|<span class="emphasis"><em>z</em></span>|<span class="emphasis"><em>v</em></span>
+</span></dt><dd>
+    see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
+</dd><dt><span class="term">
+<span class="strong"><strong>-T, --temp-dir</strong></span> <span class="emphasis"><em>DIR</em></span>
+</span></dt><dd>
+    Use this directory to store temporary files
+</dd></dl></div></div><div class="refsect2" title="bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]"><a id="stats"></a><h3>bcftools stats [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>A.vcf.gz</em></span> [<span class="emphasis"><em>B.vcf.gz</em></span>]</h3><p>Parses VCF or BCF and produces text file stats which is suitable for machine
 processing and can be plotted using <span class="strong"><strong><a class="link" href="#plot-vcfstats" title="plot-vcfstats [OPTIONS] file.vchk […]">plot-vcfstats</a></strong></span>.  When two files are given,
 the program generates separate stats for intersection and the complements. By
 default only sites are compared, <span class="strong"><strong>-s</strong></span>/<span class="strong"><strong>-S</strong></span> must given to include also sample
@@ -2607,6 +2681,17 @@ using these expressions
 missing genotypes can be matched including the phase and ploidy (".|.", "./.", ".")
 using these expressions
 </p><pre class="literallayout">GT=".|.", GT="./.", GT="."</pre></li><li class="listitem"><p class="simpara">
+sample genotype: homozygous, heterozygous, haploid, ref-ref hom, alt-alt hom,
+ref-alt het, alt-alt het, haploid ref, haploid alt (case-insensitive)
+</p><pre class="literallayout">GT="hom"
+GT="het"
+GT="hap"
+GT="RR"
+GT="AA"
+GT="RA" or GT="AR"
+GT="Aa" or GT="aA"
+GT="R"
+GT="A"</pre></li><li class="listitem"><p class="simpara">
 TYPE for variant type in REF,ALT columns (indel,snp,mnp,ref,bnd,other). Use the regex
 operator "\~" to require at least one allele of the given type or the equal sign "="
 to require that all alleles are of the given type. Compare
@@ -2614,10 +2699,13 @@ to require that all alleles are of the given type. Compare
 TYPE~"snp"
 TYPE!="snp"
 TYPE!~"snp"</pre></li><li class="listitem"><p class="simpara">
-array subscripts, "*" for any field
+array subscripts (0-based), "*" for any field, "-" to indicate a range
 </p><pre class="literallayout">(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
 DP4[*] == 0
-CSQ[*] ~ "missense_variant.*deleterious"</pre></li><li class="listitem"><p class="simpara">
+CSQ[*] ~ "missense_variant.*deleterious"
+FORMAT/DP[1-3] > 10
+FORMAT/DP[1-]  < 7
+FORMAT/DP[0,2-4] > 20</pre></li><li class="listitem"><p class="simpara">
 function on FORMAT tags (over samples) and INFO tags (over vector fields)
 </p><pre class="literallayout">MAX, MIN, AVG, SUM, STRLEN, ABS</pre></li><li class="listitem"><p class="simpara">
 variables calculated on the fly if not present: number of alternate alleles;
diff --git a/doc/bcftools.txt b/doc/bcftools.txt
index a6ef03b..fc5c25a 100644
--- a/doc/bcftools.txt
+++ b/doc/bcftools.txt
@@ -93,6 +93,7 @@ list of available options, run *bcftools* 'COMMAND' without arguments.
 - *<<query,query>>*      ..  transform VCF/BCF into user-defined formats
 - *<<reheader,reheader>>*   ..  modify VCF/BCF header, change sample names
 - *<<roh,roh>>*          ..  identify runs of homo/auto-zygosity
+- *<<sort,sort>>*        ..  sort VCF/BCF files
 - *<<stats,stats>>*      ..  produce VCF/BCF stats (former vcfcheck)
 - *<<view,view>>*        ..  subset, filter and convert VCF and BCF files
 
@@ -201,6 +202,7 @@ specific commands to see if they apply.
 *-s, --samples* \[^]'LIST'::
     Comma-separated list of samples to include or exclude if prefixed
     with "^".
+    The sample order is updated to reflect that given on the command line.
     Note that in general tags such as INFO/AC, INFO/AN, etc are not updated
     to correspond to the subset samples. *<<view,bcftools view>>* is the
     exception where some tags will be updated (unless the *-I, --no-update*
@@ -215,6 +217,7 @@ specific commands to see if they apply.
     File of sample names to include or exclude if prefixed with "^".
     One sample per line. See also the note above for the *-s, --samples*
     option.
+    The sample order is updated to reflect that given in the input file.
     The command *<<call,bcftools call>>* accepts an optional second
     column indicating ploidy (0, 1 or 2) or sex (as defined by
     *<<ploidy,--ploidy>>*, for example "F" or "M"), and can parse also PED
@@ -601,10 +604,10 @@ loss), 0 (complete loss), 3 (single-copy gain).
 *-f, --AF-file* 'file'::
     read allele frequencies from  a tab-delimited file with the columns CHR,POS,REF,ALT,AF
 
-*-o, --output-dir 'path'::
+*-o, --output-dir* 'path'::
     output directory
 
-*-p, --plot-threshold 'float'::
+*-p, --plot-threshold* 'float'::
     call *matplotlib* to produce plots for chromosomes with quality at least 'float',
     useful for visual inspection of the calls. With *-p 0*, plots for all chromosomes will be
     generated. If not given, a *matplotlib* script will be created but not called.
@@ -739,14 +742,44 @@ Note that the program does not act as a primitive variant caller and ignores all
 depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
 *setGT* plugin.
 
+*-c, --chain* 'FILE'::
+    write a chain file for liftover
+
+*-e, --exclude* 'EXPRESSION'::
+    exclude sites for which 'EXPRESSION' is true. For valid expressions see
+    *<<expressions,EXPRESSIONS>>*.
+
 *-f, --fasta-ref* 'FILE'::
     reference sequence in fasta format
 
-*-H, --haplotype* '1'|'2'::
-    apply variants for the given haplotype. This option requires *-s*, unless
-    exactly one sample is present in the VCF
+*-H, --haplotype* '1'|'2'|'R'|'A'|'LR'|'LA'|'SR'|'SA'::
+    choose which allele from the FORMAT/GT field to use (the codes are case-insensitive):
+    
+        '1';;
+            the first allele
+
+        '2';;
+            the second allele
+
+        'R';;
+            the REF allele (in heterozygous genotypes)
+
+        'A';;
+            the ALT allele (in heterozygous genotypes)
+
+        'LR, LA';;
+            the longer allele. If both have the same length, use the REF allele (LR), or the ALT allele  (LA)
+
+        'SR, SA';;
+            the shorter allele. If both have the same length, use the REF allele (SR), or the ALT allele  (SA)
+
+    This option requires *-s*, unless exactly one sample is present in the VCF
+
+*-i, --include* 'EXPRESSION'::
+    include only sites for which 'EXPRESSION' is true. For valid expressions see
+    *<<expressions,EXPRESSIONS>>*.
 
-*-i, --iupac-codes*::
+*-I, --iupac-codes*::
     output variants in the form of IUPAC ambiguity codes
 
 *-m, --mask* 'FILE'::
@@ -1770,10 +1803,9 @@ the *<<fasta_ref,--fasta-ref>>* option is supplied.
     can swap alleles and will update genotypes (GT) and AC counts,
     but will not attempt to fix PL or other fields.
 
-*-d, --rm-dup* 'snps'|'indels'|'both'|'all'|'none'::
+*-d, --rm-dup* 'snps'|'indels'|'both'|'any'::
     If a record is present multiple times, output only the first instance,
     see *--collapse* in *<<common_options,Common Options>>*.
-    Requires *-a, --allow-overlaps*.
 
 *-D, --remove-duplicates*::
     If a record is present in multiple files, output only the first instance.
@@ -1915,6 +1947,12 @@ By default, appropriate system directories are searched for installed plugins.
 *af-dist*::
     collect AF deviation stats and GT probability distribution given AF and assuming HWE
 
+*check-ploidy*::
+    check if ploidy of samples is consistent for all sites
+
+*check-sparsity*::
+    print samples without genotypes in a region or chromosome
+
 *color-chrs*::
     color shared chromosomal segments, requires trio VCF with phased GTs
 
@@ -1932,7 +1970,7 @@ By default, appropriate system directories are searched for installed plugins.
     fill INFO or REF field based on values in a fasta file
 
 *fill-tags*::
-    set INFO tags AF, AN, AC, NS, AC_Hom, AC_Het, AC_Hemi
+    set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, HWE, MAF, NS
 
 *fix-ploidy*::
     sets correct ploidy
@@ -1950,12 +1988,18 @@ By default, appropriate system directories are searched for installed plugins.
 *impute-info*::
     add imputation information metrics to the INFO field based on selected FORMAT tags
 
+*isecGT*::
+    compare two files and set non-identical genotypes to missing
+
 *mendelian*::
     count Mendelian consistent / inconsistent genotypes.
 
 *missing2ref*::
     sets missing genotypes ("./.") to ref allele ("0/0" or "0|0")
 
+*prune*::
+    prune sites by missingness or linkage disequilibrium
+
 *setGT*::
     general tool to set genotypes according to rules requested by the user
 
@@ -2321,6 +2365,24 @@ Transition probabilities:
     'FLOAT', e.g. 1e-10 (experimental)
 
 
+[[sort]]
+=== bcftools sort ['OPTIONS'] file.bcf
+
+*-m, --max-mem* 'FLOAT'['kMG']::
+    Maximum memory to use. Approximate, affects the number of temporary files written
+    to the disk. Note that if the command fails at this step because of too many open files,
+    your system limit on the number of open files ("ulimit") may need to be increased.
+
+*-o, --output* 'FILE'::
+    see *<<common_options,Common Options>>*
+
+*-O, --output-type* 'b'|'u'|'z'|'v'::
+    see *<<common_options,Common Options>>*
+
+*-T, --temp-dir* 'DIR'::
+    Use this directory to store temporary files
+
+
 
 [[stats]]
 === bcftools stats ['OPTIONS'] 'A.vcf.gz' ['B.vcf.gz']
@@ -2647,6 +2709,19 @@ using these expressions
 
         GT=".|.", GT="./.", GT="."
 
+* sample genotype: homozygous, heterozygous, haploid, ref-ref hom, alt-alt hom,
+ref-alt het, alt-alt het, haploid ref, haploid alt (case-insensitive)
+
+        GT="hom"
+        GT="het"
+        GT="hap"
+        GT="RR"
+        GT="AA"
+        GT="RA" or GT="AR"
+        GT="Aa" or GT="aA"
+        GT="R"
+        GT="A"
+
 * TYPE for variant type in REF,ALT columns (indel,snp,mnp,ref,bnd,other). Use the regex
 operator "\~" to require at least one allele of the given type or the equal sign "="
 to require that all alleles are of the given type. Compare
@@ -2656,11 +2731,14 @@ to require that all alleles are of the given type. Compare
         TYPE!="snp"
         TYPE!~"snp"
 
-* array subscripts, "*" for any field
+* array subscripts (0-based), "*" for any field, "-" to indicate a range
 
         (DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
         DP4[*] == 0
         CSQ[*] ~ "missense_variant.*deleterious"
+        FORMAT/DP[1-3] > 10
+        FORMAT/DP[1-]  < 7
+        FORMAT/DP[0,2-4] > 20
 
 * function on FORMAT tags (over samples) and INFO tags (over vector fields)
 
diff --git a/filter.c b/filter.c
index 78ff1f1..3dc91a7 100644
--- a/filter.c
+++ b/filter.c
@@ -67,21 +67,23 @@ typedef struct _token_t
     char *tag;          // for debugging and printout only, VCF tag name
     double threshold;   // filtering threshold
     int hdr_id, type;   // BCF header lookup ID and one of BCF_HT_* types
-    int idx;            // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*])
+    int idx;            // 0-based index to VCF vectors, -1: not a vector, 
+                        //  -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
+    int *idxs, nidxs;   // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
     void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
     int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *);
     void *hash;         // test presence of str value in the hash via comparator
     regex_t *regex;     // precompiled regex for string comparison
 
     // modified on filter evaluation at each VCF line
-    double *values;     // In case str_value is set, values[0] is one sample's string length
-    char *str_value;    //  and values[0]*nsamples gives the total length;
+    double *values;
+    kstring_t str_value;
     int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
     int pass_site;          // -1 not applicable, 0 fails, >0 pass
     uint8_t *pass_samples;  // status of individual samples
     int nsamples;           // number of samples
-    int nvalues, mvalues;   // number of used values, n=0 for missing values, n=1 for scalars
-                            // for strings, total length of str_value
+    int nvalues, mvalues;   // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l
+    int nstr1;              // per-sample string length, set only with str_value.l>0 && nsamples>1
 }
 token_t;
 
@@ -93,6 +95,7 @@ struct _filter_t
     token_t *filters, **flt_stack;  // filtering input tokens (in RPN) and evaluation stack
     int32_t *tmpi;
     float   *tmpf;
+    kstring_t tmps;
     int max_unpack, mtmpi, mtmpf, nsamples;
 };
 
@@ -169,6 +172,7 @@ static int filters_next_token(char **str, int *len)
         return TOK_VAL;
     }
 
+    int square_brackets = 0;
     while ( tmp[0] )
     {
         if ( tmp[0]=='"' ) break;
@@ -183,11 +187,12 @@ static int filters_next_token(char **str, int *len)
         if ( tmp[0]=='(' ) break;
         if ( tmp[0]==')' ) break;
         if ( tmp[0]=='+' ) break;
-        // hacky: so that [*] is not split, the tokenizer does not recognise square brackets []
-        if ( tmp[0]=='*' && (tmp==*str || tmp[-1]!='[') ) break;
-        if ( tmp[0]=='-' ) break;
+        if ( tmp[0]=='*' && !square_brackets ) break;
+        if ( tmp[0]=='-' && !square_brackets ) break;
         if ( tmp[0]=='/' ) break;
         if ( tmp[0]=='~' ) break;
+        if ( tmp[0]==']' ) { if (square_brackets) tmp++; break; }
+        if ( tmp[0]=='[' ) square_brackets++; 
         tmp++;
     }
     if ( tmp > *str )
@@ -270,12 +275,15 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
     else if ( line->d.info[i].type==BCF_BT_CHAR )
     {
         int n = line->d.info[i].len;
-        int m = (int)tok->values[0];
-        hts_expand(char,n+1,m,tok->str_value);
-        memcpy(tok->str_value,line->d.info[i].vptr,n);
-        tok->str_value[n] = 0;
-        tok->values[0] = m;
-        tok->nvalues   = n;
+        if ( n >= tok->str_value.m )
+        {
+            tok->str_value.m = n + 1;
+            tok->str_value.s = (char*) realloc(tok->str_value.s, tok->str_value.m);
+            if ( !tok->str_value.s ) error("Failed to alloc %d bytes\n", (int)tok->str_value.m);
+        }
+        memcpy(tok->str_value.s, line->d.info[i].vptr, n);
+        tok->str_value.s[n] = 0;
+        tok->nvalues = tok->str_value.l = n;
     }
     else if ( line->d.info[i].type==BCF_BT_FLOAT )
     {
@@ -285,10 +293,11 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
             tok->values[0] = line->d.info[i].v1.f;
             tok->nvalues   = 1;
         }
-        tok->str_value = NULL;
+        tok->str_value.l = 0;
     }
     else
     {
+        tok->str_value.l = 0;
         if ( line->d.info[i].type==BCF_BT_INT8 && line->d.info[i].v1.i==bcf_int8_missing ) tok->nvalues = 0;
         else if ( line->d.info[i].type==BCF_BT_INT16 && line->d.info[i].v1.i==bcf_int16_missing ) tok->nvalues = 0;
         else if ( line->d.info[i].type==BCF_BT_INT32 && line->d.info[i].v1.i==bcf_int32_missing ) tok->nvalues = 0;
@@ -297,7 +306,6 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
             tok->values[0] = line->d.info[i].v1.i;
             tok->nvalues   = 1;
         }
-        tok->str_value = NULL;
     }
 }
 static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
@@ -346,8 +354,8 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin
         return ret ? 0 : 1;
     }
 
-    if ( op_type==TOK_EQ ) return strcmp(btok->str_value,line->d.id) ? 0 : 1;
-    return strcmp(btok->str_value,line->d.id) ? 1 : 0;
+    if ( op_type==TOK_EQ ) return strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
+    return strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
 }
 
 /**
@@ -409,13 +417,16 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok)
 {
     if ( tok->idx==-2 )
     {
-        int i;
         tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
         if ( tok->nvalues<=0 ) tok->nvalues = 0;
         else
         {
             hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
-            for (i=0; i<tok->nvalues; i++) tok->values[i] = flt->tmpi[i];
+            int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1;
+            if ( end >= tok->nvalues ) end = tok->nvalues - 1;
+            for (i=0; i<=end; i++)
+                if ( i>=tok->nidxs || tok->idxs[i] ) tok->values[j++] = flt->tmpi[i];
+            tok->nvalues = j;
         }
     }
     else
@@ -435,15 +446,21 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
 {
     if ( tok->idx==-2 )
     {
-        int i;
         tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf);
         if ( tok->nvalues<=0 ) tok->nvalues = 0;
         else
         {
             hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
-            for (i=0; i<tok->nvalues; i++)
-                if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[i]);
-                else tok->values[i] = flt->tmpf[i];
+            int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1;
+            if ( end >= tok->nvalues ) end = tok->nvalues - 1;
+            for (i=0; i<=end; i++)
+                if ( i>=tok->nidxs || tok->idxs[i] )
+                {
+                    if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[j]);
+                    else tok->values[j] = flt->tmpf[i];
+                    j++;
+                }
+            tok->nvalues = j;
         }
     }
     else
@@ -461,33 +478,64 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok)
 
 static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok)
 {
-    int m = (int)tok->values[0];
-    int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value,&m);
-    if ( n<0 ) { tok->nvalues = 0; return; }
-    tok->values[0] = m;     // allocated length
+    int32_t m = tok->str_value.m;
+    int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value.s,&m);
+    tok->str_value.m = m;
+    if ( n<0 ) { tok->nvalues = tok->str_value.l = 0; return; }
 
     if ( tok->idx>=0 )
     {
         // get ith field (i=tok->idx)
         int i = 0;
-        char *ss = tok->str_value, *se = tok->str_value + n;
+        char *ss = tok->str_value.s, *se = tok->str_value.s + n;
         while ( ss<se && i<tok->idx )
         {
             if ( *ss==',' ) i++;
             ss++;
         }
-        if ( ss==se || i!=tok->idx ) { tok->nvalues = 0; return; }
+        if ( ss==se || i!=tok->idx ) { tok->nvalues = tok->str_value.l = 0; return; }
         se = ss;
-        while ( se-tok->str_value<n && *se!=',' ) se++;
-        if ( ss==tok->str_value ) *se = 0;
+        while ( se - tok->str_value.s < n && *se!=',' ) se++;
+        if ( ss==tok->str_value.s ) *se = 0;
         else
         {
-            memmove(tok->str_value,ss,se-ss);
-            tok->str_value[se-ss] = 0;
+            memmove(tok->str_value.s, ss, se-ss);
+            tok->str_value.s[se-ss] = 0;
         }
-        tok->nvalues = se-ss;
+        tok->str_value.l = se - ss;
     }
-    else if ( tok->idx==-2 ) tok->nvalues = n;
+    else if ( tok->idx==-2 && tok->idxs[0]==-1 )    // keep all values, TAG[*]
+        tok->str_value.l = n;
+    else if ( tok->idx==-2 )
+    {
+        flt->tmps.l = 0;
+        ks_resize(&flt->tmps, n);
+        int i, end = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
+        if ( end >= n ) end = n - 1;
+        char *beg = tok->str_value.s, *dst = flt->tmps.s;
+        for (i=0; i<=end; i++)
+        {
+            char *end = beg;
+            while ( *end && *end!=',' ) end++;
+
+            if ( i>=tok->nidxs || tok->idxs[i] )
+            {
+                memcpy(dst, beg, end - beg);
+                dst += end - beg;
+                dst[0] = ',';
+                dst++;
+            }
+
+            beg = end+1;
+        }
+        dst[0] = 0;
+        tok->str_value.l = dst - flt->tmps.s;
+
+        #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+        SWAP(char *, flt->tmps.s, tok->str_value.s);
+        SWAP(size_t, flt->tmps.m, tok->str_value.m);
+    }
+    tok->nvalues = tok->str_value.l;
 }
 
 static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok)
@@ -503,127 +551,266 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
 {
     int i;
     if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
-        tok->nvalues = 0;
-    else
     {
+        tok->nvalues = tok->nsamples = 0;
+        return;
+    }
+    if ( tok->idx >= -1 )    // scalar or vector index
+    {
+        hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
+        int nvals = tok->nvalues / line->n_sample;
+        int idx = tok->idx >= 0 ? tok->idx : 0;
         int is_missing = 1;
-        hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
-        for (i=0; i<tok->nvalues; i++)
+        int32_t *ptr = flt->tmpi;
+        for (i=0; i<line->n_sample; i++)
         {
-            if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end )
+            if ( ptr[idx]==bcf_int32_missing || ptr[idx]==bcf_int32_vector_end ) 
                 bcf_double_set_missing(tok->values[i]);
             else
             {
-                tok->values[i] = flt->tmpi[i];
+                tok->values[i] = ptr[idx];
                 is_missing = 0;
             }
+            ptr += nvals;
         }
         if ( is_missing ) tok->nvalues = 0;
-        else if ( tok->idx >= 0 )
+        else tok->nvalues = line->n_sample;
+        tok->nsamples = tok->nvalues;
+        return;
+    }
+    if ( tok->idx == -2 )
+    {
+        hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+        int nvals = tok->nvalues / line->n_sample;
+        int idx = tok->idx >= 0 ? tok->idx : 0;
+        int is_missing = 1;
+        int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
+        if ( end >= nvals ) end = nvals - 1;
+        int32_t *ptr = flt->tmpi;
+        for (i=0; i<line->n_sample; i++)
         {
-            int nsmpl = bcf_hdr_nsamples(flt->hdr);
-            int nvals = tok->nvalues / nsmpl;
-            if ( tok->idx >= nvals )
-                tok->nvalues = 0;  // the index is too big
-            else
-            {
-                for (i=0; i<nsmpl; i++)
-                    tok->values[i] = tok->values[i*nvals+tok->idx];
-                tok->nvalues = nsmpl;
-            }
+            for (k=0; k<=end; k++)
+                if ( k>=tok->nidxs || tok->idxs[k] )
+                {
+                    if ( ptr[k]==bcf_int32_missing || ptr[k]==bcf_int32_vector_end )
+                        bcf_double_set_missing(tok->values[j]);
+                    else
+                    {
+                        tok->values[j] = ptr[k];
+                        is_missing = 0;
+                    }
+                    j++;
+                }
+            ptr += nvals;
+        }
+        if ( is_missing ) tok->nvalues = tok->nsamples = 0;
+        else
+        {
+            tok->nsamples = line->n_sample;
+            tok->nvalues = j;
         }
+        return;
     }
-    tok->nsamples = tok->nvalues;
 }
 static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
 {
     int i;
-    if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<=0 )
+    if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
     {
-        tok->nvalues = tok->nsamples = 0;   // missing values
+        tok->nvalues = tok->nsamples = 0;
+        return;
     }
-    else
+    if ( tok->idx >= -1 )    // scalar or vector index
     {
+        hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
+        int nvals = tok->nvalues / line->n_sample;
+        int idx = tok->idx >= 0 ? tok->idx : 0;
         int is_missing = 1;
-        hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
-        for (i=0; i<tok->nvalues; i++)
+        float *ptr = flt->tmpf;
+        for (i=0; i<line->n_sample; i++)
         {
-            if ( bcf_float_is_missing(flt->tmpf[i]) || bcf_float_is_vector_end(flt->tmpf[i]) )
+            if ( bcf_float_is_missing(ptr[idx]) || bcf_float_is_vector_end(ptr[idx]) )
                 bcf_double_set_missing(tok->values[i]);
             else
             {
-                tok->values[i] = flt->tmpf[i];
+                tok->values[i] = ptr[idx];
                 is_missing = 0;
             }
+            ptr += nvals;
         }
         if ( is_missing ) tok->nvalues = 0;
-        else if ( tok->idx >= 0 )
+        else tok->nvalues = line->n_sample;
+        tok->nsamples = tok->nvalues;
+        return;
+    }
+    if ( tok->idx == -2 )
+    {
+        hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+        int nvals = tok->nvalues / line->n_sample;
+        int idx = tok->idx >= 0 ? tok->idx : 0;
+        int is_missing = 1;
+        int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
+        if ( end >= nvals ) end = nvals - 1;
+        float *ptr = flt->tmpf;
+        for (i=0; i<line->n_sample; i++)
         {
-            int nsmpl = bcf_hdr_nsamples(flt->hdr);
-            int nvals = tok->nvalues / nsmpl;
-            if ( tok->idx >= nvals )
-                tok->nvalues = 0;  // the index is too big
-            else
-            {
-                for (i=0; i<nsmpl; i++)
-                    tok->values[i] = tok->values[i*nvals+tok->idx];
-                tok->nvalues = nsmpl;
-            }
+            for (k=0; k<=end; k++)
+                if ( k>=tok->nidxs || tok->idxs[k] )
+                {
+                    if ( bcf_float_is_missing(ptr[k]) || bcf_float_is_vector_end(ptr[k]) )
+                        bcf_double_set_missing(tok->values[j]);
+                    else
+                    {
+                        tok->values[j] = ptr[k];
+                        is_missing = 0;
+                    }
+                    j++;
+                }
+            ptr += nvals;
+        }
+        if ( is_missing ) tok->nvalues = tok->nsamples = 0;
+        else
+        {
+            tok->nsamples = line->n_sample;
+            tok->nvalues = j;
         }
+        return;
     }
-    tok->nsamples = tok->nvalues;
 }
 static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
 {
-    int ndim = tok->nsamples * (int)tok->values[0];
-    int ret = bcf_get_format_char(flt->hdr,line,tok->tag,&tok->str_value,&ndim);
+    tok->str_value.l = tok->nvalues = 0;
+    if ( !line->n_sample ) return;
 
-    int nsmpl = bcf_hdr_nsamples(flt->hdr);
-    ndim /= nsmpl;
-    tok->values[0] = ndim;
+    int ndim = tok->str_value.m;
+    int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim);
+    tok->str_value.m = ndim;
 
-    if ( ret<=0 )
-    {
-        tok->nvalues = 0;
-        return;
-    }
+    if ( nstr<=0 ) return;
 
-    if ( tok->idx < 0 ) // scalar
+    if ( tok->idx == -1 || (tok->idx==-2 && tok->idxs[0]==-1) ) // scalar or keep all values of a vector: TAG[*]
     {
-        tok->nvalues = tok->nsamples = nsmpl;
+        tok->nsamples = line->n_sample;
+        tok->nstr1 = ndim / line->n_sample;
+        tok->nvalues = tok->str_value.l = nstr;
         return;
     }
 
-    // vector
+    int nstr1 = nstr / line->n_sample;
+
+    // vector, one or multiple indices
     int i;
-    for (i=0; i<nsmpl; i++)
+    for (i=0; i<line->n_sample; i++)
     {
-        char *ss = tok->str_value + i*ndim;
-        int is = 0, ivec = 0;
-        while ( ivec<tok->idx && is<ndim && ss[is] )
-        {
-            if ( ss[is]==',' ) ivec++;
-            is++;
-        }
-        if ( ivec!=tok->idx || is==ndim || !ss[is] )
+        char *dst = tok->str_value.s + i*nstr1, *str = dst;
+        int nval = 0, ibeg = 0;
+        while ( ibeg < nstr1 )
         {
-            ss[0] = '.';
-            ss[1] = 0;
-            continue;
+            int iend = ibeg + 1;
+            while ( iend < nstr1 && str[iend] && str[iend]!=',' ) iend++;
+
+            int keep = 0;
+            if ( tok->idx >=0 )
+                keep = tok->idx==nval ? 1 : 0;
+            else if ( nval < tok->nidxs )
+                keep = tok->idxs[nval] ? 1 : 0;
+            else if ( tok->idxs[tok->nidxs-1] < 0 )
+                keep = 1;
+
+            if ( keep )
+            {
+                if ( ibeg>0 ) memmove(dst, str+ibeg, iend-ibeg+1);
+                dst += iend - ibeg + 1;
+                if ( tok->idx>=0 ) break;
+            }
+            if ( !str[iend] ) break;
+            ibeg = iend + 1;
+            nval++;
         }
-        int ie = is;
-        while ( ie<ndim && ss[ie] && ss[ie]!=',' ) ie++;
-        if ( is ) memmove(ss,&ss[is],ie-is);
-        if ( ndim-(ie-is) ) memset(ss+ie-is,0,ndim-(ie-is));
+        if ( dst==str ) { dst[0] = '.'; dst+=2; }
+        if ( dst - str < nstr1 ) memset(dst-1, 0, nstr1 - (dst - str));
     }
-    if ( !ndim )
+    tok->nvalues = tok->str_value.l = nstr;
+    tok->nstr1 = nstr1;
+    tok->nsamples = line->n_sample;
+}
+static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type)
+{
+    bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
+    if ( !fmt )
     {
-        tok->nvalues = 0;
+        tok->nvalues = tok->str_value.l = 0;
         return;
     }
-    tok->nvalues  = ret;
+    
+    int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals = type==2 ? 3 : 4;
+    if ( tok->str_value.m <= nvals*nsmpl )
+    {
+        tok->str_value.m = nvals*nsmpl + 1;
+        tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m);
+    }
+
+#define BRANCH_INT(type_t,vector_end) \
+    { \
+        for (i=0; i<line->n_sample; i++) \
+        { \
+            type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
+            int is_het = 0, has_ref = 0, missing = 0; \
+            for (j=0; j<fmt->n; j++) \
+            { \
+                if ( ptr[j]==vector_end ) break; /* smaller ploidy */ \
+                if ( bcf_gt_is_missing(ptr[j]) ) { missing=1; break; } /* missing allele */ \
+                int ial = ptr[j]; \
+                if ( bcf_gt_allele(ial)==0 ) has_ref = 1; \
+                if ( j>0 ) \
+                { \
+                    int jal = ptr[j-1]; \
+                    if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \
+                } \
+            } \
+            char *dst = &tok->str_value.s[nvals*i]; \
+            if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
+            else if ( type==3 ) \
+            { \
+                if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \
+                else if ( !is_het ) dst[0]='h', dst[1]='o', dst[2]='m', dst[3] = 0; /* hom */ \
+                else dst[0]='h', dst[1]='e', dst[2]='t', dst[3] = 0; /* het */ \
+            } \
+            else \
+            { \
+                if ( j==1 ) \
+                { \
+                    if ( has_ref ) dst[0]='r', dst[1]=0; /* r, haploid */ \
+                    else dst[0]='a', dst[1]=0; /* a, haploid */ \
+                } \
+                else if ( !is_het ) \
+                { \
+                    if ( has_ref ) dst[0]='r', dst[1]='r', dst[2] = 0; /* rr */ \
+                    else dst[0]='a', dst[1]='a', dst[2] = 0; /* aa */ \
+                } \
+                else \
+                { \
+                    if ( has_ref ) dst[0]='r', dst[1]='a', dst[2] = 0; /* ra */ \
+                    else dst[0]='a', dst[1]='A', dst[2] = 0; /* aA */ \
+                } \
+            } \
+        } \
+    }
+    switch (fmt->type) {
+        case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+        default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break;
+    }
+#undef BRANCH_INT
     tok->nsamples = nsmpl;
+    tok->nvalues = tok->str_value.l = nvals*nsmpl;
+    tok->str_value.s[tok->str_value.l] = 0;
+    tok->nstr1 = nvals;
 }
+static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
+static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
+
 static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
 {
     bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
@@ -636,67 +823,73 @@ static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *to
     kstring_t str;
 
 gt_length_too_big:
-    str.s = tok->str_value; str.m = tok->values[0] * nsmpl; str.l = 0;
+    tok->str_value.l = 0;
     for (i=0; i<nsmpl; i++)
     {
-        int plen = str.l;
+        int plen = tok->str_value.l;
 
-        bcf_format_gt(fmt, i, &str);
-        kputc_(0,&str);
-        if ( str.l - plen > blen )
+        bcf_format_gt(fmt, i, &tok->str_value);
+        kputc_(0, &tok->str_value);
+        if ( tok->str_value.l - plen > blen )
         {
             // too many alternate alleles or ploidy is too large, the genotype does not fit
             // three characters ("0/0" vs "10/10").
-            tok->str_value = str.s;
             blen *= 2;
             goto gt_length_too_big;
         }
 
-        plen = str.l - plen;
-        while ( plen<blen )
+        plen = tok->str_value.l - plen;
+        while ( plen < blen )
         {
-            kputc_(0, &str);
+            kputc_(0, &tok->str_value);
             plen++;
         }
     }
-    tok->nvalues = str.l;
     tok->nsamples = nsmpl;
-    tok->values[0] = blen;
-    tok->str_value = str.s;
+    tok->nvalues = tok->str_value.l;
+    tok->nstr1 = blen;
 }
 static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
 {
-    kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0;
-    kputs(line->d.allele[0], &str);
-    tok->nvalues = str.l;
-    tok->values[0] = str.m;
-    tok->str_value = str.s;
+    tok->str_value.l = 0;
+    kputs(line->d.allele[0], &tok->str_value);
+    tok->nvalues = tok->str_value.l;
 }
 static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
 {
-    kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0;
+    tok->str_value.l = 0;
     if ( tok->idx>=0 )
     {
-        if ( line->n_allele >= tok->idx )
-            kputs(line->d.allele[tok->idx], &str);
+        if ( line->n_allele > tok->idx + 1 )
+            kputs(line->d.allele[tok->idx + 1], &tok->str_value);
         else
-            kputc('.', &str);
+            kputc('.', &tok->str_value);
+        tok->idx = 0;
+    }
+    else if ( tok->idx==-2 )
+    {
+        int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? line->n_allele - 1 : tok->nidxs - 1;
+        if ( end >= line->n_allele - 1 ) end = line->n_allele - 2;
+        for (i=0; i<=end; i++)
+            if ( i>=tok->nidxs || tok->idxs[i] )
+            {
+                if ( tok->str_value.l ) kputc(',', &tok->str_value);
+                kputs(line->d.allele[i+1], &tok->str_value);
+            }
     }
     else if ( line->n_allele>1 )
     {
-        kputs(line->d.allele[1], &str);
+        kputs(line->d.allele[1], &tok->str_value);
         int i;
         for (i=2; i<line->n_allele; i++)
         {
-            kputc(',', &str);
-            kputs(line->d.allele[i], &str);
+            kputc(',', &tok->str_value);
+            kputs(line->d.allele[i], &tok->str_value);
         }
     }
     else if ( line->n_allele==1 )
-        kputc('.', &str);
-    tok->nvalues = str.l;
-    tok->values[0] = str.m;
-    tok->str_value = str.s;
+        kputc('.', &tok->str_value);
+    tok->nvalues = tok->str_value.l;
 }
 static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok)
 {
@@ -857,11 +1050,11 @@ static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok)
 static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
 {
     tok->is_str = 0;
-    if ( !tok->nvalues ) return;
+    if ( !tok->str_value.l ) return;
     if ( tok->idx==-2 )
     {
         int i = 0;
-        char *ss = tok->str_value;
+        char *ss = tok->str_value.s;
         while ( *ss )
         {
             char *se = ss;
@@ -881,9 +1074,10 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
     }
     else
     {
-        tok->values[0] = strlen(tok->str_value);
+        tok->values[0] = strlen(tok->str_value.s);
         tok->nvalues = 1;
     }
+    tok->str_value.l = 0;
 }
 #define VECTOR_ARITHMETICS(atok,btok,AOP) \
 { \
@@ -1077,7 +1271,16 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
     if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
     else \
     { \
-        if ( (atok)->nsamples && (btok)->nsamples ) \
+        if ( (atok)->idx<=-2 || (btok)->idx<=-2 ) \
+        { \
+            /* any field can match: [*] */ \
+            for (i=0; i<(atok)->nvalues; i++) \
+            { \
+                for (j=0; j<(btok)->nvalues; j++) \
+                    if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
+            } \
+        } \
+        else if ( (atok)->nsamples && (btok)->nsamples ) \
         { \
             for (i=0; i<(atok)->nsamples; i++) \
             { \
@@ -1111,15 +1314,6 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
             (atok)->nsamples = (btok)->nsamples; \
             if ( !has_values ) (atok)->nvalues = 0; \
         } \
-        else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \
-        { \
-            /* any field can match: [*] */ \
-            for (i=0; i<(atok)->nvalues; i++) \
-            { \
-                for (j=0; j<(btok)->nvalues; j++) \
-                    if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
-            } \
-        } \
         else \
         { \
             if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \
@@ -1130,18 +1324,18 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
 }
 static int cmp_vector_strings(token_t *atok, token_t *btok, int logic)    // logic: TOK_EQ or TOK_NE
 {
-    if ( !atok->nvalues ) { return 0; }
-    if ( !btok->nvalues ) { atok->nvalues = 0; return 0; }
+    if ( !atok->str_value.l ) { return 0; }
+    if ( !btok->str_value.l ) { atok->str_value.l = 0; return 0; }
     int i, pass_site = 0;
     if ( atok->nsamples && atok->nsamples==btok->nsamples )
     {
         for (i=0; i<atok->nsamples; i++)
         {
-            char *astr = atok->str_value + i*(int)atok->values[0];
-            char *bstr = btok->str_value + i*(int)btok->values[0];
-            char *aend = astr + (int)atok->values[0], *a = astr;
+            char *astr = atok->str_value.s + i*atok->nstr1;
+            char *bstr = btok->str_value.s + i*btok->nstr1;
+            char *aend = astr + atok->str_value.l, *a = astr;
             while ( a<aend && *a ) a++;
-            char *bend = bstr + (int)btok->values[0], *b = bstr;
+            char *bend = bstr + btok->str_value.l, *b = bstr;
             while ( b<bend && *b ) b++;
             if ( a-astr != b-bstr ) atok->pass_samples[i] = 0;
             else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0;
@@ -1161,8 +1355,8 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic)    // log
             token_t *xtok, *ytok;   // xtok is scalar, ytok array
             if ( btok->idx==-2 ) { xtok = atok; ytok = btok; }
             else { xtok = btok; ytok = atok; }
-            char *xstr = xtok->str_value, *xend = xstr + xtok->nvalues;
-            char *ystr = ytok->str_value, *yend = ystr + ytok->nvalues, *y = ystr;
+            char *xstr = xtok->str_value.s, *xend = xstr + xtok->str_value.l;
+            char *ystr = ytok->str_value.s, *yend = ystr + ytok->str_value.l, *y = ystr;
             while ( y<=yend )
             {
                 if ( y==yend || *y==',' )
@@ -1178,7 +1372,7 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic)    // log
             }
         }
         else
-            pass_site = strcmp(atok->str_value,btok->str_value) ? 0 : 1;
+            pass_site = strcmp(atok->str_value.s,btok->str_value.s) ? 0 : 1;
         if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
     }
     else
@@ -1186,19 +1380,26 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic)    // log
         token_t *xtok, *ytok;
         if ( !atok->nsamples ) { xtok = atok; ytok = btok; }
         else { xtok = btok; ytok = atok; }
-        char *xstr = xtok->str_value;
-        char *xend = xstr + (int)xtok->values[0], *x = xstr;
+        char *xstr = xtok->str_value.s;
+        char *xend = xstr + xtok->str_value.l, *x = xstr;
         while ( x<xend && *x ) x++;
         for (i=0; i<ytok->nsamples; i++)
         {
-            char *ystr = ytok->str_value + i*(int)ytok->values[0];
-            char *yend = ystr + (int)ytok->values[0], *y = ystr;
-            while ( y<yend && *y ) y++;
-            if ( x-xstr != y-ystr ) atok->pass_samples[i] = 0;
-            else atok->pass_samples[i] = strncmp(xstr,ystr,x-xstr)==0 ? 1 : 0;
-            if ( logic!=TOK_EQ )
-                atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
-            pass_site |= atok->pass_samples[i];
+            char *ystr = ytok->str_value.s + i*ytok->nstr1;
+            char *ybeg = ystr, *yend = ystr + ytok->nstr1;
+            int pass = 0;
+            while ( ybeg < yend )
+            {
+                char *y = ybeg;
+                while ( y<yend && *y && *y!=',' ) y++;
+                if ( y - ybeg != x - xstr ) pass = 0;
+                else pass = strncmp(xstr,ybeg,x-xstr)==0 ? 1 : 0;
+                if ( logic!=TOK_EQ ) pass = pass ? 0 : 1;
+                if ( pass || !*y ) break;
+                ybeg = y+1;
+            }
+            atok->pass_samples[i] = pass;
+            pass_site |= pass;
         }
         if ( !atok->nsamples )
             atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set
@@ -1212,18 +1413,70 @@ static int regex_vector_strings(token_t *atok, token_t *btok, int negate)
     {
         for (i=0; i<atok->nsamples; i++)
         {
-            char *ptr = atok->str_value + i*(int)atok->values[0];
+            char *ptr = atok->str_value.s + i*atok->nstr1;
             atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1;
             if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
             pass_site |= atok->pass_samples[i];
         }
         return pass_site;
     }
-    pass_site = regexec(btok->regex, atok->str_value, 0,NULL,0) ? 0 : 1;
+    pass_site = regexec(btok->regex, atok->str_value.s, 0,NULL,0) ? 0 : 1;
     if ( negate ) pass_site = pass_site ? 0 : 1;
     return pass_site;
 }
 
+static void parse_tag_idx(char *tag, char *tag_idx, token_t *tok)   // tag_idx points just after "TAG["
+{
+    // TAG[*] .. any field
+    if ( !strncmp("*]", tag_idx, 3) )
+    {
+        tok->idxs = (int*) malloc(sizeof(int));
+        tok->idxs[0] = -1;
+        tok->nidxs   = 1;
+        tok->idx     = -2;
+        return;
+    }
+
+    // TAG[integer] .. one field
+    char *end, *beg = tag_idx;
+    tok->idx = strtol(tag_idx, &end, 10);
+    if ( tok->idx >= 0 && *end==']' ) return;
+
+
+    // TAG[0,1] or TAG[0-2] or [1-] etc
+    int i, ibeg = -1;
+    while ( *beg && *beg!=']' )
+    {
+        int idx = strtol(beg, &end, 10);
+        if ( end[0]==',' ) beg = end + 1;
+        else if ( end[0]==']' ) beg = end;
+        else if ( end[0]=='-' ) { beg = end + 1; ibeg = idx; continue; }
+        else error("Could not parse the index: %s[%s\n", tag, tag_idx+1);
+        if ( idx >= tok->nidxs )
+        {
+            tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(idx+1));
+            memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(idx - tok->nidxs + 1));
+            tok->nidxs = idx + 1;
+        }
+        if ( ibeg>=0 )
+        {
+            for (i=ibeg; i<=idx; i++) tok->idxs[i] = 1;
+            ibeg = -1;
+        }
+        tok->idxs[idx] = 1;
+    }
+    if ( ibeg >=0 )
+    {
+        if ( ibeg >= tok->nidxs )
+        {
+            tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(ibeg+1));
+            memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(ibeg - tok->nidxs + 1));
+            tok->nidxs = ibeg + 1;
+        }
+        tok->idxs[ibeg] = -1;
+    }
+    tok->idx = -2;
+}
 static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
 {
     tok->tok_type  = TOK_VAL;
@@ -1361,17 +1614,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         int i;
         for (i=0; i<tmp.l; i++)
             if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; is_array = i+1; break; }
-        if ( is_array )
-        {
-            if ( tmp.s[is_array]=='*' )
-                tok->idx = -2;      // tag[*] .. any field
-            else
-            {
-                char *end;
-                tok->idx = strtol(tmp.s+is_array, &end, 10);
-                if ( *end!=']' ) error("Could not parse the index: %s[%s\n", tmp.s,tmp.s+is_array);
-            }
-        }
+        if ( is_array ) 
+            parse_tag_idx(tmp.s, tmp.s+is_array, tok);
     }
     tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s);
     if ( is_fmt==-1 )
@@ -1425,7 +1669,13 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
                         case BCF_HT_STR:  tok->setter = &filters_set_info_string; tok->is_str = 1; break;
                         default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
                     }
-                    if(!is_array) tok->idx = -2;
+                    if (!is_array) 
+                    {
+                        tok->idx = -2;
+                        tok->idxs = (int*) malloc(sizeof(int));
+                        tok->idxs[0] = -1;
+                        tok->nidxs   = 1;
+                    }
                 }
             }
             filter->max_unpack |= BCF_UN_INFO;
@@ -1518,6 +1768,11 @@ static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks)
     }
 }
 
+static void str_to_lower(char *str)
+{
+    while ( *str ) { *str = tolower(*str); str++; }
+}
+
 
 // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
 filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
@@ -1538,8 +1793,8 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
         ret = filters_next_token(&tmp, &len);
         if ( ret==-1 ) error("Missing quotes in: %s\n", str);
 
-        //fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
-        //int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", TOKEN_STRING[ops[i]]); fprintf(stderr,"\n");
+        // fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
+        // int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", TOKEN_STRING[ops[i]]); fprintf(stderr,"\n");
 
         if ( ret==TOK_LFT )         // left bracket
         {
@@ -1670,6 +1925,28 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
             i = itok;
             continue;
         }
+        if ( !strcmp(out[i].tag,"GT") )
+        {
+            if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
+            int ival;
+            if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
+            else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1;
+            else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = i + 1;
+            else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1;
+            else error("[%s:%d %s] Could not parse the expression: %s\n",  __FILE__,__LINE__,__FUNCTION__, filter->str);
+
+            // assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc
+            if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3;  str_to_lower(out[ival].key); }
+            else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3;  str_to_lower(out[ival].key); }
+            else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3;  str_to_lower(out[ival].key); }
+            else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2;  str_to_lower(out[ival].key); }
+            else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; }   // ra
+            else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; }   // aA
+            else if ( !strcasecmp(out[ival].key,"aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='a'; }  // aa
+            else if ( !strcasecmp(out[ival].key,"a") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]=0; }  // a
+            else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; }  // r
+            continue;
+        }
         if ( !strcmp(out[i].tag,"FILTER") )
         {
             if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
@@ -1728,9 +2005,10 @@ void filter_destroy(filter_t *filter)
     int i;
     for (i=0; i<filter->nfilters; i++)
     {
-        //if ( filter->filters[i].key ) free(filter->filters[i].key);
-        free(filter->filters[i].str_value);
+        if ( filter->filters[i].key ) free(filter->filters[i].key);
+        free(filter->filters[i].str_value.s);
         free(filter->filters[i].tag);
+        free(filter->filters[i].idxs);
         free(filter->filters[i].values);
         free(filter->filters[i].pass_samples);
         if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash);
@@ -1745,6 +2023,7 @@ void filter_destroy(filter_t *filter)
     free(filter->str);
     free(filter->tmpi);
     free(filter->tmpf);
+    free(filter->tmps.s);
     free(filter);
 }
 
@@ -1765,16 +2044,15 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
                 filter->filters[i].setter(filter, line, &filter->filters[i]);
             else if ( filter->filters[i].key )  // string constant
             {
-                filter->filters[i].str_value = filter->filters[i].key;
-                filter->filters[i].values[0] = filter->filters[i].values[0];
-                filter->filters[i].nvalues   = strlen(filter->filters[i].key);
+                filter->filters[i].str_value.l = 0;
+                kputs(filter->filters[i].key, &filter->filters[i].str_value);
+                filter->filters[i].nvalues = filter->filters[i].str_value.l;
             }
             else    // numeric constant
             {
                 filter->filters[i].values[0] = filter->filters[i].threshold;
                 filter->filters[i].nvalues   = 1;
             }
-
             filter->flt_stack[nstack++] = &filter->filters[i];
             continue;
         }
diff --git a/kheap.h b/kheap.h
index ac2f9f9..cb5dda4 100644
--- a/kheap.h
+++ b/kheap.h
@@ -57,6 +57,8 @@
         // "data_t".
         heap_t *heap = khp_init(mh);
 
+        // When inserting a new element, the heap stores a copy of the memory
+        // area pointed to by the third argument.
         for (int i=0; i<3; i++)
             khp_insert(mh, heap, &data[i]);
 
@@ -130,7 +132,8 @@
         {                                                               \
             heap->mdat = heap->ndat;                                    \
             kroundup32(heap->mdat);                                     \
-            heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t));  \
+            heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t));         \
+            memset(heap->dat + heap->ndat, 0, (heap->mdat - heap->ndat)*sizeof(kheap_t)); \
         }                                                               \
         int i = heap->ndat - 1;                                         \
         while ( i && __cmp(dat,&heap->dat[khp_parent(i)]) )             \
diff --git a/main.c b/main.c
index 4e3e0e5..03fa6a7 100644
--- a/main.c
+++ b/main.c
@@ -57,6 +57,7 @@ int main_plugin(int argc, char *argv[]);
 int main_consensus(int argc, char *argv[]);
 int main_csq(int argc, char *argv[]);
 int bam_mpileup(int argc, char *argv[]);
+int main_sort(int argc, char *argv[]);
 
 typedef struct
 {
@@ -126,6 +127,10 @@ static cmd_t cmds[] =
       .alias = "reheader",
       .help  = "modify VCF/BCF header, change sample names"
     },
+    { .func  = main_sort,
+      .alias = "sort",
+      .help  = "sort VCF/BCF file"
+    },
     { .func  = main_vcfview,
       .alias = "view",
       .help  = "VCF/BCF conversion, view, subset and filter VCF/BCF files"
diff --git a/misc/plot-roh.py b/misc/plot-roh.py
index 49cef76..3221109 100755
--- a/misc/plot-roh.py
+++ b/misc/plot-roh.py
@@ -263,22 +263,24 @@ for fname in fnames:
             pos  = int(row[2])
             reg  = region_overlap(regs,chr,pos,pos)
             if reg==None: continue
-            smpl = row[3]
-            if samples!=None and smpl not in samples: continue
-            gt   = row[4]
-            x = gt.split('/')
-            dsg = 2
-            if x[0]!=x[1]: dsg = 1
-            elif x[0]=='0': continue        # skip HomRef 0/0 genotypes
-            if chr not in dat_gt: 
-                dat_gt[chr] = {}
-                chrs.append(chr)
-            if smpl not in dat_gt[chr]: 
-                dat_gt[chr][smpl] = []
-            if smpl not in smpl2y:
-                y = len(smpl2y)
-                smpl2y[smpl] = y
-            dat_gt[chr][smpl].append([pos,dsg])
+            for i in range(3,len(row),2):
+                smpl = row[i]
+                if samples!=None and smpl not in samples: continue
+                gt   = row[i+1]
+                x = gt.split('/')
+                if x[0]=='.': continue          # missing genotype ./.
+                dsg = 2
+                if x[0]!=x[1]: dsg = 1
+                elif x[0]=='0': continue        # skip HomRef 0/0 genotypes
+                if chr not in dat_gt: 
+                    dat_gt[chr] = {}
+                    chrs.append(chr)
+                if smpl not in dat_gt[chr]: 
+                    dat_gt[chr][smpl] = []
+                if smpl not in smpl2y:
+                    y = len(smpl2y)
+                    smpl2y[smpl] = y
+                dat_gt[chr][smpl].append([pos,dsg])
         elif row[0]=='RG':
             smpl  = row[1]
             if samples!=None and smpl not in samples: continue
@@ -326,8 +328,8 @@ for chr in chrs:
     off += max_pos + off_sep
     off_list.append(off)
 
-height = len(fnames)
-if len(fnames)>5: heigth = 5
+height = len(smpl2y)
+if len(smpl2y)>5: heigth = 5
 wh = 20,height
 
 def bignum(num):
diff --git a/misc/plot-vcfstats b/misc/plot-vcfstats
index 7172f05..d253fd7 100755
--- a/misc/plot-vcfstats
+++ b/misc/plot-vcfstats
@@ -70,11 +70,9 @@ exit;
 
 #--------------------------------
 
-sub error
+sub usage
 {
-    my (@msg) = @_;
-    if ( scalar @msg ) { confess @msg; }
-    die
+    print STDERR
         "About: Plots output of \"bcftools stats\"\n",
         "Usage: plot-vcfstats [OPTIONS] file.chk ...\n",
         "       plot-vcfstats -p outdir/ file.chk ...\n",
@@ -91,6 +89,15 @@ sub error
 }
 
 
+sub error
+{
+    my (@msg) = @_;
+    if ( scalar @msg ) { confess @msg; }
+    usage();
+    exit 1;
+}
+
+
 sub parse_params
 {
     $0 =~ s{^.+/}{};
@@ -244,7 +251,7 @@ sub parse_params
         if ( $arg eq '-t' || $arg eq '--title' ) { push @{$$opts{titles}},shift(@ARGV); next; }
         if ( $arg eq '-T' || $arg eq '--main-title' ) { $$opts{main_title} = shift(@ARGV); next; }
         if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; }
-        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
+        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { usage(); exit 0; }
         if ( -e $arg ) { push @{$$opts{vcfstats}},$arg; next; }
         error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
     }
@@ -1590,8 +1597,8 @@ sub plot_substitutions
             \\tfig = plt.figure(figsize=($$opts{img_width},$$opts{img_height}))
             \\tcm  = mpl.cm.get_cmap('autumn')
             \\tn = 12
-            \\tcol = range(n)
-            \\tfor i in range(n): col[i] = cm(1.*i/n)
+            \\tcol = []
+            \\tfor i in list(range(n)): col.append(cm(1.*i/n))
             \\tax1 = fig.add_subplot(111)
             \\tax1.bar([row[0] for row in dat], [row[2] for row in dat], color=col)
             \\tax1.set_ylabel('Count')
diff --git a/misc/run-roh.pl b/misc/run-roh.pl
index 4a1afa4..1940d2e 100755
--- a/misc/run-roh.pl
+++ b/misc/run-roh.pl
@@ -176,7 +176,7 @@ sub run_roh
         push @files,$outfile;
         if ( -e $outfile ) { next; }
         cmd(
-            "bcftools annotate --rename-chrs $chr_fname $$opts{indir}/$file -Ou | " .
+            "bcftools annotate --rename-chrs $chr_fname '$$opts{indir}/$file' -Ou | " .
             "bcftools annotate -c CHROM,POS,REF,ALT,AF1KG -h $$opts{af_annots}.hdr -a $$opts{af_annots} -Ob -o $outfile.part && " .
             "mv $outfile.part $outfile",%$opts);
     }
diff --git a/mpileup.c b/mpileup.c
index ac37dd4..9b6c6eb 100644
--- a/mpileup.c
+++ b/mpileup.c
@@ -909,7 +909,7 @@ int bam_mpileup(int argc, char *argv[])
         {"ignore-RG", no_argument, NULL, 5},
         {"ignore-rg", no_argument, NULL, 5},
         {"gvcf", required_argument, NULL, 'g'},
-        {"non-reference", no_argument, NULL, 7},
+        {"no-reference", no_argument, NULL, 7},
         {"no-version", no_argument, NULL, 8},
         {"threads",required_argument,NULL,9},
         {"illumina1.3+", no_argument, NULL, '6'},
@@ -1099,11 +1099,8 @@ int bam_mpileup(int argc, char *argv[])
     free(mplp.files);
     free(mplp.reg_fname); free(mplp.pl_list);
     if (mplp.fai) fai_destroy(mplp.fai);
-    if (mplp.bed)
-    {
-        regidx_destroy(mplp.bed);
-        regitr_destroy(mplp.bed_itr);
-    }
+    if (mplp.bed) regidx_destroy(mplp.bed);
+    if (mplp.bed_itr) regitr_destroy(mplp.bed_itr);
     if (mplp.reg) regidx_destroy(mplp.reg);
     bam_smpl_destroy(mplp.bsmpl);
     return ret;
diff --git a/plugins/check-ploidy.c b/plugins/check-ploidy.c
new file mode 100644
index 0000000..51c4a3b
--- /dev/null
+++ b/plugins/check-ploidy.c
@@ -0,0 +1,165 @@
+/* 
+    Copyright (C) 2017 Genome Research Ltd.
+
+    Author: Petr Danecek <pd3 at sanger.ac.uk>
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    
+    The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+    
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+    THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/vcfutils.h>
+#include <htslib/kseq.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include "bcftools.h"
+
+typedef struct
+{
+    char *sample;
+    int beg,end,ploidy;
+}
+dat_t;
+
+typedef struct
+{
+    int argc;
+    char **argv;
+    int rid, gt_id, ndat;
+    dat_t *dat;
+    bcf_hdr_t *hdr;
+}
+args_t;
+
+static args_t *args;
+
+const char *about(void)
+{
+    return "Check if ploidy of samples is consistent for all sites\n";
+}
+
+const char *usage(void)
+{
+    return 
+        "\n"
+        "About: Check if ploidy of samples is consistent for all sites.\n"
+        "Usage: bcftools +check-ploidy [General Options] -- [Plugin Options]\n"
+        "Options:\n"
+        "   run \"bcftools plugin\" for a list of common options\n"
+        "\n"
+        "Example:\n"
+        "   bcftools +check-ploidy file.bcf\n"
+        "\n";
+}
+
+int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
+{
+    args = (args_t*) calloc(1,sizeof(args_t));
+    args->argc   = argc; args->argv = argv;
+    args->hdr = in;
+    args->ndat = bcf_hdr_nsamples(args->hdr);
+    args->dat  = (dat_t*) calloc(args->ndat,sizeof(dat_t));
+    int i;
+    for (i=0; i<args->ndat; i++) args->dat[i].sample = args->hdr->samples[i];
+    args->rid = -1;
+    args->gt_id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"GT");
+    if ( args->gt_id<0 ) error("Error: GT field is not present\n");
+    printf("# [1]Sample\t[2]Chromosome\t[3]Region Start\t[4]Region End\t[5]Ploidy\n");
+    return 1;
+}
+
+bcf1_t *process(bcf1_t *rec)
+{
+    int i;
+
+    bcf_unpack(rec, BCF_UN_FMT);
+    bcf_fmt_t *fmt_gt = NULL;
+    for (i=0; i<rec->n_fmt; i++)
+        if ( rec->d.fmt[i].id==args->gt_id ) { fmt_gt = &rec->d.fmt[i]; break; }
+    if ( !fmt_gt ) return NULL;    // no GT tag
+
+    if ( args->ndat != rec->n_sample ) 
+        error("Incorrect number of samples at %s:%d .. found %d, expected %d\n",bcf_seqname(args->hdr,rec),rec->pos+1,rec->n_sample,args->ndat);
+
+    if ( args->rid!=rec->rid && args->rid!=-1 )
+    {
+        for (i=0; i<args->ndat; i++)
+        {
+            dat_t *dat = &args->dat[i];
+            if ( dat->ploidy!=0 ) printf("%s\t%s\t%d\t%d\t%d\n", dat->sample,bcf_seqname(args->hdr,rec),dat->beg+1,dat->end+1,dat->ploidy); 
+            dat->ploidy = 0;
+        }
+    }
+    args->rid = rec->rid;
+
+    #define BRANCH_INT(type_t,vector_end) \
+    { \
+        for (i=0; i<rec->n_sample; i++) \
+        { \
+            type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
+            int nal, missing = 0; \
+            for (nal=0; nal<fmt_gt->n; nal++) \
+            { \
+                if ( p[nal]==vector_end ) break; /* smaller ploidy */ \
+                if ( bcf_gt_is_missing(p[nal]) ) { missing=1; break; } /* missing allele */ \
+            } \
+            if ( !nal || missing ) continue; /* missing genotype */ \
+            dat_t *dat = &args->dat[i]; \
+            if ( dat->ploidy==nal ) \
+            { \
+                dat->end = rec->pos; \
+                continue; \
+            } \
+            if ( dat->ploidy!=0 ) \
+                printf("%s\t%s\t%d\t%d\t%d\n", dat->sample,bcf_seqname(args->hdr,rec),dat->beg+1,dat->end+1,dat->ploidy); \
+            dat->ploidy = nal; \
+            dat->beg = rec->pos; \
+            dat->end = rec->pos; \
+        } \
+    }
+    switch (fmt_gt->type) {
+        case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
+        default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args->hdr,rec),rec->pos+1); break;
+    }
+    #undef BRANCH_INT
+
+    return NULL;
+}
+
+void destroy(void)
+{
+    int i;
+    for (i=0; i<args->ndat; i++)
+    {
+        dat_t *dat = &args->dat[i];
+        if ( dat->ploidy!=0 ) printf("%s\t%s\t%d\t%d\t%d\n", dat->sample,bcf_hdr_id2name(args->hdr,args->rid),dat->beg+1,dat->end+1,dat->ploidy); 
+        dat->ploidy = 0;
+    }
+    free(args->dat);
+    free(args);
+}
+
diff --git a/plugins/mendelian.c b/plugins/mendelian.c
index e8ae8e5..da83cda 100644
--- a/plugins/mendelian.c
+++ b/plugins/mendelian.c
@@ -117,7 +117,7 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
             case 'T': trio_file = optarg; break;
             case 'h':
             case '?':
-            default: error("%s", usage()); break;
+            default: error(usage()); break;
         }
     }
     if ( optind != argc ) error(usage());
@@ -176,30 +176,96 @@ bcf1_t *process(bcf1_t *rec)
 
     int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr);
     if ( ngt<0 ) return dflt;
-    if ( ngt!=2*bcf_hdr_nsamples(args.hdr) ) return dflt;
+    if ( ngt!=2*bcf_hdr_nsamples(args.hdr) && ngt!=bcf_hdr_nsamples(args.hdr) ) return dflt;
+    ngt /= bcf_hdr_nsamples(args.hdr);
 
     int i, has_bad = 0, needs_update = 0;
     for (i=0; i<args.ntrios; i++)
     {
-        int mother,father,child;
         int32_t a,b,c,d,e,f;
         trio_t *trio = &args.trios[i];
 
-        a = args.gt_arr[2*trio->imother];
-        b = args.gt_arr[2*trio->imother+1];
-        c = args.gt_arr[2*trio->ifather];
-        d = args.gt_arr[2*trio->ifather+1];
-        e = args.gt_arr[2*trio->ichild];
-        f = args.gt_arr[2*trio->ichild+1];
-        if ( bcf_gt_is_missing(a) || bcf_gt_is_missing(b) ) continue; 
+        a = args.gt_arr[ngt*trio->imother];
+        b = ngt==2 ? args.gt_arr[ngt*trio->imother+1] : bcf_int32_vector_end;
+        c = args.gt_arr[ngt*trio->ifather];
+        d = ngt==2 ? args.gt_arr[ngt*trio->ifather+1] : bcf_int32_vector_end;
+        e = args.gt_arr[ngt*trio->ichild];
+        f = ngt==2 ? args.gt_arr[ngt*trio->ichild+1] : bcf_int32_vector_end;
+
+        // missing allele in the child: missing data or daugther's chrY
+        if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) continue;
+
+        // missing data in father
         if ( bcf_gt_is_missing(c) || bcf_gt_is_missing(d) ) continue; 
-        if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) continue; 
 
-        mother = (1<<bcf_gt_allele(a)) | (1<<bcf_gt_allele(b));
-        father = (1<<bcf_gt_allele(c)) | (1<<bcf_gt_allele(d));
-        child  = (1<<bcf_gt_allele(e)) | (1<<bcf_gt_allele(f));
+        uint64_t mother,father,child1,child2;
+
+        // sex chr - father is haploid
+        if ( d==bcf_int32_vector_end )
+        {
+            if ( bcf_gt_is_missing(a) && (bcf_gt_is_missing(b) || b==bcf_int32_vector_end) )
+            {
+                // either the child does not match the father or the child is diploid
+                if ( bcf_gt_allele(c)!=bcf_gt_allele(e) || f!=bcf_int32_vector_end )
+                {
+                    trio->nbad++;
+                    has_bad = 1;
+                    if ( args.mode&MODE_DELETE )
+                    {   
+                        args.gt_arr[ngt*trio->imother] = bcf_gt_missing;
+                        if ( b!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing; // should be always true 
+                        args.gt_arr[ngt*trio->ifather] = bcf_gt_missing;
+                        args.gt_arr[ngt*trio->ichild]  = bcf_gt_missing;
+                        if ( ngt==2 ) args.gt_arr[ngt*trio->ichild+1] = bcf_int32_vector_end;
+                        needs_update = 1;
+                    }
+                }
+                else
+                    trio->nok++;
+                continue;
+            }
+            // son's chrX
+            if ( f==bcf_int32_vector_end )
+            {
+                // chrY - no data in mother
+                if ( bcf_gt_allele(e)!=bcf_gt_allele(a) && bcf_gt_allele(e)!=bcf_gt_allele(b) )
+                {
+                    trio->nbad++;
+                    has_bad = 1;
+                    if ( args.mode&MODE_DELETE )
+                    {
+                        args.gt_arr[ngt*trio->imother] = bcf_gt_missing;
+                        if ( ngt==2 ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing;
+                        args.gt_arr[ngt*trio->ifather] = bcf_gt_missing;
+                        if ( d!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ifather+1] = bcf_gt_missing;
+                        args.gt_arr[ngt*trio->ichild]  = bcf_gt_missing;
+                        needs_update = 1;
+                    }
+                }
+                else
+                    trio->nok++;
+                continue;
+            }
+
+            // skip genotypes which do not fit in a 64bit bitmask
+            if ( bcf_gt_allele(a) > 63 || bcf_gt_allele(b) > 63 || bcf_gt_allele(c) > 63 ) continue; 
+            if ( bcf_gt_allele(e) > 63 || bcf_gt_allele(f) > 63 ) continue; 
+
+            // daughter's chrX
+            mother = (1<<bcf_gt_allele(a)) | (1<<bcf_gt_allele(b));
+            father = 1<<bcf_gt_allele(c);
+            child1 = 1<<bcf_gt_allele(e);
+            child2 = 1<<bcf_gt_allele(f);
+        }
+        else
+        {
+            mother = (1<<bcf_gt_allele(a)) | (1<<bcf_gt_allele(b));
+            father = (1<<bcf_gt_allele(c)) | (1<<bcf_gt_allele(d));
+            child1 = 1<<bcf_gt_allele(e);
+            child2 = 1<<bcf_gt_allele(f);
+        }
 
-        if ( (mother&child) && (father&child) ) 
+        if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) )
         {
             trio->nok++;
         }
@@ -209,18 +275,18 @@ bcf1_t *process(bcf1_t *rec)
             has_bad = 1;
             if ( args.mode&MODE_DELETE )
             {
-                args.gt_arr[2*trio->imother]   = bcf_gt_missing;
-                args.gt_arr[2*trio->imother+1] = bcf_gt_missing;
-                args.gt_arr[2*trio->ifather]   = bcf_gt_missing;
-                args.gt_arr[2*trio->ifather+1] = bcf_gt_missing;
-                args.gt_arr[2*trio->ichild]    = bcf_gt_missing;
-                args.gt_arr[2*trio->ichild+1]  = bcf_gt_missing;
+                args.gt_arr[ngt*trio->imother] = bcf_gt_missing;
+                if ( b!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing; // should be always true 
+                args.gt_arr[ngt*trio->ifather] = bcf_gt_missing;
+                if ( d!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ifather+1] = bcf_gt_missing;
+                args.gt_arr[ngt*trio->ichild] = bcf_gt_missing;
+                if ( f!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ichild+1]  = bcf_gt_missing;
                 needs_update = 1;
             }
         }
     }
 
-    if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt) )
+    if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) )
         error("Could not update GT field at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1);
 
     if ( args.mode&MODE_DELETE ) return rec;
diff --git a/plugins/setGT.c b/plugins/setGT.c
index 5c3d68d..676159d 100644
--- a/plugins/setGT.c
+++ b/plugins/setGT.c
@@ -1,6 +1,6 @@
 /*  plugins/setGT.c -- set gentoypes to given values
 
-    Copyright (C) 2015-2016 Genome Research Ltd.
+    Copyright (C) 2015-2017 Genome Research Ltd.
 
     Author: Petr Danecek <pd3 at sanger.ac.uk>
 
@@ -26,8 +26,10 @@ DEALINGS IN THE SOFTWARE.  */
 #include <stdlib.h>
 #include <htslib/vcf.h>
 #include <htslib/vcfutils.h>
+#include <htslib/kfunc.h>
 #include <inttypes.h>
 #include <getopt.h>
+#include <ctype.h>
 #include "bcftools.h"
 #include "filter.h"
 
@@ -35,15 +37,32 @@ DEALINGS IN THE SOFTWARE.  */
 #define FLT_INCLUDE 1
 #define FLT_EXCLUDE 2
 
-bcf_hdr_t *in_hdr, *out_hdr;
-int32_t *gts = NULL, mgts = 0;
-int *arr = NULL, marr = 0;
-uint64_t nchanged = 0;
-int tgt_mask = 0, new_mask = 0, new_gt = 0;
-filter_t *filter = NULL;
-char *filter_str = NULL;
-int filter_logic = 0;
-const uint8_t *smpl_pass = NULL;
+typedef int (*cmp_f)(double a, double b);
+
+static int cmp_eq(double a, double b) { return a==b ? 1 : 0; }
+static int cmp_le(double a, double b) { return a<=b ? 1 : 0; }
+static int cmp_ge(double a, double b) { return a>=b ? 1 : 0; }
+static int cmp_lt(double a, double b) { return a<b ? 1 : 0; }
+static int cmp_gt(double a, double b) { return a>b ? 1 : 0; }
+
+typedef struct
+{
+    bcf_hdr_t *in_hdr, *out_hdr;
+    int32_t *gts, mgts, *iarr, miarr;
+    int *arr, marr;
+    uint64_t nchanged;
+    int tgt_mask, new_mask, new_gt;
+    filter_t *filter;
+    char *filter_str;
+    int filter_logic;
+    const uint8_t *smpl_pass;
+    double binom_val;
+    char *binom_tag;
+    cmp_f binom_cmp;
+}
+args_t;
+
+args_t *args = NULL;
 
 #define GT_MISSING   1
 #define GT_PARTIAL  (1<<1)
@@ -53,6 +72,7 @@ const uint8_t *smpl_pass = NULL;
 #define GT_UNPHASED (1<<5)
 #define GT_ALL      (1<<6)
 #define GT_QUERY    (1<<7)
+#define GT_BINOM    (1<<8)
 
 const char *about(void)
 {
@@ -67,6 +87,7 @@ const char *usage(void)
         "           ./x  .. partially missing (e.g., \"./0\" or \".|1\" but not \"./.\")\n"
         "           .    .. partially or completely missing\n"
         "           a    .. all genotypes\n"
+        "           b    .. heterozygous genotypes failing two-tailed binomial test (example below)\n"
         "           q    .. select genotypes using -i/-e options\n"
         "       and the new genotype can be one of:\n"
         "           .    .. missing (\".\" or \"./.\", keeps ploidy)\n"
@@ -93,12 +114,71 @@ const char *usage(void)
         "\n"
         "   # set partially missing genotypes to completely missing\n"
         "   bcftools +setGT in.vcf -- -t ./x -n .\n"
+        "\n"
+        "   # set heterozygous genotypes to 0/0 if binom.test(nAlt,nRef+nAlt,0.5)<1e-3\n"
+        "   bcftools +setGT in.vcf -- -t \"b:AD<1e-3\" -n 0\n"  // todo: make -i/-e recognise something like is_het or gt="het" so that this can be generalized?
         "\n";
 }
 
+void parse_binom_expr(args_t *args, char *str)
+{
+    if ( str[1]!=':' ) goto err;
+
+    char *beg = str+2;
+    while ( *beg && isspace(*beg) ) beg++;
+    if ( !*beg ) goto err;
+    char *end = beg;
+    while ( *end )
+    {
+        if ( isspace(*end) || *end=='<' || *end=='=' || *end=='>' ) break;
+        end++;
+    }
+    if ( !*end ) goto err;
+    args->binom_tag = (char*) calloc(1,end-beg+1);
+    memcpy(args->binom_tag,beg,end-beg);
+    int tag_id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,args->binom_tag);
+    if ( !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,tag_id) ) error("The FORMAT tag \"%s\" is not present in the VCF\n", args->binom_tag);
+    
+    while ( *end && isspace(*end) ) end++;
+    if ( !*end ) goto err;
+
+    if ( !strncmp(end,"<=",2) ) { args->binom_cmp = cmp_le; beg = end+2; }
+    else if ( !strncmp(end,">=",2) ) { args->binom_cmp = cmp_ge; beg = end+2; }
+    else if ( !strncmp(end,"==",2) ) { args->binom_cmp = cmp_eq; beg = end+2; }
+    else if ( !strncmp(end,"<",1) ) { args->binom_cmp = cmp_lt; beg = end+1; }
+    else if ( !strncmp(end,">",1) ) { args->binom_cmp = cmp_gt; beg = end+1; }
+    else if ( !strncmp(end,"=",1) ) { args->binom_cmp = cmp_eq; beg = end+1; }
+    else goto err;
+
+    while ( *beg && isspace(*beg) ) beg++;
+    if ( !*beg ) goto err;
+
+    args->binom_val = strtod(beg, &end);
+    while ( *end && isspace(*end) ) end++;
+    if ( *end ) goto err;
+
+    args->tgt_mask |= GT_BINOM;
+    return;
+
+err:
+    error(
+        "Error parsing the expression: %s\n"
+        "Expected TAG CMP VAL, where\n"
+        "   TAG .. one of the format tags\n"
+        "   CMP .. operator, one of <, <=, >, >=\n"
+        "   VAL .. value\n"
+        "For example:\n"
+        "   bcftools +setGT in.vcf -- -t \"b:AD>1e-3\" -n 0\n"
+        "\n", str
+        );
+}
 
 int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
 {
+    args = (args_t*) calloc(1,sizeof(args_t));
+    args->in_hdr  = in;
+    args->out_hdr = out;
+
     int c;
     static struct option loptions[] =
     {
@@ -112,42 +192,41 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
     {
         switch (c) 
         {
-            case 'i': filter_str = optarg; filter_logic = FLT_INCLUDE; break;
-            case 'e': filter_str = optarg; filter_logic = FLT_EXCLUDE; break;
-            case 'n': new_mask = bcf_gt_phased(0); 
-                if ( strchr(optarg,'.') ) new_mask |= GT_MISSING;
-                if ( strchr(optarg,'0') ) new_mask |= GT_REF;
-                if ( strchr(optarg,'M') ) new_mask |= GT_MAJOR;
-                if ( strchr(optarg,'p') ) new_mask |= GT_PHASED;
-                if ( strchr(optarg,'u') ) new_mask |= GT_UNPHASED;
-                if ( new_mask==0 ) error("Unknown parameter to --new-gt: %s\n", optarg);
+            case 'i': args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break;
+            case 'e': args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break;
+            case 'n': args->new_mask = bcf_gt_phased(0); 
+                if ( strchr(optarg,'.') ) args->new_mask |= GT_MISSING;
+                if ( strchr(optarg,'0') ) args->new_mask |= GT_REF;
+                if ( strchr(optarg,'M') ) args->new_mask |= GT_MAJOR;
+                if ( strchr(optarg,'p') ) args->new_mask |= GT_PHASED;
+                if ( strchr(optarg,'u') ) args->new_mask |= GT_UNPHASED;
+                if ( args->new_mask==0 ) error("Unknown parameter to --new-gt: %s\n", optarg);
                 break;
             case 't':
-                if ( !strcmp(optarg,".") ) tgt_mask |= GT_MISSING|GT_PARTIAL;
-                if ( !strcmp(optarg,"./x") ) tgt_mask |= GT_PARTIAL;
-                if ( !strcmp(optarg,"./.") ) tgt_mask |= GT_MISSING;
-                if ( !strcmp(optarg,"a") ) tgt_mask |= GT_ALL;
-                if ( !strcmp(optarg,"q") ) tgt_mask |= GT_QUERY;
-                if ( !strcmp(optarg,"?") ) tgt_mask |= GT_QUERY;        // for backward compatibility
-                if ( tgt_mask==0 ) error("Unknown parameter to --target-gt: %s\n", optarg);
+                if ( !strcmp(optarg,".") ) args->tgt_mask |= GT_MISSING|GT_PARTIAL;
+                if ( !strcmp(optarg,"./x") ) args->tgt_mask |= GT_PARTIAL;
+                if ( !strcmp(optarg,"./.") ) args->tgt_mask |= GT_MISSING;
+                if ( !strcmp(optarg,"a") ) args->tgt_mask |= GT_ALL;
+                if ( !strcmp(optarg,"q") ) args->tgt_mask |= GT_QUERY;
+                if ( !strcmp(optarg,"?") ) args->tgt_mask |= GT_QUERY;        // for backward compatibility
+                if ( strchr(optarg,'b') ) parse_binom_expr(args, strchr(optarg,'b'));
+                if ( args->tgt_mask==0 ) error("Unknown parameter to --target-gt: %s\n", optarg);
                 break;
             case 'h':
             case '?':
             default: fprintf(stderr,"%s", usage()); exit(1); break;
         }
     }
-    in_hdr  = in;
-    out_hdr = out;
 
-    if ( !new_mask ) error("Expected -n option\n");
-    if ( !tgt_mask ) error("Expected -t option\n");
+    if ( !args->new_mask ) error("Expected -n option\n");
+    if ( !args->tgt_mask ) error("Expected -t option\n");
 
-    if ( new_mask & GT_MISSING ) new_gt = bcf_gt_missing;
-    if ( new_mask & GT_REF ) new_gt = new_mask&GT_PHASED ? bcf_gt_phased(0) : bcf_gt_unphased(0);
+    if ( args->new_mask & GT_MISSING ) args->new_gt = bcf_gt_missing;
+    if ( args->new_mask & GT_REF ) args->new_gt = args->new_mask&GT_PHASED ? bcf_gt_phased(0) : bcf_gt_unphased(0);
 
-    if ( filter_str  && tgt_mask!=GT_QUERY ) error("Expected -t? with -i/-e\n");
-    if ( !filter_str && tgt_mask&GT_QUERY ) error("Expected -i/-e with -t?\n");
-    if ( filter_str ) filter = filter_init(in,filter_str);
+    if ( args->filter_str  && !(args->tgt_mask&GT_QUERY) ) error("Expected -tq with -i/-e\n");
+    if ( !args->filter_str && args->tgt_mask&GT_QUERY ) error("Expected -i/-e with -tq\n");
+    if ( args->filter_str ) args->filter = filter_init(in,args->filter_str);
 
     return 0;
 }
@@ -184,61 +263,111 @@ static inline int set_gt(int32_t *ptr, int ngts, int gt)
     for (j=0; j<ngts; j++)
     {
         if ( ptr[j]==bcf_int32_vector_end ) break;
+        if ( ptr[j] != gt ) changed++;
         ptr[j] = gt;
-        changed++;
     }
     return changed;
 }
 
+static inline double calc_binom(int na, int nb)
+{
+    int N = na + nb;
+    if ( !N ) return 1;
+
+    /*
+        kfunc.h implements kf_betai, which is the regularized beta function I_x(a,b) = P(X<=a/(a+b))
+    */
+    double prob = 2 * kf_betai(na, nb+1, 0.5);
+    if ( prob > 1 ) prob = 1;
+
+    return prob;
+}
+
 bcf1_t *process(bcf1_t *rec)
 {
     if ( !rec->n_sample ) return rec;
 
-    int ngts = bcf_get_genotypes(in_hdr, rec, &gts, &mgts);
+    int ngts = bcf_get_genotypes(args->in_hdr, rec, &args->gts, &args->mgts);
     ngts /= rec->n_sample;
     int i, j, changed = 0;
+
+    int nbinom = 0;
+    if ( args->tgt_mask & GT_BINOM )
+    {
+        nbinom = bcf_get_format_int32(args->in_hdr, rec, args->binom_tag, &args->iarr, &args->miarr);
+        if ( nbinom<0 ) nbinom = 0;
+        nbinom /= rec->n_sample;
+    }
     
     // Calculating allele frequency for each allele and determining major allele
     // only do this if use_major is true
     int an = 0, maxAC = -1, majorAllele = -1;
-    if ( new_mask & GT_MAJOR )
+    if ( args->new_mask & GT_MAJOR )
     {
-        hts_expand(int,rec->n_allele,marr,arr);
-        int ret = bcf_calc_ac(in_hdr,rec,arr,BCF_UN_FMT);
+        hts_expand(int,rec->n_allele,args->marr,args->arr);
+        int ret = bcf_calc_ac(args->in_hdr,rec,args->arr,BCF_UN_FMT);
         if ( ret<= 0 )
-            error("Could not calculate allele count at %s:%d\n", bcf_seqname(in_hdr,rec),rec->pos+1);
+            error("Could not calculate allele count at %s:%d\n", bcf_seqname(args->in_hdr,rec),rec->pos+1);
 
         for(i=0; i < rec->n_allele; ++i)
         {
-            an += arr[i];
-            if (arr[i] > maxAC)
+            an += args->arr[i];
+            if (args->arr[i] > maxAC)
             {
-                maxAC = arr[i];
+                maxAC = args->arr[i];
                 majorAllele = i;
             }
         }
 
         // replacing new_gt by major allele
-        new_gt = new_mask & GT_PHASED ?  bcf_gt_phased(majorAllele) : bcf_gt_unphased(majorAllele);
+        args->new_gt = args->new_mask & GT_PHASED ?  bcf_gt_phased(majorAllele) : bcf_gt_unphased(majorAllele);
     }
 
     // replace gts
-    if ( tgt_mask&GT_QUERY )
+    if ( nbinom && ngts>=2 )    // only diploid genotypes are considered: higher ploidy ignored further, haploid here
+    {
+        if ( args->filter ) filter_test(args->filter,rec,&args->smpl_pass);
+        for (i=0; i<rec->n_sample; i++)
+        {
+            if ( args->smpl_pass )
+            {
+                if ( !args->smpl_pass[i] && args->filter_logic==FLT_INCLUDE ) continue;
+                if (  args->smpl_pass[i] && args->filter_logic==FLT_EXCLUDE ) continue;
+            }
+            int32_t *ptr = args->gts + i*ngts;
+            if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end ) continue;
+            if ( ptr[0]==ptr[1] ) continue; // a hom
+            int ia = bcf_gt_allele(ptr[0]); 
+            int ib = bcf_gt_allele(ptr[1]); 
+            if ( ia>=nbinom || ib>=nbinom ) 
+                error("The sample %s has incorrect number of %s fields at %s:%d\n",
+                        args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),rec->pos+1);
+
+            double prob = calc_binom(args->iarr[i*nbinom+ia],args->iarr[i*nbinom+ib]);
+            if ( !args->binom_cmp(prob,args->binom_val) ) continue;
+
+            if ( args->new_mask&GT_UNPHASED )
+                changed += unphase_gt(ptr, ngts);
+            else
+                changed += set_gt(ptr, ngts, args->new_gt);
+        }
+    }
+    else if ( args->tgt_mask&GT_QUERY )
     {
-        int pass_site = filter_test(filter,rec,&smpl_pass);
-        if ( (pass_site && filter_logic==FLT_EXCLUDE) || (!pass_site && filter_logic==FLT_INCLUDE) ) return rec;
+        int pass_site = filter_test(args->filter,rec,&args->smpl_pass);
+        if ( (pass_site && args->filter_logic==FLT_EXCLUDE) || (!pass_site && args->filter_logic==FLT_INCLUDE) ) return rec;
         for (i=0; i<rec->n_sample; i++)
         {
-            if ( smpl_pass )
+            if ( args->smpl_pass )
             {
-                if ( !smpl_pass[i] && filter_logic==FLT_INCLUDE ) continue;
-                if (  smpl_pass[i] && filter_logic==FLT_EXCLUDE ) continue;
+                if ( !args->smpl_pass[i] && args->filter_logic==FLT_INCLUDE ) continue;
+                if (  args->smpl_pass[i] && args->filter_logic==FLT_EXCLUDE ) continue;
             }
 
-            if ( new_mask&GT_UNPHASED )
-                changed += unphase_gt(gts + i*ngts, ngts);
+            if ( args->new_mask&GT_UNPHASED )
+                changed += unphase_gt(args->gts + i*ngts, ngts);
             else
-                changed += set_gt(gts + i*ngts, ngts, new_gt);
+                changed += set_gt(args->gts + i*ngts, ngts, args->new_gt);
         }
     }
     else
@@ -246,7 +375,7 @@ bcf1_t *process(bcf1_t *rec)
         for (i=0; i<rec->n_sample; i++)
         {
             int ploidy = 0, nmiss = 0;
-            int32_t *ptr = gts + i*ngts;
+            int32_t *ptr = args->gts + i*ngts;
             for (j=0; j<ngts; j++)
             {
                 if ( ptr[j]==bcf_int32_vector_end ) break;
@@ -255,29 +384,32 @@ bcf1_t *process(bcf1_t *rec)
             }
 
             int do_set = 0;
-            if ( tgt_mask&GT_ALL ) do_set = 1;
-            else if ( tgt_mask&GT_PARTIAL && nmiss ) do_set = 1;
-            else if ( tgt_mask&GT_MISSING && ploidy==nmiss ) do_set = 1;
+            if ( args->tgt_mask&GT_ALL ) do_set = 1;
+            else if ( args->tgt_mask&GT_PARTIAL && nmiss ) do_set = 1;
+            else if ( args->tgt_mask&GT_MISSING && ploidy==nmiss ) do_set = 1;
 
             if ( !do_set ) continue;
 
-            if ( new_mask&GT_UNPHASED )
+            if ( args->new_mask&GT_UNPHASED )
                 changed += unphase_gt(ptr, ngts);
             else
-                changed += set_gt(ptr, ngts, new_gt);
+                changed += set_gt(ptr, ngts, args->new_gt);
         }
     }
-    nchanged += changed;
-    if ( changed ) bcf_update_genotypes(out_hdr, rec, gts, ngts*rec->n_sample);
+    args->nchanged += changed;
+    if ( changed ) bcf_update_genotypes(args->out_hdr, rec, args->gts, ngts*rec->n_sample);
     return rec;
 }
 
 void destroy(void)
 {
-    if ( filter ) filter_destroy(filter);
-    free(arr);
-    fprintf(stderr,"Filled %"PRId64" alleles\n", nchanged);
-    free(gts);
+    fprintf(stderr,"Filled %"PRId64" alleles\n", args->nchanged);
+    free(args->binom_tag);
+    if ( args->filter ) filter_destroy(args->filter);
+    free(args->arr);
+    free(args->iarr);
+    free(args->gts);
+    free(args);
 }
 
 
diff --git a/test/check.chk b/test/check.chk
index c04a1bf..865773c 100644
--- a/test/check.chk
+++ b/test/check.chk
@@ -2,6 +2,20 @@
 # Definition of sets:
 # ID	[2]id	[3]tab-separated file names
 # SN, Summary numbers:
+#   number of records   .. number of data rows in the VCF
+#   number of no-ALTs   .. reference-only sites, ALT is either "." or identical to REF
+#   number of SNPs      .. number of rows with a SNP
+#   number of MNPs      .. number of rows with a MNP, such as CC>TT
+#   number of indels    .. number of rows with an indel
+#   number of others    .. number of rows with other type, for example a symbolic allele or
+#                          a complex substitution, such as ACT>TCGA
+#   number of multiallelic sites     .. number of rows with multiple alternate alleles
+#   number of multiallelic SNP sites .. number of rows with multiple alternate alleles, all SNPs
+# 
+#   Note that rows containing multiple types will be counted multiple times, in each
+#   counter. For example, a row with a SNP and an indel increments both the SNP and
+#   the indel counter.
+# 
 # SN	[2]id	[3]key	[4]value
 SN	0	number of samples:	2
 SN	0	number of records:	18
@@ -71,10 +85,10 @@ DP	0	32	4	11.111111	0	0.000000
 DP	0	35	4	11.111111	0	0.000000
 DP	0	60	0	0.000000	1	33.333333
 DP	0	62	0	0.000000	2	66.666667
-# PSC, Per-sample counts
-# PSC	[2]id	[3]sample	[4]nRefHom	[5]nNonRefHom	[6]nHets	[7]nTransitions	[8]nTransversions	[9]nIndels	[10]average depth	[11]nSingletons
-PSC	0	A	0	2	4	3	2	9	28.7	1
-PSC	0	B	1	1	4	2	2	9	28.7	0
+# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.
+# PSC	[2]id	[3]sample	[4]nRefHom	[5]nNonRefHom	[6]nHets	[7]nTransitions	[8]nTransversions	[9]nIndels	[10]average depth	[11]nSingletons	[12]nHapRef	[13]nHapAlt
+PSC	0	A	0	2	4	3	2	9	28.7	1	0	0
+PSC	0	B	1	1	4	2	2	9	28.7	0	0	0
 # PSI, Per-Sample Indels
 # PSI	[2]id	[3]sample	[4]in-frame	[5]out-frame	[6]not applicable	[7]out/(in+out) ratio	[8]nHets	[9]nAA
 PSI	0	A	0	0	0	0.00	9	0
diff --git a/test/convert.gvcf.out b/test/convert.gvcf.out
index d2c0fc9..4cd237b 100644
--- a/test/convert.gvcf.out
+++ b/test/convert.gvcf.out
@@ -112,7 +112,7 @@
 22	83	.	A	.	0	PASS	BLOCKAVG_min30p3a	GT:GQX:DP:DPF	0/0:54:19:0
 22	84	.	C	.	0	PASS	BLOCKAVG_min30p3a	GT:GQX:DP:DPF	0/0:54:19:0
 22	85	.	G	.	0	PASS	BLOCKAVG_min30p3a	GT:GQX:DP:DPF	0/0:54:19:0
-22	86	.	G	C	23	LowGQX	SNVSB=0;SNVHPOL=2	GT:GQ:GQX:DP:DPF:AD	0/1:56:23:22:0:16,6
+22	86	.	G	C	23	PASS	SNVSB=0;SNVHPOL=2	GT:GQ:GQX:DP:DPF:AD	0/1:56:23:22:0:16,6
 22	87	.	T	.	0	PASS	BLOCKAVG_min30p3a	GT:GQX:DP:DPF	0/0:69:24:0
 22	88	.	C	.	0	PASS	BLOCKAVG_min30p3a	GT:GQX:DP:DPF	0/0:69:24:0
 22	89	.	A	.	0	PASS	BLOCKAVG_min30p3a	GT:GQX:DP:DPF	0/0:69:24:0
diff --git a/test/convert.gvcf.vcf b/test/convert.gvcf.vcf
index 6dfc44d..378edd1 100644
--- a/test/convert.gvcf.vcf
+++ b/test/convert.gvcf.vcf
@@ -80,7 +80,7 @@
 22	51	.	C	.	0	PASS	END=55;BLOCKAVG_min30p3a	GT:GQX:DP:DPF	0/0:30:11:0
 22	56	.	G	.	0	PASS	END=72;BLOCKAVG_min30p3a	GT:GQX:DP:DPF	0/0:42:15:0
 22	73	.	T	.	0	PASS	END=85;BLOCKAVG_min30p3a	GT:GQX:DP:DPF	0/0:54:19:0
-22	86	.	G	C	23	LowGQX	SNVSB=0;SNVHPOL=2	GT:GQ:GQX:DP:DPF:AD	0/1:56:23:22:0:16,6
+22	86	.	G	C	23	PASS	SNVSB=0;SNVHPOL=2	GT:GQ:GQX:DP:DPF:AD	0/1:56:23:22:0:16,6
 22	87	.	T	.	0	PASS	END=101;BLOCKAVG_min30p3a	GT:GQX:DP:DPF	0/0:69:24:0
 22	102	.	A	.	0	PASS	END=140;BLOCKAVG_min30p3a	GT:GQX:DP:DPF	0/0:84:29:0
 22	141	.	G	.	0	PASS	END=185;BLOCKAVG_min30p3a	GT:GQX:DP:DPF	0/0:90:31:0
diff --git a/test/filter.12.out b/test/filter.12.out
new file mode 100644
index 0000000..2726105
--- /dev/null
+++ b/test/filter.12.out
@@ -0,0 +1 @@
+3062915	0/1	2
diff --git a/test/filter.13.out b/test/filter.13.out
new file mode 100644
index 0000000..f79e69c
--- /dev/null
+++ b/test/filter.13.out
@@ -0,0 +1,2 @@
+3177144	0/0	1/1
+3177144	0/0	0/0
diff --git a/test/filter.14.out b/test/filter.14.out
new file mode 100644
index 0000000..a04ac1e
--- /dev/null
+++ b/test/filter.14.out
@@ -0,0 +1,8 @@
+3000150	0/1	0/1
+3000151	0/1	0/1
+3062915	0/1	0/1
+3062915	0/1	2
+3106154	0/1	0/1
+3106154	0/1	0/1
+3162006	0/1	0/1
+3258448	0/1	0/1
diff --git a/test/filter.15.out b/test/filter.15.out
new file mode 100644
index 0000000..829c42c
--- /dev/null
+++ b/test/filter.15.out
@@ -0,0 +1,2 @@
+3157410	1/1	1/1
+3177144	0/0	1/1
diff --git a/test/filter.16.out b/test/filter.16.out
new file mode 100644
index 0000000..1f9f385
--- /dev/null
+++ b/test/filter.16.out
@@ -0,0 +1,3 @@
+3184885	1/2	1/2
+3199812	1/2	1/2
+3212016	1/2	1/2
diff --git a/test/filter.17.out b/test/filter.17.out
new file mode 100644
index 0000000..8b5b20a
--- /dev/null
+++ b/test/filter.17.out
@@ -0,0 +1,3 @@
+3157410	1/1	1/1
+3177144	0/0	1/1
+3177144	0/0	0/0
diff --git a/test/filter.18.out b/test/filter.18.out
new file mode 100644
index 0000000..d14486f
--- /dev/null
+++ b/test/filter.18.out
@@ -0,0 +1,11 @@
+3000150	0/1	0/1
+3000151	0/1	0/1
+3062915	0/1	0/1
+3062915	0/1	2
+3106154	0/1	0/1
+3106154	0/1	0/1
+3162006	0/1	0/1
+3184885	1/2	1/2
+3199812	1/2	1/2
+3212016	1/2	1/2
+3258448	0/1	0/1
diff --git a/test/filter.19.out b/test/filter.19.out
new file mode 100644
index 0000000..2726105
--- /dev/null
+++ b/test/filter.19.out
@@ -0,0 +1 @@
+3062915	0/1	2
diff --git a/test/mendelian.1.out b/test/mendelian.1.out
index 414f79b..e494efd 100644
--- a/test/mendelian.1.out
+++ b/test/mendelian.1.out
@@ -1,9 +1,37 @@
 ##fileformat=VCFv4.2
 ##FILTER=<ID=PASS,Description="All filters passed">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##contig=<ID=1,assembly=b37,length=249250621>
-##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	mom1	dad1	child1	mom2	dad2	child2
-1	100	.	A	G	100	PASS	.	GT	./.	./.	./.	1/1	0/1	1/1
-1	101	.	A	T	100	PASS	.	GT	0/0	0/1	0/1	1/1	0/1	1/0
-1	102	.	A	T	100	PASS	.	GT	0/0	0/1	0/1	1/1	0/1	1/0
+##contig=<ID=1>
+##contig=<ID=2>
+##contig=<ID=3>
+##contig=<ID=4>
+##contig=<ID=5>
+##contig=<ID=6>
+##contig=<ID=X>
+##contig=<ID=Y>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	mom1	dad1	child1
+1	100	.	A	G	100	PASS	.	GT	0/0	0/0	0/0
+1	101	.	A	G	100	PASS	.	GT	./.	./.	./.
+1	102	.	A	G	100	PASS	.	GT	./.	./.	./.
+2	200	.	A	G	100	PASS	.	GT	0/0	0/1	0/0
+2	201	.	A	G	100	PASS	.	GT	0/0	0/1	0/1
+2	202	.	A	G	100	PASS	.	GT	./.	./.	./.
+3	300	.	A	G	100	PASS	.	GT	./.	./.	./.
+3	301	.	A	G	100	PASS	.	GT	0/0	1/1	0/1
+3	302	.	A	G	100	PASS	.	GT	./.	./.	./.
+4	400	.	A	G	100	PASS	.	GT	0/1	0/0	0/0
+4	401	.	A	G	100	PASS	.	GT	0/1	0/0	0/1
+4	402	.	A	G	100	PASS	.	GT	./.	./.	./.
+5	500	.	A	G	100	PASS	.	GT	0/1	0/1	0/0
+5	501	.	A	G	100	PASS	.	GT	0/1	0/1	0/1
+5	502	.	A	G	100	PASS	.	GT	0/1	0/1	1/1
+6	600	.	A	G	100	PASS	.	GT	./.	./.	./.
+6	601	.	A	G	100	PASS	.	GT	0/1	1/1	0/1
+6	602	.	A	G	100	PASS	.	GT	0/1	1/1	1/1
+X	100	.	A	G	100	PASS	.	GT	0/0	1	0
+X	101	.	A	G	100	PASS	.	GT	./.	.	.
+X	102	.	A	G	100	PASS	.	GT	0/0	1	0/1
+X	103	.	A	G	100	PASS	.	GT	./.	.	./.
+Y	101	.	A	G	100	PASS	.	GT	.	0	0
+Y	102	.	A	G	100	PASS	.	GT	.	.	.
+Y	103	.	A	G	100	PASS	.	GT	.	0	.
diff --git a/test/mendelian.2.out b/test/mendelian.2.out
index 81155ae..fe15ae7 100644
--- a/test/mendelian.2.out
+++ b/test/mendelian.2.out
@@ -1,8 +1,27 @@
 ##fileformat=VCFv4.2
 ##FILTER=<ID=PASS,Description="All filters passed">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##contig=<ID=1,assembly=b37,length=249250621>
-##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	mom1	dad1	child1	mom2	dad2	child2
-1	101	.	A	T	100	PASS	.	GT	0/0	0/1	0/1	1/1	0/1	1/0
-1	102	.	A	T	100	PASS	.	GT	0/0	0/1	0/1	1/1	0/1	1/0
+##contig=<ID=1>
+##contig=<ID=2>
+##contig=<ID=3>
+##contig=<ID=4>
+##contig=<ID=5>
+##contig=<ID=6>
+##contig=<ID=X>
+##contig=<ID=Y>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	mom1	dad1	child1
+1	100	.	A	G	100	PASS	.	GT	0/0	0/0	0/0
+2	200	.	A	G	100	PASS	.	GT	0/0	0/1	0/0
+2	201	.	A	G	100	PASS	.	GT	0/0	0/1	0/1
+3	301	.	A	G	100	PASS	.	GT	0/0	1/1	0/1
+4	400	.	A	G	100	PASS	.	GT	0/1	0/0	0/0
+4	401	.	A	G	100	PASS	.	GT	0/1	0/0	0/1
+5	500	.	A	G	100	PASS	.	GT	0/1	0/1	0/0
+5	501	.	A	G	100	PASS	.	GT	0/1	0/1	0/1
+5	502	.	A	G	100	PASS	.	GT	0/1	0/1	1/1
+6	601	.	A	G	100	PASS	.	GT	0/1	1/1	0/1
+6	602	.	A	G	100	PASS	.	GT	0/1	1/1	1/1
+X	100	.	A	G	100	PASS	.	GT	0/0	1	0
+X	102	.	A	G	100	PASS	.	GT	0/0	1	0/1
+Y	101	.	A	G	100	PASS	.	GT	.	0	0
+Y	103	.	A	G	100	PASS	.	GT	.	0	.
diff --git a/test/mendelian.3.out b/test/mendelian.3.out
index 9ea430a..7accace 100644
--- a/test/mendelian.3.out
+++ b/test/mendelian.3.out
@@ -1,7 +1,22 @@
 ##fileformat=VCFv4.2
 ##FILTER=<ID=PASS,Description="All filters passed">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##contig=<ID=1,assembly=b37,length=249250621>
-##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	mom1	dad1	child1	mom2	dad2	child2
-1	100	.	A	G	100	PASS	.	GT	0/0	0/0	1/1	1/1	0/1	1/1
+##contig=<ID=1>
+##contig=<ID=2>
+##contig=<ID=3>
+##contig=<ID=4>
+##contig=<ID=5>
+##contig=<ID=6>
+##contig=<ID=X>
+##contig=<ID=Y>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	mom1	dad1	child1
+1	101	.	A	G	100	PASS	.	GT	0/0	0/0	0/1
+1	102	.	A	G	100	PASS	.	GT	0/0	0/0	1/1
+2	202	.	A	G	100	PASS	.	GT	0/0	0/1	1/1
+3	300	.	A	G	100	PASS	.	GT	0/0	1/1	0/0
+3	302	.	A	G	100	PASS	.	GT	0/0	1/1	1/1
+4	402	.	A	G	100	PASS	.	GT	0/1	0/0	1/1
+6	600	.	A	G	100	PASS	.	GT	0/1	1/1	0/0
+X	101	.	A	G	100	PASS	.	GT	0/0	1	1
+X	103	.	A	G	100	PASS	.	GT	0/0	0	0/1
+Y	102	.	A	G	100	PASS	.	GT	.	0	1
diff --git a/test/mendelian.vcf b/test/mendelian.vcf
index c43ba07..a0e90df 100644
--- a/test/mendelian.vcf
+++ b/test/mendelian.vcf
@@ -1,8 +1,36 @@
 ##fileformat=VCFv4.2
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##contig=<ID=1,assembly=b37,length=249250621>
-##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	mom1	dad1	child1	mom2	dad2	child2
-1	100	.	A	G	100	PASS	.	GT	0/0	0/0	1/1	1/1	0/1	1/1
-1	101	.	A	T	100	PASS	.	GT	0/0	0/1	0/1	1/1	0/1	1/0
-1	102	.	A	T	100	PASS	.	GT	0/0	0/1	0/1	1/1	0/1	1/0
+##contig=<ID=1>
+##contig=<ID=2>
+##contig=<ID=3>
+##contig=<ID=4>
+##contig=<ID=5>
+##contig=<ID=6>
+##contig=<ID=X>
+##contig=<ID=Y>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	mom1	dad1	child1
+1	100	.	A	G	100	PASS	.	GT	0/0	0/0	0/0
+1	101	.	A	G	100	PASS	.	GT	0/0	0/0	0/1
+1	102	.	A	G	100	PASS	.	GT	0/0	0/0	1/1
+2	200	.	A	G	100	PASS	.	GT	0/0	0/1	0/0
+2	201	.	A	G	100	PASS	.	GT	0/0	0/1	0/1
+2	202	.	A	G	100	PASS	.	GT	0/0	0/1	1/1
+3	300	.	A	G	100	PASS	.	GT	0/0	1/1	0/0
+3	301	.	A	G	100	PASS	.	GT	0/0	1/1	0/1
+3	302	.	A	G	100	PASS	.	GT	0/0	1/1	1/1
+4	400	.	A	G	100	PASS	.	GT	0/1	0/0	0/0
+4	401	.	A	G	100	PASS	.	GT	0/1	0/0	0/1
+4	402	.	A	G	100	PASS	.	GT	0/1	0/0	1/1
+5	500	.	A	G	100	PASS	.	GT	0/1	0/1	0/0
+5	501	.	A	G	100	PASS	.	GT	0/1	0/1	0/1
+5	502	.	A	G	100	PASS	.	GT	0/1	0/1	1/1
+6	600	.	A	G	100	PASS	.	GT	0/1	1/1	0/0
+6	601	.	A	G	100	PASS	.	GT	0/1	1/1	0/1
+6	602	.	A	G	100	PASS	.	GT	0/1	1/1	1/1
+X	100	.	A	G	100	PASS	.	GT	0/0	1	0
+X	101	.	A	G	100	PASS	.	GT	0/0	1	1
+X	102	.	A	G	100	PASS	.	GT	0/0	1	0/1
+X	103	.	A	G	100	PASS	.	GT	0/0	0	0/1
+Y	101	.	A	G	100	PASS	.	GT	.	0	0
+Y	102	.	A	G	100	PASS	.	GT	.	0	1
+Y	103	.	A	G	100	PASS	.	GT	.	0	.
diff --git a/test/query.34.out b/test/query.34.out
new file mode 100644
index 0000000..ba8cd61
--- /dev/null
+++ b/test/query.34.out
@@ -0,0 +1 @@
+3162007 AAAAAA,BBBBB,CCCC,DDD,EE,F AAAAAA,BBB,C
diff --git a/test/query.35.out b/test/query.35.out
new file mode 100644
index 0000000..25e8f6b
--- /dev/null
+++ b/test/query.35.out
@@ -0,0 +1 @@
+3162007 1,2,3,4,5,6 1,2,3
diff --git a/test/query.36.out b/test/query.36.out
new file mode 100644
index 0000000..e4ef348
--- /dev/null
+++ b/test/query.36.out
@@ -0,0 +1 @@
+3162007 0.1,0.02,0.003,0.0004,5e-05,6e-06 0.1,0.02,0.003
diff --git a/test/sort.out b/test/sort.out
new file mode 100644
index 0000000..fa723cd
--- /dev/null
+++ b/test/sort.out
@@ -0,0 +1,15 @@
+1	101
+1	102
+1	103
+1	104
+1	105
+2	101
+2	102
+2	103
+2	104
+2	105
+3	101
+3	102
+3	103
+3	104
+3	105
diff --git a/test/sort.vcf b/test/sort.vcf
new file mode 100644
index 0000000..649bd01
--- /dev/null
+++ b/test/sort.vcf
@@ -0,0 +1,21 @@
+##fileformat=VCFv4.2
+##reference=file:///ref.fa
+##contig=<ID=1,length=2147483647>
+##contig=<ID=2,length=2147483647>
+##contig=<ID=3,length=2147483647>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+3	105	.	T	C	.	.	.
+3	104	.	T	C	.	.	.
+3	103	.	T	C	.	.	.
+3	102	.	T	C	.	.	.
+3	101	.	T	C	.	.	.
+2	105	.	T	C	.	.	.
+2	104	.	T	C	.	.	.
+2	103	.	T	C	.	.	.
+2	102	.	T	C	.	.	.
+2	101	.	T	C	.	.	.
+1	105	.	T	C	.	.	.
+1	104	.	T	C	.	.	.
+1	103	.	T	C	.	.	.
+1	102	.	T	C	.	.	.
+1	101	.	T	C	.	.	.
diff --git a/test/stats.B.chk b/test/stats.B.chk
index 3245562..2551b64 100644
--- a/test/stats.B.chk
+++ b/test/stats.B.chk
@@ -77,9 +77,9 @@ GCsS	2	B	75.000	0	1	0	0	3	0	0
 GCiS	2	B	0.000	0	0	0	0	0	0	0
 GCTs	B	0	0	0	0	0	2	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
 GCTi	B	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
-PSC	0	B	0	0	0	0	0	0	0.0	0
-PSC	1	B	0	0	0	0	0	0	0.0	0
-PSC	2	B	0	0	4	3	1	0	0.0	4
+PSC	0	B	0	0	0	0	0	0	0.0	0	0	0
+PSC	1	B	0	0	0	0	0	0	0.0	0	0	0
+PSC	2	B	0	0	4	3	1	0	0.0	4	0	0
 PSI	0	B	0	0	0	0.00	0	0
 PSI	1	B	0	0	0	0.00	0	0
 PSI	2	B	0	0	0	0.00	0	0
diff --git a/test/stats.chk b/test/stats.chk
index af03c3e..4c1a95c 100644
--- a/test/stats.chk
+++ b/test/stats.chk
@@ -85,15 +85,15 @@ GCTs	C	0	0	1	0	0	0	0	0	0	0	3	0	0	0	0	0	0	0	0	0	0	0	0	0	0
 GCTi	A	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
 GCTi	B	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
 GCTi	C	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
-PSC	0	A	0	0	0	0	0	0	0.0	0
-PSC	0	B	0	0	0	0	0	0	0.0	0
-PSC	0	C	0	0	0	0	0	0	0.0	0
-PSC	1	A	0	0	0	0	0	0	0.0	0
-PSC	1	B	0	0	0	0	0	0	0.0	0
-PSC	1	C	0	0	0	0	0	0	0.0	0
-PSC	2	A	4	0	0	0	0	0	0.0	0
-PSC	2	B	0	0	4	3	1	0	0.0	1
-PSC	2	C	1	3	0	3	0	0	0.0	0
+PSC	0	A	0	0	0	0	0	0	0.0	0	0	0
+PSC	0	B	0	0	0	0	0	0	0.0	0	0	0
+PSC	0	C	0	0	0	0	0	0	0.0	0	0	0
+PSC	1	A	0	0	0	0	0	0	0.0	0	0	0
+PSC	1	B	0	0	0	0	0	0	0.0	0	0	0
+PSC	1	C	0	0	0	0	0	0	0.0	0	0	0
+PSC	2	A	4	0	0	0	0	0	0.0	0	0	0
+PSC	2	B	0	0	4	3	1	0	0.0	1	0	0
+PSC	2	C	1	3	0	3	0	0	0.0	0	0	0
 PSI	0	A	0	0	0	0.00	0	0
 PSI	0	B	0	0	0	0.00	0	0
 PSI	0	C	0	0	0	0.00	0	0
diff --git a/test/stats.counts.chk b/test/stats.counts.chk
new file mode 100644
index 0000000..b14de7d
--- /dev/null
+++ b/test/stats.counts.chk
@@ -0,0 +1,37 @@
+SN	0	number of samples:	3
+SN	0	number of records:	11
+SN	0	number of no-ALTs:	2
+SN	0	number of SNPs:	6
+SN	0	number of MNPs:	2
+SN	0	number of indels:	1
+SN	0	number of others:	2
+SN	0	number of multiallelic sites:	4
+SN	0	number of multiallelic SNP sites:	2
+TSTV	0	4	3	1.33	3	2	1.50
+SiS	0	1	2	1	1	1	0	0	1
+AF	0	0.000000	2	1	1	1	0	0	1
+AF	0	0.330000	2	1	1	0	0	0	0
+AF	0	0.490000	3	2	1	0	0	0	0
+QUAL	0	998	6	3	2	1
+IDD	0	1	1
+ST	0	A>C	0
+ST	0	A>G	0
+ST	0	A>T	0
+ST	0	C>A	1
+ST	0	C>G	1
+ST	0	C>T	1
+ST	0	G>A	3
+ST	0	G>C	1
+ST	0	G>T	0
+ST	0	T>A	0
+ST	0	T>C	0
+ST	0	T>G	0
+PSC	0	A	11	0	0	0	0	0	0.0	0	0	0
+PSC	0	B	1	1	4	3	1	1	0.0	0	1	0
+PSC	0	C	1	5	0	3	1	0	0.0	0	0	1
+PSI	0	A	0	0	0	0.00	0	0
+PSI	0	B	0	0	0	0.00	1	0
+PSI	0	C	0	0	0	0.00	0	0
+HWE	0	0.000000	3	0.000000	0.000000	0.000000
+HWE	0	0.330000	1	0.000000	0.000000	0.000000
+HWE	0	0.490000	7	0.330000	0.330000	0.330000
diff --git a/test/stats.counts.vcf b/test/stats.counts.vcf
new file mode 100644
index 0000000..d4dc075
--- /dev/null
+++ b/test/stats.counts.vcf
@@ -0,0 +1,18 @@
+##fileformat=VCFv4.2
+##ALT=<ID=DEL,Description="Deletion">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=1,assembly=b37,length=249250621>
+##contig=<ID=X,assembly=b37,length=249250621>
+##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	A	B	C
+1	1000	.	G	A	.	PASS	.	GT	0/0	0/1	1/1
+1	1001	.	G	A,C	.	PASS	.	GT	0/0	1/2	2/2
+1	1002	.	G	A,GA	.	PASS	.	GT	0/0	1/2	1/1
+1	1003	.	CC	TT	.	PASS	.	GT	0/0	0/1	1/1
+1	1003	.	CC	TT,CT	.	PASS	.	GT	0/0	0/1	1/1
+1	1003	.	CACC	TAT	.	PASS	.	GT	0/0	0/1	1/1
+1	1004	.	C	<DEL>	.	PASS	.	GT	0/0	0/1	1/1
+1	1005	.	C	.	.	PASS	.	GT	0/0	0/0	0/0
+1	1006	.	CC	CC	.	PASS	.	GT	0/0	0/1	1/1
+1	1006	.	C	G,T	.	PASS	.	GT	0/0	1/1	2/2
+X	1000	.	C	A	.	PASS	.	GT	0/0	0	1
diff --git a/test/test.pl b/test/test.pl
index e65aa0e..e10b881 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -46,6 +46,7 @@ test_vcf_check($opts,in=>'check',out=>'check.chk');
 test_vcf_check_merge($opts,in=>'check',out=>'check_merge.chk');
 test_vcf_stats($opts,in=>['stats.a','stats.b'],out=>'stats.chk',args=>'-s -');
 test_vcf_stats($opts,in=>['stats.a','stats.b'],out=>'stats.B.chk',args=>'-s B');
+test_vcf_stats($opts,in=>['stats.counts'],out=>'stats.counts.chk',args=>'-s -');
 test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.out',args=>'-n =2');
 test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.flt.out',args=>'-n =2 -i"STRLEN(REF)==2"');
 test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.both.out',args=>'-n =2 -c both');
@@ -110,6 +111,9 @@ test_vcf_query($opts,in=>'filter-missing-floats',out=>'query.30.out',args=>q[-f'
 test_vcf_query($opts,in=>'filter-missing-floats',out=>'query.31.out',args=>q[-f'%POS\\t%A_AF\\t%B_AF\\t%C_AF\\n' -e'A_AF>=0.0001 || B_AF >= 0.0001 || C_AF >= 0.0001']);
 test_vcf_query($opts,in=>'missing',out=>'query.32.out',args=>q[-i'FMT/FINT!="."' -f'[\t%FINT]\\n']);
 test_vcf_query($opts,in=>'query.filter.2',out=>'query.33.out',args=>q[-f'[%GT]\\n' -i'GT~"0/[1-9]" || GT~"[1-9]/0"']);
+test_vcf_query($opts,in=>'view.filter',out=>'query.34.out',args=>q[-f'%POS[ %FGS]\\n' -i'FGS[1-2,4]="EE"']);
+test_vcf_query($opts,in=>'view.filter',out=>'query.35.out',args=>q[-f'%POS[ %FGI]\\n' -i'FGI[1-2,5]=6']);
+test_vcf_query($opts,in=>'view.filter',out=>'query.36.out',args=>q[-f'%POS[ %FGF]\\n' -i'FGF[1-3,4]=5e-5']);
 test_vcf_norm($opts,in=>'norm',out=>'norm.out',fai=>'norm',args=>'-cx');
 test_vcf_norm($opts,in=>'norm.split',out=>'norm.split.out',args=>'-m-');
 test_vcf_norm($opts,in=>'norm.split.2',out=>'norm.split.2.out',args=>'-m-');
@@ -185,6 +189,18 @@ test_vcf_filter($opts,in=>'filter.2',out=>'filter.8.out',args=>q[-i'AC[*]=2 && F
 test_vcf_filter($opts,in=>'filter.2',out=>'filter.9.out',args=>q[-i'ALT="."'],fmt=>'%POS\\t%AC[\\t%GT]\\n');
 test_vcf_filter($opts,in=>'filter.4',out=>'filter.10.out',args=>q[-S . -i 'FORMAT/TEST3<25']);
 test_vcf_filter($opts,in=>'filter.4',out=>'filter.10.out',args=>q[-S . -i 'FORMAT/TEST4<25']);
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.12.out',args=>q[-i'GT="A"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.13.out',args=>q[-i'GT="RR"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.14.out',args=>q[-i'GT="RA"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.14.out',args=>q[-i'GT="AR"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.15.out',args=>q[-i'GT="AA"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.16.out',args=>q[-i'GT="aA"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.16.out',args=>q[-i'GT="Aa"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.17.out',args=>q[-i'GT="HOM"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.18.out',args=>q[-i'GT="HET"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_filter($opts,in=>'filter.2',out=>'filter.19.out',args=>q[-i'GT="HAP"'],fmt=>'%POS[\\t%GT]\\n');
+test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 0],fmt=>'%CHROM\\t%POS\\n');
+test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 1000],fmt=>'%CHROM\\t%POS\\n');
 test_vcf_regions($opts,in=>'regions');
 test_vcf_annotate($opts,in=>'annotate',tab=>'annotate',out=>'annotate.out',args=>'-c CHROM,POS,REF,ALT,ID,QUAL,INFO/T_INT,INFO/T_FLOAT,INDEL');
 test_vcf_annotate($opts,in=>'annotate',tab=>'annotate2',out=>'annotate2.out',args=>'-c CHROM,FROM,TO,T_STR');
@@ -275,8 +291,8 @@ test_vcf_consensus($opts,in=>'consensus',out=>'consensus.1.out',fa=>'consensus.f
 test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.1.chain',chain=>'consensus.1.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'');
 test_vcf_consensus($opts,in=>'consensus',out=>'consensus.2.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-H 1');
 test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.2.chain',chain=>'consensus.2.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-H 1');
-test_vcf_consensus($opts,in=>'consensus',out=>'consensus.3.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-i');
-test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.3.chain',chain=>'consensus.3.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-i');
+test_vcf_consensus($opts,in=>'consensus',out=>'consensus.3.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-I');
+test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.3.chain',chain=>'consensus.3.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-I');
 test_vcf_consensus($opts,in=>'consensus',out=>'consensus.4.out',fa=>'consensus.fa',args=>'-H 1');
 test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.4.chain',chain=>'consensus.4.chain',fa=>'consensus.fa',args=>'-H 1');
 test_vcf_consensus($opts,in=>'consensus2',out=>'consensus2.1.out',fa=>'consensus2.fa',args=>'-H 1');
@@ -703,6 +719,13 @@ sub test_vcf_filter
     test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools filter $args{args} $$opts{path}/$args{in}.vcf | $pipe");
     test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools filter -Ob $args{args} $$opts{path}/$args{in}.vcf | $$opts{bin}/bcftools view | $pipe");
 }
+sub test_vcf_sort
+{
+    my ($opts,%args) = @_;
+    my $pipe = "$$opts{bin}/bcftools query -f '$args{fmt}'";
+    test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools sort $args{args} $$opts{path}/$args{in}.vcf | $pipe");
+    test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools sort -Ob $args{args} $$opts{path}/$args{in}.vcf | $$opts{bin}/bcftools view | $pipe");
+}
 sub test_vcf_regions
 {
     my ($opts,%args) = @_;
@@ -1088,4 +1111,3 @@ sub test_csq_real
     }
     closedir($dh);
 }
-
diff --git a/vcfcnv.c b/vcfcnv.c
index ffe71c4..11c55bd 100644
--- a/vcfcnv.c
+++ b/vcfcnv.c
@@ -212,8 +212,14 @@ static double *init_iprobs(int ndim, double same_prob)
 static void init_sample_files(sample_t *smpl, char *dir)
 {
     smpl->dat_fh = open_file(&smpl->dat_fname,"w","%s/dat.%s.tab",dir,smpl->name);
+    if ( !smpl->dat_fh ) error("Error opening file: %s/dat.%s.tab\n",dir,smpl->name);
+
     smpl->cn_fh  = open_file(&smpl->cn_fname,"w","%s/cn.%s.tab",dir,smpl->name);
+    if ( !smpl->cn_fh ) error("Error opening file: %s/cn.%s.tab\n",dir,smpl->name);
+
     smpl->summary_fh = open_file(&smpl->summary_fname,"w","%s/summary.%s.tab",dir,smpl->name);
+    if ( !smpl->summary_fh ) error("Error opening file: %s/summary.%s.tab\n",dir,smpl->name);
+
     fprintf(smpl->dat_fh,"# [1]Chromosome\t[2]Position\t[3]BAF\t[4]LRR\n");
     fprintf(smpl->cn_fh,"# [1]Chromosome\t[2]Position\t[3]CN\t[4]P(CN0)\t[5]P(CN1)\t[6]P(CN2)\t[7]P(CN3)\n");
     fprintf(smpl->summary_fh,"# RG, Regions [2]Chromosome\t[3]Start\t[4]End\t[5]Copy Number state\t[6]Quality\t[7]nSites\t[8]nHETs\n");
diff --git a/vcfconvert.c b/vcfconvert.c
index 8f596d4..1e28ad8 100644
--- a/vcfconvert.c
+++ b/vcfconvert.c
@@ -1316,12 +1316,13 @@ static void gvcf_to_vcf(args_t *args)
         }
 
         // check if alleles compatible with being a gVCF record
+        // ALT must be one of ., <*>, <X>, <NON_REF>
+        // check for INFO/END is below
         int i, gallele = -1;
         if (line->n_allele==1)
             gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present)
-        else
+        else if ( line->d.allele[1][0]=='<' )
         {
-            if ( line->d.allele[1][0]!='<' ) continue;
             for (i=1; i<line->n_allele; i++)
             {
                 if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF
diff --git a/vcfindex.c b/vcfindex.c
index aa60fb2..807fedd 100644
--- a/vcfindex.c
+++ b/vcfindex.c
@@ -32,6 +32,7 @@ DEALINGS IN THE SOFTWARE.  */
 #define __STDC_FORMAT_MACROS
 #include <inttypes.h>
 #include <htslib/kstring.h>
+#include <htslib/bgzf.h>
 #include "bcftools.h"
 
 #define BCF_LIDX_SHIFT    14
@@ -208,6 +209,12 @@ int main_vcfindex(int argc, char *argv[])
                 return 1;
             }
         }
+
+        // check for truncated files, allow only with -f
+        BGZF *fp = bgzf_open(fname, "r");
+        if ( !fp ) error("index: failed to open %s\n", fname);
+        if ( bgzf_check_EOF(fp)!=1 ) error("index: the input is probably truncated, use -f to index anyway: %s\n", fname);
+        if ( bgzf_close(fp)!=0 ) error("index: close failed: %s\n", fname);
     }
 
     int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads);
diff --git a/vcfisec.c b/vcfisec.c
index 9eb3a7c..3e0e1e5 100644
--- a/vcfisec.c
+++ b/vcfisec.c
@@ -82,13 +82,13 @@ void mkdir_p(const char *fmt, ...)
     while (*p)
     {
         while (*p && *p!='/') p++;
-        if ( *p )
-        {
-            *p = 0;
-            mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
-            *p = '/';
-            p++;
-        }
+        if ( !*p ) break;
+        char ctmp = *p;
+        *p = 0;
+        int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+        if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", path,strerror(errno));
+        *p = ctmp;
+        while ( *p && *p=='/' ) p++;
     }
     free(tmp);
     free(path);
diff --git a/vcfmerge.c b/vcfmerge.c
index e9ed5ad..31f5dad 100644
--- a/vcfmerge.c
+++ b/vcfmerge.c
@@ -662,7 +662,6 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb)
         }
         // new allele
         map[i] = *nb;
-        if ( b[*nb] ) free(b[*nb]);
         b[*nb] = const_ai ? strdup(ai) : ai;
         (*nb)++;
     }
@@ -1668,6 +1667,11 @@ void gvcf_set_alleles(args_t *args)
     bcf_srs_t *files = args->files;
     maux_t *maux = args->maux;
     gvcf_aux_t *gaux = maux->gvcf;
+    for (i=0; i<maux->nals; i++) 
+    {
+        free(maux->als[i]);
+        maux->als[i] = NULL;
+    }
     maux->nals = 0;
 
     for (i=0; i<files->nreaders; i++)
@@ -2025,9 +2029,15 @@ int can_merge(args_t *args)
     maux_t *maux = args->maux;
     gvcf_aux_t *gaux = maux->gvcf;
     char *id = NULL, ref = 'N';
+    int i,j,k, ntodo = 0;
+
+    for (i=0; i<maux->nals; i++) 
+    {
+        free(maux->als[i]);
+        maux->als[i] = NULL;
+    }
     maux->var_types = maux->nals = 0;
 
-    int i,j,k, ntodo = 0;
     for (i=0; i<files->nreaders; i++)
     {
         buffer_t *buf = &maux->buf[i];
diff --git a/vcfnorm.c b/vcfnorm.c
index 86c20ab..bc51018 100644
--- a/vcfnorm.c
+++ b/vcfnorm.c
@@ -1514,6 +1514,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
 {
     bcf1_t *line;
     int i, k;
+    int prev_rid = -1, prev_pos = -1, prev_type = 0;
     for (i=0; i<n; i++)
     {
         k = rbuf_shift(&args->rbuf);
@@ -1534,6 +1535,23 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
                 continue;
             }
         }
+        else if ( args->rmdup )
+        {
+            int line_type = bcf_get_variant_types(args->lines[k]);
+            if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos )
+            {
+                if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
+                if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+                if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+            }
+            else
+            {
+                prev_rid  = args->lines[k]->rid;
+                prev_pos  = args->lines[k]->pos;
+                prev_type = 0;
+            }
+            prev_type |= line_type;
+        }
         bcf_write1(file, args->hdr, args->lines[k]);
     }
     if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n )
diff --git a/vcfquery.c b/vcfquery.c
index ab4c100..04554f8 100644
--- a/vcfquery.c
+++ b/vcfquery.c
@@ -32,6 +32,7 @@ THE SOFTWARE.  */
 #include <sys/types.h>
 #include <htslib/vcf.h>
 #include <htslib/synced_bcf_reader.h>
+#include <htslib/khash_str2int.h>
 #include <htslib/vcfutils.h>
 #include "bcftools.h"
 #include "filter.h"
@@ -151,10 +152,26 @@ static void query_vcf(args_t *args)
 
 static void list_columns(args_t *args)
 {
+    void *has_sample = NULL;
+    if ( args->sample_list )
+    {
+        has_sample = khash_str2int_init();
+        int i, nsmpl;
+        char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
+        for (i=0; i<nsmpl; i++) khash_str2int_inc(has_sample, smpl[i]);
+        free(smpl);
+    }
+
     int i;
     bcf_sr_t *reader = &args->files->readers[0];
     for (i=0; i<bcf_hdr_nsamples(reader->header); i++)
+    {
+        if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue;
         printf("%s\n", reader->header->samples[i]);
+    }
+
+    if ( has_sample )
+        khash_str2int_destroy_free(has_sample);
 }
 
 static char **copy_header(bcf_hdr_t *hdr, char **src, int nsrc)
diff --git a/vcfsort.c b/vcfsort.c
new file mode 100644
index 0000000..e41b628
--- /dev/null
+++ b/vcfsort.c
@@ -0,0 +1,306 @@
+/*  vcfsort.c -- sort subcommand
+
+   Copyright (C) 2017 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3 at sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <math.h>
+#include <htslib/vcf.h>
+#include <htslib/kstring.h>
+#include "kheap.h"
+#include "bcftools.h"
+
+typedef struct
+{
+    char *fname;
+    htsFile *fh;
+    bcf1_t *rec;
+}
+blk_t;
+
+typedef struct _args_t
+{
+    bcf_hdr_t *hdr;
+    char **argv, *fname, *output_fname, *tmp_dir;
+    int argc, output_type;
+    size_t max_mem, mem;
+    bcf1_t **buf;
+    size_t nbuf, mbuf, nblk;
+    blk_t *blk;
+}
+args_t;
+
+int cmp_bcf_pos(const void *aptr, const void *bptr)
+{
+    bcf1_t *a = *((bcf1_t**)aptr);
+    bcf1_t *b = *((bcf1_t**)bptr);
+    if ( a->rid < b->rid ) return -1;
+    if ( a->rid > b->rid ) return 1;
+    if ( a->pos < b->pos ) return -1;
+    if ( a->pos > b->pos ) return 1;
+    return 0;
+}
+
+void buf_flush(args_t *args)
+{
+    if ( !args->nbuf ) return;
+
+    qsort(args->buf, args->nbuf, sizeof(*args->buf), cmp_bcf_pos);
+
+    args->nblk++;
+    args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk);
+    blk_t *blk = args->blk + args->nblk - 1;
+
+    kstring_t str = {0,0,0};
+    ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk);
+    blk->fname = str.s;
+
+    htsFile *fh = hts_open(blk->fname, "wbu");
+    if ( fh == NULL ) error("Cannot write %s: %s\n", blk->fname, strerror(errno));
+    bcf_hdr_write(fh, args->hdr);
+    
+    int i;
+    for (i=0; i<args->nbuf; i++)
+    {
+        bcf_write(fh, args->hdr, args->buf[i]);
+        bcf_destroy(args->buf[i]);
+    }
+    hts_close(fh);
+
+    args->nbuf = 0;
+    args->mem  = 0;
+}
+
+void buf_push(args_t *args, bcf1_t *rec)
+{
+    int delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + sizeof(bcf1_t*);
+    if ( args->mem + delta > args->max_mem ) buf_flush(args);
+    args->nbuf++;
+    args->mem += delta;
+    hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf);
+    args->buf[args->nbuf-1] = rec;
+}
+
+void sort_blocks(args_t *args) 
+{
+    htsFile *in = hts_open(args->fname, "r");
+    if ( !in ) error("Could not read %s\n", args->fname);
+    args->hdr = bcf_hdr_read(in);
+
+    while ( 1 )
+    {
+        bcf1_t *rec = bcf_init();
+        int ret = bcf_read1(in, args->hdr, rec);
+        if ( ret < -1 ) error("Error encountered while parsing the input\n");
+        if ( ret == -1 )
+        {
+            bcf_destroy(rec);
+            break;
+        }
+        buf_push(args, rec);
+    }
+    buf_flush(args);
+    free(args->buf);
+
+    if ( hts_close(in)!=0 ) error("Close failed: %s\n", args->fname);
+}
+
+static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr)
+{
+    blk_t *a = *aptr;
+    blk_t *b = *bptr;
+    if ( a->rec->rid < b->rec->rid ) return 1;
+    if ( a->rec->rid > b->rec->rid ) return 0;
+    if ( a->rec->pos < b->rec->pos ) return 1;
+    return 0;
+}
+KHEAP_INIT(blk, blk_t*, blk_is_smaller)
+
+void blk_read(khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk)
+{
+    if ( !blk->fh ) return;
+    int ret = bcf_read(blk->fh, hdr, blk->rec);
+    if ( ret < -1 ) error("Error reading %s\n", blk->fname);
+    if ( ret == -1 )
+    {
+        if ( hts_close(blk->fh)!=0 ) error("Close failed: %s\n", blk->fname);
+        blk->fh = 0;
+        return;
+    }
+    khp_insert(blk, bhp, &blk);
+}
+
+void merge_blocks(args_t *args) 
+{
+    fprintf(stderr,"Merging %d temporary files\n", (int)args->nblk);
+
+    khp_blk_t *bhp = khp_init(blk);
+
+    int i;
+    for (i=0; i<args->nblk; i++)
+    {
+        blk_t *blk = args->blk + i;
+        blk->fh = hts_open(blk->fname, "r");
+        if ( !blk->fh ) error("Could not read %s: %s\n", blk->fname, strerror(errno));
+        bcf_hdr_t *hdr = bcf_hdr_read(blk->fh);
+        bcf_hdr_destroy(hdr);
+        blk->rec = bcf_init();
+        blk_read(bhp, args->hdr, blk);
+    }
+
+    htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+    bcf_hdr_write(out, args->hdr);
+    while ( bhp->ndat )
+    {
+        blk_t *blk = bhp->dat[0];
+        bcf_write(out, args->hdr, blk->rec);
+        khp_delete(blk, bhp);
+        blk_read(bhp, args->hdr, blk);
+    }
+    if ( hts_close(out)!=0 ) error("Close failed: %s\n", args->output_fname);
+
+    fprintf(stderr,"Cleaning\n");
+    for (i=0; i<args->nblk; i++)
+    {
+        blk_t *blk = args->blk + i;
+        unlink(blk->fname);
+        free(blk->fname);
+        bcf_destroy(blk->rec);
+    }
+    rmdir(args->tmp_dir);
+    free(args->blk);
+    khp_destroy(blk, bhp);
+    fprintf(stderr,"Done\n");
+}
+
+static void usage(args_t *args)
+{
+    fprintf(stderr, "\n");
+    fprintf(stderr, "About:   Sort VCF/BCF file.\n");
+    fprintf(stderr, "Usage:   bcftools sort [OPTIONS] <FILE.vcf>\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "Options:\n");
+    fprintf(stderr, "    -m, --max-mem <float>[kMG]    maximum memory to use [768M]\n");    // using metric units, 1M=1e6
+    fprintf(stderr, "    -o, --output-file <file>      output file name [stdout]\n");
+    fprintf(stderr, "    -O, --output-type <b|u|z|v>   b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+    fprintf(stderr, "    -T, --temp-dir <dir>          temporary files [/tmp/bcftools-sort.XXXXXX/]\n");
+    fprintf(stderr, "\n");
+    exit(1);
+}
+
+size_t parse_mem_string(char *str) 
+{
+    char *tmp;
+    double mem = strtod(str, &tmp);
+    if ( tmp==str ) error("Could not parse: --max-mem %s\n", str);
+    if ( !strcasecmp("k",tmp) ) mem *= 1000;
+    else if ( !strcasecmp("m",tmp) ) mem *= 1000*1000;
+    else if ( !strcasecmp("g",tmp) ) mem *= 1000*1000*1000;
+    return mem;
+}
+
+void mkdir_p(const char *fmt, ...);
+void init(args_t *args)
+{
+    if ( !args->tmp_dir )
+    {
+        args->tmp_dir = strdup("/tmp/bcftools-sort.XXXXXX");
+        char *tmp_dir = mkdtemp(args->tmp_dir);
+        if ( !tmp_dir ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno));
+    }
+    else
+    {
+        args->tmp_dir = strdup(args->tmp_dir);
+        mkdir_p(args->tmp_dir);
+    }
+    fprintf(stderr,"Writing to %s\n", args->tmp_dir);
+}
+void destroy(args_t *args)
+{
+    bcf_hdr_destroy(args->hdr);
+    free(args->tmp_dir);
+    free(args);
+}
+
+int main_sort(int argc, char *argv[])
+{
+    int c;
+    args_t *args  = (args_t*) calloc(1,sizeof(args_t));
+    args->argc    = argc; args->argv = argv;
+    args->max_mem = 768*1000*1000;
+    args->output_fname = "-";
+
+    static struct option loptions[] =
+    {
+        {"max-mem",required_argument,NULL,'m'},
+        {"temp-dir",required_argument,NULL,'T'},
+        {"output-type",required_argument,NULL,'O'},
+        {"output-file",required_argument,NULL,'o'},
+        {"help",no_argument,NULL,'h'},
+        {0,0,0,0}
+    };
+    while ((c = getopt_long(argc, argv, "m:T:O:o:h?",loptions,NULL)) >= 0)
+    {
+        switch (c)
+        {
+            case 'm': args->max_mem = parse_mem_string(optarg); break;
+            case 'T': args->tmp_dir = optarg; break;
+            case 'o': args->output_fname = optarg; break;
+            case 'O':
+                      switch (optarg[0]) {
+                          case 'b': args->output_type = FT_BCF_GZ; break;
+                          case 'u': args->output_type = FT_BCF; break;
+                          case 'z': args->output_type = FT_VCF_GZ; break;
+                          case 'v': args->output_type = FT_VCF; break;
+                          default: error("The output type \"%s\" not recognised\n", optarg);
+                      };
+                      break;
+            case 'h': usage(args);
+            case '?': usage(args);
+            default: error("Unknown argument: %s\n", optarg);
+        }
+    }
+
+    if ( optind>=argc )
+    {
+        if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-";  // reading from stdin
+        else usage(args);
+    }
+    else args->fname = argv[optind];
+
+    init(args);
+    sort_blocks(args);
+    merge_blocks(args);
+    destroy(args);
+
+    return 0;
+}
diff --git a/vcfstats.c b/vcfstats.c
index 4041a5a..3b73173 100644
--- a/vcfstats.c
+++ b/vcfstats.c
@@ -87,6 +87,7 @@ typedef struct
     int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
     int subst[15];
     int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
+    int *smpl_hapRef, *smpl_hapAlt;
     int *smpl_indel_hets, *smpl_indel_homs;
     int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
     unsigned long int *smpl_dp;
@@ -472,6 +473,8 @@ static void init_stats(args_t *args)
             stats->smpl_hets   = (int *) calloc(args->files->n_smpl,sizeof(int));
             stats->smpl_homAA  = (int *) calloc(args->files->n_smpl,sizeof(int));
             stats->smpl_homRR  = (int *) calloc(args->files->n_smpl,sizeof(int));
+            stats->smpl_hapRef = (int *) calloc(args->files->n_smpl,sizeof(int));
+            stats->smpl_hapAlt = (int *) calloc(args->files->n_smpl,sizeof(int));
             stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
             stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int));
             stats->smpl_ts     = (int *) calloc(args->files->n_smpl,sizeof(int));
@@ -548,17 +551,19 @@ static void destroy_stats(args_t *args)
         #endif
         free(stats->insertions);
         free(stats->deletions);
-        if (stats->smpl_hets) free(stats->smpl_hets);
-        if (stats->smpl_homAA) free(stats->smpl_homAA);
-        if (stats->smpl_homRR) free(stats->smpl_homRR);
-        if (stats->smpl_indel_homs) free(stats->smpl_indel_homs);
-        if (stats->smpl_indel_hets) free(stats->smpl_indel_hets);
-        if (stats->smpl_ts) free(stats->smpl_ts);
-        if (stats->smpl_tv) free(stats->smpl_tv);
-        if (stats->smpl_indels) free(stats->smpl_indels);
-        if (stats->smpl_dp) free(stats->smpl_dp);
-        if (stats->smpl_ndp) free(stats->smpl_ndp);
-        if (stats->smpl_sngl) free(stats->smpl_sngl);
+        free(stats->smpl_hets);
+        free(stats->smpl_homAA);
+        free(stats->smpl_homRR);
+        free(stats->smpl_hapRef);
+        free(stats->smpl_hapAlt);
+        free(stats->smpl_indel_homs);
+        free(stats->smpl_indel_hets);
+        free(stats->smpl_ts);
+        free(stats->smpl_tv);
+        free(stats->smpl_indels);
+        free(stats->smpl_dp);
+        free(stats->smpl_ndp);
+        free(stats->smpl_sngl);
         idist_destroy(&stats->dp);
         idist_destroy(&stats->dp_sites);
         for (j=0; j<stats->nusr; j++)
@@ -861,6 +866,8 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
                     assert( ial<line->n_allele );
                     stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++;
                 }
+                if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++;
+                if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++;
                 continue;
             }
             if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; }
@@ -873,7 +880,10 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
                     case GT_HOM_AA: nalt_tot++; break;
                 }
             #endif
-            if ( line_type&VCF_SNP || line_type==VCF_REF )  // count ALT=. as SNP
+            int var_type = 0;
+            if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial);
+            if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal);
+            if ( var_type&VCF_SNP || var_type==VCF_REF )  // count ALT=. as SNP
             {
                 if ( gt == GT_HET_RA ) stats->smpl_hets[is]++;
                 else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++;
@@ -889,7 +899,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
                         stats->smpl_tv[is]++;
                 }
             }
-            if ( line_type&VCF_INDEL )
+            if ( var_type&VCF_INDEL )
             {
                 if ( gt != GT_HOM_RR )
                 {
@@ -1068,7 +1078,7 @@ static void do_vcf_stats(args_t *args)
         if ( line->n_allele>2 )
         {
             stats->n_mals++;
-            if ( line_type == VCF_SNP ) stats->n_snp_mals++;
+            if ( line_type == VCF_SNP ) stats->n_snp_mals++;    // note: this will be fooled by C>C,T
         }
 
         if ( files->n_smpl )
@@ -1125,7 +1135,22 @@ static void print_header(args_t *args)
 static void print_stats(args_t *args)
 {
     int i, j,k, id;
-    printf("# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n");
+    printf("# SN, Summary numbers:\n");
+    printf("#   number of records   .. number of data rows in the VCF\n");
+    printf("#   number of no-ALTs   .. reference-only sites, ALT is either \".\" or identical to REF\n");
+    printf("#   number of SNPs      .. number of rows with a SNP\n");
+    printf("#   number of MNPs      .. number of rows with a MNP, such as CC>TT\n");
+    printf("#   number of indels    .. number of rows with an indel\n");
+    printf("#   number of others    .. number of rows with other type, for example a symbolic allele or\n");
+    printf("#                          a complex substitution, such as ACT>TCGA\n");
+    printf("#   number of multiallelic sites     .. number of rows with multiple alternate alleles\n");
+    printf("#   number of multiallelic SNP sites .. number of rows with multiple alternate alleles, all SNPs\n");
+    printf("# \n");
+    printf("#   Note that rows containing multiple types will be counted multiple times, in each\n");
+    printf("#   counter. For example, a row with a SNP and an indel increments both the SNP and\n");
+    printf("#   the indel counter.\n");
+    printf("# \n");
+    printf("# SN\t[2]id\t[3]key\t[4]value\n");
     for (id=0; id<args->files->nreaders; id++)
         printf("SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header));
     for (id=0; id<args->nstats; id++)
@@ -1470,16 +1495,18 @@ static void print_stats(args_t *args)
 
     if ( args->files->n_smpl )
     {
-        printf("# PSC, Per-sample counts\n# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\n");
+        printf("# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n");
+        printf("# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons"
+            "\t[12]nHapRef\t[13]nHapAlt\n");
         for (id=0; id<args->nstats; id++)
         {
             stats_t *stats = &args->stats[id];
             for (i=0; i<args->files->n_smpl; i++)
             {
                 float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0;
-                printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\n", id,args->files->samples[i],
+                printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i],
                     stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i],
-                    stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i]);
+                    stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]);
             }
         }
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bcftools.git



More information about the debian-med-commit mailing list